Alphabet recognition by using LogisticRegression model from sklearn.linear_model. (Udacity Assignment 1)
Continued from previous assignment, we saved pickle data from some sample alphabet images.
Now we are going to extract the data and train the data, so that we can recognize the alphabet.
Now we are going to extract the data and train the data, so that we can recognize the alphabet.
from __future__ import print_function
import matplotlib.pyplot as plt
import numpy as np
import os
import sys
import tarfile
import random
import hashlib
from IPython.display import display, Image
from sklearn.linear_model import LogisticRegression
from six.moves.urllib.request import urlretrieve
from six.moves import cPickle as pickle
test_filename = "notMNIST.pickle"
def read_data(picklename,data_name):
if os.path.exists(picklename):
with open(picklename, 'rb') as f:
letter_set = pickle.load(f)
return letter_set[data_name]
train_dataset = read_data(test_filename,'train_dataset')
train_labels = read_data(test_filename,'train_labels')
valid_dataset = read_data(test_filename,'valid_dataset')
valid_labels = read_data(test_filename,'valid_labels')
test_dataset = read_data(test_filename,'test_dataset')
test_labels = read_data(test_filename,'test_labels')
Let's train this data using 50, 100, 1000, and 5000 training samples and see how's the result outcome.
#Display multiple sample set
pretty_labels = {0: 'A', 1: 'B', 2: 'C', 3: 'D', 4: 'E', 5: 'F', 6: 'G', 7: 'H', 8: 'I', 9: 'J'}
def disp_sample_dataset(dataset, labels):
items = random.sample(range(len(labels)), 8)
for i, item in enumerate(items):
plt.subplot(2, 4, i+1)
plt.axis('off')
plt.title(pretty_labels[labels[item]])
plt.imshow(dataset[item])
plt.show()
#end
#Train data by using Logistic Regression
regr = LogisticRegression()
X_test = test_dataset.reshape(test_dataset.shape[0], 28 * 28)
y_test = test_labels
sample_size = 50
X_train = train_dataset[:sample_size].reshape(sample_size, 784)
y_train = train_labels[:sample_size]
print('Sample size: ', sample_size)
%time regr.fit(X_train, y_train)
print('Score: ',regr.score(X_test, y_test))
pred_labels = regr.predict(X_test)
disp_sample_dataset(test_dataset, pred_labels)
sample_size = 100
X_train = train_dataset[:sample_size].reshape(sample_size, 784)
y_train = train_labels[:sample_size]
print('Sample size = ', sample_size)
%time regr.fit(X_train, y_train)
print('Score: ',regr.score(X_test, y_test))
pred_labels = regr.predict(X_test)
disp_sample_dataset(test_dataset, pred_labels)
sample_size = 1000
X_train = train_dataset[:sample_size].reshape(sample_size, 784)
y_train = train_labels[:sample_size]
print('Sample size = ', sample_size)
%time regr.fit(X_train, y_train)
print('Score: ',regr.score(X_test, y_test))
pred_labels = regr.predict(X_test)
disp_sample_dataset(test_dataset, pred_labels)
X_valid = valid_dataset[:sample_size].reshape(sample_size, 784)
y_valid = valid_labels[:sample_size]
%time regr.score(X_valid, y_valid)
pred_labels = regr.predict(X_valid)
disp_sample_dataset(valid_dataset, pred_labels)
sample_size = 5000
X_train = train_dataset[:sample_size].reshape(sample_size, 784)
y_train = train_labels[:sample_size]
print('Sample size = ', sample_size)
%time regr.fit(X_train, y_train)
print('Score: ',regr.score(X_test, y_test))
pred_labels = regr.predict(X_test)
disp_sample_dataset(test_dataset, pred_labels)
Based on the outcomes, the result is getting more accurate with increasing of training samples.
But in order to train all the data, this method will be too slow. So, can use the fastest method by using SAG solver.
Outcomes:
Hmm, seem likes the accuracy is still not 100%.
Anyway, thanks to sample solutions provided by khanhnamle1994 which allows us to refer and learn.Github: https://github.com/khanhnamle1994