Using mini batch SGD Neural Network in Alphabet recognition!
Here we are using mini-batch stochastic gradient descent neural network method in Alphabet recognition with previous data set mentioned in nonMNIST data set.
First, we are going to build the neural network part.
Here we have adjustable number of hidden neurons, layers, epochs, mini batch sizes that allow us to tune the accuracy later.
Then, import the data set.
Next, we set up a network with 30 hidden neurons.
We'll use stochastic gradient descent to learn from the training data over 30 epochs, with a mini batch size of 10, and a learning rate of η = 3.0
Here is the outcome results.
That is, the trained network gives us a classification rate of about 91.87% (World's record is 99.79%)
at its peak ("Epoch 29")! That's quite encouraging as a first attempt.
We are going to improve this network soon and explore more and more neural network types!!😆😆
First, we are going to build the neural network part.
Here we have adjustable number of hidden neurons, layers, epochs, mini batch sizes that allow us to tune the accuracy later.
#network part
import random
import numpy as np
class Network(object):
def __init__(self, sizes):
#sizes: number of neurons (eg: (2,3,1))
#biases and weights are initialized randomly
self.num_layers = len(sizes)
self.sizes = sizes
self.biases = [np.random.randn(y, 1) for y in sizes[1:]]
self.weights = [np.random.randn(y, x) for x, y in zip(sizes[:-1], sizes[1:])]
def feedforward(self, a):
#return output of network f a is input
for b, w in zip(self.biases, self.weights):
a = sigmoid(np.dot(w, a) + b)
return a
def SGD(self, training_data, epochs, mini_batch_size, eta, test_data=None):
#train neural network using mini-batch stochastic gradient descent
#training_data is list of tuples
if test_data:
n_test = len(test_data)
n = len(training_data)
for j in xrange(epochs):
random.shuffle(training_data)
mini_batches = [
training_data[k:k+mini_batch_size]
for k in xrange(0, n, mini_batch_size)]
for mini_batch in mini_batches:
self.update_mini_batch(mini_batch, eta)
if test_data:
print "Epoch {0}: {1} / {2}".format(j, self.evaluate(test_data), n_test)
else:
print "Epoch {0} complete".format(j)
def update_mini_batch(self, mini_batch, eta):
#update network's weights and biases
#by applying gradient descent using backpropagation to single mini batch
#eta is learning rate
nabla_b = [np.zeros(b.shape) for b in self.biases]
nabla_w = [np.zeros(w.shape) for w in self.weights]
for x, y in mini_batch:
delta_nabla_b, delta_nabla_w = self.backprop(x, y)
nabla_b = [nb+dnb for nb, dnb in zip(nabla_b, delta_nabla_b)]
nabla_w = [nw+dnw for nw, dnw in zip(nabla_w, delta_nabla_w)]
self.weights = [w-(eta/len(mini_batch))*nw
for w, nw in zip(self.weights, nabla_w)]
self.biases = [b-(eta/len(mini_batch))*nb
for b, nb in zip(self.biases, nabla_b)]
def backprop(self, x, y):
#return a tuple (nabla_b, nabla_w) cost function C_x
#layer by layer lists of numpy arrays similar to biases and weights
nabla_b = [np.zeros(b.shape) for b in self.biases]
nabla_w = [np.zeros(w.shape) for w in self.weights]
#feedforward
activation = x
activations = [x] #list to store all activation layer by layer
zs = [] #list to store all z vectors
for b, w in zip(self.biases, self.weights):
z = np.dot(w, activation)+b
zs.append(z)
activation = sigmoid(z)
activations.append(activation)
#backward pass
delta = self.cost_derivative(activations[-1], y) * \
sigmoid_prime(zs[-1])
nabla_b[-1] = delta
nabla_w[-1] = np.dot(delta, activations[-2].transpose())
# l=1 means last layer, l=2 means second-last layer
for l in xrange(2, self.num_layers):
z = zs[-l]
sp = sigmoid_prime(z)
delta = np.dot(self.weights[-l+1].transpose(), delta) * sp
nabla_b[-l] = delta
nabla_w[-l] = np.dot(delta, activations[-l-1].transpose())
return(nabla_b, nabla_w)
def evaluate(self, test_data):
# return number of test inputs
# which neural network output the correct result
test_results = [(np.argmax(self.feedforward(x)), y) for (x, y) in test_data]
return sum(int(x == y) for (x, y) in test_results)
def cost_derivative(sef, output_activations, y):
#return vector of partial derivatives for output activations
return(output_activations-y)
# Miscellaneous functions
def sigmoid(z):
return 1.0/(1.0+np.exp(-z))
def sigmoid_prime(z):
return sigmoid(z) * (1-sigmoid(z))
Then, import the data set.
#Get dataset
import cPickle as pickle
import os
def load_data():
test_filename = "notMNIST.pickle"
if os.path.exists(test_filename):
with open(test_filename, 'rb') as f:
letter_dataset = pickle.load(f)
return (letter_dataset)
def vectorized_result(j):
# return 10-dimensional unit vector with a 1.0 in jth pos and zeroes elsewhere
# convert aphabet A to J into corresponding desired output from NN
e = np.zeros((10, 1))
e[j] = 1.0
return e
def load_data_wrapper():
# return tupple contain (training_data, validation_data, test_data)
# more convenient
lt_dt = load_data()
train_dataset = lt_dt['train_dataset']
train_labels = lt_dt['train_labels']
valid_dataset = lt_dt['valid_dataset']
valid_labels = lt_dt['valid_labels']
test_dataset = lt_dt['test_dataset']
test_labels = lt_dt['test_labels']
training_inputs = (np.reshape(x, (784, 1)) for x in train_dataset)
training_results = (vectorized_result(y) for y in train_labels)
training_data = zip(training_inputs, training_results)
validation_inputs = [np.reshape(x, (784, 1)) for x in valid_dataset]
validation_data = zip(validation_inputs, valid_labels)
test_inputs = [np.reshape(x, (784, 1)) for x in test_dataset]
test_data = zip(test_inputs, test_labels)
return(training_data, validation_data, test_data)
Next, we set up a network with 30 hidden neurons.
We'll use stochastic gradient descent to learn from the training data over 30 epochs, with a mini batch size of 10, and a learning rate of η = 3.0
#load data
%time training_data, validation_data, test_data = load_data_wrapper()
#run network
%time net = Network([784, 30, 10])
%time net.SGD(training_data, 30, 10, 3.0, test_data=test_data)
Here is the outcome results.
That is, the trained network gives us a classification rate of about 91.87% (World's record is 99.79%)
at its peak ("Epoch 29")! That's quite encouraging as a first attempt.
Wall time: 1.04 s Wall time: 0 ns Epoch 0: 8950 / 10000 Epoch 1: 9030 / 10000 Epoch 2: 9065 / 10000 Epoch 3: 9074 / 10000 Epoch 4: 9097 / 10000 Epoch 5: 9072 / 10000 Epoch 6: 9129 / 10000 Epoch 7: 9114 / 10000 Epoch 8: 9108 / 10000 Epoch 9: 9124 / 10000 Epoch 10: 9117 / 10000 Epoch 11: 9142 / 10000 Epoch 12: 9161 / 10000 Epoch 13: 9140 / 10000 Epoch 14: 9169 / 10000 Epoch 15: 9168 / 10000 Epoch 16: 9142 / 10000 Epoch 17: 9168 / 10000 Epoch 18: 9171 / 10000 Epoch 19: 9169 / 10000 Epoch 20: 9171 / 10000 Epoch 21: 9178 / 10000 Epoch 22: 9186 / 10000 Epoch 23: 9158 / 10000 Epoch 24: 9157 / 10000 Epoch 25: 9184 / 10000 Epoch 26: 9172 / 10000 Epoch 27: 9172 / 10000 Epoch 28: 9185 / 10000 Epoch 29: 9187 / 10000 Wall time: 10min 49s
We are going to improve this network soon and explore more and more neural network types!!😆😆