Batch Normalization Adam Optimizer and L2 reg and Dropout¶
Training Deep Neural Networks is complicated by the fact that the distribution of each layer's inputs changes during training, as the parameters of the previous layers change. This slows down the training by requiring lower learning rates and careful parameter initialization, and makes it notoriously hard to train models with saturating nonlinearities. We refer to this phenomenon as internal covariate shift, and address the problem by normalizing layer inputs.we would use MNIST data set to test our algorithm.
Article written Soumil Shah
Step 1: Import the Necessary Library¶
In [ ]:
try:
import os
import sys
import cv2
import numpy as np
from sklearn.utils import shuffle
import matplotlib.pyplot as plt
import time
import datetime
import math
import cmath
except:
print("Library not found ")
Lets us create a list for number of epoch and loss so that we can append our data in list and plot later on using matplotlib and also we want to know execution time taken for that we create a start time.
In [ ]:
now_time = datetime.datetime.now()
x_data_epoch = []
y_data_error = []
train = np.empty((1000,28,28), dtype='float64')
trainY = np.zeros((1000,10))
test = np.empty((10000, 28, 28), dtype='float64')
testY = np.zeros((10000, 10))
Step 2 : Load Train Image¶
In [ ]:
i = 0
for filename in os.listdir('/Users/soumilshah/IdeaProjects/Deep Learning/MNIST /Training1000'):
y = int(filename[0])
trainY[i, y] = 1.0
train[i] = cv2.imread('/Users/soumilshah/IdeaProjects/Deep Learning/MNIST /Training1000/{0}'.format(filename), 0)/255.0
i = i+1
Step 3 : Load Test Image¶
In [ ]:
i = 0
# read test data
for filename in os.listdir('/Users/soumilshah/IdeaProjects/Deep Learning/MNIST /Test10000'):
y = int(filename[0])
testY[i, y] = 1.0
test[i] = cv2.imread('/Users/soumilshah/IdeaProjects/Deep Learning/MNIST /Test10000/{0}'.format(filename), 0)/255.0
i = i+1
Step 4 : Declare the Parameters¶
In [ ]:
trainX = train.reshape(train.shape[0], train.shape[1]*train.shape[2])
testX = test.reshape(test.shape[0], test.shape[1] * test.shape[2])
In [ ]:
numNeuronsLayer1 = 50 # Number of Neurons in Layer 1
numNeuronsLayer2 = 10 # Number of Output Neurons
numEpochs = 20 # NUmber of Epoch
learningRate = 0.2 # Learning Rate
mini_batch_size = 25 # Batch size
Step 5: Define Weight Matrix and Bias Matrix¶
In [ ]:
w1 = np.random.uniform(low=-0.1,high=0.1,size=(numNeuronsLayer1,784))
b1 = np.random.uniform(low=-1,high=1, size=(numNeuronsLayer1))
w2 = np.random.uniform(low=- 0.1,high=0.1, size=(numNeuronsLayer2, numNeuronsLayer1))
b2 = np.random.uniform(low=-0.1,high=0.1, size=(numNeuronsLayer2))
gamma = np.random.uniform(low=-0.1,high=0.1, size=(numNeuronsLayer1))
beta = np.random.uniform(low=-0.1,high=0.1, size=(numNeuronsLayer1))
Step 6: Define Optimization Parameters¶
In [ ]:
l2_reg = 1e-5
zero_out = np.random.binomial(n=1, p=.8, size= (numNeuronsLayer1))/0.8
Step 7: Define the Adam Parameters¶
In [ ]:
beta_momentum = 0.9
beta_rms = 0.99
l2_reg = 1e-5
epsilon = 1e-8
EPS = 1e-8
runningsigma2 = 0
runningmu = 0
sdw1 = np.zeros_like(w1)
sdw2 = np.zeros_like(w2)
sdb1 = np.zeros_like(b1)
sdb2 = np.zeros_like(b2)
vdw1 = np.zeros_like(w1)
vdw2 = np.zeros_like(w2)
vdb1 = np.zeros_like(b1)
vdb2 = np.zeros_like(b2)
Now we write our First for loop that is for the number of Epoch
In [ ]:
for n in range(0, numEpochs):
loss = 0
trainX, trainY = shuffle(trainX, trainY)
gradw2 = np.zeros((numNeuronsLayer2, numNeuronsLayer1))
gradw1 = np.zeros((numNeuronsLayer1, 784))
gradb1 = np.zeros((numNeuronsLayer1))
gradb2 = np.zeros((numNeuronsLayer2))
dgamma = np.zeros((numNeuronsLayer1))
dbeta = np.zeros((numNeuronsLayer1))
For second For loop that is Mini Batch size
Step 8: Foreward Pass¶
In [ ]:
for i in range(0, int(trainX.shape[0]), mini_batch_size): # This loop Iterate for 20 Times
X_train_mini = trainX[i:i + mini_batch_size]
y_train_mini = trainY[i:i + mini_batch_size]
s1 = np.dot(X_train_mini, w1.T) + b1
Add Bacth Normalization parameters
In [ ]:
UB = np.mean(s1, axis=0)
sigma2 = np.var(s1,axis=0)
Shat = (s1 - UB)/np.sqrt(sigma2 + epsilon)
Sb = Shat * gamma + beta
runningmu = 0.9 * runningmu + (1 - 0.9) * UB
runningsigma2 = 0.9 * runningsigma2 + (1 - 0.9) * sigma2
a1 = 1 / (1 + np.exp(-1 * Sb))
Add L2 Reg and drop out
In [ ]:
a1 = np.multiply(a1, zero_out)
s2 = np.dot(a1, w2.T) + b2
a2 = 1 / (1 + np.exp(-1 * s2))
Calculate Loss
In [ ]:
W = np.sum((w1)) + np.sum((w2))
reg_cost = 0.5 * l2_reg * W * W
loss = loss + (0.5 * np.multiply((y_train_mini - a2), (y_train_mini - a2))).sum() + reg_cost
Step 9: Back Propogate¶
In [ ]:
derivative_error_2 = - (y_train_mini - a2)
derv_act_2 = np.multiply(a2, (1 - a2))
delta2 = np.multiply(derivative_error_2, derv_act_2)
derv_act_1 = np.multiply(a1, (1 - a1))
derv_act_1 = np.multiply(derv_act_1, zero_out)
error_2 = np.dot(delta2, w2)
delta1 = np.multiply(error_2, derv_act_1)
Calculate Delta BN
In [ ]:
deltabn = (delta1 *gamma)/(mini_batch_size * np.sqrt(sigma2 + epsilon)) * (mini_batch_size - 1 - (Shat *Shat))
Calculate Grad
In [ ]:
gradw2 = np.dot(delta2.T, a1)
gradw1 = np.dot(deltabn.T, X_train_mini)
gradb2 = np.sum(delta2, axis=0)
gradb1 = np.sum(deltabn, axis=0)
In [ ]:
dbeta = np.sum(delta1, axis=0)
dgamma =np.sum(delta1*Shat,axis=0)
Momentum
In [ ]:
# ---------------- MOMENTUM ------------------------------
vdw2 = beta_momentum * vdw2 + ((1 - beta_momentum) * gradw2)
vdw1 = beta_momentum * vdw1 + ((1 - beta_momentum) * gradw1)
vdb1 = beta_momentum * vdb1 + ((1 - beta_momentum) * gradb1)
vdb2 = beta_momentum * vdb2 + ((1 - beta_momentum) * gradb2)
RMS PROP
In [ ]:
# ---------------- MOMENTUM ------------------------------
vdw2 = beta_momentum * vdw2 + ((1 - beta_momentum) * gradw2)
vdw1 = beta_momentum * vdw1 + ((1 - beta_momentum) * gradw1)
vdb1 = beta_momentum * vdb1 + ((1 - beta_momentum) * gradb1)
vdb2 = beta_momentum * vdb2 + ((1 - beta_momentum) * gradb2)
# RMS ------------------------------------------------------
sdw2 = beta_rms * sdw2 + ((1 - beta_rms) * (gradw2*gradw2))
sdw1 = beta_rms * sdw1 + ((1 - beta_rms) * (gradw1 * gradw1))
sdb1 = beta_rms * sdb1 + ((1 - beta_rms) * (gradb1*gradb1))
sdb2 = beta_rms * sdb2 + ((1 - beta_rms) * (gradb2 * gradb2))
Add Bias Correction
In [ ]:
VDW1_CORRECTED = vdw1 / (1 - beta_momentum)
VDW2_CORRECTED = vdw2 / (1 - beta_momentum)
VDB1_CORRECTED = vdb1 / (1 - beta_momentum)
VDB2_CORRECTED = vdb2 / (1 - beta_momentum)
# ------------------------------------------
SDW1_CORRECTED = sdw1 / (1 - beta_rms)
SDW2_CORRECTED = sdw2 / (1 - beta_rms)
SDB1_CORRECTED = sdb1 / (1 - beta_rms)
SDB2_CORRECTED = sdb2 / (1 - beta_rms)
Step 10: Update Weight and Bias and other Parameter¶
In [ ]:
w2 = w2 - learningRate * (1/mini_batch_size) \
* (VDW2_CORRECTED) * (1/(np.sqrt(SDW2_CORRECTED)+EPS)) \
- (1/mini_batch_size) * learningRate * l2_reg * W
w1 = w1 - learningRate *(1/mini_batch_size) * (1/mini_batch_size) * \
(VDW1_CORRECTED) * (1/(np.sqrt(SDW1_CORRECTED)+EPS)) \
- (1/mini_batch_size) * learningRate * l2_reg * W
b1 = b1 - learningRate * (1/mini_batch_size) * \
(VDB1_CORRECTED) * (1/(np.sqrt(SDB1_CORRECTED)+EPS)) \
- (1/mini_batch_size) * learningRate * l2_reg * W
b2 = b2 - learningRate * (1/mini_batch_size) * \
(VDB2_CORRECTED) * (1/(np.sqrt(SDB2_CORRECTED)+EPS)) \
- (1/mini_batch_size) * learningRate * l2_reg * W
beta = beta - learningRate * dbeta
gamma = gamma -learningRate * dgamma
Print Loss
In [ ]:
print("epoch = " + str(n) + " loss = " + (str(loss)))
In [ ]:
print("done training , starting testing..")
accuracyCount = 0
Step 11: Test Image¶
In [ ]:
for i in range(testY.shape[0]):
s1 = np.dot(w1, testX[i]) + b1
mu = runningmu
sigma2 = runningsigma2
Shat = (s1 - mu) / np.sqrt(sigma2 + EPS)
sb = Shat * gamma + beta
a1 = 1 / (1 + np.exp(-1 * sb)) # np.exp operates on the array
s2 = np.dot(w2, a1) + b2
a2 = 1 / (1 + np.exp(-1 * s2))
a2index = a2.argmax(axis=0) # Calculate Maximum of these 10 Neuron
if testY[i, a2index] == 1:
accuracyCount = accuracyCount + 1
print("Accuracy count = " + str(accuracyCount/10000.0))
Putting all Together¶
In [3]:
try:
import os
import sys
import cv2
import numpy as np
from sklearn.utils import shuffle
import matplotlib.pyplot as plt
import time
import datetime
import math
import cmath
except:
print("Library not found ")
now_time = datetime.datetime.now()
x_data_epoch = []
y_data_error = []
train = np.empty((1000,28,28), dtype='float64')
trainY = np.zeros((1000,10))
test = np.empty((10000, 28, 28), dtype='float64')
testY = np.zeros((10000, 10))
# --------------------------------------------------------Load in Image--------------------------------------
i = 0
for filename in os.listdir('/Users/soumilshah/IdeaProjects/Deep Learning/MNIST /Training1000'):
y = int(filename[0])
trainY[i, y] = 1.0
train[i] = cv2.imread('/Users/soumilshah/IdeaProjects/Deep Learning/MNIST /Training1000/{0}'.format(filename), 0)/255.0
i = i+1
# -------------------------------------------------LOAD TEST IMAGE ------------------------------------------------
i = 0
# read test data
for filename in os.listdir('/Users/soumilshah/IdeaProjects/Deep Learning/MNIST /Test10000'):
y = int(filename[0])
testY[i, y] = 1.0
test[i] = cv2.imread('/Users/soumilshah/IdeaProjects/Deep Learning/MNIST /Test10000/{0}'.format(filename), 0)/255.0
i = i+1
# ---------------------------------------------------------------------------------------------------------------------
trainX = train.reshape(train.shape[0], train.shape[1]*train.shape[2])
testX = test.reshape(test.shape[0], test.shape[1] * test.shape[2])
# --------------------------------- Neural Network Parameter ------------------------------------
numNeuronsLayer1 = 50 # Number of Neurons in Layer 1
numNeuronsLayer2 = 10 # Number of Output Neurons
numEpochs = 20 # NUmber of Epoch
learningRate = 0.2 # Learning Rate
mini_batch_size = 25 # Batch size
w1 = np.random.uniform(low=-0.1,high=0.1,size=(numNeuronsLayer1,784))
b1 = np.random.uniform(low=-1,high=1, size=(numNeuronsLayer1))
w2 = np.random.uniform(low=- 0.1,high=0.1, size=(numNeuronsLayer2, numNeuronsLayer1))
b2 = np.random.uniform(low=-0.1,high=0.1, size=(numNeuronsLayer2))
gamma = np.random.uniform(low=-0.1,high=0.1, size=(numNeuronsLayer1))
beta = np.random.uniform(low=-0.1,high=0.1, size=(numNeuronsLayer1))
zero_out = np.random.binomial(n=1, p=0.8, size=(numNeuronsLayer1)) / 0.8
beta_momentum = 0.9
beta_rms = 0.99
l2_reg = 1e-5
epsilon = 1e-8
EPS = 1e-8
runningsigma2 = 0
runningmu = 0
sdw1 = np.zeros_like(w1)
sdw2 = np.zeros_like(w2)
sdb1 = np.zeros_like(b1)
sdb2 = np.zeros_like(b2)
vdw1 = np.zeros_like(w1)
vdw2 = np.zeros_like(w2)
vdb1 = np.zeros_like(b1)
vdb2 = np.zeros_like(b2)
# =======================================================================================================
for n in range(0, numEpochs):
loss = 0
trainX, trainY = shuffle(trainX, trainY)
gradw2 = np.zeros((numNeuronsLayer2, numNeuronsLayer1))
gradw1 = np.zeros((numNeuronsLayer1, 784))
gradb1 = np.zeros((numNeuronsLayer1))
gradb2 = np.zeros((numNeuronsLayer2))
dgamma = np.zeros((numNeuronsLayer1))
dbeta = np.zeros((numNeuronsLayer1))
for i in range(0, int(trainX.shape[0]), mini_batch_size): # This loop Iterate for 20 Times
X_train_mini = trainX[i:i + mini_batch_size]
y_train_mini = trainY[i:i + mini_batch_size]
s1 = np.dot(X_train_mini, w1.T) + b1
# -------------------------------
UB = np.mean(s1, axis=0)
sigma2 = np.var(s1,axis=0)
Shat = (s1 - UB)/np.sqrt(sigma2 + epsilon)
Sb = Shat * gamma + beta
runningmu = 0.9 * runningmu + (1 - 0.9) * UB
runningsigma2 = 0.9 * runningsigma2 + (1 - 0.9) * sigma2
a1 = 1 / (1 + np.exp(-1 * Sb))
a1 = np.multiply(a1, zero_out)
s2 = np.dot(a1, w2.T) + b2
a2 = 1 / (1 + np.exp(-1 * s2))
W = np.sum((w1)) + np.sum((w2))
reg_cost = 0.5 * l2_reg * W * W
loss = loss + (0.5 * np.multiply((y_train_mini - a2), (y_train_mini - a2))).sum() + reg_cost
# =======BACK PROPOGATION ==================================
derivative_error_2 = - (y_train_mini - a2)
derv_act_2 = np.multiply(a2, (1 - a2))
delta2 = np.multiply(derivative_error_2, derv_act_2)
derv_act_1 = np.multiply(a1, (1 - a1))
derv_act_1 = np.multiply(derv_act_1, zero_out)
error_2 = np.dot(delta2, w2)
delta1 = np.multiply(error_2, derv_act_1)
deltabn = (delta1 *gamma)/(mini_batch_size * np.sqrt(sigma2 + epsilon)) * (mini_batch_size - 1 - (Shat *Shat))
gradw2 = np.dot(delta2.T, a1)
gradw1 = np.dot(deltabn.T, X_train_mini)
gradb2 = np.sum(delta2, axis=0)
gradb1 = np.sum(deltabn, axis=0)
dbeta = np.sum(delta1, axis=0)
dgamma =np.sum(delta1*Shat,axis=0)
# ---------------- MOMENTUM ------------------------------
vdw2 = beta_momentum * vdw2 + ((1 - beta_momentum) * gradw2)
vdw1 = beta_momentum * vdw1 + ((1 - beta_momentum) * gradw1)
vdb1 = beta_momentum * vdb1 + ((1 - beta_momentum) * gradb1)
vdb2 = beta_momentum * vdb2 + ((1 - beta_momentum) * gradb2)
# RMS ------------------------------------------------------
sdw2 = beta_rms * sdw2 + ((1 - beta_rms) * (gradw2*gradw2))
sdw1 = beta_rms * sdw1 + ((1 - beta_rms) * (gradw1 * gradw1))
sdb1 = beta_rms * sdb1 + ((1 - beta_rms) * (gradb1*gradb1))
sdb2 = beta_rms * sdb2 + ((1 - beta_rms) * (gradb2 * gradb2))
VDW1_CORRECTED = vdw1 / (1 - beta_momentum)
VDW2_CORRECTED = vdw2 / (1 - beta_momentum)
VDB1_CORRECTED = vdb1 / (1 - beta_momentum)
VDB2_CORRECTED = vdb2 / (1 - beta_momentum)
# ------------------------------------------
SDW1_CORRECTED = sdw1 / (1 - beta_rms)
SDW2_CORRECTED = sdw2 / (1 - beta_rms)
SDB1_CORRECTED = sdb1 / (1 - beta_rms)
SDB2_CORRECTED = sdb2 / (1 - beta_rms)
# ======================After BATCH set is done Upgrade WEIGHT===============================
w2 = w2 - learningRate * (1/mini_batch_size) \
* (VDW2_CORRECTED) * (1/(np.sqrt(SDW2_CORRECTED)+EPS)) \
- (1/mini_batch_size) * learningRate * l2_reg * W
w1 = w1 - learningRate *(1/mini_batch_size) * (1/mini_batch_size) * \
(VDW1_CORRECTED) * (1/(np.sqrt(SDW1_CORRECTED)+EPS)) \
- (1/mini_batch_size) * learningRate * l2_reg * W
b1 = b1 - learningRate * (1/mini_batch_size) * \
(VDB1_CORRECTED) * (1/(np.sqrt(SDB1_CORRECTED)+EPS)) \
- (1/mini_batch_size) * learningRate * l2_reg * W
b2 = b2 - learningRate * (1/mini_batch_size) * \
(VDB2_CORRECTED) * (1/(np.sqrt(SDB2_CORRECTED)+EPS)) \
- (1/mini_batch_size) * learningRate * l2_reg * W
beta = beta - learningRate * dbeta
gamma = gamma -learningRate * dgamma
# ====================================
print("epoch = " + str(n) + " loss = " + (str(loss)))
print("done training , starting testing..")
accuracyCount = 0
# -------------------------------------- Testing Neural Network ------------------------------------------------------
for i in range(testY.shape[0]):
s1 = np.dot(w1, testX[i]) + b1
mu = runningmu
sigma2 = runningsigma2
Shat = (s1 - mu) / np.sqrt(sigma2 + EPS)
sb = Shat * gamma + beta
a1 = 1 / (1 + np.exp(-1 * sb)) # np.exp operates on the array
s2 = np.dot(w2, a1) + b2
a2 = 1 / (1 + np.exp(-1 * s2))
a2index = a2.argmax(axis=0) # Calculate Maximum of these 10 Neuron
if testY[i, a2index] == 1:
accuracyCount = accuracyCount + 1
print("Accuracy count = " + str(accuracyCount/10000.0))
In [ ]:
No comments:
Post a Comment