Monday, 26 April 2021

Unable to bring down the training loss in a multi-layer perceptron

I have implemented a multilayer perceptron in python using NumPy from scratch. I have gone through the network implementation many times but not able to figure out the bug in the network that is causing the loss to remain almost constant. Frankly, I don't know if it is a bug in the network or something else.

The dataset is the CIFAR-10 dataset.

Here is my Neural netwrok code:

from typing import Sequence

import numpy as np


class NeuralNetwork:
def __init__(
    self,
    input_size: int,
    hidden_sizes: Sequence[int],
    output_size: int,
    num_layers: int,
    reg: int,
    optim: str
):
    self.input_size = input_size
    self.hidden_sizes = hidden_sizes
    self.output_size = output_size
    self.num_layers = num_layers
    self.reg_const = reg

    if (optim != 'sgd' and optim != 'adam'):
        print("Invalid choice of optimizer. Please choose from 'sgd' or 'adam'")
        return
    self.optim = optim

    # adam related vars
    if self.optim == 'adam':
        self.eta = 0.01
        self.beta1 = 0.9
        self.beta2 = 0.999
        self.epsilon = 1e-8
        self.fm = {}
        self.sm = {}

    assert len(hidden_sizes) == (num_layers - 1)

    sizes = [input_size] + hidden_sizes + [output_size]

    self.params = {}
    for i in range(1, num_layers + 1):
        self.params["W" + str(i)] = np.random.randn(
            sizes[i - 1], sizes[i]
        ) * np.sqrt(2 / sizes[i - 1])
        self.params["b" + str(i)] = np.zeros((sizes[i], 1))

    # moment initialization for the layers
    if self.optim == 'adam':
        for i in range(1, num_layers + 1):
            self.fm['f' + str(i)] = None
            self.sm['s' + str(i)] = None

def linear(self, W: np.ndarray, X: np.ndarray, b: np.ndarray) -> np.ndarray:
    return np.dot(W.T, X) + b

def relu(self, X: np.ndarray) -> np.ndarray:
    return np.maximum(X, np.zeros_like(X))

def softmax(self, X: np.ndarray) -> np.ndarray:
    return np.exp(X - np.max(X)) / sum(np.exp(X))

def forward(self, X: np.ndarray) -> np.ndarray:
    self.outputs = {}
    # For input layer to first hidden layer
    # OL - output of linear layer
    # OR - output of Relu layer
    # OS - output of softmax layer
    self.outputs["OL" + str(1)] = self.linear(self.params["W" + str(1)], 
                                             X, self.params["b" + str(1)])
    self.outputs["OR" + str(1)] = self.relu(self.outputs["OL" + str(1)])

    # For all subsequent layers
    for i in range(2, self.num_layers + 1):
      self.outputs["OL" + str(i)] = self.linear(self.params["W" + str(i)], 
                                                self.outputs["OR" + str(i - 1)],
                                                self.params["b" + str(i)])
      self.outputs["OR" + str(i)] = self.relu(self.outputs["OL" + str(i)])

    self.outputs["OS" + str(self.num_layers + 1)] = self.softmax(
        self.outputs["OR" + str(self.num_layers)])     
      
    return self.outputs["OS" + str(self.num_layers + 1)]

def softmax_grad(self, s: np.ndarray,
                 X_in: np.ndarray, y: np.ndarray) -> np.ndarray: 
  ds = s.copy()
  ds[y.astype(np.int32), len(y) - 1] -= 1
  #   grad = np.dot(X_in, ds.T) / X_in.shape[0]
  return ds

def relu_grad(self, grad: np.ndarray, inp: np.ndarray) -> np.ndarray:
  return inp.T > 0

def linear_grad(self, h: np.ndarray, grad: np.ndarray,
                W: np.ndarray, b: np.ndarray, layer: int, lr: float,
                reg: float,
                t: int) -> np.ndarray:
    local_grad = (np.dot(h, grad)) / grad.shape[1]
    upstream_grad = np.dot(W, grad.T) / grad.shape[1]
    temp = np.average(grad, axis = 0).reshape((-1, 1))
    
    if self.optim == 'sgd':     # sgd implementation       
        # parameter updates
        t1 = lr * local_grad
        t2 = self.reg_const * np.sum(self.params["W" + str(layer)])
        self.params["W" + str(layer)] -= t1 + t2

        self.params["b" + str(layer)] -= lr * temp
    else:       # adam implementation
        # moment initialization if first iter
        if self.fm['f' + str(layer)] is None:
            self.fm['f' + str(layer)] = np.zeros_like(local_grad)
            self.sm['s' + str(layer)] = np.zeros_like(local_grad)
        
        fms = self.fm['f' + str(layer)]
        sms = self.sm['s' + str(layer)]

        fms = self.beta1 * fms + (1 - self.beta1) * local_grad
        sms = self.beta2 * sms + (1 - self.beta2) * local_grad**2

        self.fm['f' + str(layer)] = fms
        self.sm['s' + str(layer)] = sms

        # bias correction
        fm_unbias = fms / (1 - self.beta1**(t+1))
        sm_unbias = sms / (1 - self.beta2**(t+1))

        self.params["W" + str(layer)] -= lr * fm_unbias / (np.sqrt(sm_unbias) + 1e-7)
    
    return upstream_grad    

def backward(self, X: np.ndarray, y: np.ndarray,
             lr: float, reg: float, t: int) -> float:
    self.gradients = {}
    loss = 0.0
    # one-hot encoding for calculating loss
    one_hot = np.zeros((y.size, 10))
    rows = np.arange(y.size)
    one_hot[rows, y] = 1
    y = one_hot

    # Loss computation
    softmax_layer = self.outputs["OS" + str(self.num_layers + 1)]
    p = softmax_layer[y.astype(np.int32), len(y) - 1]
    loss = -np.log(p)
    loss = np.sum(loss) / len(y) + (2 * self.reg_const * np.sum(self.params["W" + str(self.num_layers)]**2))

    self.gradients["G" + str(self.num_layers + 1)] = self.softmax_grad(softmax_layer, self.outputs["OR" + str(self.num_layers)], y)
    for i in range(self.num_layers, 1, -1):
      self.gradients["G" + str(i)] = self.relu_grad(self.gradients["G" + str(i + 1)],
                                                    self.outputs["OL" + str(i)])
      self.gradients["G" + str(i)] = self.linear_grad(self.outputs["OR" + str(i - 1)],
                                                      self.gradients["G" + str(i)],
                                                      self.params["W" + str(i)],
                                                      self.params["b" + str(i)],
                                                      i, lr, reg, t)
    self.gradients["G" + str(1)] = self.relu_grad(self.gradients["G" + str(2)],
                                                  self.outputs["OL" + str(1)])
    self.gradients["G" + str(1)] = self.linear_grad(X,
                                                    self.gradients["G" + str(1)],
                                                    self.params["W" + str(1)],
                                                    self.params["b" + str(1)],
                                                    1, lr, reg, t)
    return loss

The network is initialized like this:

# Hyperparameters
input_size = 32 * 32 * 3
num_layers = 6
hidden_size = 64
hidden_sizes = [hidden_size] * (num_layers - 1)
num_classes = 10
epochs = 100
batch_size = 128
learning_rate = 1e-4
reg = 1e-6

# Change the optimizer here
optimizer = 'adam'  # 'sgd' or 'adam'

# # Initialize a new neural network model
net = NeuralNetwork(input_size, hidden_sizes, num_classes, num_layers, reg, optimizer)

# Variables to store performance for each epoch
train_loss = np.zeros(epochs)
train_accuracy = np.zeros(epochs)
val_accuracy = np.zeros(epochs)

This is the training module:

# For each epoch...
for epoch in range(epochs):
    print('epoch:', epoch)
    
    # Shuffle the dataset
    shuffled_indices = [i for i in range(X_train.shape[0])]
    shuffle(shuffled_indices)
    X_train_new = X_train[shuffled_indices, :]    # shuffling samples
    y_train_new = y_train[shuffled_indices,]  # shuffling labels
    
    # Training
    # For each mini-batch...
    for batch in range(TRAIN_IMAGES // batch_size):
        # Create a mini-batch of training data and labels
        X_batch = X_train_new[batch * batch_size : (batch + 1) * batch_size, :]
        y_batch = y_train_new[batch * batch_size : (batch + 1) * batch_size,].astype(int)
        X_batch = X_batch.astype(float) / 255.
        X_batch = X_batch.T

        # Run the forward pass of the model to get a prediction and compute the accuracy
        res = net.forward(X_batch)
        res = np.argmax(res, axis = 0)
        acc = 0
        for i in range(len(res)):
            if res[i] == y_batch[i]:
                acc += 1
        # Run the backward pass of the model to update the weights and compute the loss
        train_loss[epoch] += net.backward(X_batch, y_batch, learning_rate, reg, epoch)
        train_accuracy[epoch] += acc
    print("Epoch {}: Training accuracy: {:.3f}".format(epoch, train_accuracy[epoch] / (TRAIN_IMAGES // batch_size)))
    print("Epoch {}: Training loss: {:.3f}".format(epoch, train_loss[epoch] / (TRAIN_IMAGES // batch_size)))
    # Validation
    # No need to run the backward pass here, just run the forward pass to compute accuracy
    val_accuracy[epoch] += 0
    val_acc = 0
    for batch in range(VAL_IMAGES // batch_size):
        X_batch = X_val[batch * batch_size : (batch + 1) * batch_size, :]
        y_batch = y_val[batch * batch_size : (batch + 1) * batch_size,]
        X_batch = X_batch.T
        res = net.forward(X_batch)
        res = np.argmax(res, axis = 0)
        for i in range(len(res)):
            if res[i] == y_batch[i]:
                val_acc += 1
    val_accuracy[epoch] += val_acc / (VAL_IMAGES // batch_size)
    print("Epoch {} Validation accuracy: {:.3f}".format(epoch, val_accuracy[epoch]))

This is what the output of the training looks like:

epoch: 0
Epoch 0: Training accuracy: 12.770
Epoch 0: Training loss: 23.072
Epoch 0 Validation accuracy: 14.714
epoch: 1
Epoch 1: Training accuracy: 12.848
Epoch 1: Training loss: 23.040
Epoch 1 Validation accuracy: 14.714
epoch: 2
Epoch 2: Training accuracy: 12.969
Epoch 2: Training loss: 23.035
Epoch 2 Validation accuracy: 14.571
epoch: 3
Epoch 3: Training accuracy: 12.838
Epoch 3: Training loss: 23.033
Epoch 3 Validation accuracy: 14.857
epoch: 4
Epoch 4: Training accuracy: 12.796
Epoch 4: Training loss: 23.031
Epoch 4 Validation accuracy: 14.714
epoch: 5
Epoch 5: Training accuracy: 12.510
Epoch 5: Training loss: 23.030
Epoch 5 Validation accuracy: 15.000

This is just 5 iterations but the numbers don't change much. I tried changing hyperparameters but nothing changes much. Also, I tried increasing the number of layers and neurons per layer too still nothing changes much. This led me to believe that there is some bug in my network.

Kindly tell me where am I going wrong and please explain the bug this will be a huge help. Thank you.

This is the dataset configuration:

TRAIN_IMAGES = 49000
VAL_IMAGES = 1000
TEST_IMAGES = 10000

data = get_CIFAR10_data(TRAIN_IMAGES, VAL_IMAGES, TEST_IMAGES) # you can load this from keras
X_train, y_train = data['X_train'], data['y_train']
X_val, y_val = data['X_val'], data['y_val']
X_test, y_test = data['X_test'], data['y_test']


from Unable to bring down the training loss in a multi-layer perceptron

No comments:

Post a Comment