Multi-layer perceptron program is super slow

.everyoneloves__top-leaderboard:empty,.everyoneloves__mid-leaderboard:empty margin-bottom:0;

up vote
7
down vote

favorite

I'm writing a multi-layer perceptron from scratch and I think it's way slower than it should be. the culprit seems to be my compute_gradients-function, which according to my investigation answers for most of the execution time. It looks like this:

def compute_gradients(X, Y, S1, H, P, W1, W2, lamb):
 # Y must be one-hot
 # Y and P must be (10, n)
 # X and H must be (3072, n)
 # P is the softmax layer
 if not (Y.shape[0] == 10 and P.shape[0] == 10 and Y.shape == P.shape):
 raise ValueError("Y and P must have shape (10, k). Y: , P: ".format(Y.shape, P.shape))
 if not X.shape[0] == n_input:
 raise ValueError("X must have shape (, k), has ".format(n_input, X.shape))
 if not H.shape[0] == n_hidden:
 raise ValueError("H must have shape (, k)".format(n_hidden))
 grad_W1 = np.zeros([n_hidden, n_input])
 grad_W2 = np.zeros([10, n_hidden])
 grad_b1 = np.zeros([n_hidden, 1])
 grad_b2 = np.zeros([10, 1])


 X, Y, H, P = X.T, Y.T, H.T, P.T
 for x, y, s1, h, p in zip(X, Y, S1, H, P):
 h = np.reshape(h, [1, n_hidden])
 y = np.reshape(y, [10, 1])
 p = np.reshape(p, [10, 1])

 # Second layer
 g = -(y-p).T
 grad_b2 += g.T
 grad_W2 += np.matmul(g.T, h)
 # First layer
 g = np.matmul(g, W2)
 ind = np.zeros(h.shape[1])
 for i, val in enumerate(s1):
 if val > 0:
 ind[i] = 1
 diag = np.diag(ind)

 g = np.matmul(g, diag)

 grad_b1 += g.T
 grad_W1 += np.matmul(g.T, np.reshape(x, [1, n_input]))
 # Divide by batch size
 grad_b1 /= X.shape[0]
 grad_b2 /= X.shape[0]
 grad_W1 /= X.shape[0]
 grad_W2 /= X.shape[0]
 # Add regularization term
 grad_W1 += 2*lamb*W1
 grad_W2 += 2*lamb*W2
 return grad_W1, grad_W2, grad_b1, grad_b2

If X, Y, H, P are all 10 rows long (n=10), the computations take about 1 second. This is way too much compared to my friends who are doing the same task. But I can't see any obvious inefficiencies in my code. What can I do to speed the computations up?

EDIT: Here's a runnable example. You also need the CIFAR dataset, found here: https://www.cs.toronto.edu/~kriz/cifar-100-python.tar.gz .

import numpy as np
import matplotlib.pyplot as plt
from PIL import Image
import os
import time

n_input = 3072
n_hidden = 3072

def one_hot(Y):
 # assume Y = [1, 4, 9, 0, ...]
 result = [None]*len(Y)
 for i, cls in enumerate(Y):
 onehot = 
 0: lambda: [1,0,0,0,0,0,0,0,0,0],
 1: lambda: [0,1,0,0,0,0,0,0,0,0],
 2: lambda: [0,0,1,0,0,0,0,0,0,0],
 3: lambda: [0,0,0,1,0,0,0,0,0,0],
 4: lambda: [0,0,0,0,1,0,0,0,0,0],
 5: lambda: [0,0,0,0,0,1,0,0,0,0],
 6: lambda: [0,0,0,0,0,0,1,0,0,0],
 7: lambda: [0,0,0,0,0,0,0,1,0,0],
 8: lambda: [0,0,0,0,0,0,0,0,1,0],
 9: lambda: [0,0,0,0,0,0,0,0,0,1],
 [cls]()
 result[i] = onehot

 result = np.array(result).T
 return result


def unpickle(file):
 import pickle
 with open(file, "rb") as fo:
 d = pickle.load(fo, encoding="bytes")
 return d

path = "../../datasets/cifar/"

names = ["data_batch_1", 
 "data_batch_2", 
 "data_batch_3", 
 "data_batch_4", 
 "data_batch_5",
 ]

dataset_raw = unpickle(os.path.join(path, names[0]))
dataset_small = 
dataset_small["data"] = dataset_raw[b"data"]/255
dataset_small["labels"] = one_hot(dataset_raw[b"labels"]).T

validation_set_raw = unpickle(os.path.join(path, names[1]))
validation_small = 
validation_small["data"] = validation_set_raw[b"data"]/255
validation_small["labels"] = one_hot(validation_set_raw[b"labels"]).T

print("small dataset data shape: ".format(dataset_small["data"].shape))
print("small dataset labels shape: ".format(dataset_small["labels"].shape))
print("small validation data shape: ".format(validation_small["data"].shape))
print("small validation labels shape: ".format(validation_small["labels"].shape))

test_set_raw = unpickle(os.path.join(path, "test_batch"))
X_test = test_set_raw[b"data"]
Y_test = one_hot(test_set_raw[b"labels"])

# All data sets

dataset_large = "data": np.zeros([0, 3072]), "labels": np.array()
validation_large = 

## All data batches

for name in names[1:4]:
 raw = unpickle(os.path.join(path, name))
 dataset_large["data"] = np.append(dataset_large["data"], raw[b"data"], axis = 0)
 dataset_large["labels"] = np.append(dataset_large["labels"], raw[b"labels"], axis = 0)
raw = unpickle(os.path.join(path, names[4]))
dataset_large["data"] = np.append(dataset_large["data"], raw[b"data"][0: -1000], axis = 0)
dataset_large["labels"] = np.append(dataset_large["labels"], raw[b"labels"][0: -1000], axis = 0)
validation_large["data"] = raw[b"data"][-1000: ]
validation_large["labels"] = raw[b"labels"][-1000: ]

# Make one-hot
dataset_large["labels"] = one_hot(dataset_large["labels"]).T
validation_large["labels"] = one_hot(validation_large["labels"]).T

# Normalize
dataset_large["data"] = dataset_large["data"]/255
validation_large["data"] = validation_large["data"]/255

print("large dataset data: ".format(dataset_large["data"].shape))
print("large dataset labels: ".format(dataset_large["labels"].shape))
print("large validation set data: ".format(validation_large["data"].shape))
print("large validation set labels: ".format(validation_large["labels"].shape))

def softmax(s):
 numerator = np.exp(s)
 denominator = np.matmul(np.ones(10).T, np.exp(s))
 return numerator/denominator

def relu(s1):
 h = np.maximum(np.zeros_like(s1), s1)
 return h

def layer_1(X, W1, b1):
 # WARNING: X must have shape (3072, n)
 # W1 must have shape (3072, 3072)
 # b1 must have shape (3072, 1)
 # s will get shape (10, n)
 # Return p with shape (10, n)

 if not X.shape[0] == n_input:
 raise ValueError("Got wrong shape of s1: ".format(X.shape))


 s1 = np.matmul(W1, X) + b1

 if not s1.shape == (n_hidden, X.shape[1]):
 raise ValueError("Got wrong shape of s1: ".format(s1.shape))

 h = relu(s1)

 if not h.shape == (n_hidden, X.shape[1]):
 raise ValueError("Got wrong shape of h: ".format(h.shape))

 return s1, h

def layer_2(h, W2, b2):
 # WARNING: h must have shape (3072, n)
 # W2 must have shape (10, 3072)
 # b2 must have shape (10, 1)
 # s will get shape (10, n)
 # Returns p with shape (10, n)

 s = np.matmul(W2, h) + b2

 if not s.shape == (10, h.shape[1]):
 raise ValueError("Got wrong shape of s: ".format(s.shape))

 p = softmax(s)

 if not p.shape == (10, h.shape[1]):
 raise ValueError("Got wrong shape of p: ".format(p.shape))

 return p

def evaluate_classifier(X, W1, W2, b1, b2):
 if not X.shape[0] == n_input:
 ValueError("Wrong shape of X: ".format(X.shape))
 if not len(X.shape) == 2:
 ValueError("Wrong shape of X: ".format(X.shape))
 if not W1.shape == (n_hidden, n_input):
 raise ValueError("Wrong shape of W1: ".format(W1.shape))
 if not b1.shape == (n_hidden, 1):
 raise ValueError("Wrong shape of b1: ".format(b1.shape))
 if not W2.shape == (10, n_hidden):
 raise ValueError("Wrong shape of W2: ".format(W2.shape))
 if not b2.shape == (10, 1):
 raise ValueError("Wrong shape of b2: ".format(b2.shape))

 s1, h = layer_1(X, W1, b1)

 p = layer_2(h, W2, b2)

 return s1, h, p

def compute_cost(X, Y, W1, W2, b1, b2, lamb = 0):
 # WARNING! Y must be one hot!
 # X must have shape (3072, -1)
 # Y and P must have shape (-1, 10)

 if not Y.shape[1] == 10 and len(Y.shape) == 2:
 raise ValueError("Wrong shape of Y: . Must be (-1, 10)".format(Y.shape))
 if not X.shape[0] == 3072:
 raise ValueError("Wrong shape of X: . Must be (3072, -1)".format(X.shape))
 _, _, P = evaluate_classifier(X, W1, W2, b1, b2)
 P = P.T

 if not P.shape[1] == 10 and len(P.shape) == 2:
 raise ValueError("Wrong shape of P: . Must be (-1, 10)".format(P.shape))
 if not P.shape == Y.shape:
 raise ValueError("Y and P must have the same shape. != ".format(Y.shape, P.shape))

 l_cross = 0
 for i in range(X.shape[1]):
 y = Y[i]
 p = P[i]
 y = np.reshape(y, [1, 10])
 p = np.reshape(p, [10, 1])
 l_cross += -np.log(np.matmul(y, p))[0][0]
 avg_loss = l_cross/X.shape[1]
 reg = lamb * (np.sum(W1) + np.sum(W2))

 return avg_loss + reg

def compute_accuracy(X, Y, W1, W2, b1, b2):
 # X must have shape (3072, n)
 # Y must have shape (10, n)
 if(not X.shape[0] == 3072):
 raise ValueError("X must have shape (3072, n)")
 try:
 X.shape[1]
 except IndexError:
 raise(ValueError("X must have shape (3072, n)"))
 if not Y.shape[0] == 10:
 raise ValueError("Y must have shape (10, n)")
 if not len(Y.shape) == 2:
 raise ValueError("Y mus have shape (10, n)")
 correct = 0
 for x, y in zip(X.T, Y.T):
 _, _, p = evaluate_classifier(np.reshape(x, (3072,1)), W1, W2, b1, b2)
 pred = np.argmax(p)
 corr = np.argmax(y)
 correct += 1 if pred == corr else 0
 return correct/X.shape[1]

def compute_gradients(X, Y, S1, H, P, W1, W2, lamb):
 # Y must be one-hot
 # Y and P must be (10, n)
 # X and H must be (3072, n)
 # P is the softmax layer
 t = time.clock()
 if not (Y.shape[0] == 10 and P.shape[0] == 10 and Y.shape == P.shape):
 raise ValueError("Y and P must have shape (10, k). Y: , P: ".format(Y.shape, P.shape))
 if not X.shape[0] == n_input:
 raise ValueError("X must have shape (, k), has ".format(n_input, X.shape))
 if not H.shape[0] == n_hidden:
 raise ValueError("H must have shape (, k)".format(n_hidden))
 grad_W1 = np.zeros([n_hidden, n_input])
 grad_W2 = np.zeros([10, n_hidden])
 grad_b1 = np.zeros([n_hidden, 1])
 grad_b2 = np.zeros([10, 1])


 X, Y, H, P = X.T, Y.T, H.T, P.T

 for x, y, s1, h, p in zip(X, Y, S1, H, P):
 h = np.reshape(h, [1, n_hidden])
 y = np.reshape(y, [10, 1])
 p = np.reshape(p, [10, 1])

 # Second layer
 g = -(y-p).T
 grad_b2 += g.T
 grad_W2 += np.matmul(g.T, h)
 # First layer
 g = np.matmul(g, W2)
 print(g.shape)
 ind = np.zeros(h.shape[1])
 for i, val in enumerate(s1):
 if val > 0:
 ind[i] = 1

 diag = np.diag(ind)

 g = np.matmul(g, diag)

 grad_b1 += g.T
 grad_W1 += np.matmul(g.T, np.reshape(x, [1, n_input]))
 # Divide by batch size
 grad_b1 /= X.shape[0]
 grad_b2 /= X.shape[0]
 grad_W1 /= X.shape[0]
 grad_W2 /= X.shape[0]
 # Add regularization term
 grad_W1 += 2*lamb*W1
 grad_W2 += 2*lamb*W2
 dur = time.clock() - t
 print("compute_gradients took seconds".format(dur))
 return grad_W1, grad_W2, grad_b1, grad_b2


def compute_gradient_num(X, Y, W1, W2, b1, b2, lamb = 0, h = 0.000001):
 Y = Y.T
 grad_W1 = np.zeros_like(W1)
 grad_W2 = np.zeros_like(W2)
 grad_b1 = np.zeros_like(b1)
 grad_b2 = np.zeros_like(b2)

 #gradient for b1 vector
 print("calculating b1 gradients...")
 for i in range(b1.shape[0]):
 b1_try = np.copy(b1)
 b1_try[i,0] = b1[i,0] - h
 c1 = compute_cost(X, Y, W1, W2, b1_try, b2, lamb)
 b1_try = np.copy(b1)
 b1_try[i, 0] = b1[i, 0] + h
 c2 = compute_cost(X, Y, W1, W2, b1_try, b2, lamb)
 grad_b1[i,0] = (c2-c1) / (2 * h)

 # gradient for W1 vector
 print("calculating W1 gradients...")
 for i in range(W1.shape[0]):
 for j in range(W1.shape[1]):
 W1_try = np.copy(W1)
 W1_try[i,j] = W1[i,j] - h
 c1 = compute_cost(X, Y, W1_try, W2, b1, b2, lamb)
 W1_try = np.copy(W1)
 W1_try[i, j] = W1[i, j] + h
 c2 = compute_cost(X, Y, W1_try, W2, b1, b2, lamb)
 grad_W1[i, j] = (c2 - c1) / (2 * h)

 #gradient for b2 vector
 print("calculating b2 gradients...")
 for i in range(b2.shape[0]):
 b2_try = np.copy(b2)
 b2_try[i,0] = b2[i,0] - h
 c1 = compute_cost(X, Y, W1, W2, b1, b2_try, lamb)
 b2_try = np.copy(b2)
 b2_try[i, 0] = b2[i, 0] + h
 c2 = compute_cost(X, Y, W1, W2, b1, b2_try, lamb)
 grad_b2[i,0] = (c2-c1) / (2 * h)

 # gradient for W2 vector
 print("calculating W2 gradients...")
 for i in range(W2.shape[0]):
 for j in range(W2.shape[1]):
 W2_try = np.copy(W2)
 W2_try[i,j] = W2[i,j] - h
 c1 = compute_cost(X, Y, W1, W2_try, b1, b2, lamb)
 W2_try = np.copy(W2)
 W2_try[i, j] = W2[i, j] + h
 c2 = compute_cost(X, Y, W1, W2_try, b1, b2, lamb)
 grad_W2[i, j] = (c2 - c1) / (2 * h)

 return grad_W1, grad_W2, grad_b1, grad_b2

W1 = np.random.normal(0, 0.01, [n_hidden, n_input])
W2 = np.random.normal(0, 0.01, [10, n_hidden])
b1 = np.random.normal(0, 0.1, [n_hidden, 1])
b2 = np.random.normal(0, 0.1, [10, 1])


def train_batch(X, Y, W1, W2, b1, b2, gd_params, old_grads):
 S1, H, P = evaluate_classifier(X.T, W1, W2, b1, b2)
 grad_W1, grad_W2, grad_b1, grad_b2 = compute_gradients(X.T, Y.T, S1, H, P, W1, W2, gd_params["lambda"])
 try:
 W1 -= (1-gd_params["rho"])*gd_params["eta"]*grad_W1 + gd_params["rho"]*gd_params["eta"]*old_grads["W1"]
 W2 -= (1-gd_params["rho"])*gd_params["eta"]*grad_W2 + gd_params["rho"]*gd_params["eta"]*old_grads["W2"]
 b1 -= (1-gd_params["rho"])*gd_params["eta"]*grad_b1 + gd_params["rho"]*gd_params["eta"]*old_grads["b1"]
 b2 -= (1-gd_params["rho"])*gd_params["eta"]*grad_b2 + gd_params["rho"]*gd_params["eta"]*old_grads["b2"]
 except KeyError:
 W1 -= gd_params["eta"]*grad_W1
 W2 -= gd_params["eta"]*grad_W2
 b1 -= gd_params["eta"]*grad_b1
 b2 -= gd_params["eta"]*grad_b2
 return grad_W1, grad_W2, grad_b1, grad_b2

def minibatch_GD(X, Y, W1, W2, b1, b2, gd_params, validation=None, verbose = True):
 # X must have shape (n, 3072)
 # Y must have shape (k, 10)
 # validation["data"] must have shape (k, 3072)
 # validation["labels"] must have shape (k, 10)
 training_loss = [None]*gd_params["epochs"]
 validation_loss = training_loss[:] # copying the list
 training_accuracy = training_loss[:]
 validation_accuracy = training_loss[:]

 for i in range(gd_params["epochs"]):

 if i % 10 == 0 and verbose:
 print("Training epoch started".format(i+1))

 old_grads = 
 # For each batch
 for j in range(X.shape[0]//gd_params["batch_size"]):
 start = j*gd_params["batch_size"]
 end = start + gd_params["batch_size"]
 batch = X[start: end]
 targets = Y[start:end]

 old_grads["W1"], old_grads["W2"], old_grads["b1"], old_grads["b2"] = train_batch(batch, targets, W1, W2, b1, b2, gd_params, old_grads)

 # Train the datapoints that didn't fit into last minibatch
 n_rest = X.shape[0]%gd_params["batch_size"]
 if n_rest > 0:
 start = X.shape[0]-n_rest
 end = X.shape[0]
 batch = X[start: end]
 targets = Y[start: end]
 old_grads = train_batch(batch, targets, W1, W2, b1, b2, gd_params, old_grads)

 training_loss[i] = compute_cost(X.T, Y, W1, W2, b1, b2, gd_params["lambda"])
 training_accuracy[i] = compute_accuracy(X.T, Y.T, W1, W2, b1, b2)
 if validation:
 validation_loss[i] = compute_cost(validation["data"].T, validation["labels"], W1, W2, b1, b2, gd_params["lambda"])
 validation_accuracy[i] = compute_accuracy(validation["data"].T, validation["labels"].T, W1, W2, b1, b2)
 return training_loss, None, training_accuracy, None

dataset_mini = 
dataset_mini["data"] = dataset_small["data"][0:100]
dataset_mini["labels"] = dataset_small["labels"][0:100]

W1 = np.random.normal(0, 0.01, [n_hidden, n_input])
W2 = np.random.normal(0, 0.01, [10, n_hidden])
b1 = np.random.normal(0, 0.1, [n_hidden, 1])
b2 = np.random.normal(0, 0.1, [10, 1])

gd_params = "epochs": 200, "batch_size":10, "lambda": 0, "eta": 0.01, "rho": 0.5
X = dataset_mini["data"]
Y = dataset_mini["labels"]
training_loss, _, training_accuracy, _ = minibatch_GD(X, Y, W1, W2, b1, b2, gd_params)


plt.plot(training_loss)
plt.figure()
plt.plot(training_accuracy)

edited Apr 18 at 6:56

asked Apr 17 at 18:21

Sandi

1385

Could you modify the question to add a runnable example? It would make it easier to investigate, because we can python -m cProfile your_example.py
â€“Â FirefoxMetzger
Apr 18 at 4:57

On first glance the the culprit is that you don't use vectorized computation, but rather compute the gradient row by row in a for loop. Also you are using a linear output layer and a ReLu hidden layer correct? You can vectorize gradient computation of the ReLu instead of nesting for loops
â€“Â FirefoxMetzger
Apr 18 at 5:03

@FirefoxMetzger, added runnable example
â€“Â Sandi
Apr 18 at 6:59

You are linking to the wrong data set. Your code seems to use CIFAR-10, but you link to CIFAR-100 which has a slightly different structure and won't work.
â€“Â FirefoxMetzger
Apr 18 at 9:57

add a commentÂ |Â

up vote
7
down vote

favorite

def compute_gradients(X, Y, S1, H, P, W1, W2, lamb):
 # Y must be one-hot
 # Y and P must be (10, n)
 # X and H must be (3072, n)
 # P is the softmax layer
 if not (Y.shape[0] == 10 and P.shape[0] == 10 and Y.shape == P.shape):
 raise ValueError("Y and P must have shape (10, k). Y: , P: ".format(Y.shape, P.shape))
 if not X.shape[0] == n_input:
 raise ValueError("X must have shape (, k), has ".format(n_input, X.shape))
 if not H.shape[0] == n_hidden:
 raise ValueError("H must have shape (, k)".format(n_hidden))
 grad_W1 = np.zeros([n_hidden, n_input])
 grad_W2 = np.zeros([10, n_hidden])
 grad_b1 = np.zeros([n_hidden, 1])
 grad_b2 = np.zeros([10, 1])


 X, Y, H, P = X.T, Y.T, H.T, P.T
 for x, y, s1, h, p in zip(X, Y, S1, H, P):
 h = np.reshape(h, [1, n_hidden])
 y = np.reshape(y, [10, 1])
 p = np.reshape(p, [10, 1])

 # Second layer
 g = -(y-p).T
 grad_b2 += g.T
 grad_W2 += np.matmul(g.T, h)
 # First layer
 g = np.matmul(g, W2)
 ind = np.zeros(h.shape[1])
 for i, val in enumerate(s1):
 if val > 0:
 ind[i] = 1
 diag = np.diag(ind)

 g = np.matmul(g, diag)

 grad_b1 += g.T
 grad_W1 += np.matmul(g.T, np.reshape(x, [1, n_input]))
 # Divide by batch size
 grad_b1 /= X.shape[0]
 grad_b2 /= X.shape[0]
 grad_W1 /= X.shape[0]
 grad_W2 /= X.shape[0]
 # Add regularization term
 grad_W1 += 2*lamb*W1
 grad_W2 += 2*lamb*W2
 return grad_W1, grad_W2, grad_b1, grad_b2

EDIT: Here's a runnable example. You also need the CIFAR dataset, found here: https://www.cs.toronto.edu/~kriz/cifar-100-python.tar.gz .

import numpy as np
import matplotlib.pyplot as plt
from PIL import Image
import os
import time

n_input = 3072
n_hidden = 3072

def one_hot(Y):
 # assume Y = [1, 4, 9, 0, ...]
 result = [None]*len(Y)
 for i, cls in enumerate(Y):
 onehot = 
 0: lambda: [1,0,0,0,0,0,0,0,0,0],
 1: lambda: [0,1,0,0,0,0,0,0,0,0],
 2: lambda: [0,0,1,0,0,0,0,0,0,0],
 3: lambda: [0,0,0,1,0,0,0,0,0,0],
 4: lambda: [0,0,0,0,1,0,0,0,0,0],
 5: lambda: [0,0,0,0,0,1,0,0,0,0],
 6: lambda: [0,0,0,0,0,0,1,0,0,0],
 7: lambda: [0,0,0,0,0,0,0,1,0,0],
 8: lambda: [0,0,0,0,0,0,0,0,1,0],
 9: lambda: [0,0,0,0,0,0,0,0,0,1],
 [cls]()
 result[i] = onehot

 result = np.array(result).T
 return result


def unpickle(file):
 import pickle
 with open(file, "rb") as fo:
 d = pickle.load(fo, encoding="bytes")
 return d

path = "../../datasets/cifar/"

names = ["data_batch_1", 
 "data_batch_2", 
 "data_batch_3", 
 "data_batch_4", 
 "data_batch_5",
 ]

dataset_raw = unpickle(os.path.join(path, names[0]))
dataset_small = 
dataset_small["data"] = dataset_raw[b"data"]/255
dataset_small["labels"] = one_hot(dataset_raw[b"labels"]).T

validation_set_raw = unpickle(os.path.join(path, names[1]))
validation_small = 
validation_small["data"] = validation_set_raw[b"data"]/255
validation_small["labels"] = one_hot(validation_set_raw[b"labels"]).T

print("small dataset data shape: ".format(dataset_small["data"].shape))
print("small dataset labels shape: ".format(dataset_small["labels"].shape))
print("small validation data shape: ".format(validation_small["data"].shape))
print("small validation labels shape: ".format(validation_small["labels"].shape))

test_set_raw = unpickle(os.path.join(path, "test_batch"))
X_test = test_set_raw[b"data"]
Y_test = one_hot(test_set_raw[b"labels"])

# All data sets

dataset_large = "data": np.zeros([0, 3072]), "labels": np.array()
validation_large = 

## All data batches

for name in names[1:4]:
 raw = unpickle(os.path.join(path, name))
 dataset_large["data"] = np.append(dataset_large["data"], raw[b"data"], axis = 0)
 dataset_large["labels"] = np.append(dataset_large["labels"], raw[b"labels"], axis = 0)
raw = unpickle(os.path.join(path, names[4]))
dataset_large["data"] = np.append(dataset_large["data"], raw[b"data"][0: -1000], axis = 0)
dataset_large["labels"] = np.append(dataset_large["labels"], raw[b"labels"][0: -1000], axis = 0)
validation_large["data"] = raw[b"data"][-1000: ]
validation_large["labels"] = raw[b"labels"][-1000: ]

# Make one-hot
dataset_large["labels"] = one_hot(dataset_large["labels"]).T
validation_large["labels"] = one_hot(validation_large["labels"]).T

# Normalize
dataset_large["data"] = dataset_large["data"]/255
validation_large["data"] = validation_large["data"]/255

print("large dataset data: ".format(dataset_large["data"].shape))
print("large dataset labels: ".format(dataset_large["labels"].shape))
print("large validation set data: ".format(validation_large["data"].shape))
print("large validation set labels: ".format(validation_large["labels"].shape))

def softmax(s):
 numerator = np.exp(s)
 denominator = np.matmul(np.ones(10).T, np.exp(s))
 return numerator/denominator

def relu(s1):
 h = np.maximum(np.zeros_like(s1), s1)
 return h

def layer_1(X, W1, b1):
 # WARNING: X must have shape (3072, n)
 # W1 must have shape (3072, 3072)
 # b1 must have shape (3072, 1)
 # s will get shape (10, n)
 # Return p with shape (10, n)

 if not X.shape[0] == n_input:
 raise ValueError("Got wrong shape of s1: ".format(X.shape))


 s1 = np.matmul(W1, X) + b1

 if not s1.shape == (n_hidden, X.shape[1]):
 raise ValueError("Got wrong shape of s1: ".format(s1.shape))

 h = relu(s1)

 if not h.shape == (n_hidden, X.shape[1]):
 raise ValueError("Got wrong shape of h: ".format(h.shape))

 return s1, h

def layer_2(h, W2, b2):
 # WARNING: h must have shape (3072, n)
 # W2 must have shape (10, 3072)
 # b2 must have shape (10, 1)
 # s will get shape (10, n)
 # Returns p with shape (10, n)

 s = np.matmul(W2, h) + b2

 if not s.shape == (10, h.shape[1]):
 raise ValueError("Got wrong shape of s: ".format(s.shape))

 p = softmax(s)

 if not p.shape == (10, h.shape[1]):
 raise ValueError("Got wrong shape of p: ".format(p.shape))

 return p

def evaluate_classifier(X, W1, W2, b1, b2):
 if not X.shape[0] == n_input:
 ValueError("Wrong shape of X: ".format(X.shape))
 if not len(X.shape) == 2:
 ValueError("Wrong shape of X: ".format(X.shape))
 if not W1.shape == (n_hidden, n_input):
 raise ValueError("Wrong shape of W1: ".format(W1.shape))
 if not b1.shape == (n_hidden, 1):
 raise ValueError("Wrong shape of b1: ".format(b1.shape))
 if not W2.shape == (10, n_hidden):
 raise ValueError("Wrong shape of W2: ".format(W2.shape))
 if not b2.shape == (10, 1):
 raise ValueError("Wrong shape of b2: ".format(b2.shape))

 s1, h = layer_1(X, W1, b1)

 p = layer_2(h, W2, b2)

 return s1, h, p

def compute_cost(X, Y, W1, W2, b1, b2, lamb = 0):
 # WARNING! Y must be one hot!
 # X must have shape (3072, -1)
 # Y and P must have shape (-1, 10)

 if not Y.shape[1] == 10 and len(Y.shape) == 2:
 raise ValueError("Wrong shape of Y: . Must be (-1, 10)".format(Y.shape))
 if not X.shape[0] == 3072:
 raise ValueError("Wrong shape of X: . Must be (3072, -1)".format(X.shape))
 _, _, P = evaluate_classifier(X, W1, W2, b1, b2)
 P = P.T

 if not P.shape[1] == 10 and len(P.shape) == 2:
 raise ValueError("Wrong shape of P: . Must be (-1, 10)".format(P.shape))
 if not P.shape == Y.shape:
 raise ValueError("Y and P must have the same shape. != ".format(Y.shape, P.shape))

 l_cross = 0
 for i in range(X.shape[1]):
 y = Y[i]
 p = P[i]
 y = np.reshape(y, [1, 10])
 p = np.reshape(p, [10, 1])
 l_cross += -np.log(np.matmul(y, p))[0][0]
 avg_loss = l_cross/X.shape[1]
 reg = lamb * (np.sum(W1) + np.sum(W2))

 return avg_loss + reg

def compute_accuracy(X, Y, W1, W2, b1, b2):
 # X must have shape (3072, n)
 # Y must have shape (10, n)
 if(not X.shape[0] == 3072):
 raise ValueError("X must have shape (3072, n)")
 try:
 X.shape[1]
 except IndexError:
 raise(ValueError("X must have shape (3072, n)"))
 if not Y.shape[0] == 10:
 raise ValueError("Y must have shape (10, n)")
 if not len(Y.shape) == 2:
 raise ValueError("Y mus have shape (10, n)")
 correct = 0
 for x, y in zip(X.T, Y.T):
 _, _, p = evaluate_classifier(np.reshape(x, (3072,1)), W1, W2, b1, b2)
 pred = np.argmax(p)
 corr = np.argmax(y)
 correct += 1 if pred == corr else 0
 return correct/X.shape[1]

def compute_gradients(X, Y, S1, H, P, W1, W2, lamb):
 # Y must be one-hot
 # Y and P must be (10, n)
 # X and H must be (3072, n)
 # P is the softmax layer
 t = time.clock()
 if not (Y.shape[0] == 10 and P.shape[0] == 10 and Y.shape == P.shape):
 raise ValueError("Y and P must have shape (10, k). Y: , P: ".format(Y.shape, P.shape))
 if not X.shape[0] == n_input:
 raise ValueError("X must have shape (, k), has ".format(n_input, X.shape))
 if not H.shape[0] == n_hidden:
 raise ValueError("H must have shape (, k)".format(n_hidden))
 grad_W1 = np.zeros([n_hidden, n_input])
 grad_W2 = np.zeros([10, n_hidden])
 grad_b1 = np.zeros([n_hidden, 1])
 grad_b2 = np.zeros([10, 1])


 X, Y, H, P = X.T, Y.T, H.T, P.T

 for x, y, s1, h, p in zip(X, Y, S1, H, P):
 h = np.reshape(h, [1, n_hidden])
 y = np.reshape(y, [10, 1])
 p = np.reshape(p, [10, 1])

 # Second layer
 g = -(y-p).T
 grad_b2 += g.T
 grad_W2 += np.matmul(g.T, h)
 # First layer
 g = np.matmul(g, W2)
 print(g.shape)
 ind = np.zeros(h.shape[1])
 for i, val in enumerate(s1):
 if val > 0:
 ind[i] = 1

 diag = np.diag(ind)

 g = np.matmul(g, diag)

 grad_b1 += g.T
 grad_W1 += np.matmul(g.T, np.reshape(x, [1, n_input]))
 # Divide by batch size
 grad_b1 /= X.shape[0]
 grad_b2 /= X.shape[0]
 grad_W1 /= X.shape[0]
 grad_W2 /= X.shape[0]
 # Add regularization term
 grad_W1 += 2*lamb*W1
 grad_W2 += 2*lamb*W2
 dur = time.clock() - t
 print("compute_gradients took seconds".format(dur))
 return grad_W1, grad_W2, grad_b1, grad_b2


def compute_gradient_num(X, Y, W1, W2, b1, b2, lamb = 0, h = 0.000001):
 Y = Y.T
 grad_W1 = np.zeros_like(W1)
 grad_W2 = np.zeros_like(W2)
 grad_b1 = np.zeros_like(b1)
 grad_b2 = np.zeros_like(b2)

 #gradient for b1 vector
 print("calculating b1 gradients...")
 for i in range(b1.shape[0]):
 b1_try = np.copy(b1)
 b1_try[i,0] = b1[i,0] - h
 c1 = compute_cost(X, Y, W1, W2, b1_try, b2, lamb)
 b1_try = np.copy(b1)
 b1_try[i, 0] = b1[i, 0] + h
 c2 = compute_cost(X, Y, W1, W2, b1_try, b2, lamb)
 grad_b1[i,0] = (c2-c1) / (2 * h)

 # gradient for W1 vector
 print("calculating W1 gradients...")
 for i in range(W1.shape[0]):
 for j in range(W1.shape[1]):
 W1_try = np.copy(W1)
 W1_try[i,j] = W1[i,j] - h
 c1 = compute_cost(X, Y, W1_try, W2, b1, b2, lamb)
 W1_try = np.copy(W1)
 W1_try[i, j] = W1[i, j] + h
 c2 = compute_cost(X, Y, W1_try, W2, b1, b2, lamb)
 grad_W1[i, j] = (c2 - c1) / (2 * h)

 #gradient for b2 vector
 print("calculating b2 gradients...")
 for i in range(b2.shape[0]):
 b2_try = np.copy(b2)
 b2_try[i,0] = b2[i,0] - h
 c1 = compute_cost(X, Y, W1, W2, b1, b2_try, lamb)
 b2_try = np.copy(b2)
 b2_try[i, 0] = b2[i, 0] + h
 c2 = compute_cost(X, Y, W1, W2, b1, b2_try, lamb)
 grad_b2[i,0] = (c2-c1) / (2 * h)

 # gradient for W2 vector
 print("calculating W2 gradients...")
 for i in range(W2.shape[0]):
 for j in range(W2.shape[1]):
 W2_try = np.copy(W2)
 W2_try[i,j] = W2[i,j] - h
 c1 = compute_cost(X, Y, W1, W2_try, b1, b2, lamb)
 W2_try = np.copy(W2)
 W2_try[i, j] = W2[i, j] + h
 c2 = compute_cost(X, Y, W1, W2_try, b1, b2, lamb)
 grad_W2[i, j] = (c2 - c1) / (2 * h)

 return grad_W1, grad_W2, grad_b1, grad_b2

W1 = np.random.normal(0, 0.01, [n_hidden, n_input])
W2 = np.random.normal(0, 0.01, [10, n_hidden])
b1 = np.random.normal(0, 0.1, [n_hidden, 1])
b2 = np.random.normal(0, 0.1, [10, 1])


def train_batch(X, Y, W1, W2, b1, b2, gd_params, old_grads):
 S1, H, P = evaluate_classifier(X.T, W1, W2, b1, b2)
 grad_W1, grad_W2, grad_b1, grad_b2 = compute_gradients(X.T, Y.T, S1, H, P, W1, W2, gd_params["lambda"])
 try:
 W1 -= (1-gd_params["rho"])*gd_params["eta"]*grad_W1 + gd_params["rho"]*gd_params["eta"]*old_grads["W1"]
 W2 -= (1-gd_params["rho"])*gd_params["eta"]*grad_W2 + gd_params["rho"]*gd_params["eta"]*old_grads["W2"]
 b1 -= (1-gd_params["rho"])*gd_params["eta"]*grad_b1 + gd_params["rho"]*gd_params["eta"]*old_grads["b1"]
 b2 -= (1-gd_params["rho"])*gd_params["eta"]*grad_b2 + gd_params["rho"]*gd_params["eta"]*old_grads["b2"]
 except KeyError:
 W1 -= gd_params["eta"]*grad_W1
 W2 -= gd_params["eta"]*grad_W2
 b1 -= gd_params["eta"]*grad_b1
 b2 -= gd_params["eta"]*grad_b2
 return grad_W1, grad_W2, grad_b1, grad_b2

def minibatch_GD(X, Y, W1, W2, b1, b2, gd_params, validation=None, verbose = True):
 # X must have shape (n, 3072)
 # Y must have shape (k, 10)
 # validation["data"] must have shape (k, 3072)
 # validation["labels"] must have shape (k, 10)
 training_loss = [None]*gd_params["epochs"]
 validation_loss = training_loss[:] # copying the list
 training_accuracy = training_loss[:]
 validation_accuracy = training_loss[:]

 for i in range(gd_params["epochs"]):

 if i % 10 == 0 and verbose:
 print("Training epoch started".format(i+1))

 old_grads = 
 # For each batch
 for j in range(X.shape[0]//gd_params["batch_size"]):
 start = j*gd_params["batch_size"]
 end = start + gd_params["batch_size"]
 batch = X[start: end]
 targets = Y[start:end]

 old_grads["W1"], old_grads["W2"], old_grads["b1"], old_grads["b2"] = train_batch(batch, targets, W1, W2, b1, b2, gd_params, old_grads)

 # Train the datapoints that didn't fit into last minibatch
 n_rest = X.shape[0]%gd_params["batch_size"]
 if n_rest > 0:
 start = X.shape[0]-n_rest
 end = X.shape[0]
 batch = X[start: end]
 targets = Y[start: end]
 old_grads = train_batch(batch, targets, W1, W2, b1, b2, gd_params, old_grads)

 training_loss[i] = compute_cost(X.T, Y, W1, W2, b1, b2, gd_params["lambda"])
 training_accuracy[i] = compute_accuracy(X.T, Y.T, W1, W2, b1, b2)
 if validation:
 validation_loss[i] = compute_cost(validation["data"].T, validation["labels"], W1, W2, b1, b2, gd_params["lambda"])
 validation_accuracy[i] = compute_accuracy(validation["data"].T, validation["labels"].T, W1, W2, b1, b2)
 return training_loss, None, training_accuracy, None

dataset_mini = 
dataset_mini["data"] = dataset_small["data"][0:100]
dataset_mini["labels"] = dataset_small["labels"][0:100]

W1 = np.random.normal(0, 0.01, [n_hidden, n_input])
W2 = np.random.normal(0, 0.01, [10, n_hidden])
b1 = np.random.normal(0, 0.1, [n_hidden, 1])
b2 = np.random.normal(0, 0.1, [10, 1])

gd_params = "epochs": 200, "batch_size":10, "lambda": 0, "eta": 0.01, "rho": 0.5
X = dataset_mini["data"]
Y = dataset_mini["labels"]
training_loss, _, training_accuracy, _ = minibatch_GD(X, Y, W1, W2, b1, b2, gd_params)


plt.plot(training_loss)
plt.figure()
plt.plot(training_accuracy)

edited Apr 18 at 6:56

asked Apr 17 at 18:21

Sandi

1385

Could you modify the question to add a runnable example? It would make it easier to investigate, because we can python -m cProfile your_example.py
â€“Â FirefoxMetzger
Apr 18 at 4:57

On first glance the the culprit is that you don't use vectorized computation, but rather compute the gradient row by row in a for loop. Also you are using a linear output layer and a ReLu hidden layer correct? You can vectorize gradient computation of the ReLu instead of nesting for loops
â€“Â FirefoxMetzger
Apr 18 at 5:03

@FirefoxMetzger, added runnable example
â€“Â Sandi
Apr 18 at 6:59

You are linking to the wrong data set. Your code seems to use CIFAR-10, but you link to CIFAR-100 which has a slightly different structure and won't work.
â€“Â FirefoxMetzger
Apr 18 at 9:57

add a commentÂ |Â

up vote
7
down vote

favorite

def compute_gradients(X, Y, S1, H, P, W1, W2, lamb):
 # Y must be one-hot
 # Y and P must be (10, n)
 # X and H must be (3072, n)
 # P is the softmax layer
 if not (Y.shape[0] == 10 and P.shape[0] == 10 and Y.shape == P.shape):
 raise ValueError("Y and P must have shape (10, k). Y: , P: ".format(Y.shape, P.shape))
 if not X.shape[0] == n_input:
 raise ValueError("X must have shape (, k), has ".format(n_input, X.shape))
 if not H.shape[0] == n_hidden:
 raise ValueError("H must have shape (, k)".format(n_hidden))
 grad_W1 = np.zeros([n_hidden, n_input])
 grad_W2 = np.zeros([10, n_hidden])
 grad_b1 = np.zeros([n_hidden, 1])
 grad_b2 = np.zeros([10, 1])


 X, Y, H, P = X.T, Y.T, H.T, P.T
 for x, y, s1, h, p in zip(X, Y, S1, H, P):
 h = np.reshape(h, [1, n_hidden])
 y = np.reshape(y, [10, 1])
 p = np.reshape(p, [10, 1])

 # Second layer
 g = -(y-p).T
 grad_b2 += g.T
 grad_W2 += np.matmul(g.T, h)
 # First layer
 g = np.matmul(g, W2)
 ind = np.zeros(h.shape[1])
 for i, val in enumerate(s1):
 if val > 0:
 ind[i] = 1
 diag = np.diag(ind)

 g = np.matmul(g, diag)

 grad_b1 += g.T
 grad_W1 += np.matmul(g.T, np.reshape(x, [1, n_input]))
 # Divide by batch size
 grad_b1 /= X.shape[0]
 grad_b2 /= X.shape[0]
 grad_W1 /= X.shape[0]
 grad_W2 /= X.shape[0]
 # Add regularization term
 grad_W1 += 2*lamb*W1
 grad_W2 += 2*lamb*W2
 return grad_W1, grad_W2, grad_b1, grad_b2

EDIT: Here's a runnable example. You also need the CIFAR dataset, found here: https://www.cs.toronto.edu/~kriz/cifar-100-python.tar.gz .

import numpy as np
import matplotlib.pyplot as plt
from PIL import Image
import os
import time

n_input = 3072
n_hidden = 3072

def one_hot(Y):
 # assume Y = [1, 4, 9, 0, ...]
 result = [None]*len(Y)
 for i, cls in enumerate(Y):
 onehot = 
 0: lambda: [1,0,0,0,0,0,0,0,0,0],
 1: lambda: [0,1,0,0,0,0,0,0,0,0],
 2: lambda: [0,0,1,0,0,0,0,0,0,0],
 3: lambda: [0,0,0,1,0,0,0,0,0,0],
 4: lambda: [0,0,0,0,1,0,0,0,0,0],
 5: lambda: [0,0,0,0,0,1,0,0,0,0],
 6: lambda: [0,0,0,0,0,0,1,0,0,0],
 7: lambda: [0,0,0,0,0,0,0,1,0,0],
 8: lambda: [0,0,0,0,0,0,0,0,1,0],
 9: lambda: [0,0,0,0,0,0,0,0,0,1],
 [cls]()
 result[i] = onehot

 result = np.array(result).T
 return result


def unpickle(file):
 import pickle
 with open(file, "rb") as fo:
 d = pickle.load(fo, encoding="bytes")
 return d

path = "../../datasets/cifar/"

names = ["data_batch_1", 
 "data_batch_2", 
 "data_batch_3", 
 "data_batch_4", 
 "data_batch_5",
 ]

dataset_raw = unpickle(os.path.join(path, names[0]))
dataset_small = 
dataset_small["data"] = dataset_raw[b"data"]/255
dataset_small["labels"] = one_hot(dataset_raw[b"labels"]).T

validation_set_raw = unpickle(os.path.join(path, names[1]))
validation_small = 
validation_small["data"] = validation_set_raw[b"data"]/255
validation_small["labels"] = one_hot(validation_set_raw[b"labels"]).T

print("small dataset data shape: ".format(dataset_small["data"].shape))
print("small dataset labels shape: ".format(dataset_small["labels"].shape))
print("small validation data shape: ".format(validation_small["data"].shape))
print("small validation labels shape: ".format(validation_small["labels"].shape))

test_set_raw = unpickle(os.path.join(path, "test_batch"))
X_test = test_set_raw[b"data"]
Y_test = one_hot(test_set_raw[b"labels"])

# All data sets

dataset_large = "data": np.zeros([0, 3072]), "labels": np.array()
validation_large = 

## All data batches

for name in names[1:4]:
 raw = unpickle(os.path.join(path, name))
 dataset_large["data"] = np.append(dataset_large["data"], raw[b"data"], axis = 0)
 dataset_large["labels"] = np.append(dataset_large["labels"], raw[b"labels"], axis = 0)
raw = unpickle(os.path.join(path, names[4]))
dataset_large["data"] = np.append(dataset_large["data"], raw[b"data"][0: -1000], axis = 0)
dataset_large["labels"] = np.append(dataset_large["labels"], raw[b"labels"][0: -1000], axis = 0)
validation_large["data"] = raw[b"data"][-1000: ]
validation_large["labels"] = raw[b"labels"][-1000: ]

# Make one-hot
dataset_large["labels"] = one_hot(dataset_large["labels"]).T
validation_large["labels"] = one_hot(validation_large["labels"]).T

# Normalize
dataset_large["data"] = dataset_large["data"]/255
validation_large["data"] = validation_large["data"]/255

print("large dataset data: ".format(dataset_large["data"].shape))
print("large dataset labels: ".format(dataset_large["labels"].shape))
print("large validation set data: ".format(validation_large["data"].shape))
print("large validation set labels: ".format(validation_large["labels"].shape))

def softmax(s):
 numerator = np.exp(s)
 denominator = np.matmul(np.ones(10).T, np.exp(s))
 return numerator/denominator

def relu(s1):
 h = np.maximum(np.zeros_like(s1), s1)
 return h

def layer_1(X, W1, b1):
 # WARNING: X must have shape (3072, n)
 # W1 must have shape (3072, 3072)
 # b1 must have shape (3072, 1)
 # s will get shape (10, n)
 # Return p with shape (10, n)

 if not X.shape[0] == n_input:
 raise ValueError("Got wrong shape of s1: ".format(X.shape))


 s1 = np.matmul(W1, X) + b1

 if not s1.shape == (n_hidden, X.shape[1]):
 raise ValueError("Got wrong shape of s1: ".format(s1.shape))

 h = relu(s1)

 if not h.shape == (n_hidden, X.shape[1]):
 raise ValueError("Got wrong shape of h: ".format(h.shape))

 return s1, h

def layer_2(h, W2, b2):
 # WARNING: h must have shape (3072, n)
 # W2 must have shape (10, 3072)
 # b2 must have shape (10, 1)
 # s will get shape (10, n)
 # Returns p with shape (10, n)

 s = np.matmul(W2, h) + b2

 if not s.shape == (10, h.shape[1]):
 raise ValueError("Got wrong shape of s: ".format(s.shape))

 p = softmax(s)

 if not p.shape == (10, h.shape[1]):
 raise ValueError("Got wrong shape of p: ".format(p.shape))

 return p

def evaluate_classifier(X, W1, W2, b1, b2):
 if not X.shape[0] == n_input:
 ValueError("Wrong shape of X: ".format(X.shape))
 if not len(X.shape) == 2:
 ValueError("Wrong shape of X: ".format(X.shape))
 if not W1.shape == (n_hidden, n_input):
 raise ValueError("Wrong shape of W1: ".format(W1.shape))
 if not b1.shape == (n_hidden, 1):
 raise ValueError("Wrong shape of b1: ".format(b1.shape))
 if not W2.shape == (10, n_hidden):
 raise ValueError("Wrong shape of W2: ".format(W2.shape))
 if not b2.shape == (10, 1):
 raise ValueError("Wrong shape of b2: ".format(b2.shape))

 s1, h = layer_1(X, W1, b1)

 p = layer_2(h, W2, b2)

 return s1, h, p

def compute_cost(X, Y, W1, W2, b1, b2, lamb = 0):
 # WARNING! Y must be one hot!
 # X must have shape (3072, -1)
 # Y and P must have shape (-1, 10)

 if not Y.shape[1] == 10 and len(Y.shape) == 2:
 raise ValueError("Wrong shape of Y: . Must be (-1, 10)".format(Y.shape))
 if not X.shape[0] == 3072:
 raise ValueError("Wrong shape of X: . Must be (3072, -1)".format(X.shape))
 _, _, P = evaluate_classifier(X, W1, W2, b1, b2)
 P = P.T

 if not P.shape[1] == 10 and len(P.shape) == 2:
 raise ValueError("Wrong shape of P: . Must be (-1, 10)".format(P.shape))
 if not P.shape == Y.shape:
 raise ValueError("Y and P must have the same shape. != ".format(Y.shape, P.shape))

 l_cross = 0
 for i in range(X.shape[1]):
 y = Y[i]
 p = P[i]
 y = np.reshape(y, [1, 10])
 p = np.reshape(p, [10, 1])
 l_cross += -np.log(np.matmul(y, p))[0][0]
 avg_loss = l_cross/X.shape[1]
 reg = lamb * (np.sum(W1) + np.sum(W2))

 return avg_loss + reg

def compute_accuracy(X, Y, W1, W2, b1, b2):
 # X must have shape (3072, n)
 # Y must have shape (10, n)
 if(not X.shape[0] == 3072):
 raise ValueError("X must have shape (3072, n)")
 try:
 X.shape[1]
 except IndexError:
 raise(ValueError("X must have shape (3072, n)"))
 if not Y.shape[0] == 10:
 raise ValueError("Y must have shape (10, n)")
 if not len(Y.shape) == 2:
 raise ValueError("Y mus have shape (10, n)")
 correct = 0
 for x, y in zip(X.T, Y.T):
 _, _, p = evaluate_classifier(np.reshape(x, (3072,1)), W1, W2, b1, b2)
 pred = np.argmax(p)
 corr = np.argmax(y)
 correct += 1 if pred == corr else 0
 return correct/X.shape[1]

def compute_gradients(X, Y, S1, H, P, W1, W2, lamb):
 # Y must be one-hot
 # Y and P must be (10, n)
 # X and H must be (3072, n)
 # P is the softmax layer
 t = time.clock()
 if not (Y.shape[0] == 10 and P.shape[0] == 10 and Y.shape == P.shape):
 raise ValueError("Y and P must have shape (10, k). Y: , P: ".format(Y.shape, P.shape))
 if not X.shape[0] == n_input:
 raise ValueError("X must have shape (, k), has ".format(n_input, X.shape))
 if not H.shape[0] == n_hidden:
 raise ValueError("H must have shape (, k)".format(n_hidden))
 grad_W1 = np.zeros([n_hidden, n_input])
 grad_W2 = np.zeros([10, n_hidden])
 grad_b1 = np.zeros([n_hidden, 1])
 grad_b2 = np.zeros([10, 1])


 X, Y, H, P = X.T, Y.T, H.T, P.T

 for x, y, s1, h, p in zip(X, Y, S1, H, P):
 h = np.reshape(h, [1, n_hidden])
 y = np.reshape(y, [10, 1])
 p = np.reshape(p, [10, 1])

 # Second layer
 g = -(y-p).T
 grad_b2 += g.T
 grad_W2 += np.matmul(g.T, h)
 # First layer
 g = np.matmul(g, W2)
 print(g.shape)
 ind = np.zeros(h.shape[1])
 for i, val in enumerate(s1):
 if val > 0:
 ind[i] = 1

 diag = np.diag(ind)

 g = np.matmul(g, diag)

 grad_b1 += g.T
 grad_W1 += np.matmul(g.T, np.reshape(x, [1, n_input]))
 # Divide by batch size
 grad_b1 /= X.shape[0]
 grad_b2 /= X.shape[0]
 grad_W1 /= X.shape[0]
 grad_W2 /= X.shape[0]
 # Add regularization term
 grad_W1 += 2*lamb*W1
 grad_W2 += 2*lamb*W2
 dur = time.clock() - t
 print("compute_gradients took seconds".format(dur))
 return grad_W1, grad_W2, grad_b1, grad_b2


def compute_gradient_num(X, Y, W1, W2, b1, b2, lamb = 0, h = 0.000001):
 Y = Y.T
 grad_W1 = np.zeros_like(W1)
 grad_W2 = np.zeros_like(W2)
 grad_b1 = np.zeros_like(b1)
 grad_b2 = np.zeros_like(b2)

 #gradient for b1 vector
 print("calculating b1 gradients...")
 for i in range(b1.shape[0]):
 b1_try = np.copy(b1)
 b1_try[i,0] = b1[i,0] - h
 c1 = compute_cost(X, Y, W1, W2, b1_try, b2, lamb)
 b1_try = np.copy(b1)
 b1_try[i, 0] = b1[i, 0] + h
 c2 = compute_cost(X, Y, W1, W2, b1_try, b2, lamb)
 grad_b1[i,0] = (c2-c1) / (2 * h)

 # gradient for W1 vector
 print("calculating W1 gradients...")
 for i in range(W1.shape[0]):
 for j in range(W1.shape[1]):
 W1_try = np.copy(W1)
 W1_try[i,j] = W1[i,j] - h
 c1 = compute_cost(X, Y, W1_try, W2, b1, b2, lamb)
 W1_try = np.copy(W1)
 W1_try[i, j] = W1[i, j] + h
 c2 = compute_cost(X, Y, W1_try, W2, b1, b2, lamb)
 grad_W1[i, j] = (c2 - c1) / (2 * h)

 #gradient for b2 vector
 print("calculating b2 gradients...")
 for i in range(b2.shape[0]):
 b2_try = np.copy(b2)
 b2_try[i,0] = b2[i,0] - h
 c1 = compute_cost(X, Y, W1, W2, b1, b2_try, lamb)
 b2_try = np.copy(b2)
 b2_try[i, 0] = b2[i, 0] + h
 c2 = compute_cost(X, Y, W1, W2, b1, b2_try, lamb)
 grad_b2[i,0] = (c2-c1) / (2 * h)

 # gradient for W2 vector
 print("calculating W2 gradients...")
 for i in range(W2.shape[0]):
 for j in range(W2.shape[1]):
 W2_try = np.copy(W2)
 W2_try[i,j] = W2[i,j] - h
 c1 = compute_cost(X, Y, W1, W2_try, b1, b2, lamb)
 W2_try = np.copy(W2)
 W2_try[i, j] = W2[i, j] + h
 c2 = compute_cost(X, Y, W1, W2_try, b1, b2, lamb)
 grad_W2[i, j] = (c2 - c1) / (2 * h)

 return grad_W1, grad_W2, grad_b1, grad_b2

W1 = np.random.normal(0, 0.01, [n_hidden, n_input])
W2 = np.random.normal(0, 0.01, [10, n_hidden])
b1 = np.random.normal(0, 0.1, [n_hidden, 1])
b2 = np.random.normal(0, 0.1, [10, 1])


def train_batch(X, Y, W1, W2, b1, b2, gd_params, old_grads):
 S1, H, P = evaluate_classifier(X.T, W1, W2, b1, b2)
 grad_W1, grad_W2, grad_b1, grad_b2 = compute_gradients(X.T, Y.T, S1, H, P, W1, W2, gd_params["lambda"])
 try:
 W1 -= (1-gd_params["rho"])*gd_params["eta"]*grad_W1 + gd_params["rho"]*gd_params["eta"]*old_grads["W1"]
 W2 -= (1-gd_params["rho"])*gd_params["eta"]*grad_W2 + gd_params["rho"]*gd_params["eta"]*old_grads["W2"]
 b1 -= (1-gd_params["rho"])*gd_params["eta"]*grad_b1 + gd_params["rho"]*gd_params["eta"]*old_grads["b1"]
 b2 -= (1-gd_params["rho"])*gd_params["eta"]*grad_b2 + gd_params["rho"]*gd_params["eta"]*old_grads["b2"]
 except KeyError:
 W1 -= gd_params["eta"]*grad_W1
 W2 -= gd_params["eta"]*grad_W2
 b1 -= gd_params["eta"]*grad_b1
 b2 -= gd_params["eta"]*grad_b2
 return grad_W1, grad_W2, grad_b1, grad_b2

def minibatch_GD(X, Y, W1, W2, b1, b2, gd_params, validation=None, verbose = True):
 # X must have shape (n, 3072)
 # Y must have shape (k, 10)
 # validation["data"] must have shape (k, 3072)
 # validation["labels"] must have shape (k, 10)
 training_loss = [None]*gd_params["epochs"]
 validation_loss = training_loss[:] # copying the list
 training_accuracy = training_loss[:]
 validation_accuracy = training_loss[:]

 for i in range(gd_params["epochs"]):

 if i % 10 == 0 and verbose:
 print("Training epoch started".format(i+1))

 old_grads = 
 # For each batch
 for j in range(X.shape[0]//gd_params["batch_size"]):
 start = j*gd_params["batch_size"]
 end = start + gd_params["batch_size"]
 batch = X[start: end]
 targets = Y[start:end]

 old_grads["W1"], old_grads["W2"], old_grads["b1"], old_grads["b2"] = train_batch(batch, targets, W1, W2, b1, b2, gd_params, old_grads)

 # Train the datapoints that didn't fit into last minibatch
 n_rest = X.shape[0]%gd_params["batch_size"]
 if n_rest > 0:
 start = X.shape[0]-n_rest
 end = X.shape[0]
 batch = X[start: end]
 targets = Y[start: end]
 old_grads = train_batch(batch, targets, W1, W2, b1, b2, gd_params, old_grads)

 training_loss[i] = compute_cost(X.T, Y, W1, W2, b1, b2, gd_params["lambda"])
 training_accuracy[i] = compute_accuracy(X.T, Y.T, W1, W2, b1, b2)
 if validation:
 validation_loss[i] = compute_cost(validation["data"].T, validation["labels"], W1, W2, b1, b2, gd_params["lambda"])
 validation_accuracy[i] = compute_accuracy(validation["data"].T, validation["labels"].T, W1, W2, b1, b2)
 return training_loss, None, training_accuracy, None

dataset_mini = 
dataset_mini["data"] = dataset_small["data"][0:100]
dataset_mini["labels"] = dataset_small["labels"][0:100]

W1 = np.random.normal(0, 0.01, [n_hidden, n_input])
W2 = np.random.normal(0, 0.01, [10, n_hidden])
b1 = np.random.normal(0, 0.1, [n_hidden, 1])
b2 = np.random.normal(0, 0.1, [10, 1])

gd_params = "epochs": 200, "batch_size":10, "lambda": 0, "eta": 0.01, "rho": 0.5
X = dataset_mini["data"]
Y = dataset_mini["labels"]
training_loss, _, training_accuracy, _ = minibatch_GD(X, Y, W1, W2, b1, b2, gd_params)


plt.plot(training_loss)
plt.figure()
plt.plot(training_accuracy)

edited Apr 18 at 6:56

asked Apr 17 at 18:21

Sandi

1385

def compute_gradients(X, Y, S1, H, P, W1, W2, lamb):
 # Y must be one-hot
 # Y and P must be (10, n)
 # X and H must be (3072, n)
 # P is the softmax layer
 if not (Y.shape[0] == 10 and P.shape[0] == 10 and Y.shape == P.shape):
 raise ValueError("Y and P must have shape (10, k). Y: , P: ".format(Y.shape, P.shape))
 if not X.shape[0] == n_input:
 raise ValueError("X must have shape (, k), has ".format(n_input, X.shape))
 if not H.shape[0] == n_hidden:
 raise ValueError("H must have shape (, k)".format(n_hidden))
 grad_W1 = np.zeros([n_hidden, n_input])
 grad_W2 = np.zeros([10, n_hidden])
 grad_b1 = np.zeros([n_hidden, 1])
 grad_b2 = np.zeros([10, 1])


 X, Y, H, P = X.T, Y.T, H.T, P.T
 for x, y, s1, h, p in zip(X, Y, S1, H, P):
 h = np.reshape(h, [1, n_hidden])
 y = np.reshape(y, [10, 1])
 p = np.reshape(p, [10, 1])

 # Second layer
 g = -(y-p).T
 grad_b2 += g.T
 grad_W2 += np.matmul(g.T, h)
 # First layer
 g = np.matmul(g, W2)
 ind = np.zeros(h.shape[1])
 for i, val in enumerate(s1):
 if val > 0:
 ind[i] = 1
 diag = np.diag(ind)

 g = np.matmul(g, diag)

 grad_b1 += g.T
 grad_W1 += np.matmul(g.T, np.reshape(x, [1, n_input]))
 # Divide by batch size
 grad_b1 /= X.shape[0]
 grad_b2 /= X.shape[0]
 grad_W1 /= X.shape[0]
 grad_W2 /= X.shape[0]
 # Add regularization term
 grad_W1 += 2*lamb*W1
 grad_W2 += 2*lamb*W2
 return grad_W1, grad_W2, grad_b1, grad_b2

EDIT: Here's a runnable example. You also need the CIFAR dataset, found here: https://www.cs.toronto.edu/~kriz/cifar-100-python.tar.gz .

import numpy as np
import matplotlib.pyplot as plt
from PIL import Image
import os
import time

n_input = 3072
n_hidden = 3072

def one_hot(Y):
 # assume Y = [1, 4, 9, 0, ...]
 result = [None]*len(Y)
 for i, cls in enumerate(Y):
 onehot = 
 0: lambda: [1,0,0,0,0,0,0,0,0,0],
 1: lambda: [0,1,0,0,0,0,0,0,0,0],
 2: lambda: [0,0,1,0,0,0,0,0,0,0],
 3: lambda: [0,0,0,1,0,0,0,0,0,0],
 4: lambda: [0,0,0,0,1,0,0,0,0,0],
 5: lambda: [0,0,0,0,0,1,0,0,0,0],
 6: lambda: [0,0,0,0,0,0,1,0,0,0],
 7: lambda: [0,0,0,0,0,0,0,1,0,0],
 8: lambda: [0,0,0,0,0,0,0,0,1,0],
 9: lambda: [0,0,0,0,0,0,0,0,0,1],
 [cls]()
 result[i] = onehot

 result = np.array(result).T
 return result


def unpickle(file):
 import pickle
 with open(file, "rb") as fo:
 d = pickle.load(fo, encoding="bytes")
 return d

path = "../../datasets/cifar/"

names = ["data_batch_1", 
 "data_batch_2", 
 "data_batch_3", 
 "data_batch_4", 
 "data_batch_5",
 ]

dataset_raw = unpickle(os.path.join(path, names[0]))
dataset_small = 
dataset_small["data"] = dataset_raw[b"data"]/255
dataset_small["labels"] = one_hot(dataset_raw[b"labels"]).T

validation_set_raw = unpickle(os.path.join(path, names[1]))
validation_small = 
validation_small["data"] = validation_set_raw[b"data"]/255
validation_small["labels"] = one_hot(validation_set_raw[b"labels"]).T

print("small dataset data shape: ".format(dataset_small["data"].shape))
print("small dataset labels shape: ".format(dataset_small["labels"].shape))
print("small validation data shape: ".format(validation_small["data"].shape))
print("small validation labels shape: ".format(validation_small["labels"].shape))

test_set_raw = unpickle(os.path.join(path, "test_batch"))
X_test = test_set_raw[b"data"]
Y_test = one_hot(test_set_raw[b"labels"])

# All data sets

dataset_large = "data": np.zeros([0, 3072]), "labels": np.array()
validation_large = 

## All data batches

for name in names[1:4]:
 raw = unpickle(os.path.join(path, name))
 dataset_large["data"] = np.append(dataset_large["data"], raw[b"data"], axis = 0)
 dataset_large["labels"] = np.append(dataset_large["labels"], raw[b"labels"], axis = 0)
raw = unpickle(os.path.join(path, names[4]))
dataset_large["data"] = np.append(dataset_large["data"], raw[b"data"][0: -1000], axis = 0)
dataset_large["labels"] = np.append(dataset_large["labels"], raw[b"labels"][0: -1000], axis = 0)
validation_large["data"] = raw[b"data"][-1000: ]
validation_large["labels"] = raw[b"labels"][-1000: ]

# Make one-hot
dataset_large["labels"] = one_hot(dataset_large["labels"]).T
validation_large["labels"] = one_hot(validation_large["labels"]).T

# Normalize
dataset_large["data"] = dataset_large["data"]/255
validation_large["data"] = validation_large["data"]/255

print("large dataset data: ".format(dataset_large["data"].shape))
print("large dataset labels: ".format(dataset_large["labels"].shape))
print("large validation set data: ".format(validation_large["data"].shape))
print("large validation set labels: ".format(validation_large["labels"].shape))

def softmax(s):
 numerator = np.exp(s)
 denominator = np.matmul(np.ones(10).T, np.exp(s))
 return numerator/denominator

def relu(s1):
 h = np.maximum(np.zeros_like(s1), s1)
 return h

def layer_1(X, W1, b1):
 # WARNING: X must have shape (3072, n)
 # W1 must have shape (3072, 3072)
 # b1 must have shape (3072, 1)
 # s will get shape (10, n)
 # Return p with shape (10, n)

 if not X.shape[0] == n_input:
 raise ValueError("Got wrong shape of s1: ".format(X.shape))


 s1 = np.matmul(W1, X) + b1

 if not s1.shape == (n_hidden, X.shape[1]):
 raise ValueError("Got wrong shape of s1: ".format(s1.shape))

 h = relu(s1)

 if not h.shape == (n_hidden, X.shape[1]):
 raise ValueError("Got wrong shape of h: ".format(h.shape))

 return s1, h

def layer_2(h, W2, b2):
 # WARNING: h must have shape (3072, n)
 # W2 must have shape (10, 3072)
 # b2 must have shape (10, 1)
 # s will get shape (10, n)
 # Returns p with shape (10, n)

 s = np.matmul(W2, h) + b2

 if not s.shape == (10, h.shape[1]):
 raise ValueError("Got wrong shape of s: ".format(s.shape))

 p = softmax(s)

 if not p.shape == (10, h.shape[1]):
 raise ValueError("Got wrong shape of p: ".format(p.shape))

 return p

def evaluate_classifier(X, W1, W2, b1, b2):
 if not X.shape[0] == n_input:
 ValueError("Wrong shape of X: ".format(X.shape))
 if not len(X.shape) == 2:
 ValueError("Wrong shape of X: ".format(X.shape))
 if not W1.shape == (n_hidden, n_input):
 raise ValueError("Wrong shape of W1: ".format(W1.shape))
 if not b1.shape == (n_hidden, 1):
 raise ValueError("Wrong shape of b1: ".format(b1.shape))
 if not W2.shape == (10, n_hidden):
 raise ValueError("Wrong shape of W2: ".format(W2.shape))
 if not b2.shape == (10, 1):
 raise ValueError("Wrong shape of b2: ".format(b2.shape))

 s1, h = layer_1(X, W1, b1)

 p = layer_2(h, W2, b2)

 return s1, h, p

def compute_cost(X, Y, W1, W2, b1, b2, lamb = 0):
 # WARNING! Y must be one hot!
 # X must have shape (3072, -1)
 # Y and P must have shape (-1, 10)

 if not Y.shape[1] == 10 and len(Y.shape) == 2:
 raise ValueError("Wrong shape of Y: . Must be (-1, 10)".format(Y.shape))
 if not X.shape[0] == 3072:
 raise ValueError("Wrong shape of X: . Must be (3072, -1)".format(X.shape))
 _, _, P = evaluate_classifier(X, W1, W2, b1, b2)
 P = P.T

 if not P.shape[1] == 10 and len(P.shape) == 2:
 raise ValueError("Wrong shape of P: . Must be (-1, 10)".format(P.shape))
 if not P.shape == Y.shape:
 raise ValueError("Y and P must have the same shape. != ".format(Y.shape, P.shape))

 l_cross = 0
 for i in range(X.shape[1]):
 y = Y[i]
 p = P[i]
 y = np.reshape(y, [1, 10])
 p = np.reshape(p, [10, 1])
 l_cross += -np.log(np.matmul(y, p))[0][0]
 avg_loss = l_cross/X.shape[1]
 reg = lamb * (np.sum(W1) + np.sum(W2))

 return avg_loss + reg

def compute_accuracy(X, Y, W1, W2, b1, b2):
 # X must have shape (3072, n)
 # Y must have shape (10, n)
 if(not X.shape[0] == 3072):
 raise ValueError("X must have shape (3072, n)")
 try:
 X.shape[1]
 except IndexError:
 raise(ValueError("X must have shape (3072, n)"))
 if not Y.shape[0] == 10:
 raise ValueError("Y must have shape (10, n)")
 if not len(Y.shape) == 2:
 raise ValueError("Y mus have shape (10, n)")
 correct = 0
 for x, y in zip(X.T, Y.T):
 _, _, p = evaluate_classifier(np.reshape(x, (3072,1)), W1, W2, b1, b2)
 pred = np.argmax(p)
 corr = np.argmax(y)
 correct += 1 if pred == corr else 0
 return correct/X.shape[1]

def compute_gradients(X, Y, S1, H, P, W1, W2, lamb):
 # Y must be one-hot
 # Y and P must be (10, n)
 # X and H must be (3072, n)
 # P is the softmax layer
 t = time.clock()
 if not (Y.shape[0] == 10 and P.shape[0] == 10 and Y.shape == P.shape):
 raise ValueError("Y and P must have shape (10, k). Y: , P: ".format(Y.shape, P.shape))
 if not X.shape[0] == n_input:
 raise ValueError("X must have shape (, k), has ".format(n_input, X.shape))
 if not H.shape[0] == n_hidden:
 raise ValueError("H must have shape (, k)".format(n_hidden))
 grad_W1 = np.zeros([n_hidden, n_input])
 grad_W2 = np.zeros([10, n_hidden])
 grad_b1 = np.zeros([n_hidden, 1])
 grad_b2 = np.zeros([10, 1])


 X, Y, H, P = X.T, Y.T, H.T, P.T

 for x, y, s1, h, p in zip(X, Y, S1, H, P):
 h = np.reshape(h, [1, n_hidden])
 y = np.reshape(y, [10, 1])
 p = np.reshape(p, [10, 1])

 # Second layer
 g = -(y-p).T
 grad_b2 += g.T
 grad_W2 += np.matmul(g.T, h)
 # First layer
 g = np.matmul(g, W2)
 print(g.shape)
 ind = np.zeros(h.shape[1])
 for i, val in enumerate(s1):
 if val > 0:
 ind[i] = 1

 diag = np.diag(ind)

 g = np.matmul(g, diag)

 grad_b1 += g.T
 grad_W1 += np.matmul(g.T, np.reshape(x, [1, n_input]))
 # Divide by batch size
 grad_b1 /= X.shape[0]
 grad_b2 /= X.shape[0]
 grad_W1 /= X.shape[0]
 grad_W2 /= X.shape[0]
 # Add regularization term
 grad_W1 += 2*lamb*W1
 grad_W2 += 2*lamb*W2
 dur = time.clock() - t
 print("compute_gradients took seconds".format(dur))
 return grad_W1, grad_W2, grad_b1, grad_b2


def compute_gradient_num(X, Y, W1, W2, b1, b2, lamb = 0, h = 0.000001):
 Y = Y.T
 grad_W1 = np.zeros_like(W1)
 grad_W2 = np.zeros_like(W2)
 grad_b1 = np.zeros_like(b1)
 grad_b2 = np.zeros_like(b2)

 #gradient for b1 vector
 print("calculating b1 gradients...")
 for i in range(b1.shape[0]):
 b1_try = np.copy(b1)
 b1_try[i,0] = b1[i,0] - h
 c1 = compute_cost(X, Y, W1, W2, b1_try, b2, lamb)
 b1_try = np.copy(b1)
 b1_try[i, 0] = b1[i, 0] + h
 c2 = compute_cost(X, Y, W1, W2, b1_try, b2, lamb)
 grad_b1[i,0] = (c2-c1) / (2 * h)

 # gradient for W1 vector
 print("calculating W1 gradients...")
 for i in range(W1.shape[0]):
 for j in range(W1.shape[1]):
 W1_try = np.copy(W1)
 W1_try[i,j] = W1[i,j] - h
 c1 = compute_cost(X, Y, W1_try, W2, b1, b2, lamb)
 W1_try = np.copy(W1)
 W1_try[i, j] = W1[i, j] + h
 c2 = compute_cost(X, Y, W1_try, W2, b1, b2, lamb)
 grad_W1[i, j] = (c2 - c1) / (2 * h)

 #gradient for b2 vector
 print("calculating b2 gradients...")
 for i in range(b2.shape[0]):
 b2_try = np.copy(b2)
 b2_try[i,0] = b2[i,0] - h
 c1 = compute_cost(X, Y, W1, W2, b1, b2_try, lamb)
 b2_try = np.copy(b2)
 b2_try[i, 0] = b2[i, 0] + h
 c2 = compute_cost(X, Y, W1, W2, b1, b2_try, lamb)
 grad_b2[i,0] = (c2-c1) / (2 * h)

 # gradient for W2 vector
 print("calculating W2 gradients...")
 for i in range(W2.shape[0]):
 for j in range(W2.shape[1]):
 W2_try = np.copy(W2)
 W2_try[i,j] = W2[i,j] - h
 c1 = compute_cost(X, Y, W1, W2_try, b1, b2, lamb)
 W2_try = np.copy(W2)
 W2_try[i, j] = W2[i, j] + h
 c2 = compute_cost(X, Y, W1, W2_try, b1, b2, lamb)
 grad_W2[i, j] = (c2 - c1) / (2 * h)

 return grad_W1, grad_W2, grad_b1, grad_b2

W1 = np.random.normal(0, 0.01, [n_hidden, n_input])
W2 = np.random.normal(0, 0.01, [10, n_hidden])
b1 = np.random.normal(0, 0.1, [n_hidden, 1])
b2 = np.random.normal(0, 0.1, [10, 1])


def train_batch(X, Y, W1, W2, b1, b2, gd_params, old_grads):
 S1, H, P = evaluate_classifier(X.T, W1, W2, b1, b2)
 grad_W1, grad_W2, grad_b1, grad_b2 = compute_gradients(X.T, Y.T, S1, H, P, W1, W2, gd_params["lambda"])
 try:
 W1 -= (1-gd_params["rho"])*gd_params["eta"]*grad_W1 + gd_params["rho"]*gd_params["eta"]*old_grads["W1"]
 W2 -= (1-gd_params["rho"])*gd_params["eta"]*grad_W2 + gd_params["rho"]*gd_params["eta"]*old_grads["W2"]
 b1 -= (1-gd_params["rho"])*gd_params["eta"]*grad_b1 + gd_params["rho"]*gd_params["eta"]*old_grads["b1"]
 b2 -= (1-gd_params["rho"])*gd_params["eta"]*grad_b2 + gd_params["rho"]*gd_params["eta"]*old_grads["b2"]
 except KeyError:
 W1 -= gd_params["eta"]*grad_W1
 W2 -= gd_params["eta"]*grad_W2
 b1 -= gd_params["eta"]*grad_b1
 b2 -= gd_params["eta"]*grad_b2
 return grad_W1, grad_W2, grad_b1, grad_b2

def minibatch_GD(X, Y, W1, W2, b1, b2, gd_params, validation=None, verbose = True):
 # X must have shape (n, 3072)
 # Y must have shape (k, 10)
 # validation["data"] must have shape (k, 3072)
 # validation["labels"] must have shape (k, 10)
 training_loss = [None]*gd_params["epochs"]
 validation_loss = training_loss[:] # copying the list
 training_accuracy = training_loss[:]
 validation_accuracy = training_loss[:]

 for i in range(gd_params["epochs"]):

 if i % 10 == 0 and verbose:
 print("Training epoch started".format(i+1))

 old_grads = 
 # For each batch
 for j in range(X.shape[0]//gd_params["batch_size"]):
 start = j*gd_params["batch_size"]
 end = start + gd_params["batch_size"]
 batch = X[start: end]
 targets = Y[start:end]

 old_grads["W1"], old_grads["W2"], old_grads["b1"], old_grads["b2"] = train_batch(batch, targets, W1, W2, b1, b2, gd_params, old_grads)

 # Train the datapoints that didn't fit into last minibatch
 n_rest = X.shape[0]%gd_params["batch_size"]
 if n_rest > 0:
 start = X.shape[0]-n_rest
 end = X.shape[0]
 batch = X[start: end]
 targets = Y[start: end]
 old_grads = train_batch(batch, targets, W1, W2, b1, b2, gd_params, old_grads)

 training_loss[i] = compute_cost(X.T, Y, W1, W2, b1, b2, gd_params["lambda"])
 training_accuracy[i] = compute_accuracy(X.T, Y.T, W1, W2, b1, b2)
 if validation:
 validation_loss[i] = compute_cost(validation["data"].T, validation["labels"], W1, W2, b1, b2, gd_params["lambda"])
 validation_accuracy[i] = compute_accuracy(validation["data"].T, validation["labels"].T, W1, W2, b1, b2)
 return training_loss, None, training_accuracy, None

dataset_mini = 
dataset_mini["data"] = dataset_small["data"][0:100]
dataset_mini["labels"] = dataset_small["labels"][0:100]

W1 = np.random.normal(0, 0.01, [n_hidden, n_input])
W2 = np.random.normal(0, 0.01, [10, n_hidden])
b1 = np.random.normal(0, 0.1, [n_hidden, 1])
b2 = np.random.normal(0, 0.1, [10, 1])

gd_params = "epochs": 200, "batch_size":10, "lambda": 0, "eta": 0.01, "rho": 0.5
X = dataset_mini["data"]
Y = dataset_mini["labels"]
training_loss, _, training_accuracy, _ = minibatch_GD(X, Y, W1, W2, b1, b2, gd_params)


plt.plot(training_loss)
plt.figure()
plt.plot(training_accuracy)

edited Apr 18 at 6:56

asked Apr 17 at 18:21

Sandi

1385

edited Apr 18 at 6:56

asked Apr 17 at 18:21

Sandi

1385

asked Apr 17 at 18:21

Sandi

1385

asked Apr 17 at 18:21

Sandi

1385

Could you modify the question to add a runnable example? It would make it easier to investigate, because we can python -m cProfile your_example.py
â€“Â FirefoxMetzger
Apr 18 at 4:57

On first glance the the culprit is that you don't use vectorized computation, but rather compute the gradient row by row in a for loop. Also you are using a linear output layer and a ReLu hidden layer correct? You can vectorize gradient computation of the ReLu instead of nesting for loops
â€“Â FirefoxMetzger
Apr 18 at 5:03

@FirefoxMetzger, added runnable example
â€“Â Sandi
Apr 18 at 6:59

You are linking to the wrong data set. Your code seems to use CIFAR-10, but you link to CIFAR-100 which has a slightly different structure and won't work.
â€“Â FirefoxMetzger
Apr 18 at 9:57

add a commentÂ |Â

Could you modify the question to add a runnable example? It would make it easier to investigate, because we can python -m cProfile your_example.py
â€“Â FirefoxMetzger
Apr 18 at 4:57

On first glance the the culprit is that you don't use vectorized computation, but rather compute the gradient row by row in a for loop. Also you are using a linear output layer and a ReLu hidden layer correct? You can vectorize gradient computation of the ReLu instead of nesting for loops
â€“Â FirefoxMetzger
Apr 18 at 5:03

@FirefoxMetzger, added runnable example
â€“Â Sandi
Apr 18 at 6:59

You are linking to the wrong data set. Your code seems to use CIFAR-10, but you link to CIFAR-100 which has a slightly different structure and won't work.
â€“Â FirefoxMetzger
Apr 18 at 9:57

Could you modify the question to add a runnable example? It would make it easier to investigate, because we can python -m cProfile your_example.py
â€“Â FirefoxMetzger
Apr 18 at 4:57

On first glance the the culprit is that you don't use vectorized computation, but rather compute the gradient row by row in a for loop. Also you are using a linear output layer and a ReLu hidden layer correct? You can vectorize gradient computation of the ReLu instead of nesting for loops
â€“Â FirefoxMetzger
Apr 18 at 5:03

@FirefoxMetzger, added runnable example
â€“Â Sandi
Apr 18 at 6:59

You are linking to the wrong data set. Your code seems to use CIFAR-10, but you link to CIFAR-100 which has a slightly different structure and won't work.
â€“Â FirefoxMetzger
Apr 18 at 9:57

add a commentÂ |Â

1 Answer
1

active

oldest

votes

up vote
2
down vote

accepted

Here is a faster version (around 300 ms per batch). Its essentially refactored for readability and often in vectorized code, more readability / code beauty makes code run faster:

def compute_gradients(X, Y, S1, H, P, W1, W2, lamb):
 # Y must be one-hot
 # Y and P must be (10, n)
 # X and H must be (3072, n)
 # P is the softmax layer

 batch_size = X.shape[1]

 t = time.time()

 # gradients network
 dp = -(Y-P).T
 dl2 = dp
 drelu = np.matmul(dl2, W2)
 dl1 = drelu * (S1 > 0).T

 # updates
 dW2 = np.matmul(H,dl2).T/batch_size + 2*lamb*W2
 dW1 = np.matmul(X,dl1).T/batch_size + 2*lamb*W1

 db2 = np.sum(dl2)/batch_size
 db1 = np.sum(dl1)/batch_size

 dur = time.time() - t
 print("compute_gradients took s".format(dur))
 return dW1, dW2, db1, db2

Things I changes:

I threw out the assertions that you did in the beginning. Numpy will do the same and it really just slows you down. It also adds visual clutter and makes code hard to read

I vectorized computing ReLu (comparing the value with 0 directly)

I vectorized the gradient of the ReLu (matmul(a,diag(b)) is the same as a*b element wise)

I vectorized the accumulation of the gradient update (this one took me a long time to learn as a Bachelor's; if you don't see it, try writing down the sum over an update batch and the dyadic product for each update on paper and refactor. The idea is that you do an outer product along one rank and an inner product along another)

I changed time.clock() for time.time() to compute the runtime in seconds instead of 1/10th a second.

There is a lot more you could do to the rest of the code, but that is out of scope for this question.

edited Apr 18 at 12:58

answered Apr 18 at 11:42

FirefoxMetzger

74628

I don't think it's a bug actually, we're using the cross entropy loss and softmax for the last layer, relu for the hidden layer. The math ends up looking like that after some simplifications.
â€“Â Sandi
Apr 18 at 12:03

Ah, yes that's true. I didn't see the cross entropy term.
â€“Â FirefoxMetzger
Apr 18 at 12:58

add a commentÂ |Â

Your Answer

StackExchange.ifUsing("editor", function ()
return StackExchange.using("mathjaxEditing", function ()
StackExchange.MarkdownEditor.creationCallbacks.add(function (editor, postfix)
StackExchange.mathjaxEditing.prepareWmdForMathJax(editor, postfix, [["\$", "\$"]]);
);
);
, "mathjax-editing");

StackExchange.ifUsing("editor", function ()
StackExchange.using("externalEditor", function ()
StackExchange.using("snippets", function ()
StackExchange.snippets.init();
);
);
, "code-snippets");

StackExchange.ready(function()
var channelOptions =
tags: "".split(" "),
id: "196"
;
initTagRenderer("".split(" "), "".split(" "), channelOptions);

StackExchange.using("externalEditor", function()
// Have to fire editor after snippets, if snippets enabled
if (StackExchange.settings.snippets.snippetsEnabled)
StackExchange.using("snippets", function()
createEditor();
);

else
createEditor();

);

function createEditor()
StackExchange.prepareEditor(
heartbeatType: 'answer',
convertImagesToLinks: false,
noModals: false,
showLowRepImageUploadWarning: true,
reputationToPostImages: null,
bindNavPrevention: true,
postfix: "",
onDemand: true,
discardSelector: ".discard-answer"
,immediatelyShowMarkdownHelp:true
);

);

draft saved

draft discarded

StackExchange.ready(
function ()
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fcodereview.stackexchange.com%2fquestions%2f192314%2fmulti-layer-perceptron-program-is-super-slow%23new-answer', 'question_page');

);

Post as a guest

Name

1 Answer
1

active

oldest

votes

1 Answer
1

active

oldest

votes

up vote
2
down vote

accepted

Here is a faster version (around 300 ms per batch). Its essentially refactored for readability and often in vectorized code, more readability / code beauty makes code run faster:

def compute_gradients(X, Y, S1, H, P, W1, W2, lamb):
 # Y must be one-hot
 # Y and P must be (10, n)
 # X and H must be (3072, n)
 # P is the softmax layer

 batch_size = X.shape[1]

 t = time.time()

 # gradients network
 dp = -(Y-P).T
 dl2 = dp
 drelu = np.matmul(dl2, W2)
 dl1 = drelu * (S1 > 0).T

 # updates
 dW2 = np.matmul(H,dl2).T/batch_size + 2*lamb*W2
 dW1 = np.matmul(X,dl1).T/batch_size + 2*lamb*W1

 db2 = np.sum(dl2)/batch_size
 db1 = np.sum(dl1)/batch_size

 dur = time.time() - t
 print("compute_gradients took s".format(dur))
 return dW1, dW2, db1, db2

Things I changes:

I threw out the assertions that you did in the beginning. Numpy will do the same and it really just slows you down. It also adds visual clutter and makes code hard to read

I vectorized computing ReLu (comparing the value with 0 directly)

I vectorized the gradient of the ReLu (matmul(a,diag(b)) is the same as a*b element wise)

I vectorized the accumulation of the gradient update (this one took me a long time to learn as a Bachelor's; if you don't see it, try writing down the sum over an update batch and the dyadic product for each update on paper and refactor. The idea is that you do an outer product along one rank and an inner product along another)

I changed time.clock() for time.time() to compute the runtime in seconds instead of 1/10th a second.

There is a lot more you could do to the rest of the code, but that is out of scope for this question.

edited Apr 18 at 12:58

answered Apr 18 at 11:42

FirefoxMetzger

74628

I don't think it's a bug actually, we're using the cross entropy loss and softmax for the last layer, relu for the hidden layer. The math ends up looking like that after some simplifications.
â€“Â Sandi
Apr 18 at 12:03

Ah, yes that's true. I didn't see the cross entropy term.
â€“Â FirefoxMetzger
Apr 18 at 12:58

add a commentÂ |Â

up vote
2
down vote

accepted

Here is a faster version (around 300 ms per batch). Its essentially refactored for readability and often in vectorized code, more readability / code beauty makes code run faster:

def compute_gradients(X, Y, S1, H, P, W1, W2, lamb):
 # Y must be one-hot
 # Y and P must be (10, n)
 # X and H must be (3072, n)
 # P is the softmax layer

 batch_size = X.shape[1]

 t = time.time()

 # gradients network
 dp = -(Y-P).T
 dl2 = dp
 drelu = np.matmul(dl2, W2)
 dl1 = drelu * (S1 > 0).T

 # updates
 dW2 = np.matmul(H,dl2).T/batch_size + 2*lamb*W2
 dW1 = np.matmul(X,dl1).T/batch_size + 2*lamb*W1

 db2 = np.sum(dl2)/batch_size
 db1 = np.sum(dl1)/batch_size

 dur = time.time() - t
 print("compute_gradients took s".format(dur))
 return dW1, dW2, db1, db2

Things I changes:

I threw out the assertions that you did in the beginning. Numpy will do the same and it really just slows you down. It also adds visual clutter and makes code hard to read

I vectorized computing ReLu (comparing the value with 0 directly)

I vectorized the gradient of the ReLu (matmul(a,diag(b)) is the same as a*b element wise)

I vectorized the accumulation of the gradient update (this one took me a long time to learn as a Bachelor's; if you don't see it, try writing down the sum over an update batch and the dyadic product for each update on paper and refactor. The idea is that you do an outer product along one rank and an inner product along another)

I changed time.clock() for time.time() to compute the runtime in seconds instead of 1/10th a second.

There is a lot more you could do to the rest of the code, but that is out of scope for this question.

edited Apr 18 at 12:58

answered Apr 18 at 11:42

FirefoxMetzger

74628

I don't think it's a bug actually, we're using the cross entropy loss and softmax for the last layer, relu for the hidden layer. The math ends up looking like that after some simplifications.
â€“Â Sandi
Apr 18 at 12:03

Ah, yes that's true. I didn't see the cross entropy term.
â€“Â FirefoxMetzger
Apr 18 at 12:58

add a commentÂ |Â

up vote
2
down vote

accepted

Here is a faster version (around 300 ms per batch). Its essentially refactored for readability and often in vectorized code, more readability / code beauty makes code run faster:

def compute_gradients(X, Y, S1, H, P, W1, W2, lamb):
 # Y must be one-hot
 # Y and P must be (10, n)
 # X and H must be (3072, n)
 # P is the softmax layer

 batch_size = X.shape[1]

 t = time.time()

 # gradients network
 dp = -(Y-P).T
 dl2 = dp
 drelu = np.matmul(dl2, W2)
 dl1 = drelu * (S1 > 0).T

 # updates
 dW2 = np.matmul(H,dl2).T/batch_size + 2*lamb*W2
 dW1 = np.matmul(X,dl1).T/batch_size + 2*lamb*W1

 db2 = np.sum(dl2)/batch_size
 db1 = np.sum(dl1)/batch_size

 dur = time.time() - t
 print("compute_gradients took s".format(dur))
 return dW1, dW2, db1, db2

Things I changes:

I threw out the assertions that you did in the beginning. Numpy will do the same and it really just slows you down. It also adds visual clutter and makes code hard to read

I vectorized computing ReLu (comparing the value with 0 directly)

I vectorized the gradient of the ReLu (matmul(a,diag(b)) is the same as a*b element wise)

I vectorized the accumulation of the gradient update (this one took me a long time to learn as a Bachelor's; if you don't see it, try writing down the sum over an update batch and the dyadic product for each update on paper and refactor. The idea is that you do an outer product along one rank and an inner product along another)

I changed time.clock() for time.time() to compute the runtime in seconds instead of 1/10th a second.

There is a lot more you could do to the rest of the code, but that is out of scope for this question.

edited Apr 18 at 12:58

answered Apr 18 at 11:42

FirefoxMetzger

74628

Here is a faster version (around 300 ms per batch). Its essentially refactored for readability and often in vectorized code, more readability / code beauty makes code run faster:

def compute_gradients(X, Y, S1, H, P, W1, W2, lamb):
 # Y must be one-hot
 # Y and P must be (10, n)
 # X and H must be (3072, n)
 # P is the softmax layer

 batch_size = X.shape[1]

 t = time.time()

 # gradients network
 dp = -(Y-P).T
 dl2 = dp
 drelu = np.matmul(dl2, W2)
 dl1 = drelu * (S1 > 0).T

 # updates
 dW2 = np.matmul(H,dl2).T/batch_size + 2*lamb*W2
 dW1 = np.matmul(X,dl1).T/batch_size + 2*lamb*W1

 db2 = np.sum(dl2)/batch_size
 db1 = np.sum(dl1)/batch_size

 dur = time.time() - t
 print("compute_gradients took s".format(dur))
 return dW1, dW2, db1, db2

Things I changes:

I threw out the assertions that you did in the beginning. Numpy will do the same and it really just slows you down. It also adds visual clutter and makes code hard to read

I vectorized computing ReLu (comparing the value with 0 directly)

I vectorized the gradient of the ReLu (matmul(a,diag(b)) is the same as a*b element wise)

I vectorized the accumulation of the gradient update (this one took me a long time to learn as a Bachelor's; if you don't see it, try writing down the sum over an update batch and the dyadic product for each update on paper and refactor. The idea is that you do an outer product along one rank and an inner product along another)

I changed time.clock() for time.time() to compute the runtime in seconds instead of 1/10th a second.

There is a lot more you could do to the rest of the code, but that is out of scope for this question.

edited Apr 18 at 12:58

answered Apr 18 at 11:42

FirefoxMetzger

74628

edited Apr 18 at 12:58

answered Apr 18 at 11:42

FirefoxMetzger

74628

answered Apr 18 at 11:42

FirefoxMetzger

74628

answered Apr 18 at 11:42

FirefoxMetzger

74628

I don't think it's a bug actually, we're using the cross entropy loss and softmax for the last layer, relu for the hidden layer. The math ends up looking like that after some simplifications.
â€“Â Sandi
Apr 18 at 12:03

Ah, yes that's true. I didn't see the cross entropy term.
â€“Â FirefoxMetzger
Apr 18 at 12:58

add a commentÂ |Â

I don't think it's a bug actually, we're using the cross entropy loss and softmax for the last layer, relu for the hidden layer. The math ends up looking like that after some simplifications.
â€“Â Sandi
Apr 18 at 12:03

Ah, yes that's true. I didn't see the cross entropy term.
â€“Â FirefoxMetzger
Apr 18 at 12:58

I don't think it's a bug actually, we're using the cross entropy loss and softmax for the last layer, relu for the hidden layer. The math ends up looking like that after some simplifications.
â€“Â Sandi
Apr 18 at 12:03

Ah, yes that's true. I didn't see the cross entropy term.
â€“Â FirefoxMetzger
Apr 18 at 12:58

add a commentÂ |Â

draft saved

draft discarded

draft saved

draft discarded

Post as a guest

Name

搜尋此網誌

trjhtr