Multi-layer perceptron program is super slow

The name of the pictureThe name of the pictureThe name of the pictureClash Royale CLAN TAG#URR8PPP





.everyoneloves__top-leaderboard:empty,.everyoneloves__mid-leaderboard:empty margin-bottom:0;







up vote
7
down vote

favorite












I'm writing a multi-layer perceptron from scratch and I think it's way slower than it should be. the culprit seems to be my compute_gradients-function, which according to my investigation answers for most of the execution time. It looks like this:



def compute_gradients(X, Y, S1, H, P, W1, W2, lamb):
# Y must be one-hot
# Y and P must be (10, n)
# X and H must be (3072, n)
# P is the softmax layer
if not (Y.shape[0] == 10 and P.shape[0] == 10 and Y.shape == P.shape):
raise ValueError("Y and P must have shape (10, k). Y: , P: ".format(Y.shape, P.shape))
if not X.shape[0] == n_input:
raise ValueError("X must have shape (, k), has ".format(n_input, X.shape))
if not H.shape[0] == n_hidden:
raise ValueError("H must have shape (, k)".format(n_hidden))
grad_W1 = np.zeros([n_hidden, n_input])
grad_W2 = np.zeros([10, n_hidden])
grad_b1 = np.zeros([n_hidden, 1])
grad_b2 = np.zeros([10, 1])


X, Y, H, P = X.T, Y.T, H.T, P.T
for x, y, s1, h, p in zip(X, Y, S1, H, P):
h = np.reshape(h, [1, n_hidden])
y = np.reshape(y, [10, 1])
p = np.reshape(p, [10, 1])

# Second layer
g = -(y-p).T
grad_b2 += g.T
grad_W2 += np.matmul(g.T, h)
# First layer
g = np.matmul(g, W2)
ind = np.zeros(h.shape[1])
for i, val in enumerate(s1):
if val > 0:
ind[i] = 1
diag = np.diag(ind)

g = np.matmul(g, diag)

grad_b1 += g.T
grad_W1 += np.matmul(g.T, np.reshape(x, [1, n_input]))
# Divide by batch size
grad_b1 /= X.shape[0]
grad_b2 /= X.shape[0]
grad_W1 /= X.shape[0]
grad_W2 /= X.shape[0]
# Add regularization term
grad_W1 += 2*lamb*W1
grad_W2 += 2*lamb*W2
return grad_W1, grad_W2, grad_b1, grad_b2


If X, Y, H, P are all 10 rows long (n=10), the computations take about 1 second. This is way too much compared to my friends who are doing the same task. But I can't see any obvious inefficiencies in my code. What can I do to speed the computations up?



EDIT: Here's a runnable example. You also need the CIFAR dataset, found here: https://www.cs.toronto.edu/~kriz/cifar-100-python.tar.gz .



import numpy as np
import matplotlib.pyplot as plt
from PIL import Image
import os
import time

n_input = 3072
n_hidden = 3072

def one_hot(Y):
# assume Y = [1, 4, 9, 0, ...]
result = [None]*len(Y)
for i, cls in enumerate(Y):
onehot =
0: lambda: [1,0,0,0,0,0,0,0,0,0],
1: lambda: [0,1,0,0,0,0,0,0,0,0],
2: lambda: [0,0,1,0,0,0,0,0,0,0],
3: lambda: [0,0,0,1,0,0,0,0,0,0],
4: lambda: [0,0,0,0,1,0,0,0,0,0],
5: lambda: [0,0,0,0,0,1,0,0,0,0],
6: lambda: [0,0,0,0,0,0,1,0,0,0],
7: lambda: [0,0,0,0,0,0,0,1,0,0],
8: lambda: [0,0,0,0,0,0,0,0,1,0],
9: lambda: [0,0,0,0,0,0,0,0,0,1],
[cls]()
result[i] = onehot

result = np.array(result).T
return result


def unpickle(file):
import pickle
with open(file, "rb") as fo:
d = pickle.load(fo, encoding="bytes")
return d

path = "../../datasets/cifar/"

names = ["data_batch_1",
"data_batch_2",
"data_batch_3",
"data_batch_4",
"data_batch_5",
]

dataset_raw = unpickle(os.path.join(path, names[0]))
dataset_small =
dataset_small["data"] = dataset_raw[b"data"]/255
dataset_small["labels"] = one_hot(dataset_raw[b"labels"]).T

validation_set_raw = unpickle(os.path.join(path, names[1]))
validation_small =
validation_small["data"] = validation_set_raw[b"data"]/255
validation_small["labels"] = one_hot(validation_set_raw[b"labels"]).T

print("small dataset data shape: ".format(dataset_small["data"].shape))
print("small dataset labels shape: ".format(dataset_small["labels"].shape))
print("small validation data shape: ".format(validation_small["data"].shape))
print("small validation labels shape: ".format(validation_small["labels"].shape))

test_set_raw = unpickle(os.path.join(path, "test_batch"))
X_test = test_set_raw[b"data"]
Y_test = one_hot(test_set_raw[b"labels"])

# All data sets

dataset_large = "data": np.zeros([0, 3072]), "labels": np.array()
validation_large =

## All data batches

for name in names[1:4]:
raw = unpickle(os.path.join(path, name))
dataset_large["data"] = np.append(dataset_large["data"], raw[b"data"], axis = 0)
dataset_large["labels"] = np.append(dataset_large["labels"], raw[b"labels"], axis = 0)
raw = unpickle(os.path.join(path, names[4]))
dataset_large["data"] = np.append(dataset_large["data"], raw[b"data"][0: -1000], axis = 0)
dataset_large["labels"] = np.append(dataset_large["labels"], raw[b"labels"][0: -1000], axis = 0)
validation_large["data"] = raw[b"data"][-1000: ]
validation_large["labels"] = raw[b"labels"][-1000: ]

# Make one-hot
dataset_large["labels"] = one_hot(dataset_large["labels"]).T
validation_large["labels"] = one_hot(validation_large["labels"]).T

# Normalize
dataset_large["data"] = dataset_large["data"]/255
validation_large["data"] = validation_large["data"]/255

print("large dataset data: ".format(dataset_large["data"].shape))
print("large dataset labels: ".format(dataset_large["labels"].shape))
print("large validation set data: ".format(validation_large["data"].shape))
print("large validation set labels: ".format(validation_large["labels"].shape))

def softmax(s):
numerator = np.exp(s)
denominator = np.matmul(np.ones(10).T, np.exp(s))
return numerator/denominator

def relu(s1):
h = np.maximum(np.zeros_like(s1), s1)
return h

def layer_1(X, W1, b1):
# WARNING: X must have shape (3072, n)
# W1 must have shape (3072, 3072)
# b1 must have shape (3072, 1)
# s will get shape (10, n)
# Return p with shape (10, n)

if not X.shape[0] == n_input:
raise ValueError("Got wrong shape of s1: ".format(X.shape))


s1 = np.matmul(W1, X) + b1

if not s1.shape == (n_hidden, X.shape[1]):
raise ValueError("Got wrong shape of s1: ".format(s1.shape))

h = relu(s1)

if not h.shape == (n_hidden, X.shape[1]):
raise ValueError("Got wrong shape of h: ".format(h.shape))

return s1, h

def layer_2(h, W2, b2):
# WARNING: h must have shape (3072, n)
# W2 must have shape (10, 3072)
# b2 must have shape (10, 1)
# s will get shape (10, n)
# Returns p with shape (10, n)

s = np.matmul(W2, h) + b2

if not s.shape == (10, h.shape[1]):
raise ValueError("Got wrong shape of s: ".format(s.shape))

p = softmax(s)

if not p.shape == (10, h.shape[1]):
raise ValueError("Got wrong shape of p: ".format(p.shape))

return p

def evaluate_classifier(X, W1, W2, b1, b2):
if not X.shape[0] == n_input:
ValueError("Wrong shape of X: ".format(X.shape))
if not len(X.shape) == 2:
ValueError("Wrong shape of X: ".format(X.shape))
if not W1.shape == (n_hidden, n_input):
raise ValueError("Wrong shape of W1: ".format(W1.shape))
if not b1.shape == (n_hidden, 1):
raise ValueError("Wrong shape of b1: ".format(b1.shape))
if not W2.shape == (10, n_hidden):
raise ValueError("Wrong shape of W2: ".format(W2.shape))
if not b2.shape == (10, 1):
raise ValueError("Wrong shape of b2: ".format(b2.shape))

s1, h = layer_1(X, W1, b1)

p = layer_2(h, W2, b2)

return s1, h, p

def compute_cost(X, Y, W1, W2, b1, b2, lamb = 0):
# WARNING! Y must be one hot!
# X must have shape (3072, -1)
# Y and P must have shape (-1, 10)

if not Y.shape[1] == 10 and len(Y.shape) == 2:
raise ValueError("Wrong shape of Y: . Must be (-1, 10)".format(Y.shape))
if not X.shape[0] == 3072:
raise ValueError("Wrong shape of X: . Must be (3072, -1)".format(X.shape))
_, _, P = evaluate_classifier(X, W1, W2, b1, b2)
P = P.T

if not P.shape[1] == 10 and len(P.shape) == 2:
raise ValueError("Wrong shape of P: . Must be (-1, 10)".format(P.shape))
if not P.shape == Y.shape:
raise ValueError("Y and P must have the same shape. != ".format(Y.shape, P.shape))

l_cross = 0
for i in range(X.shape[1]):
y = Y[i]
p = P[i]
y = np.reshape(y, [1, 10])
p = np.reshape(p, [10, 1])
l_cross += -np.log(np.matmul(y, p))[0][0]
avg_loss = l_cross/X.shape[1]
reg = lamb * (np.sum(W1) + np.sum(W2))

return avg_loss + reg

def compute_accuracy(X, Y, W1, W2, b1, b2):
# X must have shape (3072, n)
# Y must have shape (10, n)
if(not X.shape[0] == 3072):
raise ValueError("X must have shape (3072, n)")
try:
X.shape[1]
except IndexError:
raise(ValueError("X must have shape (3072, n)"))
if not Y.shape[0] == 10:
raise ValueError("Y must have shape (10, n)")
if not len(Y.shape) == 2:
raise ValueError("Y mus have shape (10, n)")
correct = 0
for x, y in zip(X.T, Y.T):
_, _, p = evaluate_classifier(np.reshape(x, (3072,1)), W1, W2, b1, b2)
pred = np.argmax(p)
corr = np.argmax(y)
correct += 1 if pred == corr else 0
return correct/X.shape[1]

def compute_gradients(X, Y, S1, H, P, W1, W2, lamb):
# Y must be one-hot
# Y and P must be (10, n)
# X and H must be (3072, n)
# P is the softmax layer
t = time.clock()
if not (Y.shape[0] == 10 and P.shape[0] == 10 and Y.shape == P.shape):
raise ValueError("Y and P must have shape (10, k). Y: , P: ".format(Y.shape, P.shape))
if not X.shape[0] == n_input:
raise ValueError("X must have shape (, k), has ".format(n_input, X.shape))
if not H.shape[0] == n_hidden:
raise ValueError("H must have shape (, k)".format(n_hidden))
grad_W1 = np.zeros([n_hidden, n_input])
grad_W2 = np.zeros([10, n_hidden])
grad_b1 = np.zeros([n_hidden, 1])
grad_b2 = np.zeros([10, 1])


X, Y, H, P = X.T, Y.T, H.T, P.T

for x, y, s1, h, p in zip(X, Y, S1, H, P):
h = np.reshape(h, [1, n_hidden])
y = np.reshape(y, [10, 1])
p = np.reshape(p, [10, 1])

# Second layer
g = -(y-p).T
grad_b2 += g.T
grad_W2 += np.matmul(g.T, h)
# First layer
g = np.matmul(g, W2)
print(g.shape)
ind = np.zeros(h.shape[1])
for i, val in enumerate(s1):
if val > 0:
ind[i] = 1

diag = np.diag(ind)

g = np.matmul(g, diag)

grad_b1 += g.T
grad_W1 += np.matmul(g.T, np.reshape(x, [1, n_input]))
# Divide by batch size
grad_b1 /= X.shape[0]
grad_b2 /= X.shape[0]
grad_W1 /= X.shape[0]
grad_W2 /= X.shape[0]
# Add regularization term
grad_W1 += 2*lamb*W1
grad_W2 += 2*lamb*W2
dur = time.clock() - t
print("compute_gradients took seconds".format(dur))
return grad_W1, grad_W2, grad_b1, grad_b2


def compute_gradient_num(X, Y, W1, W2, b1, b2, lamb = 0, h = 0.000001):
Y = Y.T
grad_W1 = np.zeros_like(W1)
grad_W2 = np.zeros_like(W2)
grad_b1 = np.zeros_like(b1)
grad_b2 = np.zeros_like(b2)

#gradient for b1 vector
print("calculating b1 gradients...")
for i in range(b1.shape[0]):
b1_try = np.copy(b1)
b1_try[i,0] = b1[i,0] - h
c1 = compute_cost(X, Y, W1, W2, b1_try, b2, lamb)
b1_try = np.copy(b1)
b1_try[i, 0] = b1[i, 0] + h
c2 = compute_cost(X, Y, W1, W2, b1_try, b2, lamb)
grad_b1[i,0] = (c2-c1) / (2 * h)

# gradient for W1 vector
print("calculating W1 gradients...")
for i in range(W1.shape[0]):
for j in range(W1.shape[1]):
W1_try = np.copy(W1)
W1_try[i,j] = W1[i,j] - h
c1 = compute_cost(X, Y, W1_try, W2, b1, b2, lamb)
W1_try = np.copy(W1)
W1_try[i, j] = W1[i, j] + h
c2 = compute_cost(X, Y, W1_try, W2, b1, b2, lamb)
grad_W1[i, j] = (c2 - c1) / (2 * h)

#gradient for b2 vector
print("calculating b2 gradients...")
for i in range(b2.shape[0]):
b2_try = np.copy(b2)
b2_try[i,0] = b2[i,0] - h
c1 = compute_cost(X, Y, W1, W2, b1, b2_try, lamb)
b2_try = np.copy(b2)
b2_try[i, 0] = b2[i, 0] + h
c2 = compute_cost(X, Y, W1, W2, b1, b2_try, lamb)
grad_b2[i,0] = (c2-c1) / (2 * h)

# gradient for W2 vector
print("calculating W2 gradients...")
for i in range(W2.shape[0]):
for j in range(W2.shape[1]):
W2_try = np.copy(W2)
W2_try[i,j] = W2[i,j] - h
c1 = compute_cost(X, Y, W1, W2_try, b1, b2, lamb)
W2_try = np.copy(W2)
W2_try[i, j] = W2[i, j] + h
c2 = compute_cost(X, Y, W1, W2_try, b1, b2, lamb)
grad_W2[i, j] = (c2 - c1) / (2 * h)

return grad_W1, grad_W2, grad_b1, grad_b2

W1 = np.random.normal(0, 0.01, [n_hidden, n_input])
W2 = np.random.normal(0, 0.01, [10, n_hidden])
b1 = np.random.normal(0, 0.1, [n_hidden, 1])
b2 = np.random.normal(0, 0.1, [10, 1])


def train_batch(X, Y, W1, W2, b1, b2, gd_params, old_grads):
S1, H, P = evaluate_classifier(X.T, W1, W2, b1, b2)
grad_W1, grad_W2, grad_b1, grad_b2 = compute_gradients(X.T, Y.T, S1, H, P, W1, W2, gd_params["lambda"])
try:
W1 -= (1-gd_params["rho"])*gd_params["eta"]*grad_W1 + gd_params["rho"]*gd_params["eta"]*old_grads["W1"]
W2 -= (1-gd_params["rho"])*gd_params["eta"]*grad_W2 + gd_params["rho"]*gd_params["eta"]*old_grads["W2"]
b1 -= (1-gd_params["rho"])*gd_params["eta"]*grad_b1 + gd_params["rho"]*gd_params["eta"]*old_grads["b1"]
b2 -= (1-gd_params["rho"])*gd_params["eta"]*grad_b2 + gd_params["rho"]*gd_params["eta"]*old_grads["b2"]
except KeyError:
W1 -= gd_params["eta"]*grad_W1
W2 -= gd_params["eta"]*grad_W2
b1 -= gd_params["eta"]*grad_b1
b2 -= gd_params["eta"]*grad_b2
return grad_W1, grad_W2, grad_b1, grad_b2

def minibatch_GD(X, Y, W1, W2, b1, b2, gd_params, validation=None, verbose = True):
# X must have shape (n, 3072)
# Y must have shape (k, 10)
# validation["data"] must have shape (k, 3072)
# validation["labels"] must have shape (k, 10)
training_loss = [None]*gd_params["epochs"]
validation_loss = training_loss[:] # copying the list
training_accuracy = training_loss[:]
validation_accuracy = training_loss[:]

for i in range(gd_params["epochs"]):

if i % 10 == 0 and verbose:
print("Training epoch started".format(i+1))

old_grads =
# For each batch
for j in range(X.shape[0]//gd_params["batch_size"]):
start = j*gd_params["batch_size"]
end = start + gd_params["batch_size"]
batch = X[start: end]
targets = Y[start:end]

old_grads["W1"], old_grads["W2"], old_grads["b1"], old_grads["b2"] = train_batch(batch, targets, W1, W2, b1, b2, gd_params, old_grads)

# Train the datapoints that didn't fit into last minibatch
n_rest = X.shape[0]%gd_params["batch_size"]
if n_rest > 0:
start = X.shape[0]-n_rest
end = X.shape[0]
batch = X[start: end]
targets = Y[start: end]
old_grads = train_batch(batch, targets, W1, W2, b1, b2, gd_params, old_grads)

training_loss[i] = compute_cost(X.T, Y, W1, W2, b1, b2, gd_params["lambda"])
training_accuracy[i] = compute_accuracy(X.T, Y.T, W1, W2, b1, b2)
if validation:
validation_loss[i] = compute_cost(validation["data"].T, validation["labels"], W1, W2, b1, b2, gd_params["lambda"])
validation_accuracy[i] = compute_accuracy(validation["data"].T, validation["labels"].T, W1, W2, b1, b2)
return training_loss, None, training_accuracy, None

dataset_mini =
dataset_mini["data"] = dataset_small["data"][0:100]
dataset_mini["labels"] = dataset_small["labels"][0:100]

W1 = np.random.normal(0, 0.01, [n_hidden, n_input])
W2 = np.random.normal(0, 0.01, [10, n_hidden])
b1 = np.random.normal(0, 0.1, [n_hidden, 1])
b2 = np.random.normal(0, 0.1, [10, 1])

gd_params = "epochs": 200, "batch_size":10, "lambda": 0, "eta": 0.01, "rho": 0.5
X = dataset_mini["data"]
Y = dataset_mini["labels"]
training_loss, _, training_accuracy, _ = minibatch_GD(X, Y, W1, W2, b1, b2, gd_params)


plt.plot(training_loss)
plt.figure()
plt.plot(training_accuracy)






share|improve this question





















  • Could you modify the question to add a runnable example? It would make it easier to investigate, because we can python -m cProfile your_example.py
    – FirefoxMetzger
    Apr 18 at 4:57










  • On first glance the the culprit is that you don't use vectorized computation, but rather compute the gradient row by row in a for loop. Also you are using a linear output layer and a ReLu hidden layer correct? You can vectorize gradient computation of the ReLu instead of nesting for loops
    – FirefoxMetzger
    Apr 18 at 5:03










  • @FirefoxMetzger, added runnable example
    – Sandi
    Apr 18 at 6:59










  • You are linking to the wrong data set. Your code seems to use CIFAR-10, but you link to CIFAR-100 which has a slightly different structure and won't work.
    – FirefoxMetzger
    Apr 18 at 9:57
















up vote
7
down vote

favorite












I'm writing a multi-layer perceptron from scratch and I think it's way slower than it should be. the culprit seems to be my compute_gradients-function, which according to my investigation answers for most of the execution time. It looks like this:



def compute_gradients(X, Y, S1, H, P, W1, W2, lamb):
# Y must be one-hot
# Y and P must be (10, n)
# X and H must be (3072, n)
# P is the softmax layer
if not (Y.shape[0] == 10 and P.shape[0] == 10 and Y.shape == P.shape):
raise ValueError("Y and P must have shape (10, k). Y: , P: ".format(Y.shape, P.shape))
if not X.shape[0] == n_input:
raise ValueError("X must have shape (, k), has ".format(n_input, X.shape))
if not H.shape[0] == n_hidden:
raise ValueError("H must have shape (, k)".format(n_hidden))
grad_W1 = np.zeros([n_hidden, n_input])
grad_W2 = np.zeros([10, n_hidden])
grad_b1 = np.zeros([n_hidden, 1])
grad_b2 = np.zeros([10, 1])


X, Y, H, P = X.T, Y.T, H.T, P.T
for x, y, s1, h, p in zip(X, Y, S1, H, P):
h = np.reshape(h, [1, n_hidden])
y = np.reshape(y, [10, 1])
p = np.reshape(p, [10, 1])

# Second layer
g = -(y-p).T
grad_b2 += g.T
grad_W2 += np.matmul(g.T, h)
# First layer
g = np.matmul(g, W2)
ind = np.zeros(h.shape[1])
for i, val in enumerate(s1):
if val > 0:
ind[i] = 1
diag = np.diag(ind)

g = np.matmul(g, diag)

grad_b1 += g.T
grad_W1 += np.matmul(g.T, np.reshape(x, [1, n_input]))
# Divide by batch size
grad_b1 /= X.shape[0]
grad_b2 /= X.shape[0]
grad_W1 /= X.shape[0]
grad_W2 /= X.shape[0]
# Add regularization term
grad_W1 += 2*lamb*W1
grad_W2 += 2*lamb*W2
return grad_W1, grad_W2, grad_b1, grad_b2


If X, Y, H, P are all 10 rows long (n=10), the computations take about 1 second. This is way too much compared to my friends who are doing the same task. But I can't see any obvious inefficiencies in my code. What can I do to speed the computations up?



EDIT: Here's a runnable example. You also need the CIFAR dataset, found here: https://www.cs.toronto.edu/~kriz/cifar-100-python.tar.gz .



import numpy as np
import matplotlib.pyplot as plt
from PIL import Image
import os
import time

n_input = 3072
n_hidden = 3072

def one_hot(Y):
# assume Y = [1, 4, 9, 0, ...]
result = [None]*len(Y)
for i, cls in enumerate(Y):
onehot =
0: lambda: [1,0,0,0,0,0,0,0,0,0],
1: lambda: [0,1,0,0,0,0,0,0,0,0],
2: lambda: [0,0,1,0,0,0,0,0,0,0],
3: lambda: [0,0,0,1,0,0,0,0,0,0],
4: lambda: [0,0,0,0,1,0,0,0,0,0],
5: lambda: [0,0,0,0,0,1,0,0,0,0],
6: lambda: [0,0,0,0,0,0,1,0,0,0],
7: lambda: [0,0,0,0,0,0,0,1,0,0],
8: lambda: [0,0,0,0,0,0,0,0,1,0],
9: lambda: [0,0,0,0,0,0,0,0,0,1],
[cls]()
result[i] = onehot

result = np.array(result).T
return result


def unpickle(file):
import pickle
with open(file, "rb") as fo:
d = pickle.load(fo, encoding="bytes")
return d

path = "../../datasets/cifar/"

names = ["data_batch_1",
"data_batch_2",
"data_batch_3",
"data_batch_4",
"data_batch_5",
]

dataset_raw = unpickle(os.path.join(path, names[0]))
dataset_small =
dataset_small["data"] = dataset_raw[b"data"]/255
dataset_small["labels"] = one_hot(dataset_raw[b"labels"]).T

validation_set_raw = unpickle(os.path.join(path, names[1]))
validation_small =
validation_small["data"] = validation_set_raw[b"data"]/255
validation_small["labels"] = one_hot(validation_set_raw[b"labels"]).T

print("small dataset data shape: ".format(dataset_small["data"].shape))
print("small dataset labels shape: ".format(dataset_small["labels"].shape))
print("small validation data shape: ".format(validation_small["data"].shape))
print("small validation labels shape: ".format(validation_small["labels"].shape))

test_set_raw = unpickle(os.path.join(path, "test_batch"))
X_test = test_set_raw[b"data"]
Y_test = one_hot(test_set_raw[b"labels"])

# All data sets

dataset_large = "data": np.zeros([0, 3072]), "labels": np.array()
validation_large =

## All data batches

for name in names[1:4]:
raw = unpickle(os.path.join(path, name))
dataset_large["data"] = np.append(dataset_large["data"], raw[b"data"], axis = 0)
dataset_large["labels"] = np.append(dataset_large["labels"], raw[b"labels"], axis = 0)
raw = unpickle(os.path.join(path, names[4]))
dataset_large["data"] = np.append(dataset_large["data"], raw[b"data"][0: -1000], axis = 0)
dataset_large["labels"] = np.append(dataset_large["labels"], raw[b"labels"][0: -1000], axis = 0)
validation_large["data"] = raw[b"data"][-1000: ]
validation_large["labels"] = raw[b"labels"][-1000: ]

# Make one-hot
dataset_large["labels"] = one_hot(dataset_large["labels"]).T
validation_large["labels"] = one_hot(validation_large["labels"]).T

# Normalize
dataset_large["data"] = dataset_large["data"]/255
validation_large["data"] = validation_large["data"]/255

print("large dataset data: ".format(dataset_large["data"].shape))
print("large dataset labels: ".format(dataset_large["labels"].shape))
print("large validation set data: ".format(validation_large["data"].shape))
print("large validation set labels: ".format(validation_large["labels"].shape))

def softmax(s):
numerator = np.exp(s)
denominator = np.matmul(np.ones(10).T, np.exp(s))
return numerator/denominator

def relu(s1):
h = np.maximum(np.zeros_like(s1), s1)
return h

def layer_1(X, W1, b1):
# WARNING: X must have shape (3072, n)
# W1 must have shape (3072, 3072)
# b1 must have shape (3072, 1)
# s will get shape (10, n)
# Return p with shape (10, n)

if not X.shape[0] == n_input:
raise ValueError("Got wrong shape of s1: ".format(X.shape))


s1 = np.matmul(W1, X) + b1

if not s1.shape == (n_hidden, X.shape[1]):
raise ValueError("Got wrong shape of s1: ".format(s1.shape))

h = relu(s1)

if not h.shape == (n_hidden, X.shape[1]):
raise ValueError("Got wrong shape of h: ".format(h.shape))

return s1, h

def layer_2(h, W2, b2):
# WARNING: h must have shape (3072, n)
# W2 must have shape (10, 3072)
# b2 must have shape (10, 1)
# s will get shape (10, n)
# Returns p with shape (10, n)

s = np.matmul(W2, h) + b2

if not s.shape == (10, h.shape[1]):
raise ValueError("Got wrong shape of s: ".format(s.shape))

p = softmax(s)

if not p.shape == (10, h.shape[1]):
raise ValueError("Got wrong shape of p: ".format(p.shape))

return p

def evaluate_classifier(X, W1, W2, b1, b2):
if not X.shape[0] == n_input:
ValueError("Wrong shape of X: ".format(X.shape))
if not len(X.shape) == 2:
ValueError("Wrong shape of X: ".format(X.shape))
if not W1.shape == (n_hidden, n_input):
raise ValueError("Wrong shape of W1: ".format(W1.shape))
if not b1.shape == (n_hidden, 1):
raise ValueError("Wrong shape of b1: ".format(b1.shape))
if not W2.shape == (10, n_hidden):
raise ValueError("Wrong shape of W2: ".format(W2.shape))
if not b2.shape == (10, 1):
raise ValueError("Wrong shape of b2: ".format(b2.shape))

s1, h = layer_1(X, W1, b1)

p = layer_2(h, W2, b2)

return s1, h, p

def compute_cost(X, Y, W1, W2, b1, b2, lamb = 0):
# WARNING! Y must be one hot!
# X must have shape (3072, -1)
# Y and P must have shape (-1, 10)

if not Y.shape[1] == 10 and len(Y.shape) == 2:
raise ValueError("Wrong shape of Y: . Must be (-1, 10)".format(Y.shape))
if not X.shape[0] == 3072:
raise ValueError("Wrong shape of X: . Must be (3072, -1)".format(X.shape))
_, _, P = evaluate_classifier(X, W1, W2, b1, b2)
P = P.T

if not P.shape[1] == 10 and len(P.shape) == 2:
raise ValueError("Wrong shape of P: . Must be (-1, 10)".format(P.shape))
if not P.shape == Y.shape:
raise ValueError("Y and P must have the same shape. != ".format(Y.shape, P.shape))

l_cross = 0
for i in range(X.shape[1]):
y = Y[i]
p = P[i]
y = np.reshape(y, [1, 10])
p = np.reshape(p, [10, 1])
l_cross += -np.log(np.matmul(y, p))[0][0]
avg_loss = l_cross/X.shape[1]
reg = lamb * (np.sum(W1) + np.sum(W2))

return avg_loss + reg

def compute_accuracy(X, Y, W1, W2, b1, b2):
# X must have shape (3072, n)
# Y must have shape (10, n)
if(not X.shape[0] == 3072):
raise ValueError("X must have shape (3072, n)")
try:
X.shape[1]
except IndexError:
raise(ValueError("X must have shape (3072, n)"))
if not Y.shape[0] == 10:
raise ValueError("Y must have shape (10, n)")
if not len(Y.shape) == 2:
raise ValueError("Y mus have shape (10, n)")
correct = 0
for x, y in zip(X.T, Y.T):
_, _, p = evaluate_classifier(np.reshape(x, (3072,1)), W1, W2, b1, b2)
pred = np.argmax(p)
corr = np.argmax(y)
correct += 1 if pred == corr else 0
return correct/X.shape[1]

def compute_gradients(X, Y, S1, H, P, W1, W2, lamb):
# Y must be one-hot
# Y and P must be (10, n)
# X and H must be (3072, n)
# P is the softmax layer
t = time.clock()
if not (Y.shape[0] == 10 and P.shape[0] == 10 and Y.shape == P.shape):
raise ValueError("Y and P must have shape (10, k). Y: , P: ".format(Y.shape, P.shape))
if not X.shape[0] == n_input:
raise ValueError("X must have shape (, k), has ".format(n_input, X.shape))
if not H.shape[0] == n_hidden:
raise ValueError("H must have shape (, k)".format(n_hidden))
grad_W1 = np.zeros([n_hidden, n_input])
grad_W2 = np.zeros([10, n_hidden])
grad_b1 = np.zeros([n_hidden, 1])
grad_b2 = np.zeros([10, 1])


X, Y, H, P = X.T, Y.T, H.T, P.T

for x, y, s1, h, p in zip(X, Y, S1, H, P):
h = np.reshape(h, [1, n_hidden])
y = np.reshape(y, [10, 1])
p = np.reshape(p, [10, 1])

# Second layer
g = -(y-p).T
grad_b2 += g.T
grad_W2 += np.matmul(g.T, h)
# First layer
g = np.matmul(g, W2)
print(g.shape)
ind = np.zeros(h.shape[1])
for i, val in enumerate(s1):
if val > 0:
ind[i] = 1

diag = np.diag(ind)

g = np.matmul(g, diag)

grad_b1 += g.T
grad_W1 += np.matmul(g.T, np.reshape(x, [1, n_input]))
# Divide by batch size
grad_b1 /= X.shape[0]
grad_b2 /= X.shape[0]
grad_W1 /= X.shape[0]
grad_W2 /= X.shape[0]
# Add regularization term
grad_W1 += 2*lamb*W1
grad_W2 += 2*lamb*W2
dur = time.clock() - t
print("compute_gradients took seconds".format(dur))
return grad_W1, grad_W2, grad_b1, grad_b2


def compute_gradient_num(X, Y, W1, W2, b1, b2, lamb = 0, h = 0.000001):
Y = Y.T
grad_W1 = np.zeros_like(W1)
grad_W2 = np.zeros_like(W2)
grad_b1 = np.zeros_like(b1)
grad_b2 = np.zeros_like(b2)

#gradient for b1 vector
print("calculating b1 gradients...")
for i in range(b1.shape[0]):
b1_try = np.copy(b1)
b1_try[i,0] = b1[i,0] - h
c1 = compute_cost(X, Y, W1, W2, b1_try, b2, lamb)
b1_try = np.copy(b1)
b1_try[i, 0] = b1[i, 0] + h
c2 = compute_cost(X, Y, W1, W2, b1_try, b2, lamb)
grad_b1[i,0] = (c2-c1) / (2 * h)

# gradient for W1 vector
print("calculating W1 gradients...")
for i in range(W1.shape[0]):
for j in range(W1.shape[1]):
W1_try = np.copy(W1)
W1_try[i,j] = W1[i,j] - h
c1 = compute_cost(X, Y, W1_try, W2, b1, b2, lamb)
W1_try = np.copy(W1)
W1_try[i, j] = W1[i, j] + h
c2 = compute_cost(X, Y, W1_try, W2, b1, b2, lamb)
grad_W1[i, j] = (c2 - c1) / (2 * h)

#gradient for b2 vector
print("calculating b2 gradients...")
for i in range(b2.shape[0]):
b2_try = np.copy(b2)
b2_try[i,0] = b2[i,0] - h
c1 = compute_cost(X, Y, W1, W2, b1, b2_try, lamb)
b2_try = np.copy(b2)
b2_try[i, 0] = b2[i, 0] + h
c2 = compute_cost(X, Y, W1, W2, b1, b2_try, lamb)
grad_b2[i,0] = (c2-c1) / (2 * h)

# gradient for W2 vector
print("calculating W2 gradients...")
for i in range(W2.shape[0]):
for j in range(W2.shape[1]):
W2_try = np.copy(W2)
W2_try[i,j] = W2[i,j] - h
c1 = compute_cost(X, Y, W1, W2_try, b1, b2, lamb)
W2_try = np.copy(W2)
W2_try[i, j] = W2[i, j] + h
c2 = compute_cost(X, Y, W1, W2_try, b1, b2, lamb)
grad_W2[i, j] = (c2 - c1) / (2 * h)

return grad_W1, grad_W2, grad_b1, grad_b2

W1 = np.random.normal(0, 0.01, [n_hidden, n_input])
W2 = np.random.normal(0, 0.01, [10, n_hidden])
b1 = np.random.normal(0, 0.1, [n_hidden, 1])
b2 = np.random.normal(0, 0.1, [10, 1])


def train_batch(X, Y, W1, W2, b1, b2, gd_params, old_grads):
S1, H, P = evaluate_classifier(X.T, W1, W2, b1, b2)
grad_W1, grad_W2, grad_b1, grad_b2 = compute_gradients(X.T, Y.T, S1, H, P, W1, W2, gd_params["lambda"])
try:
W1 -= (1-gd_params["rho"])*gd_params["eta"]*grad_W1 + gd_params["rho"]*gd_params["eta"]*old_grads["W1"]
W2 -= (1-gd_params["rho"])*gd_params["eta"]*grad_W2 + gd_params["rho"]*gd_params["eta"]*old_grads["W2"]
b1 -= (1-gd_params["rho"])*gd_params["eta"]*grad_b1 + gd_params["rho"]*gd_params["eta"]*old_grads["b1"]
b2 -= (1-gd_params["rho"])*gd_params["eta"]*grad_b2 + gd_params["rho"]*gd_params["eta"]*old_grads["b2"]
except KeyError:
W1 -= gd_params["eta"]*grad_W1
W2 -= gd_params["eta"]*grad_W2
b1 -= gd_params["eta"]*grad_b1
b2 -= gd_params["eta"]*grad_b2
return grad_W1, grad_W2, grad_b1, grad_b2

def minibatch_GD(X, Y, W1, W2, b1, b2, gd_params, validation=None, verbose = True):
# X must have shape (n, 3072)
# Y must have shape (k, 10)
# validation["data"] must have shape (k, 3072)
# validation["labels"] must have shape (k, 10)
training_loss = [None]*gd_params["epochs"]
validation_loss = training_loss[:] # copying the list
training_accuracy = training_loss[:]
validation_accuracy = training_loss[:]

for i in range(gd_params["epochs"]):

if i % 10 == 0 and verbose:
print("Training epoch started".format(i+1))

old_grads =
# For each batch
for j in range(X.shape[0]//gd_params["batch_size"]):
start = j*gd_params["batch_size"]
end = start + gd_params["batch_size"]
batch = X[start: end]
targets = Y[start:end]

old_grads["W1"], old_grads["W2"], old_grads["b1"], old_grads["b2"] = train_batch(batch, targets, W1, W2, b1, b2, gd_params, old_grads)

# Train the datapoints that didn't fit into last minibatch
n_rest = X.shape[0]%gd_params["batch_size"]
if n_rest > 0:
start = X.shape[0]-n_rest
end = X.shape[0]
batch = X[start: end]
targets = Y[start: end]
old_grads = train_batch(batch, targets, W1, W2, b1, b2, gd_params, old_grads)

training_loss[i] = compute_cost(X.T, Y, W1, W2, b1, b2, gd_params["lambda"])
training_accuracy[i] = compute_accuracy(X.T, Y.T, W1, W2, b1, b2)
if validation:
validation_loss[i] = compute_cost(validation["data"].T, validation["labels"], W1, W2, b1, b2, gd_params["lambda"])
validation_accuracy[i] = compute_accuracy(validation["data"].T, validation["labels"].T, W1, W2, b1, b2)
return training_loss, None, training_accuracy, None

dataset_mini =
dataset_mini["data"] = dataset_small["data"][0:100]
dataset_mini["labels"] = dataset_small["labels"][0:100]

W1 = np.random.normal(0, 0.01, [n_hidden, n_input])
W2 = np.random.normal(0, 0.01, [10, n_hidden])
b1 = np.random.normal(0, 0.1, [n_hidden, 1])
b2 = np.random.normal(0, 0.1, [10, 1])

gd_params = "epochs": 200, "batch_size":10, "lambda": 0, "eta": 0.01, "rho": 0.5
X = dataset_mini["data"]
Y = dataset_mini["labels"]
training_loss, _, training_accuracy, _ = minibatch_GD(X, Y, W1, W2, b1, b2, gd_params)


plt.plot(training_loss)
plt.figure()
plt.plot(training_accuracy)






share|improve this question





















  • Could you modify the question to add a runnable example? It would make it easier to investigate, because we can python -m cProfile your_example.py
    – FirefoxMetzger
    Apr 18 at 4:57










  • On first glance the the culprit is that you don't use vectorized computation, but rather compute the gradient row by row in a for loop. Also you are using a linear output layer and a ReLu hidden layer correct? You can vectorize gradient computation of the ReLu instead of nesting for loops
    – FirefoxMetzger
    Apr 18 at 5:03










  • @FirefoxMetzger, added runnable example
    – Sandi
    Apr 18 at 6:59










  • You are linking to the wrong data set. Your code seems to use CIFAR-10, but you link to CIFAR-100 which has a slightly different structure and won't work.
    – FirefoxMetzger
    Apr 18 at 9:57












up vote
7
down vote

favorite









up vote
7
down vote

favorite











I'm writing a multi-layer perceptron from scratch and I think it's way slower than it should be. the culprit seems to be my compute_gradients-function, which according to my investigation answers for most of the execution time. It looks like this:



def compute_gradients(X, Y, S1, H, P, W1, W2, lamb):
# Y must be one-hot
# Y and P must be (10, n)
# X and H must be (3072, n)
# P is the softmax layer
if not (Y.shape[0] == 10 and P.shape[0] == 10 and Y.shape == P.shape):
raise ValueError("Y and P must have shape (10, k). Y: , P: ".format(Y.shape, P.shape))
if not X.shape[0] == n_input:
raise ValueError("X must have shape (, k), has ".format(n_input, X.shape))
if not H.shape[0] == n_hidden:
raise ValueError("H must have shape (, k)".format(n_hidden))
grad_W1 = np.zeros([n_hidden, n_input])
grad_W2 = np.zeros([10, n_hidden])
grad_b1 = np.zeros([n_hidden, 1])
grad_b2 = np.zeros([10, 1])


X, Y, H, P = X.T, Y.T, H.T, P.T
for x, y, s1, h, p in zip(X, Y, S1, H, P):
h = np.reshape(h, [1, n_hidden])
y = np.reshape(y, [10, 1])
p = np.reshape(p, [10, 1])

# Second layer
g = -(y-p).T
grad_b2 += g.T
grad_W2 += np.matmul(g.T, h)
# First layer
g = np.matmul(g, W2)
ind = np.zeros(h.shape[1])
for i, val in enumerate(s1):
if val > 0:
ind[i] = 1
diag = np.diag(ind)

g = np.matmul(g, diag)

grad_b1 += g.T
grad_W1 += np.matmul(g.T, np.reshape(x, [1, n_input]))
# Divide by batch size
grad_b1 /= X.shape[0]
grad_b2 /= X.shape[0]
grad_W1 /= X.shape[0]
grad_W2 /= X.shape[0]
# Add regularization term
grad_W1 += 2*lamb*W1
grad_W2 += 2*lamb*W2
return grad_W1, grad_W2, grad_b1, grad_b2


If X, Y, H, P are all 10 rows long (n=10), the computations take about 1 second. This is way too much compared to my friends who are doing the same task. But I can't see any obvious inefficiencies in my code. What can I do to speed the computations up?



EDIT: Here's a runnable example. You also need the CIFAR dataset, found here: https://www.cs.toronto.edu/~kriz/cifar-100-python.tar.gz .



import numpy as np
import matplotlib.pyplot as plt
from PIL import Image
import os
import time

n_input = 3072
n_hidden = 3072

def one_hot(Y):
# assume Y = [1, 4, 9, 0, ...]
result = [None]*len(Y)
for i, cls in enumerate(Y):
onehot =
0: lambda: [1,0,0,0,0,0,0,0,0,0],
1: lambda: [0,1,0,0,0,0,0,0,0,0],
2: lambda: [0,0,1,0,0,0,0,0,0,0],
3: lambda: [0,0,0,1,0,0,0,0,0,0],
4: lambda: [0,0,0,0,1,0,0,0,0,0],
5: lambda: [0,0,0,0,0,1,0,0,0,0],
6: lambda: [0,0,0,0,0,0,1,0,0,0],
7: lambda: [0,0,0,0,0,0,0,1,0,0],
8: lambda: [0,0,0,0,0,0,0,0,1,0],
9: lambda: [0,0,0,0,0,0,0,0,0,1],
[cls]()
result[i] = onehot

result = np.array(result).T
return result


def unpickle(file):
import pickle
with open(file, "rb") as fo:
d = pickle.load(fo, encoding="bytes")
return d

path = "../../datasets/cifar/"

names = ["data_batch_1",
"data_batch_2",
"data_batch_3",
"data_batch_4",
"data_batch_5",
]

dataset_raw = unpickle(os.path.join(path, names[0]))
dataset_small =
dataset_small["data"] = dataset_raw[b"data"]/255
dataset_small["labels"] = one_hot(dataset_raw[b"labels"]).T

validation_set_raw = unpickle(os.path.join(path, names[1]))
validation_small =
validation_small["data"] = validation_set_raw[b"data"]/255
validation_small["labels"] = one_hot(validation_set_raw[b"labels"]).T

print("small dataset data shape: ".format(dataset_small["data"].shape))
print("small dataset labels shape: ".format(dataset_small["labels"].shape))
print("small validation data shape: ".format(validation_small["data"].shape))
print("small validation labels shape: ".format(validation_small["labels"].shape))

test_set_raw = unpickle(os.path.join(path, "test_batch"))
X_test = test_set_raw[b"data"]
Y_test = one_hot(test_set_raw[b"labels"])

# All data sets

dataset_large = "data": np.zeros([0, 3072]), "labels": np.array()
validation_large =

## All data batches

for name in names[1:4]:
raw = unpickle(os.path.join(path, name))
dataset_large["data"] = np.append(dataset_large["data"], raw[b"data"], axis = 0)
dataset_large["labels"] = np.append(dataset_large["labels"], raw[b"labels"], axis = 0)
raw = unpickle(os.path.join(path, names[4]))
dataset_large["data"] = np.append(dataset_large["data"], raw[b"data"][0: -1000], axis = 0)
dataset_large["labels"] = np.append(dataset_large["labels"], raw[b"labels"][0: -1000], axis = 0)
validation_large["data"] = raw[b"data"][-1000: ]
validation_large["labels"] = raw[b"labels"][-1000: ]

# Make one-hot
dataset_large["labels"] = one_hot(dataset_large["labels"]).T
validation_large["labels"] = one_hot(validation_large["labels"]).T

# Normalize
dataset_large["data"] = dataset_large["data"]/255
validation_large["data"] = validation_large["data"]/255

print("large dataset data: ".format(dataset_large["data"].shape))
print("large dataset labels: ".format(dataset_large["labels"].shape))
print("large validation set data: ".format(validation_large["data"].shape))
print("large validation set labels: ".format(validation_large["labels"].shape))

def softmax(s):
numerator = np.exp(s)
denominator = np.matmul(np.ones(10).T, np.exp(s))
return numerator/denominator

def relu(s1):
h = np.maximum(np.zeros_like(s1), s1)
return h

def layer_1(X, W1, b1):
# WARNING: X must have shape (3072, n)
# W1 must have shape (3072, 3072)
# b1 must have shape (3072, 1)
# s will get shape (10, n)
# Return p with shape (10, n)

if not X.shape[0] == n_input:
raise ValueError("Got wrong shape of s1: ".format(X.shape))


s1 = np.matmul(W1, X) + b1

if not s1.shape == (n_hidden, X.shape[1]):
raise ValueError("Got wrong shape of s1: ".format(s1.shape))

h = relu(s1)

if not h.shape == (n_hidden, X.shape[1]):
raise ValueError("Got wrong shape of h: ".format(h.shape))

return s1, h

def layer_2(h, W2, b2):
# WARNING: h must have shape (3072, n)
# W2 must have shape (10, 3072)
# b2 must have shape (10, 1)
# s will get shape (10, n)
# Returns p with shape (10, n)

s = np.matmul(W2, h) + b2

if not s.shape == (10, h.shape[1]):
raise ValueError("Got wrong shape of s: ".format(s.shape))

p = softmax(s)

if not p.shape == (10, h.shape[1]):
raise ValueError("Got wrong shape of p: ".format(p.shape))

return p

def evaluate_classifier(X, W1, W2, b1, b2):
if not X.shape[0] == n_input:
ValueError("Wrong shape of X: ".format(X.shape))
if not len(X.shape) == 2:
ValueError("Wrong shape of X: ".format(X.shape))
if not W1.shape == (n_hidden, n_input):
raise ValueError("Wrong shape of W1: ".format(W1.shape))
if not b1.shape == (n_hidden, 1):
raise ValueError("Wrong shape of b1: ".format(b1.shape))
if not W2.shape == (10, n_hidden):
raise ValueError("Wrong shape of W2: ".format(W2.shape))
if not b2.shape == (10, 1):
raise ValueError("Wrong shape of b2: ".format(b2.shape))

s1, h = layer_1(X, W1, b1)

p = layer_2(h, W2, b2)

return s1, h, p

def compute_cost(X, Y, W1, W2, b1, b2, lamb = 0):
# WARNING! Y must be one hot!
# X must have shape (3072, -1)
# Y and P must have shape (-1, 10)

if not Y.shape[1] == 10 and len(Y.shape) == 2:
raise ValueError("Wrong shape of Y: . Must be (-1, 10)".format(Y.shape))
if not X.shape[0] == 3072:
raise ValueError("Wrong shape of X: . Must be (3072, -1)".format(X.shape))
_, _, P = evaluate_classifier(X, W1, W2, b1, b2)
P = P.T

if not P.shape[1] == 10 and len(P.shape) == 2:
raise ValueError("Wrong shape of P: . Must be (-1, 10)".format(P.shape))
if not P.shape == Y.shape:
raise ValueError("Y and P must have the same shape. != ".format(Y.shape, P.shape))

l_cross = 0
for i in range(X.shape[1]):
y = Y[i]
p = P[i]
y = np.reshape(y, [1, 10])
p = np.reshape(p, [10, 1])
l_cross += -np.log(np.matmul(y, p))[0][0]
avg_loss = l_cross/X.shape[1]
reg = lamb * (np.sum(W1) + np.sum(W2))

return avg_loss + reg

def compute_accuracy(X, Y, W1, W2, b1, b2):
# X must have shape (3072, n)
# Y must have shape (10, n)
if(not X.shape[0] == 3072):
raise ValueError("X must have shape (3072, n)")
try:
X.shape[1]
except IndexError:
raise(ValueError("X must have shape (3072, n)"))
if not Y.shape[0] == 10:
raise ValueError("Y must have shape (10, n)")
if not len(Y.shape) == 2:
raise ValueError("Y mus have shape (10, n)")
correct = 0
for x, y in zip(X.T, Y.T):
_, _, p = evaluate_classifier(np.reshape(x, (3072,1)), W1, W2, b1, b2)
pred = np.argmax(p)
corr = np.argmax(y)
correct += 1 if pred == corr else 0
return correct/X.shape[1]

def compute_gradients(X, Y, S1, H, P, W1, W2, lamb):
# Y must be one-hot
# Y and P must be (10, n)
# X and H must be (3072, n)
# P is the softmax layer
t = time.clock()
if not (Y.shape[0] == 10 and P.shape[0] == 10 and Y.shape == P.shape):
raise ValueError("Y and P must have shape (10, k). Y: , P: ".format(Y.shape, P.shape))
if not X.shape[0] == n_input:
raise ValueError("X must have shape (, k), has ".format(n_input, X.shape))
if not H.shape[0] == n_hidden:
raise ValueError("H must have shape (, k)".format(n_hidden))
grad_W1 = np.zeros([n_hidden, n_input])
grad_W2 = np.zeros([10, n_hidden])
grad_b1 = np.zeros([n_hidden, 1])
grad_b2 = np.zeros([10, 1])


X, Y, H, P = X.T, Y.T, H.T, P.T

for x, y, s1, h, p in zip(X, Y, S1, H, P):
h = np.reshape(h, [1, n_hidden])
y = np.reshape(y, [10, 1])
p = np.reshape(p, [10, 1])

# Second layer
g = -(y-p).T
grad_b2 += g.T
grad_W2 += np.matmul(g.T, h)
# First layer
g = np.matmul(g, W2)
print(g.shape)
ind = np.zeros(h.shape[1])
for i, val in enumerate(s1):
if val > 0:
ind[i] = 1

diag = np.diag(ind)

g = np.matmul(g, diag)

grad_b1 += g.T
grad_W1 += np.matmul(g.T, np.reshape(x, [1, n_input]))
# Divide by batch size
grad_b1 /= X.shape[0]
grad_b2 /= X.shape[0]
grad_W1 /= X.shape[0]
grad_W2 /= X.shape[0]
# Add regularization term
grad_W1 += 2*lamb*W1
grad_W2 += 2*lamb*W2
dur = time.clock() - t
print("compute_gradients took seconds".format(dur))
return grad_W1, grad_W2, grad_b1, grad_b2


def compute_gradient_num(X, Y, W1, W2, b1, b2, lamb = 0, h = 0.000001):
Y = Y.T
grad_W1 = np.zeros_like(W1)
grad_W2 = np.zeros_like(W2)
grad_b1 = np.zeros_like(b1)
grad_b2 = np.zeros_like(b2)

#gradient for b1 vector
print("calculating b1 gradients...")
for i in range(b1.shape[0]):
b1_try = np.copy(b1)
b1_try[i,0] = b1[i,0] - h
c1 = compute_cost(X, Y, W1, W2, b1_try, b2, lamb)
b1_try = np.copy(b1)
b1_try[i, 0] = b1[i, 0] + h
c2 = compute_cost(X, Y, W1, W2, b1_try, b2, lamb)
grad_b1[i,0] = (c2-c1) / (2 * h)

# gradient for W1 vector
print("calculating W1 gradients...")
for i in range(W1.shape[0]):
for j in range(W1.shape[1]):
W1_try = np.copy(W1)
W1_try[i,j] = W1[i,j] - h
c1 = compute_cost(X, Y, W1_try, W2, b1, b2, lamb)
W1_try = np.copy(W1)
W1_try[i, j] = W1[i, j] + h
c2 = compute_cost(X, Y, W1_try, W2, b1, b2, lamb)
grad_W1[i, j] = (c2 - c1) / (2 * h)

#gradient for b2 vector
print("calculating b2 gradients...")
for i in range(b2.shape[0]):
b2_try = np.copy(b2)
b2_try[i,0] = b2[i,0] - h
c1 = compute_cost(X, Y, W1, W2, b1, b2_try, lamb)
b2_try = np.copy(b2)
b2_try[i, 0] = b2[i, 0] + h
c2 = compute_cost(X, Y, W1, W2, b1, b2_try, lamb)
grad_b2[i,0] = (c2-c1) / (2 * h)

# gradient for W2 vector
print("calculating W2 gradients...")
for i in range(W2.shape[0]):
for j in range(W2.shape[1]):
W2_try = np.copy(W2)
W2_try[i,j] = W2[i,j] - h
c1 = compute_cost(X, Y, W1, W2_try, b1, b2, lamb)
W2_try = np.copy(W2)
W2_try[i, j] = W2[i, j] + h
c2 = compute_cost(X, Y, W1, W2_try, b1, b2, lamb)
grad_W2[i, j] = (c2 - c1) / (2 * h)

return grad_W1, grad_W2, grad_b1, grad_b2

W1 = np.random.normal(0, 0.01, [n_hidden, n_input])
W2 = np.random.normal(0, 0.01, [10, n_hidden])
b1 = np.random.normal(0, 0.1, [n_hidden, 1])
b2 = np.random.normal(0, 0.1, [10, 1])


def train_batch(X, Y, W1, W2, b1, b2, gd_params, old_grads):
S1, H, P = evaluate_classifier(X.T, W1, W2, b1, b2)
grad_W1, grad_W2, grad_b1, grad_b2 = compute_gradients(X.T, Y.T, S1, H, P, W1, W2, gd_params["lambda"])
try:
W1 -= (1-gd_params["rho"])*gd_params["eta"]*grad_W1 + gd_params["rho"]*gd_params["eta"]*old_grads["W1"]
W2 -= (1-gd_params["rho"])*gd_params["eta"]*grad_W2 + gd_params["rho"]*gd_params["eta"]*old_grads["W2"]
b1 -= (1-gd_params["rho"])*gd_params["eta"]*grad_b1 + gd_params["rho"]*gd_params["eta"]*old_grads["b1"]
b2 -= (1-gd_params["rho"])*gd_params["eta"]*grad_b2 + gd_params["rho"]*gd_params["eta"]*old_grads["b2"]
except KeyError:
W1 -= gd_params["eta"]*grad_W1
W2 -= gd_params["eta"]*grad_W2
b1 -= gd_params["eta"]*grad_b1
b2 -= gd_params["eta"]*grad_b2
return grad_W1, grad_W2, grad_b1, grad_b2

def minibatch_GD(X, Y, W1, W2, b1, b2, gd_params, validation=None, verbose = True):
# X must have shape (n, 3072)
# Y must have shape (k, 10)
# validation["data"] must have shape (k, 3072)
# validation["labels"] must have shape (k, 10)
training_loss = [None]*gd_params["epochs"]
validation_loss = training_loss[:] # copying the list
training_accuracy = training_loss[:]
validation_accuracy = training_loss[:]

for i in range(gd_params["epochs"]):

if i % 10 == 0 and verbose:
print("Training epoch started".format(i+1))

old_grads =
# For each batch
for j in range(X.shape[0]//gd_params["batch_size"]):
start = j*gd_params["batch_size"]
end = start + gd_params["batch_size"]
batch = X[start: end]
targets = Y[start:end]

old_grads["W1"], old_grads["W2"], old_grads["b1"], old_grads["b2"] = train_batch(batch, targets, W1, W2, b1, b2, gd_params, old_grads)

# Train the datapoints that didn't fit into last minibatch
n_rest = X.shape[0]%gd_params["batch_size"]
if n_rest > 0:
start = X.shape[0]-n_rest
end = X.shape[0]
batch = X[start: end]
targets = Y[start: end]
old_grads = train_batch(batch, targets, W1, W2, b1, b2, gd_params, old_grads)

training_loss[i] = compute_cost(X.T, Y, W1, W2, b1, b2, gd_params["lambda"])
training_accuracy[i] = compute_accuracy(X.T, Y.T, W1, W2, b1, b2)
if validation:
validation_loss[i] = compute_cost(validation["data"].T, validation["labels"], W1, W2, b1, b2, gd_params["lambda"])
validation_accuracy[i] = compute_accuracy(validation["data"].T, validation["labels"].T, W1, W2, b1, b2)
return training_loss, None, training_accuracy, None

dataset_mini =
dataset_mini["data"] = dataset_small["data"][0:100]
dataset_mini["labels"] = dataset_small["labels"][0:100]

W1 = np.random.normal(0, 0.01, [n_hidden, n_input])
W2 = np.random.normal(0, 0.01, [10, n_hidden])
b1 = np.random.normal(0, 0.1, [n_hidden, 1])
b2 = np.random.normal(0, 0.1, [10, 1])

gd_params = "epochs": 200, "batch_size":10, "lambda": 0, "eta": 0.01, "rho": 0.5
X = dataset_mini["data"]
Y = dataset_mini["labels"]
training_loss, _, training_accuracy, _ = minibatch_GD(X, Y, W1, W2, b1, b2, gd_params)


plt.plot(training_loss)
plt.figure()
plt.plot(training_accuracy)






share|improve this question













I'm writing a multi-layer perceptron from scratch and I think it's way slower than it should be. the culprit seems to be my compute_gradients-function, which according to my investigation answers for most of the execution time. It looks like this:



def compute_gradients(X, Y, S1, H, P, W1, W2, lamb):
# Y must be one-hot
# Y and P must be (10, n)
# X and H must be (3072, n)
# P is the softmax layer
if not (Y.shape[0] == 10 and P.shape[0] == 10 and Y.shape == P.shape):
raise ValueError("Y and P must have shape (10, k). Y: , P: ".format(Y.shape, P.shape))
if not X.shape[0] == n_input:
raise ValueError("X must have shape (, k), has ".format(n_input, X.shape))
if not H.shape[0] == n_hidden:
raise ValueError("H must have shape (, k)".format(n_hidden))
grad_W1 = np.zeros([n_hidden, n_input])
grad_W2 = np.zeros([10, n_hidden])
grad_b1 = np.zeros([n_hidden, 1])
grad_b2 = np.zeros([10, 1])


X, Y, H, P = X.T, Y.T, H.T, P.T
for x, y, s1, h, p in zip(X, Y, S1, H, P):
h = np.reshape(h, [1, n_hidden])
y = np.reshape(y, [10, 1])
p = np.reshape(p, [10, 1])

# Second layer
g = -(y-p).T
grad_b2 += g.T
grad_W2 += np.matmul(g.T, h)
# First layer
g = np.matmul(g, W2)
ind = np.zeros(h.shape[1])
for i, val in enumerate(s1):
if val > 0:
ind[i] = 1
diag = np.diag(ind)

g = np.matmul(g, diag)

grad_b1 += g.T
grad_W1 += np.matmul(g.T, np.reshape(x, [1, n_input]))
# Divide by batch size
grad_b1 /= X.shape[0]
grad_b2 /= X.shape[0]
grad_W1 /= X.shape[0]
grad_W2 /= X.shape[0]
# Add regularization term
grad_W1 += 2*lamb*W1
grad_W2 += 2*lamb*W2
return grad_W1, grad_W2, grad_b1, grad_b2


If X, Y, H, P are all 10 rows long (n=10), the computations take about 1 second. This is way too much compared to my friends who are doing the same task. But I can't see any obvious inefficiencies in my code. What can I do to speed the computations up?



EDIT: Here's a runnable example. You also need the CIFAR dataset, found here: https://www.cs.toronto.edu/~kriz/cifar-100-python.tar.gz .



import numpy as np
import matplotlib.pyplot as plt
from PIL import Image
import os
import time

n_input = 3072
n_hidden = 3072

def one_hot(Y):
# assume Y = [1, 4, 9, 0, ...]
result = [None]*len(Y)
for i, cls in enumerate(Y):
onehot =
0: lambda: [1,0,0,0,0,0,0,0,0,0],
1: lambda: [0,1,0,0,0,0,0,0,0,0],
2: lambda: [0,0,1,0,0,0,0,0,0,0],
3: lambda: [0,0,0,1,0,0,0,0,0,0],
4: lambda: [0,0,0,0,1,0,0,0,0,0],
5: lambda: [0,0,0,0,0,1,0,0,0,0],
6: lambda: [0,0,0,0,0,0,1,0,0,0],
7: lambda: [0,0,0,0,0,0,0,1,0,0],
8: lambda: [0,0,0,0,0,0,0,0,1,0],
9: lambda: [0,0,0,0,0,0,0,0,0,1],
[cls]()
result[i] = onehot

result = np.array(result).T
return result


def unpickle(file):
import pickle
with open(file, "rb") as fo:
d = pickle.load(fo, encoding="bytes")
return d

path = "../../datasets/cifar/"

names = ["data_batch_1",
"data_batch_2",
"data_batch_3",
"data_batch_4",
"data_batch_5",
]

dataset_raw = unpickle(os.path.join(path, names[0]))
dataset_small =
dataset_small["data"] = dataset_raw[b"data"]/255
dataset_small["labels"] = one_hot(dataset_raw[b"labels"]).T

validation_set_raw = unpickle(os.path.join(path, names[1]))
validation_small =
validation_small["data"] = validation_set_raw[b"data"]/255
validation_small["labels"] = one_hot(validation_set_raw[b"labels"]).T

print("small dataset data shape: ".format(dataset_small["data"].shape))
print("small dataset labels shape: ".format(dataset_small["labels"].shape))
print("small validation data shape: ".format(validation_small["data"].shape))
print("small validation labels shape: ".format(validation_small["labels"].shape))

test_set_raw = unpickle(os.path.join(path, "test_batch"))
X_test = test_set_raw[b"data"]
Y_test = one_hot(test_set_raw[b"labels"])

# All data sets

dataset_large = "data": np.zeros([0, 3072]), "labels": np.array()
validation_large =

## All data batches

for name in names[1:4]:
raw = unpickle(os.path.join(path, name))
dataset_large["data"] = np.append(dataset_large["data"], raw[b"data"], axis = 0)
dataset_large["labels"] = np.append(dataset_large["labels"], raw[b"labels"], axis = 0)
raw = unpickle(os.path.join(path, names[4]))
dataset_large["data"] = np.append(dataset_large["data"], raw[b"data"][0: -1000], axis = 0)
dataset_large["labels"] = np.append(dataset_large["labels"], raw[b"labels"][0: -1000], axis = 0)
validation_large["data"] = raw[b"data"][-1000: ]
validation_large["labels"] = raw[b"labels"][-1000: ]

# Make one-hot
dataset_large["labels"] = one_hot(dataset_large["labels"]).T
validation_large["labels"] = one_hot(validation_large["labels"]).T

# Normalize
dataset_large["data"] = dataset_large["data"]/255
validation_large["data"] = validation_large["data"]/255

print("large dataset data: ".format(dataset_large["data"].shape))
print("large dataset labels: ".format(dataset_large["labels"].shape))
print("large validation set data: ".format(validation_large["data"].shape))
print("large validation set labels: ".format(validation_large["labels"].shape))

def softmax(s):
numerator = np.exp(s)
denominator = np.matmul(np.ones(10).T, np.exp(s))
return numerator/denominator

def relu(s1):
h = np.maximum(np.zeros_like(s1), s1)
return h

def layer_1(X, W1, b1):
# WARNING: X must have shape (3072, n)
# W1 must have shape (3072, 3072)
# b1 must have shape (3072, 1)
# s will get shape (10, n)
# Return p with shape (10, n)

if not X.shape[0] == n_input:
raise ValueError("Got wrong shape of s1: ".format(X.shape))


s1 = np.matmul(W1, X) + b1

if not s1.shape == (n_hidden, X.shape[1]):
raise ValueError("Got wrong shape of s1: ".format(s1.shape))

h = relu(s1)

if not h.shape == (n_hidden, X.shape[1]):
raise ValueError("Got wrong shape of h: ".format(h.shape))

return s1, h

def layer_2(h, W2, b2):
# WARNING: h must have shape (3072, n)
# W2 must have shape (10, 3072)
# b2 must have shape (10, 1)
# s will get shape (10, n)
# Returns p with shape (10, n)

s = np.matmul(W2, h) + b2

if not s.shape == (10, h.shape[1]):
raise ValueError("Got wrong shape of s: ".format(s.shape))

p = softmax(s)

if not p.shape == (10, h.shape[1]):
raise ValueError("Got wrong shape of p: ".format(p.shape))

return p

def evaluate_classifier(X, W1, W2, b1, b2):
if not X.shape[0] == n_input:
ValueError("Wrong shape of X: ".format(X.shape))
if not len(X.shape) == 2:
ValueError("Wrong shape of X: ".format(X.shape))
if not W1.shape == (n_hidden, n_input):
raise ValueError("Wrong shape of W1: ".format(W1.shape))
if not b1.shape == (n_hidden, 1):
raise ValueError("Wrong shape of b1: ".format(b1.shape))
if not W2.shape == (10, n_hidden):
raise ValueError("Wrong shape of W2: ".format(W2.shape))
if not b2.shape == (10, 1):
raise ValueError("Wrong shape of b2: ".format(b2.shape))

s1, h = layer_1(X, W1, b1)

p = layer_2(h, W2, b2)

return s1, h, p

def compute_cost(X, Y, W1, W2, b1, b2, lamb = 0):
# WARNING! Y must be one hot!
# X must have shape (3072, -1)
# Y and P must have shape (-1, 10)

if not Y.shape[1] == 10 and len(Y.shape) == 2:
raise ValueError("Wrong shape of Y: . Must be (-1, 10)".format(Y.shape))
if not X.shape[0] == 3072:
raise ValueError("Wrong shape of X: . Must be (3072, -1)".format(X.shape))
_, _, P = evaluate_classifier(X, W1, W2, b1, b2)
P = P.T

if not P.shape[1] == 10 and len(P.shape) == 2:
raise ValueError("Wrong shape of P: . Must be (-1, 10)".format(P.shape))
if not P.shape == Y.shape:
raise ValueError("Y and P must have the same shape. != ".format(Y.shape, P.shape))

l_cross = 0
for i in range(X.shape[1]):
y = Y[i]
p = P[i]
y = np.reshape(y, [1, 10])
p = np.reshape(p, [10, 1])
l_cross += -np.log(np.matmul(y, p))[0][0]
avg_loss = l_cross/X.shape[1]
reg = lamb * (np.sum(W1) + np.sum(W2))

return avg_loss + reg

def compute_accuracy(X, Y, W1, W2, b1, b2):
# X must have shape (3072, n)
# Y must have shape (10, n)
if(not X.shape[0] == 3072):
raise ValueError("X must have shape (3072, n)")
try:
X.shape[1]
except IndexError:
raise(ValueError("X must have shape (3072, n)"))
if not Y.shape[0] == 10:
raise ValueError("Y must have shape (10, n)")
if not len(Y.shape) == 2:
raise ValueError("Y mus have shape (10, n)")
correct = 0
for x, y in zip(X.T, Y.T):
_, _, p = evaluate_classifier(np.reshape(x, (3072,1)), W1, W2, b1, b2)
pred = np.argmax(p)
corr = np.argmax(y)
correct += 1 if pred == corr else 0
return correct/X.shape[1]

def compute_gradients(X, Y, S1, H, P, W1, W2, lamb):
# Y must be one-hot
# Y and P must be (10, n)
# X and H must be (3072, n)
# P is the softmax layer
t = time.clock()
if not (Y.shape[0] == 10 and P.shape[0] == 10 and Y.shape == P.shape):
raise ValueError("Y and P must have shape (10, k). Y: , P: ".format(Y.shape, P.shape))
if not X.shape[0] == n_input:
raise ValueError("X must have shape (, k), has ".format(n_input, X.shape))
if not H.shape[0] == n_hidden:
raise ValueError("H must have shape (, k)".format(n_hidden))
grad_W1 = np.zeros([n_hidden, n_input])
grad_W2 = np.zeros([10, n_hidden])
grad_b1 = np.zeros([n_hidden, 1])
grad_b2 = np.zeros([10, 1])


X, Y, H, P = X.T, Y.T, H.T, P.T

for x, y, s1, h, p in zip(X, Y, S1, H, P):
h = np.reshape(h, [1, n_hidden])
y = np.reshape(y, [10, 1])
p = np.reshape(p, [10, 1])

# Second layer
g = -(y-p).T
grad_b2 += g.T
grad_W2 += np.matmul(g.T, h)
# First layer
g = np.matmul(g, W2)
print(g.shape)
ind = np.zeros(h.shape[1])
for i, val in enumerate(s1):
if val > 0:
ind[i] = 1

diag = np.diag(ind)

g = np.matmul(g, diag)

grad_b1 += g.T
grad_W1 += np.matmul(g.T, np.reshape(x, [1, n_input]))
# Divide by batch size
grad_b1 /= X.shape[0]
grad_b2 /= X.shape[0]
grad_W1 /= X.shape[0]
grad_W2 /= X.shape[0]
# Add regularization term
grad_W1 += 2*lamb*W1
grad_W2 += 2*lamb*W2
dur = time.clock() - t
print("compute_gradients took seconds".format(dur))
return grad_W1, grad_W2, grad_b1, grad_b2


def compute_gradient_num(X, Y, W1, W2, b1, b2, lamb = 0, h = 0.000001):
Y = Y.T
grad_W1 = np.zeros_like(W1)
grad_W2 = np.zeros_like(W2)
grad_b1 = np.zeros_like(b1)
grad_b2 = np.zeros_like(b2)

#gradient for b1 vector
print("calculating b1 gradients...")
for i in range(b1.shape[0]):
b1_try = np.copy(b1)
b1_try[i,0] = b1[i,0] - h
c1 = compute_cost(X, Y, W1, W2, b1_try, b2, lamb)
b1_try = np.copy(b1)
b1_try[i, 0] = b1[i, 0] + h
c2 = compute_cost(X, Y, W1, W2, b1_try, b2, lamb)
grad_b1[i,0] = (c2-c1) / (2 * h)

# gradient for W1 vector
print("calculating W1 gradients...")
for i in range(W1.shape[0]):
for j in range(W1.shape[1]):
W1_try = np.copy(W1)
W1_try[i,j] = W1[i,j] - h
c1 = compute_cost(X, Y, W1_try, W2, b1, b2, lamb)
W1_try = np.copy(W1)
W1_try[i, j] = W1[i, j] + h
c2 = compute_cost(X, Y, W1_try, W2, b1, b2, lamb)
grad_W1[i, j] = (c2 - c1) / (2 * h)

#gradient for b2 vector
print("calculating b2 gradients...")
for i in range(b2.shape[0]):
b2_try = np.copy(b2)
b2_try[i,0] = b2[i,0] - h
c1 = compute_cost(X, Y, W1, W2, b1, b2_try, lamb)
b2_try = np.copy(b2)
b2_try[i, 0] = b2[i, 0] + h
c2 = compute_cost(X, Y, W1, W2, b1, b2_try, lamb)
grad_b2[i,0] = (c2-c1) / (2 * h)

# gradient for W2 vector
print("calculating W2 gradients...")
for i in range(W2.shape[0]):
for j in range(W2.shape[1]):
W2_try = np.copy(W2)
W2_try[i,j] = W2[i,j] - h
c1 = compute_cost(X, Y, W1, W2_try, b1, b2, lamb)
W2_try = np.copy(W2)
W2_try[i, j] = W2[i, j] + h
c2 = compute_cost(X, Y, W1, W2_try, b1, b2, lamb)
grad_W2[i, j] = (c2 - c1) / (2 * h)

return grad_W1, grad_W2, grad_b1, grad_b2

W1 = np.random.normal(0, 0.01, [n_hidden, n_input])
W2 = np.random.normal(0, 0.01, [10, n_hidden])
b1 = np.random.normal(0, 0.1, [n_hidden, 1])
b2 = np.random.normal(0, 0.1, [10, 1])


def train_batch(X, Y, W1, W2, b1, b2, gd_params, old_grads):
S1, H, P = evaluate_classifier(X.T, W1, W2, b1, b2)
grad_W1, grad_W2, grad_b1, grad_b2 = compute_gradients(X.T, Y.T, S1, H, P, W1, W2, gd_params["lambda"])
try:
W1 -= (1-gd_params["rho"])*gd_params["eta"]*grad_W1 + gd_params["rho"]*gd_params["eta"]*old_grads["W1"]
W2 -= (1-gd_params["rho"])*gd_params["eta"]*grad_W2 + gd_params["rho"]*gd_params["eta"]*old_grads["W2"]
b1 -= (1-gd_params["rho"])*gd_params["eta"]*grad_b1 + gd_params["rho"]*gd_params["eta"]*old_grads["b1"]
b2 -= (1-gd_params["rho"])*gd_params["eta"]*grad_b2 + gd_params["rho"]*gd_params["eta"]*old_grads["b2"]
except KeyError:
W1 -= gd_params["eta"]*grad_W1
W2 -= gd_params["eta"]*grad_W2
b1 -= gd_params["eta"]*grad_b1
b2 -= gd_params["eta"]*grad_b2
return grad_W1, grad_W2, grad_b1, grad_b2

def minibatch_GD(X, Y, W1, W2, b1, b2, gd_params, validation=None, verbose = True):
# X must have shape (n, 3072)
# Y must have shape (k, 10)
# validation["data"] must have shape (k, 3072)
# validation["labels"] must have shape (k, 10)
training_loss = [None]*gd_params["epochs"]
validation_loss = training_loss[:] # copying the list
training_accuracy = training_loss[:]
validation_accuracy = training_loss[:]

for i in range(gd_params["epochs"]):

if i % 10 == 0 and verbose:
print("Training epoch started".format(i+1))

old_grads =
# For each batch
for j in range(X.shape[0]//gd_params["batch_size"]):
start = j*gd_params["batch_size"]
end = start + gd_params["batch_size"]
batch = X[start: end]
targets = Y[start:end]

old_grads["W1"], old_grads["W2"], old_grads["b1"], old_grads["b2"] = train_batch(batch, targets, W1, W2, b1, b2, gd_params, old_grads)

# Train the datapoints that didn't fit into last minibatch
n_rest = X.shape[0]%gd_params["batch_size"]
if n_rest > 0:
start = X.shape[0]-n_rest
end = X.shape[0]
batch = X[start: end]
targets = Y[start: end]
old_grads = train_batch(batch, targets, W1, W2, b1, b2, gd_params, old_grads)

training_loss[i] = compute_cost(X.T, Y, W1, W2, b1, b2, gd_params["lambda"])
training_accuracy[i] = compute_accuracy(X.T, Y.T, W1, W2, b1, b2)
if validation:
validation_loss[i] = compute_cost(validation["data"].T, validation["labels"], W1, W2, b1, b2, gd_params["lambda"])
validation_accuracy[i] = compute_accuracy(validation["data"].T, validation["labels"].T, W1, W2, b1, b2)
return training_loss, None, training_accuracy, None

dataset_mini =
dataset_mini["data"] = dataset_small["data"][0:100]
dataset_mini["labels"] = dataset_small["labels"][0:100]

W1 = np.random.normal(0, 0.01, [n_hidden, n_input])
W2 = np.random.normal(0, 0.01, [10, n_hidden])
b1 = np.random.normal(0, 0.1, [n_hidden, 1])
b2 = np.random.normal(0, 0.1, [10, 1])

gd_params = "epochs": 200, "batch_size":10, "lambda": 0, "eta": 0.01, "rho": 0.5
X = dataset_mini["data"]
Y = dataset_mini["labels"]
training_loss, _, training_accuracy, _ = minibatch_GD(X, Y, W1, W2, b1, b2, gd_params)


plt.plot(training_loss)
plt.figure()
plt.plot(training_accuracy)








share|improve this question












share|improve this question




share|improve this question








edited Apr 18 at 6:56
























asked Apr 17 at 18:21









Sandi

1385




1385











  • Could you modify the question to add a runnable example? It would make it easier to investigate, because we can python -m cProfile your_example.py
    – FirefoxMetzger
    Apr 18 at 4:57










  • On first glance the the culprit is that you don't use vectorized computation, but rather compute the gradient row by row in a for loop. Also you are using a linear output layer and a ReLu hidden layer correct? You can vectorize gradient computation of the ReLu instead of nesting for loops
    – FirefoxMetzger
    Apr 18 at 5:03










  • @FirefoxMetzger, added runnable example
    – Sandi
    Apr 18 at 6:59










  • You are linking to the wrong data set. Your code seems to use CIFAR-10, but you link to CIFAR-100 which has a slightly different structure and won't work.
    – FirefoxMetzger
    Apr 18 at 9:57
















  • Could you modify the question to add a runnable example? It would make it easier to investigate, because we can python -m cProfile your_example.py
    – FirefoxMetzger
    Apr 18 at 4:57










  • On first glance the the culprit is that you don't use vectorized computation, but rather compute the gradient row by row in a for loop. Also you are using a linear output layer and a ReLu hidden layer correct? You can vectorize gradient computation of the ReLu instead of nesting for loops
    – FirefoxMetzger
    Apr 18 at 5:03










  • @FirefoxMetzger, added runnable example
    – Sandi
    Apr 18 at 6:59










  • You are linking to the wrong data set. Your code seems to use CIFAR-10, but you link to CIFAR-100 which has a slightly different structure and won't work.
    – FirefoxMetzger
    Apr 18 at 9:57















Could you modify the question to add a runnable example? It would make it easier to investigate, because we can python -m cProfile your_example.py
– FirefoxMetzger
Apr 18 at 4:57




Could you modify the question to add a runnable example? It would make it easier to investigate, because we can python -m cProfile your_example.py
– FirefoxMetzger
Apr 18 at 4:57












On first glance the the culprit is that you don't use vectorized computation, but rather compute the gradient row by row in a for loop. Also you are using a linear output layer and a ReLu hidden layer correct? You can vectorize gradient computation of the ReLu instead of nesting for loops
– FirefoxMetzger
Apr 18 at 5:03




On first glance the the culprit is that you don't use vectorized computation, but rather compute the gradient row by row in a for loop. Also you are using a linear output layer and a ReLu hidden layer correct? You can vectorize gradient computation of the ReLu instead of nesting for loops
– FirefoxMetzger
Apr 18 at 5:03












@FirefoxMetzger, added runnable example
– Sandi
Apr 18 at 6:59




@FirefoxMetzger, added runnable example
– Sandi
Apr 18 at 6:59












You are linking to the wrong data set. Your code seems to use CIFAR-10, but you link to CIFAR-100 which has a slightly different structure and won't work.
– FirefoxMetzger
Apr 18 at 9:57




You are linking to the wrong data set. Your code seems to use CIFAR-10, but you link to CIFAR-100 which has a slightly different structure and won't work.
– FirefoxMetzger
Apr 18 at 9:57










1 Answer
1






active

oldest

votes

















up vote
2
down vote



accepted










Here is a faster version (around 300 ms per batch). Its essentially refactored for readability and often in vectorized code, more readability / code beauty makes code run faster:



def compute_gradients(X, Y, S1, H, P, W1, W2, lamb):
# Y must be one-hot
# Y and P must be (10, n)
# X and H must be (3072, n)
# P is the softmax layer

batch_size = X.shape[1]

t = time.time()

# gradients network
dp = -(Y-P).T
dl2 = dp
drelu = np.matmul(dl2, W2)
dl1 = drelu * (S1 > 0).T

# updates
dW2 = np.matmul(H,dl2).T/batch_size + 2*lamb*W2
dW1 = np.matmul(X,dl1).T/batch_size + 2*lamb*W1

db2 = np.sum(dl2)/batch_size
db1 = np.sum(dl1)/batch_size

dur = time.time() - t
print("compute_gradients took s".format(dur))
return dW1, dW2, db1, db2


Things I changes:



  • I threw out the assertions that you did in the beginning. Numpy will do the same and it really just slows you down. It also adds visual clutter and makes code hard to read

  • I vectorized computing ReLu (comparing the value with 0 directly)

  • I vectorized the gradient of the ReLu (matmul(a,diag(b)) is the same as a*b element wise)

  • I vectorized the accumulation of the gradient update (this one took me a long time to learn as a Bachelor's; if you don't see it, try writing down the sum over an update batch and the dyadic product for each update on paper and refactor. The idea is that you do an outer product along one rank and an inner product along another)

  • I changed time.clock() for time.time() to compute the runtime in seconds instead of 1/10th a second.

There is a lot more you could do to the rest of the code, but that is out of scope for this question.






share|improve this answer























  • I don't think it's a bug actually, we're using the cross entropy loss and softmax for the last layer, relu for the hidden layer. The math ends up looking like that after some simplifications.
    – Sandi
    Apr 18 at 12:03










  • Ah, yes that's true. I didn't see the cross entropy term.
    – FirefoxMetzger
    Apr 18 at 12:58










Your Answer




StackExchange.ifUsing("editor", function ()
return StackExchange.using("mathjaxEditing", function ()
StackExchange.MarkdownEditor.creationCallbacks.add(function (editor, postfix)
StackExchange.mathjaxEditing.prepareWmdForMathJax(editor, postfix, [["\$", "\$"]]);
);
);
, "mathjax-editing");

StackExchange.ifUsing("editor", function ()
StackExchange.using("externalEditor", function ()
StackExchange.using("snippets", function ()
StackExchange.snippets.init();
);
);
, "code-snippets");

StackExchange.ready(function()
var channelOptions =
tags: "".split(" "),
id: "196"
;
initTagRenderer("".split(" "), "".split(" "), channelOptions);

StackExchange.using("externalEditor", function()
// Have to fire editor after snippets, if snippets enabled
if (StackExchange.settings.snippets.snippetsEnabled)
StackExchange.using("snippets", function()
createEditor();
);

else
createEditor();

);

function createEditor()
StackExchange.prepareEditor(
heartbeatType: 'answer',
convertImagesToLinks: false,
noModals: false,
showLowRepImageUploadWarning: true,
reputationToPostImages: null,
bindNavPrevention: true,
postfix: "",
onDemand: true,
discardSelector: ".discard-answer"
,immediatelyShowMarkdownHelp:true
);



);








 

draft saved


draft discarded


















StackExchange.ready(
function ()
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fcodereview.stackexchange.com%2fquestions%2f192314%2fmulti-layer-perceptron-program-is-super-slow%23new-answer', 'question_page');

);

Post as a guest






























1 Answer
1






active

oldest

votes








1 Answer
1






active

oldest

votes









active

oldest

votes






active

oldest

votes








up vote
2
down vote



accepted










Here is a faster version (around 300 ms per batch). Its essentially refactored for readability and often in vectorized code, more readability / code beauty makes code run faster:



def compute_gradients(X, Y, S1, H, P, W1, W2, lamb):
# Y must be one-hot
# Y and P must be (10, n)
# X and H must be (3072, n)
# P is the softmax layer

batch_size = X.shape[1]

t = time.time()

# gradients network
dp = -(Y-P).T
dl2 = dp
drelu = np.matmul(dl2, W2)
dl1 = drelu * (S1 > 0).T

# updates
dW2 = np.matmul(H,dl2).T/batch_size + 2*lamb*W2
dW1 = np.matmul(X,dl1).T/batch_size + 2*lamb*W1

db2 = np.sum(dl2)/batch_size
db1 = np.sum(dl1)/batch_size

dur = time.time() - t
print("compute_gradients took s".format(dur))
return dW1, dW2, db1, db2


Things I changes:



  • I threw out the assertions that you did in the beginning. Numpy will do the same and it really just slows you down. It also adds visual clutter and makes code hard to read

  • I vectorized computing ReLu (comparing the value with 0 directly)

  • I vectorized the gradient of the ReLu (matmul(a,diag(b)) is the same as a*b element wise)

  • I vectorized the accumulation of the gradient update (this one took me a long time to learn as a Bachelor's; if you don't see it, try writing down the sum over an update batch and the dyadic product for each update on paper and refactor. The idea is that you do an outer product along one rank and an inner product along another)

  • I changed time.clock() for time.time() to compute the runtime in seconds instead of 1/10th a second.

There is a lot more you could do to the rest of the code, but that is out of scope for this question.






share|improve this answer























  • I don't think it's a bug actually, we're using the cross entropy loss and softmax for the last layer, relu for the hidden layer. The math ends up looking like that after some simplifications.
    – Sandi
    Apr 18 at 12:03










  • Ah, yes that's true. I didn't see the cross entropy term.
    – FirefoxMetzger
    Apr 18 at 12:58














up vote
2
down vote



accepted










Here is a faster version (around 300 ms per batch). Its essentially refactored for readability and often in vectorized code, more readability / code beauty makes code run faster:



def compute_gradients(X, Y, S1, H, P, W1, W2, lamb):
# Y must be one-hot
# Y and P must be (10, n)
# X and H must be (3072, n)
# P is the softmax layer

batch_size = X.shape[1]

t = time.time()

# gradients network
dp = -(Y-P).T
dl2 = dp
drelu = np.matmul(dl2, W2)
dl1 = drelu * (S1 > 0).T

# updates
dW2 = np.matmul(H,dl2).T/batch_size + 2*lamb*W2
dW1 = np.matmul(X,dl1).T/batch_size + 2*lamb*W1

db2 = np.sum(dl2)/batch_size
db1 = np.sum(dl1)/batch_size

dur = time.time() - t
print("compute_gradients took s".format(dur))
return dW1, dW2, db1, db2


Things I changes:



  • I threw out the assertions that you did in the beginning. Numpy will do the same and it really just slows you down. It also adds visual clutter and makes code hard to read

  • I vectorized computing ReLu (comparing the value with 0 directly)

  • I vectorized the gradient of the ReLu (matmul(a,diag(b)) is the same as a*b element wise)

  • I vectorized the accumulation of the gradient update (this one took me a long time to learn as a Bachelor's; if you don't see it, try writing down the sum over an update batch and the dyadic product for each update on paper and refactor. The idea is that you do an outer product along one rank and an inner product along another)

  • I changed time.clock() for time.time() to compute the runtime in seconds instead of 1/10th a second.

There is a lot more you could do to the rest of the code, but that is out of scope for this question.






share|improve this answer























  • I don't think it's a bug actually, we're using the cross entropy loss and softmax for the last layer, relu for the hidden layer. The math ends up looking like that after some simplifications.
    – Sandi
    Apr 18 at 12:03










  • Ah, yes that's true. I didn't see the cross entropy term.
    – FirefoxMetzger
    Apr 18 at 12:58












up vote
2
down vote



accepted







up vote
2
down vote



accepted






Here is a faster version (around 300 ms per batch). Its essentially refactored for readability and often in vectorized code, more readability / code beauty makes code run faster:



def compute_gradients(X, Y, S1, H, P, W1, W2, lamb):
# Y must be one-hot
# Y and P must be (10, n)
# X and H must be (3072, n)
# P is the softmax layer

batch_size = X.shape[1]

t = time.time()

# gradients network
dp = -(Y-P).T
dl2 = dp
drelu = np.matmul(dl2, W2)
dl1 = drelu * (S1 > 0).T

# updates
dW2 = np.matmul(H,dl2).T/batch_size + 2*lamb*W2
dW1 = np.matmul(X,dl1).T/batch_size + 2*lamb*W1

db2 = np.sum(dl2)/batch_size
db1 = np.sum(dl1)/batch_size

dur = time.time() - t
print("compute_gradients took s".format(dur))
return dW1, dW2, db1, db2


Things I changes:



  • I threw out the assertions that you did in the beginning. Numpy will do the same and it really just slows you down. It also adds visual clutter and makes code hard to read

  • I vectorized computing ReLu (comparing the value with 0 directly)

  • I vectorized the gradient of the ReLu (matmul(a,diag(b)) is the same as a*b element wise)

  • I vectorized the accumulation of the gradient update (this one took me a long time to learn as a Bachelor's; if you don't see it, try writing down the sum over an update batch and the dyadic product for each update on paper and refactor. The idea is that you do an outer product along one rank and an inner product along another)

  • I changed time.clock() for time.time() to compute the runtime in seconds instead of 1/10th a second.

There is a lot more you could do to the rest of the code, but that is out of scope for this question.






share|improve this answer















Here is a faster version (around 300 ms per batch). Its essentially refactored for readability and often in vectorized code, more readability / code beauty makes code run faster:



def compute_gradients(X, Y, S1, H, P, W1, W2, lamb):
# Y must be one-hot
# Y and P must be (10, n)
# X and H must be (3072, n)
# P is the softmax layer

batch_size = X.shape[1]

t = time.time()

# gradients network
dp = -(Y-P).T
dl2 = dp
drelu = np.matmul(dl2, W2)
dl1 = drelu * (S1 > 0).T

# updates
dW2 = np.matmul(H,dl2).T/batch_size + 2*lamb*W2
dW1 = np.matmul(X,dl1).T/batch_size + 2*lamb*W1

db2 = np.sum(dl2)/batch_size
db1 = np.sum(dl1)/batch_size

dur = time.time() - t
print("compute_gradients took s".format(dur))
return dW1, dW2, db1, db2


Things I changes:



  • I threw out the assertions that you did in the beginning. Numpy will do the same and it really just slows you down. It also adds visual clutter and makes code hard to read

  • I vectorized computing ReLu (comparing the value with 0 directly)

  • I vectorized the gradient of the ReLu (matmul(a,diag(b)) is the same as a*b element wise)

  • I vectorized the accumulation of the gradient update (this one took me a long time to learn as a Bachelor's; if you don't see it, try writing down the sum over an update batch and the dyadic product for each update on paper and refactor. The idea is that you do an outer product along one rank and an inner product along another)

  • I changed time.clock() for time.time() to compute the runtime in seconds instead of 1/10th a second.

There is a lot more you could do to the rest of the code, but that is out of scope for this question.







share|improve this answer















share|improve this answer



share|improve this answer








edited Apr 18 at 12:58


























answered Apr 18 at 11:42









FirefoxMetzger

74628




74628











  • I don't think it's a bug actually, we're using the cross entropy loss and softmax for the last layer, relu for the hidden layer. The math ends up looking like that after some simplifications.
    – Sandi
    Apr 18 at 12:03










  • Ah, yes that's true. I didn't see the cross entropy term.
    – FirefoxMetzger
    Apr 18 at 12:58
















  • I don't think it's a bug actually, we're using the cross entropy loss and softmax for the last layer, relu for the hidden layer. The math ends up looking like that after some simplifications.
    – Sandi
    Apr 18 at 12:03










  • Ah, yes that's true. I didn't see the cross entropy term.
    – FirefoxMetzger
    Apr 18 at 12:58















I don't think it's a bug actually, we're using the cross entropy loss and softmax for the last layer, relu for the hidden layer. The math ends up looking like that after some simplifications.
– Sandi
Apr 18 at 12:03




I don't think it's a bug actually, we're using the cross entropy loss and softmax for the last layer, relu for the hidden layer. The math ends up looking like that after some simplifications.
– Sandi
Apr 18 at 12:03












Ah, yes that's true. I didn't see the cross entropy term.
– FirefoxMetzger
Apr 18 at 12:58




Ah, yes that's true. I didn't see the cross entropy term.
– FirefoxMetzger
Apr 18 at 12:58












 

draft saved


draft discarded


























 


draft saved


draft discarded














StackExchange.ready(
function ()
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fcodereview.stackexchange.com%2fquestions%2f192314%2fmulti-layer-perceptron-program-is-super-slow%23new-answer', 'question_page');

);

Post as a guest













































































Popular posts from this blog

Greedy Best First Search implementation in Rust

Function to Return a JSON Like Objects Using VBA Collections and Arrays

C++11 CLH Lock Implementation