Training a model with pure python (different approach from using classes)
This time, instead of using classes, I am trying a different approach, which is by just using functions. This is neither flexible nor expandable as using classes. Also, it is more cumbersome to write it this way as each parameter needs to be tracked and passed manually. Although I could write with type dispatch and layer abstraction, I just wanted to get it over with a bare bone working model and worry about expanding it later.
import gzip
import matplotlib.pyplot as plt
import random
import statistics
import math
import operator as op
import time
from functools import reduce
It is pretty much the same program until the model part.
Because we are only using python lists, we need to build utilities or tools we need in order to train our model. First, I start with shape, which returns a tuple of shape of a matrix or a list
def shape(t) -> tuple:
"Using for loop to go deeper, but only goes up to 10 layers."
# It only works up to 10 levels of deep
res = tuple()
for i in range(0, 10):
try:
# Using eval is very slow. I gotta come up with other way to do this.
res += (len(eval('t'+str('[0]'*i))),)
except TypeError:
# print('Not a list')
break
except IndexError:
print('Cant index it')
break
return res
def shape(t) -> tuple:
""" It uses while loop to go through so that we are not limited to 10.
However, using i to keep up with a value does not seem too pythonic. """
res = tuple()
i = 0
while True:
try:
# Using eval is very slow. I gotta come up with other way to do this.
res += (len(eval('t'+str('[0]'*i))),)
# print('t'+str('[0]'*i), ' eval is ', len(eval('t'+str('[0]'*i))))
except TypeError:
# print('Not a list')
break
except IndexError:
print('Cant index it')
break
i += 1
return res
def shape(t) -> tuple:
""" More elegent way to approach. """
def loop(mat, result):
if not isinstance(mat, list):
return result
else:
return loop(mat[0], result + (len(mat),))
return loop(t, tuple())
Still works as well
shape([1, 2, 3]), shape([[1, 2, 3], [4, 5, 6]])
Now that we have a way of getting a shape of a matrix, we can move onto map_mat, which takes a function and a vector or a matrix. It calls the function and mapping into a matrix or a vector.
def map_mat(fn, mat):
"Apply fn into a matrix or a vector"
res = []
if len(shape(mat)) == 2: # It is a matrix
for i in range(len(mat)):
res.append([fn(m) for m in mat[i]])
else: # It is a vector
return list(map(fn, mat))
return res
lst = [1, 2, 3, 4, 5]
mat1 = [[1, 2, 3],
[4, 5, 6],
[7, 8, 9]]
map_mat(lambda x: x + 1, lst)
map_mat(lambda x: x + 1, mat1)
Instead of using for loops, it is faster to use list comprehension.
def map_mat2(fn, mat):
"A little faster than map_mat."
return [list(map(fn, mat[i])) for i in range(len(mat))]
It would be better if we can make matrices easier instead of making them by hand.
def lst_nums(shape, num=1):
"Use optional num to define what a list is full of. Default is 1"
if isinstance(shape, tuple):
x, y = shape
return [[num]*y for _ in range(x)]
else:
x = shape
return [num]*x
hund_1s = lst_nums((10, 10), 1)
len(hund_1s), len(hund_1s[0])
hund_1s
This one gets random numbers.
def lst_random(shape, init_parms=False):
"return a list of randoms and if init_parms is True, initialize parameters using Kaiming init."
x, y = shape
res = lst_nums(shape, 0)
for i in range(x):
for j in range(y):
res[i][j] = random.normalvariate(0,1)
if init_parms: res[i][j] *= math.sqrt(2/x)
return res
rand_mat = lst_random((10,10))
shape(rand_mat)
Here is transpose function, which transposes a matrix.
def transpose (mat):
"Transpose the matrix"
return [[m[i] for m in mat] for i in range(len(mat[0]))]
Now that we can make matrices with ease, we need a function that can be called using multiple matrices. With elementwise function, we can call a function with inputs from two matrices elementwise. This is very useful function when it comes to training a model later on.
def elementwise (fn, mat1, mat2):
"Closure that returns function that does element wise action"
# can it handle (64,), (64,)? YES!
mat = []
try:
m1r,m1c = shape(mat1)
except ValueError:
m1r = shape(mat1)[0]
m1c = 0
try:
m2r,m2c = shape(mat2)
except ValueError:
m2r = shape(mat2)[0]
m2c = 0
if m1c == m2c == 0: # Two 1D vectors
return list(map(fn, mat1, mat2))
elif (m1r, m1c) == (m2r, m2c): # two matrixs with same sizes
return [[fn(x,y) for x,y in zip(mat1[i], mat2[i])] for i in range(len(mat1))]
elif m1c == m2r and m2c==0: # shape of (a, b), (b,)
for i in range(m1r):
mat.append([fn(x,y) for x,y in zip(mat1[i],mat2)])
return mat
elif m1r == m2r and m2c == 0: # shape of (a, b), (a,)
for i in range(m1r):
mat.append([fn(m, mat2[i]) for m in mat1[i]])
return mat
else:
assert False, "WTF??"
hund_2s = lst_nums((10, 10), 2)
elementwise(lambda x, y: x+y, hund_1s, hund_2s)
rand_mat[0]
Now, we need to reshape our matrices into whatever shape we want.
def reshape(matrix, new_shape) -> list:
""" If matrix can be reshaped into new_shape, then
return a new matrix with a respective shape.
Only supports matrices into 2 dimensional arrays."""
old_shape = shape(matrix)
elem_nums = mul(old_shape)
if old_shape == new_shape:
return matrix
elif not elem_nums == mul(new_shape):
raise Exception("Wrong shape!")
else:
return shaping(flatten(matrix), new_shape, elem_nums, list())
def mul(lst: list) -> int:
""" Return a result of all numbers multiplied.
Like sum, but multiplying. """
return reduce(op.mul, lst, 1)
def shaping(flat, new_shape, elem_nums, result):
if len(new_shape) == 0:
return result
else:
div = elem_nums // new_shape[0]
for i in range(new_shape[0]):
result.append(flat[(i * div):((i+1) * div)])
return result
def flatten(matrix):
""" Flatten a matrix into a 1 dimensional list. """
result = []
for i in range(len(matrix)):
if isinstance(matrix[i], list):
result.extend(flatten(matrix[i]))
else:
result.append(matrix[i])
return result
Testing new tools
shaping(flatten([1, [2, [[[4]]]], 3]), (2,2), 4, [])
shape(flatten(hund_1s))
shape(reshape(hund_1s, (100, 1))), shape(reshape(hund_1s, (1, 100)))
mat3 = [[[1, 2],
[3, 4]],
[[5, 6],
[7, 8]]]
mat3, shape(mat3)
shape(reshape(mat3, (4, 2))), reshape(mat3, (4, 2))
First, we need data if we want to do some training. We are using mnist dataset from yann lecun website. The dataset has training images and testing/validating images.
!wget http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
!wget http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz
!wget http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz
!wget http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz
Here, I convert zip files into image objects with functions I got from here. I am only using numpy library for only conversion purposes.
def mnist_images(fname:str, pct=1) -> list:
"""
Convert zip files into lists of images.
Only returning pct percent of data.
"""
with gzip.open('data/'+fname, 'r') as f:
# first 4 bytes is a magic number
magic_number = int.from_bytes(f.read(4), 'big')
# second 4 bytes is the number of images
image_count = int.from_bytes(f.read(4), 'big')
# image_count = int(image_count * percent)
# third 4 bytes is the row count
row_count = int.from_bytes(f.read(4), 'big')
# fourth 4 bytes is the column count
column_count = int.from_bytes(f.read(4), 'big')
# rest is the image pixel data, each pixel is stored as an unsigned byte
# pixel values are 0 to 255
image_data = f.read()
images = reshape(list(image_data), (image_count, column_count, row_count))
return images[:int(image_count * pct)]
# return reshape(images, (image_count, column_count, row_count))
def mnist_labels(fname:str, pct=1) -> list:
"""
Convert zip files into lists of labels.
Only returning pct percent of data.
"""
with gzip.open('data/'+fname, 'r') as f:
# first 4 bytes is a magic number
magic_number = int.from_bytes(f.read(4), 'big')
# second 4 bytes is the number of labels
label_count = int.from_bytes(f.read(4), 'big')
# rest is the label data, each label is stored as unsigned byte
# label values are 0 to 9
label_data = f.read()
labels = list(label_data)
return labels[:int(label_count * pct)]
Now, I am making a directory for all the data and putting data inside.
!mkdir data
!mv train-images-idx3-ubyte.gz data
!mv train-labels-idx1-ubyte.gz data
!mv t10k-images-idx3-ubyte.gz data
!mv t10k-labels-idx1-ubyte.gz data
!ls data/
Now that we have the data we need, let's make this data more usable by using functions that I got above, such as mnist_images and mnist_labels. With mnist_images, I get numpy arrays of images, and with mnist_labels, I get numpy arrays with labels for each image.
py_imgs = mnist_images('train-images-idx3-ubyte.gz')
py_train_labels = mnist_labels('train-labels-idx1-ubyte.gz')
py_test_imgs = mnist_images('t10k-images-idx3-ubyte.gz')
py_test_labels = mnist_labels('t10k-labels-idx1-ubyte.gz')
Now that we have numpy arrays of images and labels, we can convert those into python lists.
type(py_imgs[0])
type(py_imgs[0][0])
type(py_imgs), type(py_train_labels)
Now that we have some tools to work with, we can prepare our data for training. First, we will reshape our data. Even if we are not using GPU to train, it is still fun to reshape them. Then, we divide our data by 255 because the highest value is 255.
py_imgs = map_mat2(lambda x: x / 255, py_imgs)
py_test_imgs = map_mat2(lambda x: x / 255, py_test_imgs)
shape(py_imgs), shape(py_test_imgs)
We have a dataset now. Nowe we can:
- Train with dataset.
- Get predictions and find loss.
- Get metrics.
- Get gradients and update parameters (weight and bias).
Now that we have a dataset, it is time to look at matrix multiplication, which is the most important operation in deep learning. First, we initialize weights and bias.
x = lst_random((200,100))
x[1][:5]
If shape is (2,3): [[1,1,1], [1,1,1]] Also, if matrix multiplication between (2, 3) and (3, 4) should be (2, 4)
def py_matmul(a,b):
"Needs some speed ups"
ar,ac = len(a),len(a[0])
br,bc = len(b),len(b[0])
assert ac == br, f'Size of ar ({ac}) does not match br ({br}).'
c = lst_nums((ar, bc), 0)
for i in range(ar):
for j in range(bc):
for z in range(ac):
c[i][j] += a[i][z] * b[z][j]
return c
m1 = [[1,2],[3,4]]
m2 = [[2,3],[4,5]]
m5 = [[1,2,3,4],[5,6]]
py_matmul(m1,m2)
This is not
ml1 = lst_random((784, 100))
ml2 = lst_random((100, 10))
It works, but it is slow. We can make it faster by getting rid of for loop.
def col_mat (mat:list, col:int) -> list:
"Get a column of a matrix."
return [m[col] for m in mat]
def py_matmul2(a,b):
"Use sum function"
ar,ac = len(a),len(a[0])
br,bc = len(b),len(b[0])
assert ac == br, f'Size of ar ({ac}) does not match br ({br}).'
c = lst_nums((ar, bc), 0)
for i in range(ar):
for j in range(bc):
c[i][j] = sum(elementwise(op.mul, a[i], col_mat(b,j)))
return c
py_matmul2(m1, m2)
Using two for loops is faster than using three.
def py_matmul3(a, b):
ar,ac = len(a),len(a[0])
br,bc = len(b),len(b[0])
assert ac == br, f'Size of ar ({ac}) does not match br ({br}).'
c = lst_nums((ar, bc), 0)
for i in range(ar):
c[i] = [sum(elementwise(op.mul, a[i], col_mat(b,j))) for j in range(bc)]
return c
py_matmul3(m1, m2)
Even with reducing it to one loop, we did not really gain much speed. After using prun, we can see that elementwise is using a lot of time. We can probably get away without using elemtwise to achieve matrix multiplication.
def py_matmul4(a, b):
ar,ac = len(a),len(a[0])
br,bc = len(b),len(b[0])
assert ac == br, f'Size of ar ({ac}) does not match br ({br}).'
c = lst_nums((ar, bc), 0)
t = transpose(b)
for i in range(ar):
c[i] = [sum(map(lambda x: x[0] * x[1], zip(a[i], (t[j])))) for j in range(bc)]
return c
py_matmul4(m1, m2)
Without elementwise, we gained some speed compared to other versions.
I am still not satisfied with the result yet. I am sure we can do better. Let's get some help from itertools.
Default sum takes the longest time to execute now, but it is faster option we have, compared to using for loop or reduce function.
def py_matmul5(a, b):
ar,ac = len(a),len(a[0])
br,bc = len(b),len(b[0])
assert ac == br, f'Size of ar ({ac}) does not match br ({br}).'
c = lst_nums((ar, bc), 0)
t = transpose(b)
for i in range(ar):
# c[i] = [sum(itertools.starmap(op.mul, zip(a[i], (t[j])))) for j in range(bc)]
c[i] = [sum(map(op.mul, a[i], t[j])) for j in range(bc)]
return c
py_matmul5(m1, m2)
sum_test = list(range(10_000_000))
len(sum_test)
def reduce_sum(lst):
return functools.reduce(op.add, lst)
def for_sum(lst):
res = 0
for i in range(len(lst)):
res += lst[i]
return res
Using Kaiming init. With Kaiming init, we get a head start compared to using just random numbers.
sample = lst_random((200, 100), True)
# x = map_mat(lambda x: x*0.1, x)
# statistics.stdev(x[0])
Checking whether the initialization works. Standard deviation should equal to sqrt(2/n_in), and mean should be 0. And this works. With this initialization, we can train deeper layers. For more information, paper is here.
def check_dist(x):
for i in range(len(x)//10):
print(statistics.stdev(x[i]), statistics.mean(x[i]))
math.sqrt(2/200)
statistics.variance(sample[0])
check_dist(sample)
shape(m1), shape(m2)
Now, it is time to diverge from using classes.
def relu(old_x):
return 0 if old_x < 0 else old_x
def relu_b (old_x, grad):
return grad if old_x > 0 else 0
def softmax (inp):
mat = map_mat2(math.exp, inp)
res = []
for i in range(len(mat)):
s = sum(mat[i])
res.append([x/s for x in mat[i]])
return res
def softmax_b(old_y, grad):
res = elementwise(op.mul, old_y, grad)
res = [sum(res[i]) for i in range(len(old_y))] # shape is (64,)
return elementwise(op.mul, old_y, elementwise(op.sub, grad, res))
def crossentropyloss(inp, targ):
mat = inp
res = []
for i in range(len(mat)):
for j in range(len(targ[0])):
if targ[i][j] == 1:
res.append(-math.log(mat[i][j]))
return res
def crossen_b(old_x, old_y):
mat = map_mat2(lambda x: x if x>1e-8 else 1e-8, old_x)
res = lst_nums(shape(old_x), num=0.)
for i in range(len(mat)):
for j in range(len(old_y[0])):
if old_y[i][j] == 1:
res[i][j] = (-1/(mat[i][j]))
return res
def linear(x, w, b):
return elementwise(lambda x,y: x+y, py_matmul5(x, w), b)
def linear_b(old_x, w, grad):
grad_b = mean_0(grad)
grad_w = py_matmul5(transpose(old_x), grad)
out = py_matmul5(grad, transpose(w))
return out, grad_w, grad_b
def mean_0 (matrix):
"Find a mean in matrix over 0 axis"
return [statistics.mean([m[i] for m in matrix]) for i in range(len(matrix[0]))]
Now, we will take mini bathces of data with batch size and train.
def prep_data(size):
xb = py_imgs[:size]
yb = lst_nums((size, 10), 0)
yb_vals = py_train_labels[:size]
for i in range(size):
yb[i][yb_vals[i]] = 1
return xb, yb
x, y = prep_data(25600)
shape(x), shape(y)
def forward_and_backward(inp, targ, w1, b1, w2, b2):
# Forward pass
l1 = linear(inp,w1,b1)
l2 = map_mat2(relu, l1)
sm_old_y = linear(l2,w2,b2)
cel_old_x = softmax(sm_old_y)
cel_old_x = map_mat2(lambda x: x if x>1e-8 else 1e-8, cel_old_x)
# Calculate loss
loss = crossentropyloss(cel_old_x, targ)
total_loss = sum(loss) / len(targ)
# Backward pass
grad = crossen_b(cel_old_x, targ)
grad = softmax_b(cel_old_x,grad)
grad, grad_w2, grad_b2 = linear_b(l2,w2,grad)
grad = elementwise(relu_b,l1,grad)
grad, grad_w1, grad_b1 = linear_b(inp,w1,grad)
return (grad_w1, grad_b1, grad_w2, grad_b2), total_loss, w1, b1, w2, b2
def make_prediction(inp, w1, b1, w2, b2):
inp = reshape(inp, (1, 784))
l1 = linear(inp,w1,b1)
l2 = map_mat2(relu, l1)
sm_old_y = linear(l2,w2,b2)
result = softmax(sm_old_y)
result = result[0]
return result.index(max(result))
w1 = lst_random((784, 56), True)
w2 = lst_random((56, 10), True)
b1 = lst_nums(56, 0)
b2 = lst_nums(10, 0)
wbs = (w1, b1, w2, b2)
With our data set, it took me about five minutes to run on google colab.
def train (n, x=x, y=y, bs=64, lr=0.01):
"""
Train n times and return weights and biases
"""
# Initialize weights and biases
w1 = lst_random((784, 56), True)
w2 = lst_random((56, 10), True)
b1 = lst_nums(56, 0)
b2 = lst_nums(10, 0)
wbs = (w1, b1, w2, b2)
for i in range(n):
for j in range(len(x) // bs):
xb = x[j*bs:(j+1)*bs]
yb = y[j*bs:(j+1)*bs]
# Do a forward and backward then get grad
grads, loss, w1, b1, w2, b2 = forward_and_backward(xb, yb, w1, b1, w2, b2)
# multiply grads with lr and update weights and biases
grads = [map_mat(lambda x: x*lr, mat) for mat in grads]
w1 = elementwise(op.sub, w1, grads[0])
b1 = elementwise(op.sub, b1, grads[1])
w2 = elementwise(op.sub, w2, grads[2])
b2 = elementwise(op.sub, b2, grads[3])
if j % 50 == 0:
accuracy = len(list(filter(None, [make_prediction(py_test_imgs[i], w1, b1, w2, b2) == py_test_labels[i] for i in range(100)])))
print(f" Batch #{j} with Loss is {loss}, Accuracy is {accuracy}%")
print(f"Epoch:{i+1} / {n} Loss is {loss}, Accuracy is {accuracy}%")
return (w1, b1, w2, b2)
w1, b1, w2, b2 = train(1, lr=0.01)
This approach is not as expandable and flexible as class approach. I could have written as packages of functions consisting forward and backward versions. Then, dispatch either backward or forward depending on the type we need to call. We do not need a functionality of classes or objects in order to write code in objective style. I will rewrite this code later in that manner.