I’ve seen a bunch of tutorials on Neural Networks in Python, but I’ve never found any of them to be particularly good. They’re either riddled with errors or use simple, single-example training with basic arrays. I’ve always wanted something a little more robust than the simple C-ish implementation and something less mathematically terse than the average neural network paper. I made this implementation in the hopes that it will explain how neural networks work and how you’d use a matrix library to train multiple examples at the same time. It’s not optimized (since you can save your activation values and re-use them), but it should be easy to tune.
Here’s the source:
#!/usr/bin/env python | |
# Author: Joseph Catrambone <jo.jcat@gmail.com> | |
# Obtained from https://gist.github.com/JosephCatrambone/b8a6509384d3858974c2 | |
# License: | |
# The MIT License (MIT) | |
# | |
# Copyright (c) 2015 Joseph Catrambone | |
# | |
# Permission is hereby granted, free of charge, to any person obtaining a copy | |
# of this software and associated documentation files (the "Software"), to deal | |
# in the Software without restriction, including without limitation the rights | |
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |
# copies of the Software, and to permit persons to whom the Software is | |
# furnished to do so, subject to the following conditions: | |
# | |
# The above copyright notice and this permission notice shall be included in all | |
# copies or substantial portions of the Software. | |
# | |
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |
# SOFTWARE. | |
from __future__ import print_function, division | |
import numpy | |
import math | |
def sanity_check(val, val_limit=float('inf')): | |
return False # Uncomment to disable | |
if numpy.any(numpy.isnan(val)) or numpy.any(numpy.isinf(val)) or numpy.any(val > val_limit) or numpy.any(val < -val_limit): | |
print("Breaking") | |
import pdb; pdb.set_trace() | |
# Activation functions | |
# Delta activation functions da(y) expects y to be a(x). | |
# dActivation function da(y) expects y to be just x. | |
# Note the distinction in naming. | |
def linear(x): | |
return x | |
def delta_linear(x): | |
return numpy.ones(x.shape, dtype=numpy.float) | |
def sigmoid(x): | |
return 1.0/(1.0+numpy.exp(-x)) | |
def delta_sigmoid(x): | |
# Or, if x = sig(x), then x*(1-x) | |
return numpy.multiply(sigmoid(x), (1-sigmoid(x))) | |
def tanh(x): | |
return numpy.tanh(x) | |
def delta_tanh(x): | |
return 1 - numpy.power(tanh(x), 2) | |
def softplus(x): # Smooth Relu | |
return numpy.log(1 + numpy.exp(x)) | |
def delta_softplus(x): | |
return sigmoid(x) # Coincidence | |
class NeuralNetwork(object): | |
def __init__(self, layers, activation_functions, delta_activation_functions, weight_range=0.1): | |
"""Construct a neural network. | |
Layers should be an array of sizes. [2, 4, 10, 1] will have an input of two and an output of 1. | |
activation_functions should be an array of functions which take an array and return an array. | |
delta_activation_functions should take an array and return f'(x) for each x in the array, NOT f'(f(x)).""" | |
self.weights = list() | |
self.biases = list() | |
self.activation_functions = activation_functions | |
self.delta_activation_functions = delta_activation_functions | |
#for l1, l2 in zip(layers, layers[:1]): | |
for index in range(len(layers)-1): | |
l1 = layers[index] | |
l2 = layers[index+1] | |
self.weights.append(numpy.random.uniform(low=-weight_range, high=weight_range, size=(l1,l2))) | |
for layer_size in layers: | |
self.biases.append(numpy.zeros((1, layer_size))) # Need to dup | |
def _add_bias(self, data, bias): | |
return data + bias.repeat(data.shape[0], axis=0) | |
def predict(self, examples): | |
activities, activations = self.forward_propagate(examples) | |
return self.activation_functions[-1](activities[-1]) | |
def forward_propagate(self, examples): | |
"""Returns the values and the activations.""" | |
activities = list() # Preactivations | |
activations = list() # After act-func. | |
# Populate input | |
activities.append(examples) | |
activations.append(self.activation_functions[0](examples)) | |
# Forward prop | |
for weight, bias, func in zip(self.weights, self.biases[1:], self.activation_functions): | |
preactivation = self._add_bias(numpy.dot(activations[-1], weight), bias) | |
activities.append(preactivation) | |
activations.append(func(preactivation)) | |
return activities, activations | |
def backward_propagate(self, expected, activities, activations): | |
"""Given the expected values and the activities, return the delta_weight and delta_bias arrays.""" | |
# From Bishop's book, | |
# Forward propagate to get activities (a) and activations (z) | |
# Evaluate dk for all outputs with dk = yk - yk (basically, get error at output) | |
# Backpropagate error dk using dj = deltaAct(aj) * sum(wkj * dk) | |
# Use dEn/dwji = dj*zi | |
expected = numpy.atleast_2d(expected) | |
delta_weights = list() | |
delta_biases = list() | |
# Calculate blame/error | |
last_error = expected - activations[-1] # Linear loss. | |
delta = numpy.multiply(last_error, self.delta_activation_functions[-1](activities[-1])) | |
delta_biases.append(delta.mean(axis=0)) | |
# (Dot of weight and delta) * gradient at activity | |
for k in range(len(activities)-2, -1, -1): | |
layer_error = numpy.dot(delta, self.weights[k].T) | |
delta_weights.append(numpy.dot(activations[k].T, delta)) # Get weight change before calculating blame at this level. | |
delta = numpy.multiply(layer_error, self.delta_activation_functions[k](activities[k])) | |
delta_biases.append(delta.mean(axis=0)) | |
delta_biases.reverse() | |
delta_weights.reverse() | |
return delta_weights, delta_biases | |
def fit(self, examples, labels, epochs=1000, shuffle_data=True, batch_size=10, learning_rate=0.01, momentum=0.0, early_cutoff=0.0, update_every=0, update_func=None): | |
"""Train the neural network on the given examples and labels. | |
epochs is the maximum number of iterations that should be spent training the data. | |
batch_size is the number of examples which should be used at a time. | |
learning_rate is the amount by which delta weights are downsamples. | |
momentum is the amount by which old changes are applied. | |
early_cutoff is the amount of error which, if a given epoch undershoots, training will cease. | |
After 'update_every' epochs (k%up == 0), update_func (if not null) will be called with the epoch and error. """ | |
# To calculate momentum, maintain the last changes. | |
# We prepopulate this list with a bunch of zero matrices since there are no changes at the start. | |
last_delta_weights = list() | |
last_delta_biases = list() | |
for i in range(len(self.weights)): | |
last_delta_weights.append(numpy.zeros(self.weights[i].shape)) | |
for i in range(len(self.biases)): | |
last_delta_biases.append(numpy.zeros(self.biases[i].shape)) | |
for k in range(epochs): | |
# Randomly select examples in the list | |
samples = numpy.random.randint(low=0, high=examples.shape[0], size=[batch_size,]) | |
x = numpy.atleast_2d(examples[samples]) | |
y = numpy.atleast_2d(labels[samples]) | |
# Forward propagate | |
activities, activations = self.forward_propagate(x) | |
# Backprop errors | |
dws, dbs = self.backward_propagate(y, activities, activations) | |
# Apply deltas | |
for i, dw in enumerate(dws): | |
last_delta_weights[i] = last_delta_weights[i]*momentum + ((learning_rate*dw)/float(batch_size))*(1.0-momentum) | |
self.weights[i] += last_delta_weights[i] | |
for i, db in enumerate(dbs): | |
last_delta_biases[i] = last_delta_biases[i]*momentum + ((learning_rate*db)/float(batch_size))*(1.0-momentum) | |
self.biases[i] += last_delta_biases[i] | |
# Calculate error | |
error = numpy.sum(numpy.abs(y - self.activation_functions[-1](activities[-1]))) | |
# Check to see if we should call the user's progress function | |
if update_every != 0 and update_func is not None and k%update_every == 0: | |
update_func(k, error) | |
# Break early | |
if error < early_cutoff: | |
return | |
if __name__=="__main__": | |
examples = numpy.asarray([ | |
[0.0, 0.0], | |
[1.0, 0.0], | |
[0.0, 1.0], | |
[1.0, 0.1] | |
]) | |
labels = numpy.asarray([ | |
[0.0,], | |
[1.0,], | |
[1.0,], | |
[0.0,] | |
]) | |
#nn = NeuralNetwork([2, 3, 1], [sigmoid, sigmoid, sigmoid], [delta_sigmoid, delta_sigmoid, delta_sigmoid], weight_range=1.0) | |
nn = NeuralNetwork([2, 3, 1], [tanh, tanh, tanh], [delta_tanh, delta_tanh, delta_tanh]) | |
#import pdb; pdb.set_trace() | |
nn.fit(examples, labels, epochs=100000, learning_rate=0.9, momentum=0.3, update_every=100, update_func=lambda i,x : print("Iteration {}: {}".format(i,x))) | |
print(nn.predict(examples)) | |
And here are some visualized examples as the different activation functions try to learn f(x) = sin(x).