Archive

Monthly Archives: May 2015

I’ve seen a bunch of tutorials on Neural Networks in Python, but I’ve never found any of them to be particularly good. They’re either riddled with errors or use simple, single-example training with basic arrays. I’ve always wanted something a little more robust than the simple C-ish implementation and something less mathematically terse than the average neural network paper. I made this implementation in the hopes that it will explain how neural networks work and how you’d use a matrix library to train multiple examples at the same time. It’s not optimized (since you can save your activation values and re-use them), but it should be easy to tune.

Here’s the source:

#!/usr/bin/env python
# Author: Joseph Catrambone <jo.jcat@gmail.com>
# Obtained from https://gist.github.com/JosephCatrambone/b8a6509384d3858974c2
# License:
# The MIT License (MIT)
#
# Copyright (c) 2015 Joseph Catrambone
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
from __future__ import print_function, division
import numpy
import math
def sanity_check(val, val_limit=float('inf')):
return False # Uncomment to disable
if numpy.any(numpy.isnan(val)) or numpy.any(numpy.isinf(val)) or numpy.any(val > val_limit) or numpy.any(val < -val_limit):
print("Breaking")
import pdb; pdb.set_trace()
# Activation functions
# Delta activation functions da(y) expects y to be a(x).
# dActivation function da(y) expects y to be just x.
# Note the distinction in naming.
def linear(x):
return x
def delta_linear(x):
return numpy.ones(x.shape, dtype=numpy.float)
def sigmoid(x):
return 1.0/(1.0+numpy.exp(-x))
def delta_sigmoid(x):
# Or, if x = sig(x), then x*(1-x)
return numpy.multiply(sigmoid(x), (1-sigmoid(x)))
def tanh(x):
return numpy.tanh(x)
def delta_tanh(x):
return 1 - numpy.power(tanh(x), 2)
def softplus(x): # Smooth Relu
return numpy.log(1 + numpy.exp(x))
def delta_softplus(x):
return sigmoid(x) # Coincidence
class NeuralNetwork(object):
def __init__(self, layers, activation_functions, delta_activation_functions, weight_range=0.1):
"""Construct a neural network.
Layers should be an array of sizes. [2, 4, 10, 1] will have an input of two and an output of 1.
activation_functions should be an array of functions which take an array and return an array.
delta_activation_functions should take an array and return f'(x) for each x in the array, NOT f'(f(x))."""
self.weights = list()
self.biases = list()
self.activation_functions = activation_functions
self.delta_activation_functions = delta_activation_functions
#for l1, l2 in zip(layers, layers[:1]):
for index in range(len(layers)-1):
l1 = layers[index]
l2 = layers[index+1]
self.weights.append(numpy.random.uniform(low=-weight_range, high=weight_range, size=(l1,l2)))
for layer_size in layers:
self.biases.append(numpy.zeros((1, layer_size))) # Need to dup
def _add_bias(self, data, bias):
return data + bias.repeat(data.shape[0], axis=0)
def predict(self, examples):
activities, activations = self.forward_propagate(examples)
return self.activation_functions[-1](activities[-1])
def forward_propagate(self, examples):
"""Returns the values and the activations."""
activities = list() # Preactivations
activations = list() # After act-func.
# Populate input
activities.append(examples)
activations.append(self.activation_functions[0](examples))
# Forward prop
for weight, bias, func in zip(self.weights, self.biases[1:], self.activation_functions):
preactivation = self._add_bias(numpy.dot(activations[-1], weight), bias)
activities.append(preactivation)
activations.append(func(preactivation))
return activities, activations
def backward_propagate(self, expected, activities, activations):
"""Given the expected values and the activities, return the delta_weight and delta_bias arrays."""
# From Bishop's book,
# Forward propagate to get activities (a) and activations (z)
# Evaluate dk for all outputs with dk = yk - yk (basically, get error at output)
# Backpropagate error dk using dj = deltaAct(aj) * sum(wkj * dk)
# Use dEn/dwji = dj*zi
expected = numpy.atleast_2d(expected)
delta_weights = list()
delta_biases = list()
# Calculate blame/error
last_error = expected - activations[-1] # Linear loss.
delta = numpy.multiply(last_error, self.delta_activation_functions[-1](activities[-1]))
delta_biases.append(delta.mean(axis=0))
# (Dot of weight and delta) * gradient at activity
for k in range(len(activities)-2, -1, -1):
layer_error = numpy.dot(delta, self.weights[k].T)
delta_weights.append(numpy.dot(activations[k].T, delta)) # Get weight change before calculating blame at this level.
delta = numpy.multiply(layer_error, self.delta_activation_functions[k](activities[k]))
delta_biases.append(delta.mean(axis=0))
delta_biases.reverse()
delta_weights.reverse()
return delta_weights, delta_biases
def fit(self, examples, labels, epochs=1000, shuffle_data=True, batch_size=10, learning_rate=0.01, momentum=0.0, early_cutoff=0.0, update_every=0, update_func=None):
"""Train the neural network on the given examples and labels.
epochs is the maximum number of iterations that should be spent training the data.
batch_size is the number of examples which should be used at a time.
learning_rate is the amount by which delta weights are downsamples.
momentum is the amount by which old changes are applied.
early_cutoff is the amount of error which, if a given epoch undershoots, training will cease.
After 'update_every' epochs (k%up == 0), update_func (if not null) will be called with the epoch and error. """
# To calculate momentum, maintain the last changes.
# We prepopulate this list with a bunch of zero matrices since there are no changes at the start.
last_delta_weights = list()
last_delta_biases = list()
for i in range(len(self.weights)):
last_delta_weights.append(numpy.zeros(self.weights[i].shape))
for i in range(len(self.biases)):
last_delta_biases.append(numpy.zeros(self.biases[i].shape))
for k in range(epochs):
# Randomly select examples in the list
samples = numpy.random.randint(low=0, high=examples.shape[0], size=[batch_size,])
x = numpy.atleast_2d(examples[samples])
y = numpy.atleast_2d(labels[samples])
# Forward propagate
activities, activations = self.forward_propagate(x)
# Backprop errors
dws, dbs = self.backward_propagate(y, activities, activations)
# Apply deltas
for i, dw in enumerate(dws):
last_delta_weights[i] = last_delta_weights[i]*momentum + ((learning_rate*dw)/float(batch_size))*(1.0-momentum)
self.weights[i] += last_delta_weights[i]
for i, db in enumerate(dbs):
last_delta_biases[i] = last_delta_biases[i]*momentum + ((learning_rate*db)/float(batch_size))*(1.0-momentum)
self.biases[i] += last_delta_biases[i]
# Calculate error
error = numpy.sum(numpy.abs(y - self.activation_functions[-1](activities[-1])))
# Check to see if we should call the user's progress function
if update_every != 0 and update_func is not None and k%update_every == 0:
update_func(k, error)
# Break early
if error < early_cutoff:
return
if __name__=="__main__":
examples = numpy.asarray([
[0.0, 0.0],
[1.0, 0.0],
[0.0, 1.0],
[1.0, 0.1]
])
labels = numpy.asarray([
[0.0,],
[1.0,],
[1.0,],
[0.0,]
])
#nn = NeuralNetwork([2, 3, 1], [sigmoid, sigmoid, sigmoid], [delta_sigmoid, delta_sigmoid, delta_sigmoid], weight_range=1.0)
nn = NeuralNetwork([2, 3, 1], [tanh, tanh, tanh], [delta_tanh, delta_tanh, delta_tanh])
#import pdb; pdb.set_trace()
nn.fit(examples, labels, epochs=100000, learning_rate=0.9, momentum=0.3, update_every=100, update_func=lambda i,x : print("Iteration {}: {}".format(i,x)))
print(nn.predict(examples))

And here are some visualized examples as the different activation functions try to learn f(x) = sin(x).

A five layer neural network with sigmoid activation for the inner three layers.

The same network structure as above, but with softmax activation for the hidden layers.

The same network with Tanh activation.