MLP classifier in theano settles at local minima
我使用 theano 编写了一个 MLP 分类器。使用反向传播算法的训练函数如下:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 | self.weights=[theano.shared(numpy.random.random((network.architecture[i+1],network.architecture[i]))) for i in range(len(network.architecture)-1)] self.bias=[theano.shared(numpy.random.random(network.architecture[i+1])) for i in range(len(network.architecture)-1)] self.layers=network.layers self.prev_rate=[theano.shared(numpy.zeros((network.architecture[i+1],network.architecture[i]))) for i in range(len(network.architecture)-1)]+[theano.shared(numpy.zeros(network.architecture[i+1])) for i in range(len(network.architecture)-1)] prediction=T.dmatrix() output=T.dmatrix() reg_lambda=T.dscalar() alpha=T.dscalar() momentum=T.dscalar() cost=T.nnet.categorical_crossentropy(prediction,output).mean() for i,j in zip(self.weights,self.bias): cost+=T.sum(i**2)*reg_lambda cost+=T.sum(j**2)*reg_lambda parameters=self.weights+self.bias rates=[(alpha*T.grad(cost,parameter)+momentum*prev_rate) for parameter,prev_rate in zip(parameters,self.prev_rate)] updates=[(weight,weight-rate) for weight,rate in zip(parameters,rates)]+[(prev_rate,rate) for prev_rate,rate in zip(self.prev_rate,rates)] self.backprop=theano.function([prediction,output,reg_lambda,alpha,momentum],cost,updates=updates) |
我尝试为 XOR 问题训练分类器。实现是
1 2 3 4 5 | network=FeedForwardNetwork([2,2,2]) network.initialize() network.train(numpy.array([[0.,0.],[0.,1.],[1.,0.],[1.,1.],[0.,0.],[0.,1.],[1.,0.],[1.,1.]]),numpy.array([[0.,1.],[1.,0.],[1.,0.],[0.,1.],[0.,1.],[1.,0.],[1.,0.],[0.,1.]]),alpha=0.01,epochs=1000000000000000,momentum=0.9) print network.predict(numpy.array([[1.,0.]])) print network.predict(numpy.array([[0.,0.]])) |
initialize() 方法只编译后端的所有函数,即反向传播函数、用于计算预测的前向传递函数和一些其他的 theano 函数。现在,当我运行这段代码时,训练会达到局部最小值。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 | 0.69314718056 0.69314718056 0.69314718056 0.69314718056 0.69314718056 0.69314718056 0.69314718056 0.69314718056 0.69314718056 0.69314718056 0.69314718056 0.69314718056 0.69314718056 0.69314718056 0.69314718056 0.69314718056 0.69314718056 0.69314718056 0.69314718056 0.69314718056 0.69314718056 0.69314718056 |
在训练开始时,损失约为 0.92。它稳步下降到上述值并停在那里。我尝试改变 alpha 值和动量。我做错了什么?
附言
整个代码在这里:
网络.py
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 | import theano import theano.tensor as T import numpy from layers import * from backend import NetworkBackend class Network: def __init__(self,architecture): self.architecture=architecture self.layers=[] self.weights=[] self.bias=[] def __str__(self): banner='' for i in range(len(self.weights)): banner+=str(self.weights[i])+'\ ' banner+=str(self.bias[i])+'\ ' return banner class FeedForwardNetwork(Network): def initialize(self): self.layers.append(InputLayer(units=self.architecture[0])) for i in range(1,len(self.architecture[:-1])): self.layers.append(SigmoidLayer(units=self.architecture[i])) self.layers.append(SoftmaxLayer(units=self.architecture[-1])) self.backend=NetworkBackend(self) def predict(self,inputs): return self.backend.activate(inputs) def train(self,X,y,alpha=100,reg_lambda=0.0001,epochs=10000,momentum=0.9): cost=1 while cost>0.01 and epochs: prediction=self.predict(X) cost=self.backend.backprop(prediction,y,reg_lambda,alpha,momentum) print cost epochs-=1 if __name__=='__main__': network=FeedForwardNetwork([2,2,2]) network.initialize() network.train(numpy.array([[0.,0.],[0.,1.],[1.,0.],[1.,1.],[0.,0.],[0.,1.],[1.,0.],[1.,1.]]),numpy.array([[0.,1.],[1.,0.],[1.,0.],[0.,1.],[0.,1.],[1.,0.],[1.,0.],[0.,1.]]),alpha=0.01,epochs=1000000000000000,momentum=0.9) print network.predict(numpy.array([[1.,0.]])) print network.predict(numpy.array([[0.,0.]])) |
layers.py
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 | import theano import theano.tensor as T import scipy from backend import ComputationBackend class Layer: def __init__(self,units): self.units=units self.backend=ComputationBackend() def __str__(self): banner=self.__class__.__name__ banner+=" Units:%d"%self.units return banner class SigmoidLayer(Layer): def forwardPass(self,inputs): return self.backend.sigmoid(inputs) class InputLayer(Layer): def forwardPass(self,inputs): return inputs class SoftmaxLayer(Layer): def forwardPass(self,inputs): return self.backend.softmax(inputs) |
后端.py
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 | import theano import theano.tensor as T import numpy class NetworkBackend: def __init__(self,network): # initialize shared variables self.weights=[theano.shared(numpy.random.random((network.architecture[i+1],network.architecture[i]))) for i in range(len(network.architecture)-1)] self.bias=[theano.shared(numpy.random.random(network.architecture[i+1])) for i in range(len(network.architecture)-1)] self.layers=network.layers self.prev_rate=[theano.shared(numpy.zeros((network.architecture[i+1],network.architecture[i]))) for i in range(len(network.architecture)-1)]+[theano.shared(numpy.zeros(network.architecture[i+1])) for i in range(len(network.architecture)-1)] # activation for network layers inputs=T.dmatrix() temp=self.layers[0].forwardPass(inputs) for i in range(1,len(self.layers[:-1])): temp=self.layers[i].forwardPass(T.dot(temp,self.weights[i-1].transpose())+self.bias[i-1]) output=self.layers[-1].forwardPass(T.dot(temp,self.weights[-1].transpose())+self.bias[-1]) self.activate=theano.function([inputs],output) prediction=T.dmatrix() output=T.dmatrix() reg_lambda=T.dscalar() alpha=T.dscalar() momentum=T.dscalar() cost=T.nnet.categorical_crossentropy(prediction,output).mean() for i,j in zip(self.weights,self.bias): cost+=T.sum(i**2)*reg_lambda cost+=T.sum(j**2)*reg_lambda parameters=self.weights+self.bias rates=[(alpha*T.grad(cost,parameter)+momentum*prev_rate) for parameter,prev_rate in zip(parameters,self.prev_rate)] updates=[(weight,weight-rate) for weight,rate in zip(parameters,rates)]+[(prev_rate,rate) for prev_rate,rate in zip(self.prev_rate,rates)] self.backprop=theano.function([prediction,output,reg_lambda,alpha,momentum],cost,updates=updates) class ComputationBackend: def __init__(self): # sigmoid activation self.sigmoid=T.nnet.sigmoid # softmax activation self.softmax=T.nnet.softmax |
这可能是参数初始化引起的。以下代码示例使用具有单个隐藏层的神经网络实现了一个基本的 XOR 学习器。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 | import numpy import theano import theano.tensor as tt def compile(input_size, hidden_size): w_h = theano.shared(numpy.random.standard_normal(size=(input_size, hidden_size)).astype(theano.config.floatX)) b_h = theano.shared(numpy.zeros((hidden_size,), dtype=theano.config.floatX)) w_y = theano.shared(numpy.zeros((hidden_size,), dtype=theano.config.floatX)) b_y = theano.shared(numpy.zeros(1, dtype=theano.config.floatX), broadcastable=(True,)) x = tt.matrix() z = tt.ivector() learning_rate = tt.scalar() h = tt.tanh(tt.dot(x, w_h) + b_h) y = tt.nnet.sigmoid(tt.dot(h, w_y) + b_y) cost = tt.nnet.binary_crossentropy(y, z).mean() updates = [(p, p - learning_rate * tt.grad(cost, p)) for p in [w_h, b_h, w_y, b_y]] return theano.function([x, z, learning_rate], outputs=cost, updates=updates), theano.function([x], outputs=y) def main(): numpy.random.seed(5) train, test = compile(2, 2) for _ in xrange(100000): print train([[1, 1], [1, 0], [0, 1], [0, 0]], [0, 1, 1, 0], 0.1) print test([[1, 1], [1, 0], [0, 1], [0, 0]]) main() |
注意随机数生成器的种子值。使用
不同的随机初始化方法可能会产生更好的结果,即对 RNG 种子不太敏感。
终于明白了!在 NetworkBackend 中,在计算成本时,我正在计算预期输出和作为参数传递给 theano 函数的预测之间的交叉熵,而不是使用由激活函数计算的预测。因此,theano 图不包含前向传播。因此,theano.tensor.grad 只能找到正则化函数的梯度,而不是实际的成本函数!所以正确的实现应该是:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 | inputs=T.dmatrix() temp=self.layers[0].forwardPass(inputs) for i in range(1,len(self.layers[:-1])): temp=self.layers[i].forwardPass(T.dot (temp,self.weights[i-1].transpose())+self.bias[i-1]) output=self.layers[-1].forwardPass(T.dot(temp,self.weights[-1]. transpose())+self.bias[-1]) self.activate=theano.function([inputs],output) label=T.dmatrix() reg_lambda=T.dscalar() alpha=T.dscalar() momentum=T.dscalar() cost=T.nnet.categorical_crossentropy(output,label).mean() for i,j in zip(self.weights,self.bias): cost+=T.sum(i**2)*reg_lambda cost+=T.sum(j**2)*reg_lambda parameters=self.weights+self.bias rates=[(alpha*T.grad(cost,parameter)+momentum*prev_rate) for parameter,prev_rate in zip(parameters,self.prev_rate)] updates=[(weight,weight-rate) for weight,rate in zip(parameters,rates)]+[(prev_rate,rate) for prev_rate,rate in zip(self.prev_rate,rates)] self.backprop=theano.function([inputs,label,reg_lambda,alpha,momentum], cost,updates=updates) |
所以我没有声明一个新的预测矩阵,而是使用激活函数中使用的相同方程来获取输入并计算训练函数中的预测。这完成了 theano 图,theano.tensor.grad() 现在计算成本函数的梯度以及正则化。