神经网络6：LSTM 神经网络

▶ 循环神经网络

● 代码，参考【https://zybuluo.com/hanbingtao/note/581764】。原代码有多处错误，且有大量低效的循环和函数调用，在 github 上反馈给了 Up 主，不知道有没有改善。这里主要实现了一个单层 LSTM 神经网络类 LstmLayer，包含前向量和后向计算

  1 import numpy as np
  2
  3 global_epsilon = 1e-3
  4 global_ita = 0.2
  5 #np.random.seed(107)
  6
  7 class SigmoidActivator(object):                                     # 两个激活函数
  8     def forward(self, weighted_input):
  9         return 1.0 / (1.0 + np.exp(-weighted_input))
 10
 11     def backward(self, output):
 12         return output * (1 - output)
 13
 14 class TanhActivator(object):
 15     def forward(self, weighted_input):
 16         return 2.0 / (1.0 + np.exp(-2 * weighted_input)) - 1.0
 17
 18     def backward(self, output):
 19         return 1 - output * output
 20
 21 class LstmLayer(object):
 22     def __init__(self, sCol, dCol, nSample, ita = global_ita):      # 构造函数要求传入样本数，以申请足够的内存
 23         self.sCol = sCol
 24         self.dCol = dCol
 25         self.ita = ita
 26         self.nSample = nSample
 27         self.time = 0
 28         self.gActivator = SigmoidActivator()                        # 中间层激活函数
 29         self.dActivator = TanhActivator()                           # 输出层激活函数
 30
 31         self.f =  np.zeros((self.nSample + 1, self.dCol))           # 初始化状态向量，第 0 行永远为 0，方便递推计算
 32         self.i =  np.zeros((self.nSample + 1, self.dCol))
 33         self.ct = np.zeros((self.nSample + 1, self.dCol))
 34         self.c =  np.zeros((self.nSample + 1, self.dCol))
 35         self.o =  np.zeros((self.nSample + 1, self.dCol))
 36         self.h =  np.zeros((self.nSample + 1, self.dCol))
 37         self.Wfh, self.Wfx, self.bf =  self.weightMatrix()
 38         self.Wih, self.Wix, self.bi =  self.weightMatrix()
 39         self.Wch, self.Wcx, self.bct = self.weightMatrix()
 40         self.Woh, self.Wox, self.bo =  self.weightMatrix()
 41
 42     def weightMatrix(self):                                         # 初始化权重矩阵
 43         Wh = np.random.uniform(-1, 1,(self.dCol, self.dCol))
 44         Wx = np.random.uniform(-1, 1,(self.dCol, self.sCol))
 45         b = np.zeros(self.dCol)
 46         return Wh, Wx, b
 47
 48     def forward(self, x):                                           # 前向计算
 49         self.time += 1
 50         tt = self.time
 51         self.f[tt] =  self.gActivator.forward(np.dot(self.Wfh, self.h[tt - 1]) + np.dot(self.Wfx, x) + self.bf)
 52         self.i[tt] =  self.gActivator.forward(np.dot(self.Wih, self.h[tt - 1]) + np.dot(self.Wix, x) + self.bi)
 53         self.ct[tt] = self.dActivator.forward(np.dot(self.Wch, self.h[tt - 1]) + np.dot(self.Wcx, x) + self.bct)     # 注意 ct 门不一样  
 54         self.c[tt] =  self.f[tt] * self.c[tt - 1] + self.i[tt] * self.ct[tt]
 55         self.o[tt] =  self.gActivator.forward(np.dot(self.Woh, self.h[tt - 1]) + np.dot(self.Wox, x) + self.bo)
 56         self.h[tt] =  self.o[tt] * self.dActivator.forward(self.c[tt])
 57
 58     def backward(self, x, deltaNextLayer):                          # 后向计算        
 59         self.deltaF =  np.zeros((self.time + 1, self.dCol))         # 计算误差项部分，初始化各误差项        
 60         self.deltaI =  np.zeros((self.time + 1, self.dCol))
 61         self.deltaO =  np.zeros((self.time + 1, self.dCol))
 62         self.deltaCt = np.zeros((self.time + 1, self.dCol))
 63         self.deltaH =  np.zeros((self.time + 1, self.dCol))         # deltaH 表示输出的误差项（用于分解给个状态向量）
 64         self.deltaH[-1] = deltaNextLayer                            # 上一层传递来的误差项                
 65         for tt in range(self.time, 0, -1):                          # 同层倒序传递
 66             f =  self.f[tt]
 67             i =  self.i[tt]
 68             ct = self.ct[tt]
 69             o =  self.o[tt]
 70             h =  self.deltaH[tt]
 71             cPre =         self.c[tt-1]
 72             tanhC =        self.dActivator.forward(self.c[tt])
 73             inverseTanhC = self.dActivator.backward(tanhC)
 74
 75             self.deltaF[tt] =  h * o * inverseTanhC * cPre * self.gActivator.backward(f)                        # 用本层总 dealta 计算各状态向量的 delta
 76             self.deltaI[tt] =  h * o * inverseTanhC * ct   * self.gActivator.backward(i)
 77             self.deltaCt[tt] = h * o * inverseTanhC * i    * self.dActivator.backward(ct)
 78             self.deltaO[tt] =  h     * tanhC               * self.gActivator.backward(o)
 79             self.deltaH[tt-1] = np.dot(self.deltaO[tt], self.Woh) + np.dot(self.deltaI[tt], self.Wih) + \
 80                                 np.dot(self.deltaF[tt], self.Wfh) + np.dot(self.deltaCt[tt], self.Wch)          # 用本层个状态向量 delta 计算上层总 delta，这里假设上一层输出即为本层输入，括号外没有偏导数项
 81
 82         self.WfhGrad = np.sum(np.array([ np.outer(self.deltaF[1+i], self.h[i]) for i in range(self.time) ]), 0) # 求各状态向量梯度，使用外积完全向量化，类似于张量缩并
 83         self.WihGrad = np.sum(np.array([ np.outer(self.deltaI[1+i], self.h[i]) for i in range(self.time) ]), 0)
 84         self.WohGrad = np.sum(np.array([ np.outer(self.deltaO[1+i], self.h[i]) for i in range(self.time) ]), 0)
 85         self.WchGrad = np.sum(np.array([ np.outer(self.deltaCt[1+i],self.h[i]) for i in range(self.time) ]), 0)
 86
 87         self.bfGrad = np.sum(self.deltaF[1:1+self.time], 0)         # 求 b 的梯度，相当于上式没有乘法部分
 88         self.biGrad = np.sum(self.deltaI[1:1+self.time], 0)
 89         self.boGrad = np.sum(self.deltaO[1:1+self.time], 0)
 90         self.bcGrad = np.sum(self.deltaCt[1:1+self.time],0)
 91
 92         self.WfxGrad = np.outer(self.deltaF[-1], x)                 # 计算 Wx 的梯度         
 93         self.WixGrad = np.outer(self.deltaI[-1], x)
 94         self.WoxGrad = np.outer(self.deltaO[-1], x)
 95         self.WcxGrad = np.outer(self.deltaCt[-1],x)
 96
 97     def update(self):                                               # 更新权重
 98         self.Wfh -= self.ita * self.WhfGrad
 99         self.Wfx -= self.ita * self.WhxGrad
100         self.bf -=  self.ita * self.bfGrad
101         self.Wih -= self.ita * self.WhiGrad
102         self.Wix -= self.ita * self.WhiGrad
103         self.bi -=  self.ita * self.biGrad
104         self.Woh -= self.ita * self.WofGrad
105         self.Wox -= self.ita * self.WoxGrad
106         self.bo -=  self.ita * self.boGrad
107         self.Wch -= self.ita * self.WcfGrad
108         self.Wcx -= self.ita * self.WcxGrad
109         self.bct -= self.ita * self.bcGrad
110
111     def reset(self):                                                # 重置各状态向量
112         self.time = 0
113         self.f = np.zeros((self.nSample + 1,self.dCol))
114         self.i = np.zeros((self.nSample + 1,self.dCol))
115         self.ct = np.zeros((self.nSample + 1,self.dCol))
116         self.c = np.zeros((self.nSample + 1,self.dCol))
117         self.o = np.zeros((self.nSample + 1,self.dCol))
118         self.h = np.zeros((self.nSample + 1,self.dCol))
119
120     def printLstmLayer(self):                                       # 输出本层神经网络的所有参数
121         print("sCol = %d, dCol = %d, ita = %d, nSample = %d, time = %d"%(self.sCol, self.dCol, self.ita, self.nSample, self.time))
122         print("f=\n", self.f, "\ni=\n", self.i, "\nct=\n", self.ct, "\nc=\n", self.c, "\no=\n", self.o, "\nh=\n", self.h)
123         print("Wfh=\n", self.Wfh, "\nWfx=\n", self.Wfx, "\nbf=\n", self.bf)
124         print("Wih=\n", self.Wih, "\nWix=\n", self.Wix, "\nbi=\n", self.bi)
125         print("Wch=\n", self.Wch, "\nWcx=\n", self.Wcx, "\nbc=\n", self.bct)
126         print("Woh=\n", self.Woh, "\nWox=\n", self.Wox, "\nbo=\n", self.bo)
127
128         print("deltaF=\n", self.deltaF, "\ndeltaI=\n", self.deltaI, "\ndeltaO=\n", self.deltaO, "\ndeltaCt=\n", self.deltaCt, "\ndeltaH=\n", self.deltaH)
129         print("WfhGrad=\n", self.WfhGrad, "\nWfxGrad=\n", self.WfxGrad, "\nbfGrad=\n", self.bfGrad)
130         print("WihGrad=\n", self.WihGrad, "\nWixGrad=\n", self.WixGrad, "\nbiGrad=\n", self.biGrad)
131         print("WohGrad=\n", self.WohGrad, "\nWoxGrad=\n", self.WoxGrad, "\nboGrad=\n", self.boGrad)
132         print("WchGrad=\n", self.WchGrad, "\nWcxGrad=\n", self.WcxGrad, "\nbcGrad=\n", self.bcGrad)
133
134 def createTestData():                                               # 创建测试数据
135     s = [ np.array([1, 2, 3]), np.array([2, 3, 4]) ]
136     d = np.array([1, 2])
137     return s, d
138
139 def test():
140     lstmLayer = LstmLayer(3, 2, 2)                                  # 传入输入维度、输出维度，样本数，可选参数学习效率
141     x, d = createTestData()
142     lstmLayer.forward(x[0]), lstmLayer.forward(x[1])
143     lstmLayer.backward(x[1], d)
144     lstmLayer.printLstmLayer()
145
146 def gradCheck(epsilon = global_epsilon):
147     lstm = LstmLayer(3, 2, 2, epsilon)
148     s, d = createTestData()
149     lstm.forward(s[0]), lstm.forward(s[1])
150     lstm.backward(s[1], np.ones(lstm.h[-1].shape,dtype=np.float64))    # 计算参考梯度，假设最终误差项为全 1 向量    
151     for i in range(lstm.Wfh.shape[0]):
152         for j in range(lstm.Wfh.shape[1]):
153             lstm.Wfh[i,j] += epsilon
154             lstm.reset()
155             lstm.forward(s[0]), lstm.forward(s[1])
156             err1 = np.sum(lstm.h[-1])
157             lstm.Wfh[i,j] -= 2*epsilon
158             lstm.reset()
159             lstm.forward(s[0]), lstm.forward(s[1])
160             err2 = np.sum(lstm.h[-1])
161             lstm.Wfh[i,j] += epsilon
162             print('weights(%d,%d): expected <-> actural %.4e <-> %.4e' % (i, j, (err1 - err2) / (2 * epsilon), lstm.WfhGrad[i,j]))
163
164 if __name__ == "__main__":
165     test()
166     gradCheck()

● 输出结果

sCol = 3, dCol = 2, ita = 0, nSample = 2, time = 2
f=
 [[0.         0.        ]
 [0.08364582 0.64637889]
 [0.04687823 0.75054254]]
i=
 [[0.         0.        ]
 [0.64994358 0.11231909]
 [0.71495984 0.07056953]]
ct=
 [[ 0.          0.        ]
 [-0.77851627 -0.99959223]
 [-0.75373868 -0.99999457]]
c=
 [[ 0.          0.        ]
 [-0.50599165 -0.11227329]
 [-0.56261288 -0.15483503]]
o=
 [[0.         0.        ]
 [0.05723445 0.06130245]
 [0.0173681  0.01644683]]
h=
 [[ 0.          0.        ]
 [-0.02671797 -0.00685385]
 [-0.00885623 -0.00252639]]
Wfh=
 [[-0.44650757 -0.34150997]
 [ 0.1461234   0.7320657 ]]
Wfx=
 [[ 0.61845573 -0.74104458 -0.51005937]
 [ 0.50410244 -0.08955573  0.09272295]]
bf=
 [0. 0.]
Wih=
 [[ 0.05587383 -0.25802153]
 [-0.73662134 -0.25832213]]
Wix=
 [[ 0.33294318 -0.38308928  0.35067554]
 [ 0.03109526  0.40860802 -0.97185992]]
bi=
 [0. 0.]
Wch=
 [[-0.16803787 -0.149016  ]
 [-0.68550217  0.24428858]]
Wcx=
 [[ 0.29142476  0.62232088 -0.85921977]
 [-0.81363189 -0.65205061 -0.71037887]]
bc=
 [0. 0.]
Woh=
 [[-0.09910883 -0.49439315]
 [-0.90781981  0.44788208]]
Wox=
 [[-0.23362093 -0.45101893 -0.55533428]
 [-0.88301662  0.34405375 -0.84458816]]
bo=
 [0. 0.]
deltaF=
 [[ 0.          0.        ]
 [ 0.          0.        ]
 [-0.00029056 -0.00067513]]
deltaI=
 [[ 0.00000000e+00  0.00000000e+00]
 [-4.89958523e-05 -1.29338011e-05]
 [-1.97417511e-03 -2.10655809e-03]]
deltaO=
 [[ 0.00000000e+00  0.00000000e+00]
 [-1.55659244e-04 -1.37923566e-05]
 [-8.70241385e-03 -4.96967325e-03]]
deltaCt=
 [[0.00000000e+00 0.00000000e+00]
 [7.08195651e-05 1.18852402e-08]
 [3.96844066e-03 2.46176389e-08]]
deltaH=
 [[2.28293924e-05 7.62122414e-05]
 [6.17970530e-03 2.14376899e-03]
 [1.00000000e+00 2.00000000e+00]]
WfhGrad=
 [[7.76324944e-06 1.99147518e-06]
 [1.80382086e-05 4.62726915e-06]]
WfxGrad=
 [[-0.00058113 -0.00087169 -0.00116225]
 [-0.00135027 -0.0020254  -0.00270054]]
bfGrad=
 [-0.00029056 -0.00067513]
WihGrad=
 [[5.27459502e-05 1.35307066e-05]
 [5.62829545e-05 1.44380401e-05]]
WixGrad=
 [[-0.00394835 -0.00592253 -0.0078967 ]
 [-0.00421312 -0.00631967 -0.00842623]]
biGrad=
 [-0.00202317 -0.00211949]
WohGrad=
 [[2.32510827e-04 5.96450681e-05]
 [1.32779578e-04 3.40614115e-05]]
WoxGrad=
 [[-0.01740483 -0.02610724 -0.03480966]
 [-0.00993935 -0.01490902 -0.01987869]]
boGrad=
 [-0.00885807 -0.00498347]
WchGrad=
 [[-1.06028676e-04 -2.71991102e-05]
 [-6.57733325e-10 -1.68725686e-10]]
WcxGrad=
 [[7.93688131e-03 1.19053220e-02 1.58737626e-02]
 [4.92352779e-08 7.38529168e-08 9.84705558e-08]]
bcGrad=
 [4.03926022e-03 3.65028791e-08]
weights(0,0): expected <-> actural 1.4570e-02 <-> 1.4570e-02
weights(0,1): expected <-> actural -2.4253e-02 <-> -2.4253e-02
weights(1,0): expected <-> actural -5.2460e-03 <-> -5.2460e-03
weights(1,1): expected <-> actural 8.7327e-03 <-> 8.7327e-03