RNN-theano代码解析

import theano

import numpy

import os

import pdb

from theano import tensor as T

from collections import OrderedDict

class model(object):

    def __init__(self, nh, nc, ne, de, cs):

        '''

        nh :: dimension of the hidden layer

        nc :: number of classes

        ne :: number of word embeddings in the vocabulary

        de :: dimension of the word embeddings

        cs :: word window context size

        '''

        # parameters of the model

        self.emb = theano.shared(0.2 * numpy.random.uniform(-1.0, 1.0,\

                   (ne+1, de)).astype(theano.config.floatX)) # add one for PADDING at the end

        self.Wx  = theano.shared(0.2 * numpy.random.uniform(-1.0, 1.0,\

                   (de * cs, nh)).astype(theano.config.floatX))

        self.Wh  = theano.shared(0.2 * numpy.random.uniform(-1.0, 1.0,\

                   (nh, nh)).astype(theano.config.floatX))

        self.W   = theano.shared(0.2 * numpy.random.uniform(-1.0, 1.0,\

                   (nh, nc)).astype(theano.config.floatX))

        self.bh  = theano.shared(numpy.zeros(nh, dtype=theano.config.floatX))

        self.b   = theano.shared(numpy.zeros(nc, dtype=theano.config.floatX))

        self.h0  = theano.shared(numpy.zeros(nh, dtype=theano.config.floatX))

        # bundle

        self.params = [ self.emb, self.Wx, self.Wh, self.W, self.bh, self.b, self.h0 ]

        self.names  = ['embeddings', 'Wx', 'Wh', 'W', 'bh', 'b', 'h0']

        idxs = T.imatrix() # as many columns as context window size/lines as words in the sentence

        x = self.emb[idxs].reshape((idxs.shape[0], de*cs))

        y    = T.iscalar('y') # label

        def recurrence(x_t, h_tm1):

            h_t = T.nnet.sigmoid(T.dot(x_t, self.Wx) + T.dot(h_tm1, self.Wh) + self.bh)

            s_t = T.nnet.softmax(T.dot(h_t, self.W) + self.b)

            return [h_t, s_t]

        [h, s], _ = theano.scan(fn=recurrence, \

            sequences=x, outputs_info=[self.h0, None], \

            n_steps=x.shape[0])

        p_y_given_x_lastword = s[-1,0,:]

        p_y_given_x_sentence = s[:,0,:]

        y_pred = T.argmax(p_y_given_x_sentence, axis=1)

        # cost and gradients and learning rate

        lr = T.scalar('lr')

        nll = -T.log(p_y_given_x_lastword)[y]

        gradients = T.grad( nll, self.params )

        updates = OrderedDict(( p, p-lr*g ) for p, g in zip( self.params , gradients))

        # theano functions

        self.classify = theano.function(inputs=[idxs], outputs=y_pred)

        self.train = theano.function( inputs  = [idxs, y, lr],

                                      outputs = nll,

                                      updates = updates )

        self.normalize = theano.function( inputs = [],

                         updates = {self.emb:\

                         self.emb/T.sqrt((self.emb**2).sum(axis=1)).dimshuffle(0,'x')})

    def save(self, folder):

        for param, name in zip(self.params, self.names):

            numpy.save(os.path.join(folder, name + '.npy'), param.get_value())

上述是RNN在deep learning tutorial上的代码，我们来逐层解释一下。

 self.emb = theano.shared(0.2 * numpy.random.uniform(-1.0, 1.0,\

                   (ne+1, de)).astype(theano.config.floatX)) # add one for PADDING at the end

        self.Wx  = theano.shared(0.2 * numpy.random.uniform(-1.0, 1.0,\

                   (de * cs, nh)).astype(theano.config.floatX))

        self.Wh  = theano.shared(0.2 * numpy.random.uniform(-1.0, 1.0,\

                   (nh, nh)).astype(theano.config.floatX))

        self.W   = theano.shared(0.2 * numpy.random.uniform(-1.0, 1.0,\

                   (nh, nc)).astype(theano.config.floatX))

        self.bh  = theano.shared(numpy.zeros(nh, dtype=theano.config.floatX))

        self.b   = theano.shared(numpy.zeros(nc, dtype=theano.config.floatX))

        self.h0  = theano.shared(numpy.zeros(nh, dtype=theano.config.floatX))

这一段很明显是初始化参数，emb是词向量，一共ne+1个词，de是维度，是超参数，需要给定。在elman-forward中有这样对应的输入：

 s = {'fold':3, # 5 folds 0,1,2,3,4

         'lr':0.0627142536696559,

         'verbose':1,

         'decay':False, # decay on the learning rate if improvement stops

         'win':7, # number of words in the context window

         'bs':9, # number of backprop through time steps

         'nhidden':100, # number of hidden units

         'seed':345,

         'emb_dimension':100, # dimension of word embedding

         'nepochs':50}

    folder = os.path.basename(__file__).split('.')[0]

    if not os.path.exists(folder): os.mkdir(folder)

    # load the dataset

    train_set, valid_set, test_set, dic = load.atisfold(s['fold'])

    idx2label = dict((k,v) for v,k in dic['labels2idx'].iteritems())

    idx2word  = dict((k,v) for v,k in dic['words2idx'].iteritems())

    train_lex, train_ne, train_y = train_set

    valid_lex, valid_ne, valid_y = valid_set

    test_lex,  test_ne,  test_y  = test_set

    vocsize = len(dic['words2idx'])

    nclasses = len(dic['labels2idx'])

    nsentences = len(train_lex)

    # instanciate the model

    numpy.random.seed(s['seed'])

    random.seed(s['seed'])

    rnn = model(    nh = s['nhidden'],

                    nc = nclasses,

                    ne = vocsize,

                    de = s['emb_dimension'],

                    cs = s['win'] )

我们可以看到在

train_set, valid_set, test_set, dic = load.atisfold(s['fold'])

以及 vocsize = len(dic['words2idx'])可知emb的行是总单词的个数。emb也是需要训练得到的。wx是(de*cs)*h的矩阵，是输入到隐藏层之间的参数，每个单词扩充到cs窗口大小，每个单词维度是词向量维度de，所以一个单词长度就是de*cs，bh为这两层之间的bias，wh是h*h的矩阵，隐藏层到隐藏层，h0是bias，w是h*c隐藏层到输出层，b为bias。由

for e in xrange(s['nepochs']):

        # shuffle

        shuffle([train_lex, train_ne, train_y], s['seed'])

        s['ce'] = e

        tic = time.time()

        for i in xrange(nsentences):

            cwords = contextwin(train_lex[i], s['win']）

　　　　　　words = map(lambda x: numpy.asarray(x).astype('int32'),\ minibatch(cwords, s['bs']))

labels = train_y[i]

for word_batch , label_last_word in zip(words, labels):

rnn.train(word_batch, label_last_word, s['clr'])

rnn.normalize()

cwords = contextwin(train_lex[i], s['win']）是将每一条训练句子扩充成窗口，比如此时窗口为7，则[0,1,2,3,4]将变为5行7列的矩阵，中心为0,1,2,3,4，不足处用-1填充，[[-1, -1, -1, 0, 1, 2, 3], [-1, -1, 0, 1, 2, 3, 4], [-1, 0, 1, 2, 3, 4,-1], [ 0, 1, 2, 3, 4,-1,-1], [ 1, 2, 3, 4,-1,-1,-1]],minibatch是将list分组，每组1～bs（或最大长度)行，扩充后

[[[-1, -1, -1, 0, 1, 2, 3]],

 [[-1, -1, -1, 0, 1, 2, 3], [-1, -1, 0, 1, 2, 3, 4]],

[[-1, -1, -1, 0, 1, 2, 3], [-1, -1, 0, 1, 2, 3, 4], [-1, 0, 1, 2, 3, 4, -1]],

 [[-1, -1, -1, 0, 1, 2, 3], [-1, -1, 0, 1, 2, 3, 4], [-1, 0, 1, 2, 3, 4, -1], [0, 1, 2, 3, 4, -1, -1]],

[[-1, -1, -1, 0, 1, 2, 3], [-1, -1, 0, 1, 2, 3, 4], [-1, 0, 1, 2, 3, 4, -1], [0, 1, 2, 3, 4, -1, -1], [1, 2, 3, 4, -1, -1, -1]]]

labels = train_y[i] 则labels就是一条句子的每个单词标签list，比如[0,1,2,3,4]对应的可能是[126,126,45,126,55],(在idxtowords中0,1,2,3,4可以转换为word，在idxtolabels中126,126,45,126,55可以转变为labels，所以word_batch,label_last_word为[[-1, -1, -1, 0, 1, 2, 3]]和126,以此类推。

self.train = theano.function( inputs  = [idxs, y, lr],

                                      outputs = nll,

                                      updates = updates )

idxs传入后也就是这里的word_batch，先初始化为词向量x = self.emb[idxs].reshape((idxs.shape[0], de*cs))，比如第二个batch处理后就是2*700的x，然后

        def recurrence(x_t, h_tm1):

            h_t = T.nnet.sigmoid(T.dot(x_t, self.Wx) + T.dot(h_tm1, self.Wh) + self.bh)

            s_t = T.nnet.softmax(T.dot(h_t, self.W) + self.b)

            return [h_t, s_t]

        [h, s], _ = theano.scan(fn=recurrence, \

            sequences=x, outputs_info=[self.h0, None], \

            n_steps=x.shape[0])

相当于前一个单词的context window组成的700维词向量(直接拼接)与wx相乘加上初始h0乘以wh加上偏置bh得到第二个隐藏层h_t，通过h_t与W相乘加上偏置得到输出s_t,如果x不只两列，就是如此循环下去，n列相当于考虑了n个单词，rnn循环了n次，[h,s]是每一层的隐藏层与输出层，都是三维矩阵。（此处不太明白s具体为什么）

        p_y_given_x_lastword = s[-1,0,:]

        p_y_given_x_sentence = s[:,0,:]

        y_pred = T.argmax(p_y_given_x_sentence, axis=1)

p_y_given_x_lastword是最后一个单词分为变成nc（这个数据集里是127类）类对应于每一类的概率（向量），而 p_y_given_x_sentence是这个句子里每个单词对应每一类的概率
（矩阵）

        lr = T.scalar('lr')

        nll = -T.log(p_y_given_x_lastword)[y]

        gradients = T.grad( nll, self.params )

        updates = OrderedDict(( p, p-lr*g ) for p, g in zip( self.params , gradients))

        # theano functions

        self.classify = theano.function(inputs=[idxs], outputs=y_pred)

所以上面这段代码nll是最后一个单词正确分类的概率，取-log函数，求导，此处params有

self.params = [ self.emb, self.Wx, self.Wh, self.W, self.bh, self.b, self.h0 ]

修改每个参数，梯度下降法，相当于一次训练一个单词，当然利用到了前面n-1个单词的信息，从第一个训练到最后一个单词，即

[[[-1, -1, -1, 0, 1, 2, 3]],

 [[-1, -1, -1, 0, 1, 2, 3], [-1, -1, 0, 1, 2, 3, 4]],

[[-1, -1, -1, 0, 1, 2, 3], [-1, -1, 0, 1, 2, 3, 4], [-1, 0, 1, 2, 3, 4, -1]],

 [[-1, -1, -1, 0, 1, 2, 3], [-1, -1, 0, 1, 2, 3, 4], [-1, 0, 1, 2, 3, 4, -1], [0, 1, 2, 3, 4, -1, -1]],

[[-1, -1, -1, 0, 1, 2, 3], [-1, -1, 0, 1, 2, 3, 4], [-1, 0, 1, 2, 3, 4, -1], [0, 1, 2, 3, 4, -1, -1], [1, 2, 3, 4, -1, -1, -1]

这里面一次训练一行经过emb处理后的n×700维矩阵，只对最后一个单词求代价cost，而分类classify里面包含了一个句子的所有单词，取每个单词最终127个分类的最大概率作为
单词分类（标签）

        predictions_test = [ map(lambda x: idx2label[x], \

                             rnn.classify(numpy.asarray(contextwin(x, s['win'])).astype('int32')))\

                             for x in test_lex ]

        groundtruth_test = [ map(lambda x: idx2label[x], y) for y in test_y ]

        words_test = [ map(lambda x: idx2word[x], w) for w in test_lex]

train_lex, train_ne, train_y = train_set

    valid_lex, valid_ne, valid_y = valid_set

    test_lex,  test_ne,  test_y  = test_set

这里面不知道test_ne是啥，不过train_lex,test_lex都是二维矩阵，每一行是一个句子，我们再看上上面那段代码，predictions_test相当于取出每个test_lex中的句子，先扩充成n×7的矩阵，每一行是一个单词的context window，放入classify分类器里面得到的是每个单词的label ID，再转化成label，groundtruth_test是真正每个单词的label，words_test是每个句子原本的句子。

最后输出是一个文件，包括单词，真实标签，预测标签。