python - Tensorflow seq2seq多维回归

编辑:我编辑了代码以制作seq2seq教程/练习，它们是:
https://github.com/guillaume-chevalier/seq2seq-signal-prediction

我尝试对多维输入和输出进行序列到序列(seq2seq)回归。我尝试了一些随着时间的推移会产生以下损失的东西:

该模型完全无法学习预测在每个输入和输出维度上克隆的窦，即使我尝试非常小的学习率也是如此。

为RNN构建的Tensorflow损失函数似乎解决了我们直接想要训练标签或词嵌入的情况，因此我尝试自己计算损失。关于这一点，我不知道我们应该如何处理dec_inp(解码器输入)变量，我尝试做的事情似乎在Tensorflow中尚未完成，但从概念上讲特别简单(请参见标题)。

这是张量图:

图上有些事情是我无法预期的，例如RMSProp优化器和basic_rnn_seq2seq之间的链接。

这是我尝试过的:

import tensorflow as tf

import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import tempfile
import math


rnn_cell = tf.nn.rnn_cell
seq2seq = tf.nn.seq2seq

tf.reset_default_graph()
sess = tf.InteractiveSession()


# Neural net's parameters
seq_length = 5  # Inputs and outputs are sequences of 5 units
batch_size = 1  # Keeping it simple for now

# Each unit in the sequence is a float32 vector of lenght 10:
# Same dimension sizes just for simplicity now
output_dim = hidden_dim = input_dim = 10

# Optmizer:
learning_rate = 0.0007  # Small lr to avoid problem
nb_iters = 2000  # Crank up the iters in consequence
lr_decay = 0.85  # 0.9 default
momentum = 0.01  # 0.0 default


# Create seq2seq's args
enc_inp = [tf.placeholder(tf.float32, shape=(None, input_dim),
                          name="inp%i" % t)
           for t in range(seq_length)]

# sparse "labels" that are not labels:
expected_sparse_output = [tf.placeholder(tf.float32, shape=(None, output_dim),
                        name="expected_sparse_output%i" % t)
          for t in range(seq_length)]

# Decoder input: prepend some "GO" token and drop the final
# There might be a problem there too,
# my outputs are not tokens integer, but float vectors.
dec_inp = [tf.zeros_like(enc_inp[0], dtype=np.float32, name="GO")] + enc_inp[:-1]

# Initial memory value for recurrence.
prev_mem = tf.zeros((batch_size, hidden_dim))


# Create rnn cell and decoder's sequence
cell = rnn_cell.GRUCell(hidden_dim)
# cell = tf.nn.rnn_cell.MultiRNNCell([cell] * layers_stacked_count)
dec_outputs, dec_memory = seq2seq.basic_rnn_seq2seq(
    enc_inp,
    dec_inp,
    cell
)


# Training loss and optimizer
loss = 0
for _y, _Y in zip(dec_outputs, expected_sparse_output):
    loss += tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(_y, _Y)) # Softmax loss
    # loss + tf.reduce_mean(tf.squared_difference(_y, _Y))

# The following commented loss function did not worked because
# I want a sparse output rather than labels
# weights = [tf.ones_like(labels_t, dtype=tf.float32)
#            for labels_t in expected_sparse_output]
# loss = seq2seq.sequence_loss(dec_outputs, labels, weights)

tf.scalar_summary("loss", loss)
summary_op = tf.merge_all_summaries()

# optimizer = tf.train.MomentumOptimizer(learning_rate, momentum)
# optimizer = tf.train.AdagradOptimizer(learning_rate)
optimizer = tf.train.RMSPropOptimizer(learning_rate, decay=lr_decay, momentum=momentum)
train_op = optimizer.minimize(loss)

logdir = tempfile.mkdtemp()
print logdir
summary_writer = tf.train.SummaryWriter(logdir, sess.graph)


sess.run(tf.initialize_all_variables())

def gen_data_x_y():
    """
    Simply returns data of shape:
        (seq_lenght, batch_size, output_dim)

    X is a sine of domain 0.0*pi to 1.5*pi
    Y is a sine of domain 1.5*pi to 3.0*pi

    To temporarily deal with the number of dimensions
    """
    # Create the sine in x and it's continuation in y
    x = np.sin(np.linspace(0.0*math.pi, 1.5*math.pi, seq_length))
    y = np.sin(np.linspace(1.5*math.pi, 3.0*math.pi, seq_length))

    # Clone the sine for every input_dim.
    # Normaly those dims would containt different signals
    # happening at the same time of a single timestep of
    # a single training example, such as other features of
    # the signal such as various moving averages
    x = np.array([x for i in range(input_dim)])
    y = np.array([y for i in range(output_dim)])
    x, y = x.T, y.T

    x = np.array([x]*batch_size) # simple for now: batch_size of 1
    y = np.array([y]*batch_size)
    # shape: (batch_size, seq_lenght, output_dim)
    x = np.array(x).transpose((1, 0, 2))
    y = np.array(y).transpose((1, 0, 2))
    # shape: (seq_lenght, batch_size, output_dim)

    # print "X_SHAPE: " + str(x.shape)
    return x, y

def train_batch(batch_size):
    """
    Training step: we optimize for every outputs Y at once,
    feeding all inputs X

    I do not know yet how to deal with
    the enc_inp tensor declared earlier
    """
    X, Y = gen_data_x_y()

    feed_dict = {
        enc_inp[t]: X[t] for t in range(seq_length)
    }
    feed_dict.update({expected_sparse_output[t]: Y[t] for t in range(seq_length)})
    feed_dict.update({prev_mem: np.zeros((batch_size, hidden_dim))})

    _, loss_t, summary = sess.run([train_op, loss, summary_op], feed_dict)
    return loss_t, summary

# Train
for t in range(nb_iters):
    loss_t, summary = train_batch(batch_size)
    print loss_t
    summary_writer.add_summary(summary, t)
summary_writer.flush()

# Visualise the loss
# !tensorboard --logdir {logdir}


# Test the training
X, Y = gen_data_x_y()

feed_dict = {
    enc_inp[t]: X[t] for t in range(seq_length)
}
# feed_dict.update({expected_sparse_output[t]: Y[t] for t in range(seq_length)})

outputs = sess.run([dec_outputs], feed_dict)


# Evaluate model
np.set_printoptions(suppress=True)  # No scientific exponents
expected = Y[:,0,0]
print "Expected: "
print expected
print ""
print "The following results now represents each timesteps of a different output dim:"

mses = []
for i in range(output_dim):
    pred = np.array(outputs[0])[:,0,i]
    print pred
    mse = math.sqrt(np.mean((pred - expected)**2))
    print "mse: " + str(mse)
    mses.append(mse)
    print ""

print ""
print "FINAL MEAN SQUARED ERROR ON RESULT: " + str(np.mean(mses))

打印:

/tmp/tmpVbO48U
5.87742
5.87894
5.88054
5.88221
5.88395
[...]
5.71791
5.71791
5.71791
5.71791
5.71791
Expected:
[-1.         -0.38268343  0.70710678  0.92387953  0.        ]

The following results now represents each timesteps of a different output dim:
[-0.99999893 -0.99999893  0.96527898  0.99995273 -0.01624492]
mse: 0.301258140201

[-0.99999952 -0.99999952  0.98715001  0.9999997  -0.79249388]
mse: 0.467620401096

[-0.99999946 -0.9999994   0.97464144  0.99999654 -0.30602577]
mse: 0.332294862093

[-0.99999893 -0.99999893  0.95765316  0.99917656  0.36947867]
mse: 0.342355383387

[-0.99999964 -0.99999952  0.9847464   0.99999964 -0.70281279]
mse: 0.43769921227

[-0.99999744 -0.9999975   0.97723919  0.99999851 -0.39834118]
mse: 0.351715216206

[-0.99999964 -0.99999952  0.97650111  0.99999803 -0.37042192]
mse: 0.34544431708

[-0.99999648 -0.99999893  0.99999917  0.99999917  0.99999726]
mse: 0.542706750242

[-0.99999917 -0.99999917  0.96115535  0.99984574  0.12008631]
mse: 0.305224828554

[-0.99999952 -0.99999946  0.98291612  0.99999952 -0.62598646]
mse: 0.413473861107


FINAL MEAN SQUARED ERROR ON RESULT: 0.383979297224

似乎我的代码中缺少一件事，否则就有一个小错误。

最佳答案

对于sin(x)之类的学习功能，使用softmax损失不是很好。
* softmax损失通常用于多类离散预测
*对于连续预测，请使用例如l2_loss

另外，由于sin(x)是x的函数，因此我认为您不需要RNN。我真的会首先尝试2层或3层完全连接的网络。在这种情况下，您可以尝试RNN。但是sin(x)仅取决于x，而不取决于整个历史记录，因此在这种情况下，循环状态将毫无用处。