我正在尝试训练可预测4个标量浮点输出的回归模型。从目前的情况来看,网络迅速分散,损失增加到NaN。我不知道发生了什么事。
以下是在带有NVidia GPU的Windows 10上使用TensorFlow 1.1.0测试的自续示例。
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import numpy
import tensorflow as tf
IMAGE_HEIGHT = 320
IMAGE_WIDTH = 160
NUM_CHANNELS = 3
PIXEL_DEPTH = 255
SEED = 66479 # Set to None for random seed.
BATCH_SIZE=5
NUM_OUTPUTS = 4 # the four outputs
def data_type():
return tf.float32
# The variables below hold all the trainable weights. They are passed an
# initial value which will be assigned when we call:
# {tf.global_variables_initializer().run()}
conv1_weights = tf.Variable(
tf.truncated_normal([5, 5, NUM_CHANNELS, 32], # 5x5 filter, depth 32.
stddev=0.1,
seed=SEED, dtype=data_type()))
conv1_biases = tf.Variable(tf.zeros([32], dtype=data_type()))
conv2_weights = tf.Variable(tf.truncated_normal(
[5, 5, 32, 64], stddev=0.1,
seed=SEED, dtype=data_type()))
conv2_biases = tf.Variable(tf.constant(0.1, shape=[64], dtype=data_type()))
fc1_weights = tf.Variable( # fully connected, depth 512.
tf.truncated_normal([IMAGE_HEIGHT // 4 * IMAGE_WIDTH // 4 * 64, 512],
stddev=0.1,
seed=SEED,
dtype=data_type()))
fc1_biases = tf.Variable(tf.constant(0.1, shape=[512], dtype=data_type()))
fc2_weights = tf.Variable(tf.truncated_normal([512, NUM_OUTPUTS],
stddev=0.1,
seed=SEED,
dtype=data_type()))
fc2_biases = tf.Variable(tf.constant(
0.1, shape=[NUM_OUTPUTS], dtype=data_type()))
# We will replicate the model structure for the training subgraph, as well
# as the evaluation subgraphs, while sharing the trainable parameters.
def model(data, train=False):
"""The Model definition."""
# 2D convolution, with 'SAME' padding (i.e. the output feature map has
# the same size as the input). Note that {strides} is a 4D array whose
# shape matches the data layout: [image index, y, x, depth].
conv = tf.nn.conv2d(data,
conv1_weights,
strides=[1, 1, 1, 1],
padding='SAME')
# Bias and rectified linear non-linearity.
relu = tf.nn.relu(tf.nn.bias_add(conv, conv1_biases))
# Max pooling. The kernel size spec {ksize} also follows the layout of
# the data. Here we have a pooling window of 2, and a stride of 2.
pool = tf.nn.max_pool(relu,
ksize=[1, 2, 2, 1],
strides=[1, 2, 2, 1],
padding='SAME')
conv = tf.nn.conv2d(pool,
conv2_weights,
strides=[1, 1, 1, 1],
padding='SAME')
relu = tf.nn.relu(tf.nn.bias_add(conv, conv2_biases))
pool = tf.nn.max_pool(relu,
ksize=[1, 2, 2, 1],
strides=[1, 2, 2, 1],
padding='SAME')
# Reshape the feature map cuboid into a 2D matrix to feed it to the
# fully connected layers.
pool_shape = pool.get_shape().as_list()
reshape = tf.reshape(
pool,
[pool_shape[0], pool_shape[1] * pool_shape[2] * pool_shape[3]])
# Fully connected layer. Note that the '+' operation automatically
# broadcasts the biases.
hidden = tf.nn.relu(tf.matmul(reshape, fc1_weights) + fc1_biases)
# Add a 50% dropout during training only. Dropout also scales
# activations such that no rescaling is needed at evaluation time.
if train:
hidden = tf.nn.dropout(hidden, 0.5, seed=SEED)
return tf.matmul(hidden, fc2_weights) + fc2_biases
def main():
train_data_batch = tf.placeholder(tf.float32, shape=(BATCH_SIZE, IMAGE_HEIGHT, IMAGE_WIDTH, NUM_CHANNELS))
train_label_batch = tf.placeholder(tf.float32, shape=(BATCH_SIZE, NUM_OUTPUTS))
with tf.name_scope('pred'):
train_pred = model(train_data_batch, train=True)
with tf.name_scope('loss'):
loss = tf.reduce_sum(tf.square(train_pred - train_label_batch))
tf.summary.scalar('loss', loss)
# L2 regularization for the fully connected parameters.
regularizers = (tf.nn.l2_loss(fc1_weights) + tf.nn.l2_loss(fc1_biases) +
tf.nn.l2_loss(fc2_weights) + tf.nn.l2_loss(fc2_biases))
# Add the regularization term to the loss.
loss += 5e-4 * regularizers
optimizer = tf.train.GradientDescentOptimizer(0.01)
train_op = optimizer.minimize(loss)
with tf.Session() as sess:
# The op for initializing the variables.
init_op = tf.group(tf.global_variables_initializer(),
tf.local_variables_initializer())
sess.run(init_op)
while True:
predictions, l, _ = sess.run([train_pred, loss, train_op], feed_dict={
train_data_batch: numpy.zeros([BATCH_SIZE, IMAGE_HEIGHT, IMAGE_WIDTH, NUM_CHANNELS])+0.2,
train_label_batch: numpy.zeros([BATCH_SIZE, 4])})
print(l)
if __name__ == "__main__":
main()
输出:
9031.0
5.6838e+22
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
最佳答案
看来我的模型有所不同。我通过更改为AdamOptimizer解决了这一问题:
optimizer = tf.train.AdamOptimizer(0.5)
这为动量优化器自适应设置参数。