喂入嵌入层时,我已经看到两种填充。
例如:
考虑两个句子:
word1 =“我是狗人。”
word2 =“克里希尼和普拉迪帕都爱猫。”
word1_int = [1,2,3,4,5,6]
word2_int = [7,8,9,10,11,12,13]
将两个单词都填充到length = 8
填充方法1(开头为0)
word1_int = [0,0,1,2,3,4,5,6]
word2_int = [0,7,8,9,10,11,12,13]
填充方法2(末尾加0)
word1_int = [1,2,3,4,5,6,0,0]
word2_int = [7,8,9,10,11,12,13,0]
我正在尝试使用20个新闻组数据集进行在线分类。我目前正在使用第一种方法来填充文本。
问:在我的实现中,使用第一种方法相对于另一种方法有什么优势吗?
先感谢您!
我的代码如下所示:
from collections import Counter
import tensorflow as tf
from sklearn.datasets import fetch_20newsgroups
import matplotlib as mplt
mplt.use('agg') # Must be before importing matplotlib.pyplot or pylab!
import matplotlib.pyplot as plt
from string import punctuation
from sklearn.preprocessing import LabelBinarizer
import numpy as np
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')
def pre_process():
newsgroups_data = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))
words = []
temp_post_text = []
print(len(newsgroups_data.data))
for post in newsgroups_data.data:
all_text = ''.join([text for text in post if text not in punctuation])
all_text = all_text.split('\n')
all_text = ''.join(all_text)
temp_text = all_text.split(" ")
for word in temp_text:
if word.isalpha():
temp_text[temp_text.index(word)] = word.lower()
# temp_text = [word for word in temp_text if word not in stopwords.words('english')]
temp_text = list(filter(None, temp_text))
temp_text = ' '.join([i for i in temp_text if not i.isdigit()])
words += temp_text.split(" ")
temp_post_text.append(temp_text)
# temp_post_text = list(filter(None, temp_post_text))
dictionary = Counter(words)
# deleting spaces
# del dictionary[""]
sorted_split_words = sorted(dictionary, key=dictionary.get, reverse=True)
vocab_to_int = {c: i for i, c in enumerate(sorted_split_words,1)}
message_ints = []
for message in temp_post_text:
temp_message = message.split(" ")
message_ints.append([vocab_to_int[i] for i in temp_message])
# maximum message length = 6577
# message_lens = Counter([len(x) for x in message_ints])AAA
seq_length = 6577
num_messages = len(temp_post_text)
features = np.zeros([num_messages, seq_length], dtype=int)
for i, row in enumerate(message_ints):
print(features[i, -len(row):])
features[i, -len(row):] = np.array(row)[:seq_length]
print(features[i, -len(row):])
lb = LabelBinarizer()
lbl = newsgroups_data.target
labels = np.reshape(lbl, [-1])
labels = lb.fit_transform(labels)
return features, labels, len(sorted_split_words)+1
def get_batches(x, y, batch_size=1):
for ii in range(0, len(y), batch_size):
yield x[ii:ii + batch_size], y[ii:ii + batch_size]
def plot(noOfWrongPred, dataPoints):
font_size = 14
fig = plt.figure(dpi=100,figsize=(10, 6))
mplt.rcParams.update({'font.size': font_size})
plt.title("Distribution of wrong predictions", fontsize=font_size)
plt.ylabel('Error rate', fontsize=font_size)
plt.xlabel('Number of data points', fontsize=font_size)
plt.plot(dataPoints, noOfWrongPred, label='Prediction', color='blue', linewidth=1.8)
# plt.legend(loc='upper right', fontsize=14)
plt.savefig('distribution of wrong predictions.png')
# plt.show()
def train_test():
features, labels, n_words = pre_process()
print(features.shape)
print(labels.shape)
# Defining Hyperparameters
lstm_layers = 1
batch_size = 1
lstm_size = 200
learning_rate = 0.01
# --------------placeholders-------------------------------------
# Create the graph object
graph = tf.Graph()
# Add nodes to the graph
with graph.as_default():
tf.set_random_seed(1)
inputs_ = tf.placeholder(tf.int32, [None, None], name="inputs")
# labels_ = tf.placeholder(dtype= tf.int32)
labels_ = tf.placeholder(tf.float32, [None, None], name="labels")
# output_keep_prob is the dropout added to the RNN's outputs, the dropout will have no effect on the calculation of the subsequent states.
keep_prob = tf.placeholder(tf.float32, name="keep_prob")
# Size of the embedding vectors (number of units in the embedding layer)
embed_size = 300
# generating random values from a uniform distribution (minval included and maxval excluded)
embedding = tf.Variable(tf.random_uniform((n_words, embed_size), -1, 1),trainable=True)
embed = tf.nn.embedding_lookup(embedding, inputs_)
print(embedding.shape)
print(embed.shape)
print(embed[0])
# Your basic LSTM cell
lstm = tf.contrib.rnn.BasicLSTMCell(lstm_size)
# Add dropout to the cell
drop = tf.contrib.rnn.DropoutWrapper(lstm, output_keep_prob=keep_prob)
# Stack up multiple LSTM layers, for deep learning
cell = tf.contrib.rnn.MultiRNNCell([drop] * lstm_layers)
# Getting an initial state of all zeros
initial_state = cell.zero_state(batch_size, tf.float32)
outputs, final_state = tf.nn.dynamic_rnn(cell, embed, initial_state=initial_state)
# hidden layer
hidden = tf.layers.dense(outputs[:, -1], units=25, activation=tf.nn.relu)
print(hidden.shape)
logit = tf.contrib.layers.fully_connected(hidden, num_outputs=20, activation_fn=None)
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=logit, labels=labels_))
optimizer = tf.train.AdamOptimizer(learning_rate).minimize(cost)
saver = tf.train.Saver()
# ----------------------------online training-----------------------------------------
with tf.Session(graph=graph) as sess:
tf.set_random_seed(1)
sess.run(tf.global_variables_initializer())
iteration = 1
state = sess.run(initial_state)
wrongPred = 0
noOfWrongPreds = []
dataPoints = []
for ii, (x, y) in enumerate(get_batches(features, labels, batch_size), 1):
feed = {inputs_: x,
labels_: y,
keep_prob: 0.5,
initial_state: state}
embedzz = sess.run(embedding, feed_dict=feed)
print(embedzz)
predictions = tf.nn.softmax(logit).eval(feed_dict=feed)
print("----------------------------------------------------------")
print("Iteration: {}".format(iteration))
isequal = np.equal(np.argmax(predictions[0], 0), np.argmax(y[0], 0))
print(np.argmax(predictions[0], 0))
print(np.argmax(y[0], 0))
if not (isequal):
wrongPred += 1
print("nummber of wrong preds: ",wrongPred)
if iteration%50 == 0:
noOfWrongPreds.append(wrongPred/iteration)
dataPoints.append(iteration)
loss, states, _ = sess.run([cost, final_state, optimizer], feed_dict=feed)
print("Train loss: {:.3f}".format(loss))
iteration += 1
saver.save(sess, "checkpoints/sentiment.ckpt")
errorRate = wrongPred / len(labels)
print("ERRORS: ", wrongPred)
print("ERROR RATE: ", errorRate)
plot(noOfWrongPreds, dataPoints)
if __name__ == '__main__':
train_test()
这是我用来填充所有句子的代码示例。
seq_length = 6577
num_messages = len(temp_post_text)
features = np.zeros([num_messages, seq_length], dtype=int)
for i, row in enumerate(message_ints):
print(features[i, -len(row):])
features[i, -len(row):] = np.array(row)[:seq_length]
print(features[i, -len(row):])
最佳答案
通常,当我们使用LSTM或RNN时,我们使用最终输出或隐藏状态并将其传递进行预测。您也正在做与以下行相同的操作:
logit = tf.contrib.layers.fully_connected(hidden, num_outputs=20, activation_fn=None)
在这里,两种填充方法有所区别。如果您使用填充的第二种方法(填充后),那么最终的隐藏状态将被冲掉,因为大多数情况下它是
0
,而使用第一种方法,我们可以确保隐藏状态输出正确。关于python-3.x - 为tf.nn.embedding_lookup预设置不同的文本大小时,预填充和后填充文本的区别,我们在Stack Overflow上找到一个类似的问题:https://stackoverflow.com/questions/52423147/