本文主要参考:https://github.com/zhedongzheng/finch 完成。与原代码的区别在于没有使用 tf.estimator,以及数据预处理方面做了部分修改(使用于dataset)
# -*- coding:utf-8 -*-
from collections import Counter
import tensorflow as tf
import numpy as np
import re
PARAMS = {
'min_freq': 5,
'window_size': 3,
'n_sampled': 100,
'embed_dim': 200,
}
def preprocess_text(text):
# 1. 将数据中的换行符替换为空格
text = text.replace('\n', ' ')
# 2. 将数据中的空白字符替换为空格,并转化为全小写
text = re.sub('\s+', ' ', text).strip().lower()
# 3. 以空格为分隔符,类似于简单的分词
words = text.split()
# 4. 统计词频
word2freq = Counter(words)
# 5. 去掉低频词
words = [word for word in words if word2freq[word] > PARAMS['min_freq']]
print("Total words:", len(words))
# 6. 去重
_words = set(words)
PARAMS['word2idx'] = {c: i for i, c in enumerate(_words)}
PARAMS['idx2word'] = {i: c for i, c in enumerate(_words)}
PARAMS['vocab_size'] = len(PARAMS['idx2word'])
print('Vocabulary size:', PARAMS['vocab_size'])
indexed = [PARAMS['word2idx'][w] for w in words]
# 7. 将高频词去掉
indexed = filter_high_freq(indexed)
print("Word preprocessing completed ...")
return indexed
def filter_high_freq(int_words, t=1e-5, threshold=0.8):
int_word_counts = Counter(int_words)
total_count = len(int_words)
# 1. 计算词的概率,c/all
word_freqs = {w: c / total_count for w, c in int_word_counts.items()}
# 2. 计算词的丢弃概率,词频越高,丢弃概率越高。例如: 'the' 出现词频很高,但携带的信息少,需要去除
prob_drop = {w: 1 - np.sqrt(t / word_freqs[w]) for w in int_word_counts}
# 3. 高于阈值的丢弃掉
train_words = [w for w in int_words if prob_drop[w] < threshold]
return train_words
def make_data(int_words):
x, y = [], []
for i in range(PARAMS['window_size'], len(int_words) - PARAMS['window_size']):
# 1. 生成一个词的上下文
inputs = get_x(int_words, i)
# 2. 将一个词的所有上下文作为一个整体,添加到x中
# x = [['a','b','d',e'],['b','c','e','f']
x.append(inputs)
# 3. 将每个label作为一个子list,添加到y中
#y = [['c'],['d']]
# 4. 即每条数据为context:word
y.append([int_words[i]])
return np.array(x), np.array(y)
def get_x(words, idx):
left = idx - PARAMS['window_size']
right = idx + PARAMS['window_size']
return words[left: idx] + words[idx + 1: right + 1]
# 1. 预处理数据
with open(r'E:\nlp_data\ptb_train.txt') as f:
x_train, y_train = make_data(preprocess_text(f.read()))
# 2. 将数据封装为dataset
# 这里一条数据是多少呢?
# 一条数据,x = 6个词,y=1个词
# 原因在于 make_data 中的 x.append(inputs) 和 y.append([int_words[i]])
dataset = tf.data.Dataset.from_tensor_slices(tensors=(x_train,y_train))
dataset = dataset.batch(batch_size=100).repeat(5)
iter = dataset.make_one_shot_iterator()
next_data = iter.get_next()
# 3. CBOW模型搭建
# 这里填写 shape = (None,6) 和 shape=(None,1)
# 原因在于 make_data 中的 x.append(inputs) 和 y.append([int_words[i]])
# window_size = 3,则context大小为6
# None 为100,原因在于 dataset.batch(batch_size=100)
x = tf.placeholder(shape=(None,6),dtype=tf.int32)
y_= tf.placeholder(shape=(None,1),dtype=tf.int32)
E = tf.get_variable(name="E",shape=(PARAMS['vocab_size'],PARAMS['embed_dim']))
embedding = tf.nn.embedding_lookup(params=E,ids=x)
embedding = tf.reduce_mean(embedding,axis=[1])
W = tf.get_variable(name="w",shape=(PARAMS['vocab_size'],PARAMS['embed_dim']),dtype=tf.float32)
b = tf.get_variable(name="b",shape=(PARAMS['vocab_size']),dtype=tf.float32)
loss_op = tf.reduce_mean(tf.nn.sampled_softmax_loss(
weights=W,
biases=b,
labels=y_,
inputs=embedding,
num_sampled=PARAMS['n_sampled'],
num_classes=PARAMS['vocab_size']))
opt = tf.train.GradientDescentOptimizer(learning_rate=0.5).minimize(loss=loss_op)
init = tf.global_variables_initializer()
with tf.Session() as session:
session.run(init)
try:
while True:
inputs,labels = session.run(next_data)
session.run(fetches=opt,feed_dict={x:inputs,y_:labels})
except tf.errors.OutOfRangeError:
print("train complete")