本文主要参考:https://github.com/zhedongzheng/finch 完成。与原代码的区别在于没有使用 tf.estimator,以及数据预处理方面做了部分修改(使用于dataset)

# -*- coding:utf-8 -*-
from collections import Counter

import tensorflow as tf
import numpy as np
import re

PARAMS = {
    'min_freq': 5,
    'window_size': 3,
    'n_sampled': 100,
    'embed_dim': 200,
}


def preprocess_text(text):
    # 1. 将数据中的换行符替换为空格
    text = text.replace('\n', ' ')
    # 2. 将数据中的空白字符替换为空格,并转化为全小写
    text = re.sub('\s+', ' ', text).strip().lower()
    # 3. 以空格为分隔符,类似于简单的分词
    words = text.split()
    # 4. 统计词频
    word2freq = Counter(words)
    # 5. 去掉低频词
    words = [word for word in words if word2freq[word] > PARAMS['min_freq']]
    print("Total words:", len(words))
    # 6. 去重
    _words = set(words)
    PARAMS['word2idx'] = {c: i for i, c in enumerate(_words)}
    PARAMS['idx2word'] = {i: c for i, c in enumerate(_words)}
    PARAMS['vocab_size'] = len(PARAMS['idx2word'])
    print('Vocabulary size:', PARAMS['vocab_size'])

    indexed = [PARAMS['word2idx'][w] for w in words]
    # 7. 将高频词去掉
    indexed = filter_high_freq(indexed)
    print("Word preprocessing completed ...")
    return indexed

def filter_high_freq(int_words, t=1e-5, threshold=0.8):
    int_word_counts = Counter(int_words)
    total_count = len(int_words)
    # 1. 计算词的概率,c/all
    word_freqs = {w: c / total_count for w, c in int_word_counts.items()}
    # 2. 计算词的丢弃概率,词频越高,丢弃概率越高。例如: 'the' 出现词频很高,但携带的信息少,需要去除
    prob_drop = {w: 1 - np.sqrt(t / word_freqs[w]) for w in int_word_counts}
    # 3. 高于阈值的丢弃掉
    train_words = [w for w in int_words if prob_drop[w] < threshold]
    return train_words


def make_data(int_words):
    x, y = [], []
    for i in range(PARAMS['window_size'], len(int_words) - PARAMS['window_size']):
        # 1. 生成一个词的上下文
        inputs = get_x(int_words, i)
        # 2. 将一个词的所有上下文作为一个整体,添加到x中
        # x = [['a','b','d',e'],['b','c','e','f']
        x.append(inputs)
        # 3. 将每个label作为一个子list,添加到y中
        #y = [['c'],['d']]
        # 4. 即每条数据为context:word
        y.append([int_words[i]])
    return np.array(x), np.array(y)


def get_x(words, idx):
    left = idx - PARAMS['window_size']
    right = idx + PARAMS['window_size']
    return words[left: idx] + words[idx + 1: right + 1]

# 1. 预处理数据
with open(r'E:\nlp_data\ptb_train.txt') as f:
    x_train, y_train = make_data(preprocess_text(f.read()))
# 2. 将数据封装为dataset
# 这里一条数据是多少呢?
# 一条数据,x = 6个词,y=1个词
# 原因在于 make_data 中的  x.append(inputs) 和 y.append([int_words[i]])
dataset = tf.data.Dataset.from_tensor_slices(tensors=(x_train,y_train))
dataset = dataset.batch(batch_size=100).repeat(5)
iter = dataset.make_one_shot_iterator()
next_data = iter.get_next()
# 3. CBOW模型搭建
# 这里填写 shape = (None,6) 和 shape=(None,1)
# 原因在于 make_data 中的  x.append(inputs) 和 y.append([int_words[i]])
# window_size = 3,则context大小为6
# None 为100,原因在于  dataset.batch(batch_size=100)
x = tf.placeholder(shape=(None,6),dtype=tf.int32)
y_= tf.placeholder(shape=(None,1),dtype=tf.int32)

E = tf.get_variable(name="E",shape=(PARAMS['vocab_size'],PARAMS['embed_dim']))
embedding = tf.nn.embedding_lookup(params=E,ids=x)
embedding = tf.reduce_mean(embedding,axis=[1])

W = tf.get_variable(name="w",shape=(PARAMS['vocab_size'],PARAMS['embed_dim']),dtype=tf.float32)
b = tf.get_variable(name="b",shape=(PARAMS['vocab_size']),dtype=tf.float32)

loss_op = tf.reduce_mean(tf.nn.sampled_softmax_loss(
    weights=W,
    biases=b,
    labels=y_,
    inputs=embedding,
    num_sampled=PARAMS['n_sampled'],
    num_classes=PARAMS['vocab_size']))

opt = tf.train.GradientDescentOptimizer(learning_rate=0.5).minimize(loss=loss_op)

init = tf.global_variables_initializer()

with tf.Session() as session:
    session.run(init)
    try:
        while True:
            inputs,labels = session.run(next_data)
            session.run(fetches=opt,feed_dict={x:inputs,y_:labels})
    except tf.errors.OutOfRangeError:
        print("train complete")
11-27 20:53