TFIDF练习 | TFIDF

直接上代码吧：

 """

     测试Demo

 """

 import lightgbm as lgb

 import numpy as np

 from sklearn.feature_extraction.text import TfidfVectorizer

 from sklearn.feature_extraction.text import CountVectorizer

 def use_lgb():

     # 训练数据，500个样本，10个维度

     train_data = np.random.rand(500, 10)

     # 构建二分类数据

     label = np.random.randint(2, size=500)

     # 放入到dataset中

     train = lgb.Dataset(train_data, label=label)

     print(train)

 def use_tfidf():

     sentence = ['没有 你 的 地方 都是 他乡', '没有 你 的 旅行 都是 流浪']

     # 不去掉停用词

     c = CountVectorizer(stop_words=None)

     # 拟合模型返回文本矩阵

     count_word_tf = c.fit_transform(sentence)

     # print(count_word_tf.toarray())

     # # 查看那些词，以字典的形式

     # print(c.vocabulary_)

     # # 得到特征

     # print(c.get_feature_names())

 ###############################

     stopword = ['都是']

     # 构建一个tfidf向量器,去除停用词

     tfidf = TfidfVectorizer(stop_words=stopword)

     # 给出tfidf的权重,将tfidf矩阵抽取出来

     weight = tfidf.fit_transform(sentence).toarray()

     # 给出特征名称

     word = tfidf.get_feature_names()

     print("有哪些词：")

     print(word)

     print("\n词汇表以及他们的位置索引：")

     for key, value in tfidf.vocabulary_.items():

         print(key, value)

     print("\n词频矩阵：")

     print(weight)

     print(len(weight))

     # 打印每类文本中的tfidf权重，第一个for变量所有样本，第二个for遍历某一类文档下的所有权重

     for i in range(len(weight)):

         print("这里输出的是第{}文本的词语tfidf权重".format(i))

         for j in range(len(word)):

             # 经过tfidf后，找出每篇文档相关的词，这些词就是精心挑选出来的。然后根据这些词到文档中去找到tfidf值

             print(word[j], weight[i][j])

 if __name__ == '__main__':

     use_tfidf()

输出：

 有哪些词：

 ['他乡', '地方', '旅行', '没有', '流浪']

 词汇表以及他们的位置索引：

 他乡 0

 旅行 2

 流浪 4

 地方 1

 没有 3

 词频矩阵：

 [[0.6316672  0.6316672  0.         0.44943642 0.        ]

  [0.         0.         0.6316672  0.44943642 0.6316672 ]]

 2

 这里输出的是第0文本的词语tfidf权重

 他乡 0.6316672017376245

 地方 0.6316672017376245

 旅行 0.0

 没有 0.4494364165239821

 流浪 0.0

 这里输出的是第1文本的词语tfidf权重

 他乡 0.0

 地方 0.0

 旅行 0.6316672017376245

 没有 0.4494364165239821

 流浪 0.6316672017376245

本文参考：https://blog.csdn.net/the_lastest/article/details/79093407