
(1, 12) 0.656240233446
  (1, 11)   0.754552023393
  (2, 6)    1.0
  (3, 13)   1.0
  (4, 2)    1.0
  (7, 9)    1.0
  (9, 4)    0.742540927053
  (9, 5)    0.66980069547
  (11, 19)  0.735138466738
  (11, 7)   0.677916982176
  (12, 18)  1.0
  (13, 14)  0.697455191865
  (13, 11)  0.716628394177
  (14, 5)   1.0
  (15, 8)   1.0
  (16, 17)  1.0
  (18, 1)   1.0
  (19, 17)  1.0
  (22, 13)  1.0
  (23, 3)   1.0
  (25, 6)   1.0
  (26, 19)  0.476648253537
  (26, 7)   0.879094103268
  (28, 10)  0.532672175403
  (28, 7)   0.523456282204



#This contains the list of file names
_filenames =[]
#This conatains the list if contents/text in the file
_contents = []
#This is a dict of filename:content
_file_contents = {}
class KmeansClustering():
   def kmeansClusters(self):
        global _report
            self.num_clusters = 5
            km = KMeans(n_clusters=self.num_clusters)
            vocab_frame = TokenizingAndPanda().createPandaVocabFrame()
            self.tfidf_matrix, self.terms, self.dist = TfidfProcessing().getTfidFPropertyData()
            self.clusters = km.labels_.tolist()
            joblib.dump(km, 'doc_cluster2.pkl')
            km = joblib.load('doc_cluster2.pkl')

class TokenizingAndPanda():

    def tokenize_only(self,text):
        This function tokenizes the text
        :param text: Give the text that you want to tokenize
        :return: it gives the filter tokes
        # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
        tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
        filtered_tokens = []
        # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
        for token in tokens:
            if re.search('[a-zA-Z]', token):
        return filtered_tokens

    def tokenize_and_stem(self,text):
        # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
        tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
        filtered_tokens = []
        # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
        for token in tokens:
            if re.search('[a-zA-Z]', token):
        stems = [_stemmer.stem(t) for t in filtered_tokens]
        return stems

    def getFilnames(self):

        global _path
        global _filenames
        path = _path
        _filenames = FileAccess().read_all_file_names(path)

    def getContentsForFilenames(self):
        global _contents
        global _file_contents
        for filename in _filenames:
            content = FileAccess().read_the_contents_from_files(_path, filename)
            _file_contents[filename] = content

    def createPandaVocabFrame(self):
        global _totalvocab_stemmed
        global _totalvocab_tokenized
        #Enable this if you want to load the filenames and contents from a file structure.
        # self.getFilnames()
        # self.getContentsForFilenames()

        # for name, i in _file_contents.items():
        #     print(name)
        #     print(i)
        for i in _contents:
            allwords_stemmed = self.tokenize_and_stem(i)

            allwords_tokenized = self.tokenize_only(i)
        vocab_frame = pd.DataFrame({'words': _totalvocab_tokenized}, index=_totalvocab_stemmed)
        return vocab_frame

class TfidfProcessing():

    def getTfidFPropertyData(self):
        tfidf_vectorizer = TfidfVectorizer(max_df=0.4, max_features=200000,
                                           min_df=0.02, stop_words='english',
                                           use_idf=True, tokenizer=TokenizingAndPanda().tokenize_and_stem, ngram_range=(1, 1))
        # print(_contents)
        tfidf_matrix = tfidf_vectorizer.fit_transform(_contents)
        terms = tfidf_vectorizer.get_feature_names()
        dist = 1 - cosine_similarity(tfidf_matrix)

        return tfidf_matrix, terms, dist



(1, 12) 0.656240233446

表示第一个文档中的第12个单词(根据sklearn建立的一些词汇)的归一化频率为0.656240233446。 “丢失”位为零,这意味着例如在第一个文档中找不到第三个单词(因为没有(1,3)),依此类推。

缺少某些文档的事实是由于您的特定代码/数据(未包括在内)造成的,也许您手动设置了词汇表?还是要考虑最大数量的功能? TfidfVectorizer中有许多参数可能会导致这种情况,但是如果没有确切的代码(和一些示例性数据),就什么也不能说。例如,设置min_df可能会导致(因为它丢弃非常稀有的单词)类似于max_features(效果相同)

关于python-3.x - 理想情况下给出的tfidf矩阵是什么,我们在Stack Overflow上找到一个类似的问题:https://stackoverflow.com/questions/42489589/

10-12 07:32