1. 停用词(stopwords)
ref: Removing stop words with NLTK in Python
ref: Remove Stop Words
import nltk # nltk.download('stopwords') from nltk.corpus import stopwords print(stopwords.words('english')) output: ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"]
2. 介词(prepositions, part of speech)
ref: How do I remove verbs, prepositions, conjunctions etc from my text? [closed]
ref: Alphabetical list of part-of-speech tags used in the Penn Treebank Project:
>>> import nltk >>> sentence = """At eight o'clock on Thursday morning ... Arthur didn't feel very good.""" >>> tokens = nltk.word_tokenize(sentence) >>> tokens ['At', 'eight', "o'clock", 'on', 'Thursday', 'morning', 'Arthur', 'did', "n't", 'feel', 'very', 'good', '.'] >>> tagged = nltk.pos_tag(tokens) >>> tagged[0:6] [('At', 'IN'), ('eight', 'CD'), ("o'clock", 'JJ'), ('on', 'IN'), ('Thursday', 'NNP'), ('morning', 'NN')]
3. Named Entity Recognition (NER)
ref: Introduction to Named Entity Recognition
ref: Named Entity Recognition with NLTK and SpaCy
- Standford NER
- spaCy
- NLTK
Stanford NER
article = ''' Asian shares skidded on Tuesday after a rout in tech stocks put Wall Street to the sword, while a sharp drop in oil prices and political risks in Europe pushed the dollar to 16-month highs as investors dumped riskier assets. MSCI’s broadest index of Asia-Pacific shares outside Japan dropped 1.7 percent to a 1-1/2 week trough, with Australian shares sinking 1.6 percent. Japan’s Nikkei dived 3.1 percent led by losses in electric machinery makers and suppliers of Apple’s iphone parts. Sterling fell to $1.286 after three straight sessions of losses took it to the lowest since Nov.1 as there were still considerable unresolved issues with the European Union over Brexit, British Prime Minister Theresa May said on Monday.''' import nltk from nltk.tag import StanfordNERTagger print('NTLK Version: %s' % nltk.__version__) stanford_ner_tagger = StanfordNERTagger( r"D:\Twitter Data\Data\NER\stanford-ner-2018-10-16\classifiers\english.muc.7class.distsim.crf.ser.gz", r"D:\Twitter Data\Data\NER\stanford-ner-2018-10-16\stanford-ner-3.9.2.jar" ) results = stanford_ner_tagger.tag(article.split()) print('Original Sentence: %s' % (article)) for result in results: tag_value = result[0] tag_type = result[1] if tag_type != 'O': print('Type: %s, Value: %s' % (tag_type, tag_value)) output: NTLK Version: 3.4 Original Sentence: Asian shares skidded on Tuesday after a rout in tech stocks put Wall Street to the sword, while a sharp drop in oil prices and political risks in Europe pushed the dollar to 16-month highs as investors dumped riskier assets. MSCI’s broadest index of Asia-Pacific shares outside Japan dropped 1.7 percent to a 1-1/2 week trough, with Australian shares sinking 1.6 percent. Japan’s Nikkei dived 3.1 percent led by losses in electric machinery makers and suppliers of Apple’s iphone parts. Sterling fell to $1.286 after three straight sessions of losses took it to the lowest since Nov.1 as there were still considerable unresolved issues with the European Union over Brexit, British Prime Minister Theresa May said on Monday. Type: DATE, Value: Tuesday Type: LOCATION, Value: Europe Type: ORGANIZATION, Value: Asia-Pacific Type: LOCATION, Value: Japan Type: PERCENT, Value: 1.7 Type: PERCENT, Value: percent Type: ORGANIZATION, Value: Nikkei Type: PERCENT, Value: 3.1 Type: PERCENT, Value: percent Type: LOCATION, Value: European Type: LOCATION, Value: Union Type: PERSON, Value: Theresa Type: PERSON, Value: May
spaCy
import spacy from spacy import displacy from collections import Counter import en_core_web_sm nlp = en_core_web_sm.load() doc = nlp(article) for X in doc.ents: print('Value: %s, Type: %s' % (X.text, X.label_)) output: Value: Asian, Type: NORP Value: Tuesday, Type: DATE Value: Europe, Type: LOC Value: MSCI’s, Type: ORG Value: Asia-Pacific, Type: LOC Value: Japan, Type: GPE Value: 1.7 percent, Type: PERCENT Value: 1-1/2, Type: CARDINAL Value: Australian, Type: NORP Value: 1.6 percent, Type: PERCENT Value: Japan, Type: GPE Value: 3.1 percent, Type: PERCENT Value: Apple, Type: ORG Value: 1.286, Type: MONEY Value: three, Type: CARDINAL Value: Nov.1, Type: NORP Value: the European Union, Type: ORG Value: Brexit, Type: GPE Value: British, Type: NORP Value: Theresa May, Type: PERSON Value: Monday, Type: DATE
NLTK
def fn_preprocess(art): art = nltk.word_tokenize(art) art = nltk.pos_tag(art) return art art_processed = fn_preprocess(article) print(art_processed) output: [('Asian', 'JJ'), ('shares', 'NNS'), ('skidded', 'VBN'), ('on', 'IN'), ('Tuesday', 'NNP'), ('after', 'IN'), ('a', 'DT'), ('rout', 'NN'), ('in', 'IN'), ('tech', 'JJ'), ('stocks', 'NNS'), ('put', 'VBD'), ('Wall', 'NNP'), ('Street', 'NNP'), ('to', 'TO'), ('the', 'DT'), ('sword', 'NN'), (',', ','), ('while', 'IN'), ('a', 'DT'), ('sharp', 'JJ'), ('drop', 'NN'), ('in', 'IN'), ('oil', 'NN'), ('prices', 'NNS'), ('and', 'CC'), ('political', 'JJ'), ('risks', 'NNS'), ('in', 'IN'), ('Europe', 'NNP'), ('pushed', 'VBD'), ('the', 'DT'), ('dollar', 'NN'), ('to', 'TO'), ('16-month', 'JJ'), ('highs', 'NNS'), ('as', 'IN'), ('investors', 'NNS'), ('dumped', 'VBD'), ('riskier', 'JJR'), ('assets', 'NNS'), ('.', '.'), ('MSCI', 'NNP'), ('’', 'NNP'), ('s', 'VBD'), ('broadest', 'JJS'), ('index', 'NN'), ('of', 'IN'), ('Asia-Pacific', 'NNP'), ('shares', 'NNS'), ('outside', 'IN'), ('Japan', 'NNP'), ('dropped', 'VBD'), ('1.7', 'CD'), ('percent', 'NN'), ('to', 'TO'), ('a', 'DT'), ('1-1/2', 'JJ'), ('week', 'NN'), ('trough', 'NN'), (',', ','), ('with', 'IN'), ('Australian', 'JJ'), ('shares', 'NNS'), ('sinking', 'VBG'), ('1.6', 'CD'), ('percent', 'NN'), ('.', '.'), ('Japan', 'NNP'), ('’', 'NNP'), ('s', 'VBD'), ('Nikkei', 'NNP'), ('dived', 'VBD'), ('3.1', 'CD'), ('percent', 'NN'), ('led', 'VBN'), ('by', 'IN'), ('losses', 'NNS'), ('in', 'IN'), ('electric', 'JJ'), ('machinery', 'NN'), ('makers', 'NNS'), ('and', 'CC'), ('suppliers', 'NNS'), ('of', 'IN'), ('Apple', 'NNP'), ('’', 'NNP'), ('s', 'VBD'), ('iphone', 'NN'), ('parts', 'NNS'), ('.', '.'), ('Sterling', 'NN'), ('fell', 'VBD'), ('to', 'TO'), ('$', '$'), ('1.286', 'CD'), ('after', 'IN'), ('three', 'CD'), ('straight', 'JJ'), ('sessions', 'NNS'), ('of', 'IN'), ('losses', 'NNS'), ('took', 'VBD'), ('it', 'PRP'), ('to', 'TO'), ('the', 'DT'), ('lowest', 'JJS'), ('since', 'IN'), ('Nov.1', 'NNP'), ('as', 'IN'), ('there', 'EX'), ('were', 'VBD'), ('still', 'RB'), ('considerable', 'JJ'), ('unresolved', 'JJ'), ('issues', 'NNS'), ('with', 'IN'), ('the', 'DT'), ('European', 'NNP'), ('Union', 'NNP'), ('over', 'IN'), ('Brexit', 'NNP'), (',', ','), ('British', 'NNP'), ('Prime', 'NNP'), ('Minister', 'NNP'), ('Theresa', 'NNP'), ('May', 'NNP'), ('said', 'VBD'), ('on', 'IN'), ('Monday', 'NNP'), ('.', '.')]