【R语言文本挖掘】:情感分析与词云图绘制
引言
1.情感数据集
- AFINN
- bing
- nrc
library(`tidytext`)
library(dplyr)
get_sentiments("nrc") %>% head()
get_sentiments("bing") %>% head()
get_sentiments("afinn") %>% head()
2.使用内连接进行情感分析
library(janeaustenr)
library(dplyr)
library(stringr)
tidy_books <- austen_books() %>%
group_by(book) %>%
mutate(
linenumber = row_number(),
chapter = cumsum(str_detect(text,
regex("^chapter [\\divxlc]",
ignore_case = TrUE)))) %>%#使用正则表达式来定义章节
ungroup() %>%
unnest_tokens(word, text)#分词
tidy_books %>% head()
nrc_joy <- get_sentiments("nrc") %>%
filter(sentiment == "joy")
tidy_books %>%
filter(book=='Emma') %>%
inner_join(nrc_joy) %>%
count(word,sort=TrUE)%>%
head()
library(tidyr)
jane_austen_sentiment <- tidy_books %>%
inner_join(get_sentiments("bing")) %>%#使用bing情绪词典进行内连接
count(book, index = linenumber %/% 80, sentiment) %>%#按八十行为一个小段进行记数
pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>%#将数据转换成宽数据
mutate(sentiment = positive - negative)#计算净情绪,如果大于0说明是积极情绪,小于0说明是消极的
jane_austen_sentiment %>% head()
library(ggplot2)
ggplot(jane_austen_sentiment, aes(index, sentiment, fill = book)) +
geom_col()+#绘制柱形图
facet_wrap(~book, ncol = 2, scales = "free_x")#根据不同书进行分面绘图,两行
3.对比三种情感字典
pride_prejudice <- tidy_books %>%
filter(book == 'Pride & Prejudice')
pride_prejudice %>% head()
# 使用AFINN词典
afinn <- pride_prejudice %>%
inner_join(get_sentiments("afinn")) %>% #内连接,得到带有情感的文本
group_by(index = linenumber %/% 80) %>% #每隔80行作为一小段
summarise(sentiment = sum(value)) %>% #这里我们进行一个求和处理,因为这里是以数字表示情感的
mutate(method = "AFINN")
# bing词典
bing <- pride_prejudice %>%
inner_join(get_sentiments("bing")) %>%#使用bing词典进行内连接
mutate(method = "Bing")%>%
count(index = linenumber %/% 80, sentiment) %>%
pivot_wider(names_from = sentiment,
values_from = n,
values_fill = 0) %>%
mutate(sentiment = positive - negative)
# 使用nrc词典
nrc <- pride_prejudice %>%
inner_join(get_sentiments("nrc") %>%
filter(sentiment %in% c('positive','negative'))) %>%
mutate(method = "NRC")%>%
count(index = linenumber %/% 80, sentiment) %>%
pivot_wider(names_from = sentiment,
values_from = n,
values_fill = 0) %>%
mutate(sentiment = positive - negative)
bind_rows(afinn,
bing_and_nrc) %>%
ggplot(aes(index, sentiment, fill = method)) +
geom_col(show.legend = FALSE) +
facet_wrap(~method, ncol = 1, scales = "free_y")
get_sentiments('nrc') %>%
filter(sentiment %in% c('positive','negative')) %>%
count(sentiment)
get_sentiments('bing') %>%
count(sentiment)
4.最常见的积极和消极的单词
bing_word_count <- tidy_books %>%
inner_join(get_sentiments('bing')) %>%
count(word, sentiment, sort = TrUE) %>%
ungroup()
[1m[22mJoining, by = "word"
bing_word_count %>% head()
bing_word_count %>%
group_by(sentiment) %>%
slice_max(n, n = 10) %>% #删选出前10的
ungroup() %>%
mutate(word = reorder(word, n)) %>%#重新排序
ggplot(aes(n, word, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(x = "Contribution to sentiment",
y = NULL)
custom_stop_words <- bind_rows(tibble(word = c("miss"),
lexicon = c("custom")),
stop_words)
custom_stop_words %>% head()
5.词云绘制
library(wordcloud)
tidy_books %>%
anti_join(stop_words) %>%
count(word) %>%
with(wordcloud(word, n, max.words = 100))#出现次数最多的前100个
library(reshape2)
tidy_books %>%
inner_join(get_sentiments("bing")) %>%
count(word, sentiment, sort = TrUE) %>%
acast(word ~ sentiment, value.var = "n", fill = 0) %>%
comparison.cloud(colors = c("blue", "red"),
max.words = 100)
6.总结
参考资料:Text Mining with R