JoJo的数据分析历险记

JoJo的数据分析历险记

【R语言文本挖掘】:分析单词和文档频率——TF-IDF


【R语言文本挖掘】:分析单词和文档频率——TF-IDF-LMLPHP

引言

1.简·奥斯汀小说集的频率

接下来我们导入相关库

# 导入相关库
library(dplyr)
library(janeaustenr)
library(tidytext)

分词处理

# 将文本分词成word
book_words <- austen_books() %>%
unnest_tokens(word, text) %>% #将文本分词
count(book, word, sort = TrUE)
book_words%>%head()
total_words <- book_words %>%
  group_by(book) %>%
  summarize(total = sum(n))

total_words%>%head()
book_words <- left_join(book_words, total_words)#左连接

book_words %>% head()
[1m[22mJoining, by = c("book", "total")
library(ggplot2)

ggplot(book_words, aes(n/total, fill = book)) +
  geom_histogram(show.legend = FALSE) +
  xlim(NA, 0.0009) +
  facet_wrap(~book, ncol = 2, scales = "free_y")#分面绘图

【R语言文本挖掘】:分析单词和文档频率——TF-IDF-LMLPHP

2.齐夫定律

freq_by_rank <- book_words %>%
  group_by(book) %>%
  mutate(rank = row_number(),
         `term frequency` = n/total)#计算rank和频率
freq_by_rank%>%head()
freq_by_rank %>%
  ggplot(aes(rank, `term frequency`, color = book)) +
  geom_line(size = 1.1, alpha = 0.8, show.legend = FALSE) +
  scale_x_log10() +
  scale_y_log10()

【R语言文本挖掘】:分析单词和文档频率——TF-IDF-LMLPHP

rank_subset <- freq_by_rank %>%
  filter(rank < 500,
         rank > 10)

lm(log10(`term frequency`) ~ log10(rank), data = rank_subset)
​    Call:
​    lm(formula = log10(`term frequency`) ~ log10(rank), data = rank_subset)
​
​    Coefficients:
​    (Intercept)  log10(rank)
​        -0.6226      -1.1125
freq_by_rank %>%
  ggplot(aes(rank, `term frequency`, color = book)) +
  geom_abline(intercept = -0.62, slope = -1.1,
              color = "gray50", linetype = 2) +
  geom_line(size = 1.1, alpha = 0.8, show.legend = FALSE) +
  scale_x_log10() +
  scale_y_log10()


【R语言文本挖掘】:分析单词和文档频率——TF-IDF-LMLPHP

3.bind_tf_idf() 函数

book_tf_idf <- book_words %>%
  bind_tf_idf(word, book, n)

book_tf_idf %>% head()
book_tf_idf %>%
  select(-total) %>%
  arrange(desc(tf_idf)) %>%head()
library(forcats)

book_tf_idf %>%
  group_by(book) %>%
  slice_max(tf_idf, n = 15) %>%
  ungroup() %>%
  ggplot(aes(tf_idf, fct_reorder(word, tf_idf), fill = book)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~book, ncol = 2, scales = "free") +
  labs(x = "tf-idf", y = NULL)

【R语言文本挖掘】:分析单词和文档频率——TF-IDF-LMLPHP

4.物理文档语料库

library(gutenbergr)
physics <- gutenberg_download(c(37729, 14725, 13476, 30155),
                              meta_fields = "author")
physics_words <- physics %>%
    unnest_tokens(word,text)%>%
    count(author,word, sort = TrUE)
physics_words %>% head()
plot_physics <- physics_words %>%
  bind_tf_idf(word, author, n) %>%
  mutate(author = factor(author, levels = c("Galilei, Galileo",
                                            "Huygens, Christiaan",
                                            "Tesla, Nikola",
                                            "Einstein, Albert")))

plot_physics %>%
  group_by(author) %>%
  slice_max(tf_idf, n = 15) %>%
  ungroup() %>%
  mutate(word = reorder(word, tf_idf)) %>%
  ggplot(aes(tf_idf, word, fill = author)) +
  geom_col(show.legend = FALSE) +
  labs(x = "tf-idf", y = NULL) +
  facet_wrap(~author, ncol = 2, scales = "free")


【R语言文本挖掘】:分析单词和文档频率——TF-IDF-LMLPHP

library(stringr)

physics %>%
  filter(str_detect(text, "_k_")) #%>% 
  #select(text)
physics %>%
  filter(str_detect(text, "rC")) %>%
  select(text) %>%
  head()
mystopwords <- tibble(word = c("eq", "co", "rc", "ac", "ak", "bn",
                                   "fig", "file", "cg", "cb", "cm",
                               "ab", "_k", "_k_", "_x"))

physics_words <- anti_join(physics_words, mystopwords,
                           by = "word")

plot_physics <- physics_words %>%
  bind_tf_idf(word, author, n) %>%
  mutate(word = str_remove_all(word, "_")) %>%
  group_by(author) %>%
  slice_max(tf_idf, n = 15) %>%
  ungroup() %>%
  mutate(word = fct_reorder(word, tf_idf)) %>%
  mutate(author = factor(author, levels = c("Galilei, Galileo",
                                            "Huygens, Christiaan",
                                            "Tesla, Nikola",
                                            "Einstein, Albert")))

ggplot(plot_physics, aes(tf_idf, word, fill = author)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~author, ncol = 2, scales = "free") +
  labs(x = "tf-idf", y = NULL)


【R语言文本挖掘】:分析单词和文档频率——TF-IDF-LMLPHP

5.总结

07-11 17:23