用R读取PDF并进行数据挖掘,例子如下:
- # here is a pdf for mining
- url "http://www.noisyroom.net/blog/RomneySpeech072912.pdf"
- dest ".pdf")
- download.file(url, dest, mode = "wb")
- # set path to pdftotxt.exe and convert pdf to text
- exe "C:\\Program Files\\xpdfbin-win-3.03\\bin32\\pdftotext.exe"
- system(paste("\"", exe, "\" \"", dest, "\"", sep = ""), wait = F)
- # get txt-file name and open it
- filetxt ".pdf", ".txt", dest)
- shell.exec(filetxt); shell.exec(filetxt) # strangely the first try always throws an error..
- # do something with it, i.e. a simple word cloud
- library(tm)
- library(wordcloud)
- library(Rstem)
- txt
- txt
- txt "\\f", stopwords()))
- corpus
- corpus
- tdm
- m
- d
- # Stem words
- d$stem "english")
- # and put words to column, otherwise they would be lost when aggregating
- d$word
- # remove web address (very long string):
- d
- # aggregate freqeuncy by word stem and
- # keep first words..
- agg_freq
- agg_word function(x) x[1])
- d
- # sort by frequency
- d <- d[order(d$freq, decreasing = T), ]
- # print wordcloud:
- wordcloud(d$word, d$freq)
- # remove files
- file.remove(dir(tempdir(), full.name=T)) # remove files