功能描写叙述:

获取某个路径下的全部文件,提取出每一个文件里出现频率最高的前300个字。保存在数据库其中。

前提。你须要配置好nltk

#!/usr/bin/python
#coding=utf-8
'''
function : This script will create a database named mydb then abstract keywords of files of privacy police. author : Chicho date : 2014/7/28 running : python key_extract.py -d path_of_file
''' import sys,getopt
import nltk
import MySQLdb
from nltk.corpus import PlaintextCorpusReader corpus_root = "" if __name__ == '__main__': opts,args = getopt.getopt(sys.argv[1:], "d:h","directory=help") #get the directory
for op,value in opts:
if op in ("-d", "--directory"):
corpus_root = value #actually。 the above method to get a directory is a little complicated,you can
#do like this
'''
the input include you path and use sys.argv to get the path
'''
'''
running : python key_extract.py you path_of_file
corpus_root = sys.argv[1]
''' # corpus_root is the directory of files of privacy policy, all of the are html files
filelists = PlaintextCorpusReader(corpus_root, '.*') #get the files' list
files = filelists.fileids() #connect the database
conn = MySQLdb.connect(host = 'your_personal_host_ip_address', user = 'rusername', port =your_port, passwd = 'U_password')
#get the cursor
curs = conn.cursor() conn.set_character_set('utf8')
curs.execute('set names utf8')
curs.execute('SET CHARACTER SET utf8;')
curs.execute('SET character_set_connection=utf8;') '''
conn.text_factory=lambda x: unicode(x, 'utf8', "ignore")
#conn.text_factory=str
''' # create a database named mydb
'''
try:
curs.execute("create database mydb")
except Exception,e:
print e
''' conn.select_db('mydb') try:
for i in range(300):
sql = "alter table filekeywords add " + "key" + str(i) + " varchar(45)"
curs.execute(sql)
except Exception,e:
print e i = 0
for privacyfile in files:
#f = open(privacyfile,'r', encoding= 'utf-8')
sql = "insert into filekeywords set id =" + str(i)
curs.execute(sql)
sql = "update filekeywords set name =" + "'" + privacyfile + "' where id= " + str(i)
curs.execute(sql)
# get the words in privacy policy
wordlist = [w for w in filelists.words(privacyfile) if w.isalpha() and len(w)>2] # get the keywords
fdist = nltk.FreqDist(wordlist)
vol = fdist.keys()
key_num = len(vol)
if key_num > 300:
key_num = 300
for j in range(key_num):
sql = "update filekeywords set " + "key" + str(j) + "=" + "'" + vol[j] + "' where id=" + str(i)
curs.execute(sql)
i = i + 1 conn.commit()
curs.close()
conn.close()

转载注明出处:http://blog.csdn.net/chichoxian/article/details/42003603

05-11 17:22
查看更多