# -*- coding: utf-8 -*-
import os, errno def fileName(path):#获取文件夹
str = ''
for i in range(1,len(path.split('\\'))):
str+=path.split('\\')[i]+'\\'
return str def mkdir_p(path): #创建目录树
try:
os.makedirs(path)
except OSError as exc: # Python >2.5 (except OSError, exc: for Python <2.5)
if exc.errno == errno.EEXIST and os.path.isdir(path):
pass
else: raise def fileTraverse(filepath):
#遍历filepath下所有文件,包括子目录
files = os.listdir(filepath)
for fi in files:
fi_d = os.path.join(filepath,fi)
if os.path.isdir(fi_d):
mkdir_p("E:\\"+fileName(fi_d))
#创建文件夹,文件夹目录树
fileTraverse(fi_d)#递归遍历
else:
print os.path.join(filepath,fi_d) root = 'F:\\目标2'
root = root.decode('utf-8')#目录名中有中文,需要decode
fileTraverse(root)
# -*- coding: utf-8 -*-
import os, errno
import jieba.posseg as pseg def fileName(filePath):#获取文件夹
str = ''
for i in range(1,len(filePath.split('\\'))):
str+=filePath.split('\\')[i]+'\\'
return str def mkdir_p(path): #创建目录树
try:
os.makedirs(path)
except OSError as exc: # Python >2.5 (except OSError, exc: for Python <2.5)
if exc.errno == errno.EEXIST and os.path.isdir(path):
pass
else: raise def splitSentence(inputFile):
filePath = os.path.dirname(inputFile) #获取路径名
name = os.path.basename(inputFile) #获取文件名 fin = open(inputFile,'r') #以读的方式打开文件
outputfile = "E:\\" + fileName(filePath)#~~~~~~~~~~~~~~~~~~~~~~~~~~~源文件~~~~~~~~~~~~~~~~~~~~~~~~~~~
#outputfile = (outputfile+name).decode('utf-8')
fout = open(outputfile+name,'w') #以写得方式打开文件 for eachLine in fin:
line = eachLine.strip().decode('utf-8','ignore') #去除每行首尾可能出现的空格,并转为Unicode进行处理
line=line.strip('\n') #去掉多余空行
wordList = pseg.cut(line) #用结巴分词,对每行内容进行分词
outStr = ''
for word in wordList:#
#print word.word,word.flag
outStr += word.word+'/'+word.flag
#print outStr
fout.write(outStr.encode('utf-8')) #将分词好的结果写入到输出文件
fout.write('\n')
fin.close()
fout.close() def fileTraverse(filePath):
#遍历filepath下所有文件,包括子目录
files = os.listdir(filePath)
for fi in files:
fi_d = os.path.join(filePath,fi)
if os.path.isdir(fi_d):
#检验给出的路径是否是一个目录
mkdir_p("E:\\"+fileName(fi_d))#~~~~~~~~~~~~~~~~~~~~~~~~~~~目标文件~~~~~~~~~~~~~~~~~~~~~~~~~~~
#创建文件夹,文件夹目录树
fileTraverse(fi_d)#递归遍历
else:
#print os.path.join(filePath,fi_d)#y与fi_d相同
#print fi_d
splitSentence(fi_d) root = 'F:\\source' #~~~~~~~~~~~~~~~~~~~~~~~~~~~源文件~~~~~~~~~~~~~~~~~~~~~~~~~~~
root = root.decode('utf-8')#目录名中有中文,需要decode
fileTraverse(root)
05-11 11:21