问题描述
我想要能够将PDF转换为CSV文件,并找到了几个有用的脚本,但是对于Python,我有一个问题:
指定要打印的PDF和CSV的文件路径?
我使用的是Python 2.7.11和PDFMiner 20140328。
import sys
从pdfminer.pdfinterp导入PDFResourceManager,PDFPageInterpreter
从pdfminer.pdf页导入PDFPage
从pdfminer.converter导入XMLConverter ,HTMLConverter,TextConverter
从pdfminer.layout导入LAParams
从cStringIO导入StringIO
def pdfparser(data):
fp = file(data, 'rb')
rsrcmgr = PDFResourceManager()
retstr = StringIO()
codec ='utf-8'
laparams = LAParams()
device = TextConverter
解释器= PDFPageInterpreter(rsrcmgr,设备)
对于PDFPage.get_pages(fp)中的页面:
interpreter.process_page(page)
data = retstr.getvalue()
打印数据
如果__name__ =='__main__':
pdfparser .argv [1])$ b $ b
来自 SO回答tgray :
def pdf_to_csv(filename,separator,threshold):
来自cStringIO import StringIO
from pdfminer.converter import LTChar,TextConverter
从pdfminer.layout导入LAParams
从pdfminer.pdfinterp导入PDFResourceManager,PDFPageInterpreter
从pdfminer.pdf页导入PDFPage
class CsvConverter(TextConverter):
def __init __(self,* args,** kwargs):
TextConverter .__ init __(self,* args,** kwargs)
self.separator = separator
self.threshold = threshold
def end_page(self,i):
from collections import defaultdict
lines = defaultdict(lambda:{})
child in self.cur_item._objs:#< - changed
如果isinstance(child,LTChar):
(_,_,x,y)= child.bbox
line = lines [int(-y)]
line [x] = child._text.encode(self.codec)#< - 更改
对于排序(lines.keys $ b line = lines [y]
self.line_creator(line)
self.outfp.write(self.line_creator(line))
self.outfp.write(\\\
)
def line_creator(self,line):
keys = sorted(line.keys())
#计算此行上每个字符之间的平均距离
average_istance = sum([keys [i] - keys [i-1] for i in range(1,len(keys))])/ len(keys)
将第一个字符追加到结果
结果= [line [keys [0]]]
对于范围内的i(1,len(keys)):
#如果此字符和最后一个字符之间的距离大于平均值*阈值
if(keys [i] - keys [i-1])> average_istance * self.threshold:
#将分隔符附加到该位置
result.append(self.separator)
#追加字符
result.append(line [keys [i ]])
printable_line =''.join(result)
返回printable_line
#...下面部分代码是
的混音#在pdfminer / tools / pdf2text模块中的convert()函数
rsrc = PDFResourceManager()
outfp = StringIO()
device = CsvConverter(rsrc,outfp,codec =utf-8因为我的测试文档是utf-8(注意:utf-8是默认的编解码器)
fp = open(filename,'rb')
interpreter = PDFPageInterpreter(rsrc,device)
for i,page in enumerate(PDFPage.get_pages(fp)):
outfp.write(START PAGE%d\\\
%i)
如果页面不是无:
print'none'
interpreter.process_page(page)
outfp.write(END PAGE%d\\\
%i )
device.close()
fp.close()
return outfp.getvalue()
if __name__ =='__main__':
#用于CSV的分隔符
separator =';'
#一个字符被视为新字/列的一部分的距离乘法器/块。通常1.5工作得很好
threshold = 1.5
print pdf_to_csv('myLovelyFile.pdf',separator,threshold)
链接中的答案和这个答案之间的主要区别是line_creator方法,它试图从PDF中提取一些结构。
应该使用PDFminer 20140328。
I want to be able to convert PDFs to CSV files and have found several useful scripts but, being new to Python, I have a question:
Where do you specify the filepath of the PDF and the CSV you want to print to?
I'm using Python 2.7.11 and PDFMiner 20140328.
import sys
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from pdfminer.converter import XMLConverter, HTMLConverter, TextConverter
from pdfminer.layout import LAParams
from cStringIO import StringIO
def pdfparser(data):
fp = file(data, 'rb')
rsrcmgr = PDFResourceManager()
retstr = StringIO()
codec = 'utf-8'
laparams = LAParams()
device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
for page in PDFPage.get_pages(fp):
interpreter.process_page(page)
data = retstr.getvalue()
print data
if __name__ == '__main__':
pdfparser(sys.argv[1])
Here is some modified code from this SO answer written by tgray:
def pdf_to_csv(filename, separator, threshold):
from cStringIO import StringIO
from pdfminer.converter import LTChar, TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
class CsvConverter(TextConverter):
def __init__(self, *args, **kwargs):
TextConverter.__init__(self, *args, **kwargs)
self.separator = separator
self.threshold = threshold
def end_page(self, i):
from collections import defaultdict
lines = defaultdict(lambda: {})
for child in self.cur_item._objs: # <-- changed
if isinstance(child, LTChar):
(_, _, x, y) = child.bbox
line = lines[int(-y)]
line[x] = child._text.encode(self.codec) # <-- changed
for y in sorted(lines.keys()):
line = lines[y]
self.line_creator(line)
self.outfp.write(self.line_creator(line))
self.outfp.write("\n")
def line_creator(self, line):
keys = sorted(line.keys())
# calculate the average distange between each character on this row
average_distance = sum([keys[i] - keys[i - 1] for i in range(1, len(keys))]) / len(keys)
# append the first character to the result
result = [line[keys[0]]]
for i in range(1, len(keys)):
# if the distance between this character and the last character is greater than the average*threshold
if (keys[i] - keys[i - 1]) > average_distance * self.threshold:
# append the separator into that position
result.append(self.separator)
# append the character
result.append(line[keys[i]])
printable_line = ''.join(result)
return printable_line
# ... the following part of the code is a remix of the
# convert() function in the pdfminer/tools/pdf2text module
rsrc = PDFResourceManager()
outfp = StringIO()
device = CsvConverter(rsrc, outfp, codec="utf-8", laparams=LAParams())
# becuase my test documents are utf-8 (note: utf-8 is the default codec)
fp = open(filename, 'rb')
interpreter = PDFPageInterpreter(rsrc, device)
for i, page in enumerate(PDFPage.get_pages(fp)):
outfp.write("START PAGE %d\n" % i)
if page is not None:
print 'none'
interpreter.process_page(page)
outfp.write("END PAGE %d\n" % i)
device.close()
fp.close()
return outfp.getvalue()
if __name__ == '__main__':
# the separator to use with the CSV
separator = ';'
# the distance multiplier after which a character is considered part of a new word/column/block. Usually 1.5 works quite well
threshold = 1.5
print pdf_to_csv('myLovelyFile.pdf', separator, threshold)
The main difference between the answer in the link and this one is the line_creator method, which tries to extract some structure out of the PDF.
Should work with PDFminer 20140328.
这篇关于Python PDFMIner - PDF到CSV的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持!