问题描述
我有一个将pdf转换为html的程序,我需要对此程序进行补充,因此在转换后它会搜索标签PA /和它前面的字符,并将这些标签和字符保存为CSV文件, m试图做到这一点,但我不能,有人可以帮我吗?
以下是目前的代码:
导入shlex
导入子流程
导入os
导入平台
从bs4导入BeautifulSoup
导入re
import csv
import pickle
def rename_files():
file_list = os.listdir(rC:\\PROJECT\\pdfs)
print( file_list)
saved_path = os.getcwd()
print('当前工作目录是'+ saved_path)
os.chdir(r'C:\\PROJECT\\pdfs ')
for file_name in file_list:
os.rename(file_name,file_name.translate(None,))
os.chdir(saved_path)
rename_files()
def run(command):
如果platform.system()!='Windows':
args = shlex.split(command)
else:
args = command
s = subprocess.Popen(args,
stdout = subprocess.PIPE,
stderr = subprocess.PIPE)
输出,错误= s.communicate()
返回s.returncode == 0,输出错误
#将其更改为PDF文件的基本目录
base_directory ='C:\\PROJECT\\pdfs'
如果不是os.path.isdir(base_directory):
打印%s不是目录%base_directory
exit(1)
#将此更改为您的pdf2htmlEX可执行位置
bin_path ='C:\\Python27\\\ \\ pdfminer-20140328 \\tools\\pdf2txt.py'
如果不是os.path.isfile(bin_path):
print找不到%s%bin_path
exit(1)
用于dir_path,dir_name_list,os.walk(base_directory)中的file_name_list:
用于file_name_list中的file_name:
#如果这不是PDF文件
if file_name.endswith('。pdf'):
#跳过它
continue
file_path = os.path.join(dir_path, file_name)
#在此将PDF转换为HTML
args =(bin_path,file_name,file_path)
成功,输出,错误= run(python%s -o%s.html%s %args)
如果不成功:
print无法将%s转换为HTML%file_path
print%s%errors
htmls_path ='C:\ \PROJECT'
用于dir_path,dir_name_list,os.walk(htmls_path)中的file_name_list:
用于file_name_list中的file_name:
如果不是file_name.endswith('。html'):
以open(file_name)作为标记继续
:
soup = BeautifulSoup(markup.read())
text = soup.get_text()
match = re.findall( PA /(\S *)\s *(\S *),text)
print(match)
with ope n('score.csv','w')as f:
writer = csv.writer(f)
writer.writerows('%s'%match)
html太大了,我会在这里写下它的一部分,包括PA和我不想要的文本:
< html>
< title>测试< / title>
< body>
< div style =position:absolute; border:textbox 1px solid; writing-mode:lr-tb; left:59px; top:34023px; width:84px; height:32px;>< span style =font-family:YFEHEP + Times-Bold; font-size:17px>只是一些文本,我不想在CSV文件中使用
< br>< / span><< ; < / div>< div style =position:absolute; border:textbox 1px solid; writing-mode:lr-tb; left:59px; top:34066px; width:84px; height:16px;>< span style =font-family: YFEHEP + Times-Roman; font-size:16px> PA / 01008/17 GTD
< br>< / span>< / div>< div style =position:absolute; border: textbox 1px solid; writing-mode:lr-tb; left:59px; top:34105px; width:84px; height:16px;>< span style =font-family:YFEHEP + Times-Roman; font-size :16px> PA / 01095/17 GTD
< / body>
< / html>
import re
from bs4 import BeautifulSoup
html_doc =
< html>
< title>测试< / title>
< body>
< div style =位置:绝对; border:textbox 1px solid;写入模式:LR-TB;左:59px;顶部:34023px;宽度:84px; height:32px;>< span style =font-family:YFEHEP + Times-Bold;字体大小:17px>只是一些文本,我不希望在CSV文件中有
< br>< / span>< span style =font-family:YFEHEP + Times-罗马; font-size:16px> PA / 00986/17 GTD
< br>< / span>< / div>< div style =position:absolute; border:textbox 1px solid;写入模式:LR-TB;左:59px;顶部:34066px;宽度:84px; height:16px;>< span style =font-family:YFEHEP + Times-Roman; font-size:16px> PA / 01008/17 GTD
< br>< / span>< / div>< div style =position:absolute; border:textbox 1px solid;写入模式:LR-TB;左:59px;顶部:34105px;宽度:84px; height:16px;>< span style =font-family:YFEHEP + Times-Roman; font-size:16px> PA / 01095/17 GTD
< / body>
< / html>
汤= BeautifulSoup (html_doc,'html.parser')
text = soup.get_text()
match = re.findall(PA /(\S *)\s *(\ S *),text)
print(match)
I have a program that converts pdfs into html and I needed to complement this program so after converting It would search for the tags PA/ and the character in front of it and save these tags and characters to a CSV file, I'm trying to do it but I can't, could someone help me out please? Here's the code so far: The html is too big, I'll write here a part of it that includes the PA's and the text that I don't want: For writting to CSV 这篇关于如何使用美丽的汤列出所有具有PA /在html文件内部的字符串的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持!
$ b $ pre $ 将csv
打开('ur file.csv','wb')如下:
csv_out = csv.writer(out)
csv_out.writerow(['fist_col','second_col'])
用于匹配行:
csv_out.writerow(row)
import shlex
import subprocess
import os
import platform
from bs4 import BeautifulSoup
import re
import csv
import pickle
def rename_files():
file_list = os.listdir(r"C:\\PROJECT\\pdfs")
print(file_list)
saved_path = os.getcwd()
print('Current working directory is '+saved_path)
os.chdir(r'C:\\PROJECT\\pdfs')
for file_name in file_list:
os.rename(file_name, file_name.translate(None, " "))
os.chdir(saved_path)
rename_files()
def run(command):
if platform.system() != 'Windows':
args = shlex.split(command)
else:
args = command
s = subprocess.Popen(args,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
output, errors = s.communicate()
return s.returncode == 0, output, errors
# Change this to your PDF file base directory
base_directory = 'C:\\PROJECT\\pdfs'
if not os.path.isdir(base_directory):
print "%s is not a directory" % base_directory
exit(1)
# Change this to your pdf2htmlEX executable location
bin_path = 'C:\\Python27\\pdfminer-20140328\\tools\\pdf2txt.py'
if not os.path.isfile(bin_path):
print "Could not find %s" % bin_path
exit(1)
for dir_path, dir_name_list, file_name_list in os.walk(base_directory):
for file_name in file_name_list:
# If this is not a PDF file
if not file_name.endswith('.pdf'):
# Skip it
continue
file_path = os.path.join(dir_path, file_name)
# Convert your PDF to HTML here
args = (bin_path, file_name, file_path)
success, output, errors = run("python %s -o %s.html %s " %args)
if not success:
print "Could not convert %s to HTML" % file_path
print "%s" % errors
htmls_path = 'C:\\PROJECT'
for dir_path, dir_name_list, file_name_list in os.walk(htmls_path):
for file_name in file_name_list:
if not file_name.endswith('.html'):
continue
with open(file_name) as markup:
soup = BeautifulSoup(markup.read())
text = soup.get_text()
match = re.findall("PA/(\S*)\s*(\S*)", text)
print(match)
with open ('score.csv', 'w') as f:
writer = csv.writer(f)
writer.writerows('%s' %match)
<html>
<title>Testing</title>
<body>
<div style="position:absolute; border: textbox 1px solid; writing-mode:lr-tb; left:59px; top:34023px; width:84px; height:32px;"><span style="font-family: YFEHEP+Times-Bold; font-size:17px">JUST SOME TEXT THAT I DON'T WANT TO HAVE ON THE CSV FILE
<br></span><span style="font-family: YFEHEP+Times-Roman; font-size:16px">PA/00986/17 GTD
<br></span></div><div style="position:absolute; border: textbox 1px solid; writing-mode:lr-tb; left:59px; top:34066px; width:84px; height:16px;"><span style="font-family: YFEHEP+Times-Roman; font-size:16px">PA/01008/17 GTD
<br></span></div><div style="position:absolute; border: textbox 1px solid; writing-mode:lr-tb; left:59px; top:34105px; width:84px; height:16px;"><span style="font-family: YFEHEP+Times-Roman; font-size:16px">PA/01095/17 GTD
</body>
</html>
import re
from bs4 import BeautifulSoup
html_doc = """
<html>
<title>Testing</title>
<body>
<div style="position:absolute; border: textbox 1px solid; writing-mode:lr-tb; left:59px; top:34023px; width:84px; height:32px;"><span style="font-family: YFEHEP+Times-Bold; font-size:17px">JUST SOME TEXT THAT I DON'T WANT TO HAVE ON THE CSV FILE
<br></span><span style="font-family: YFEHEP+Times-Roman; font-size:16px">PA/00986/17 GTD
<br></span></div><div style="position:absolute; border: textbox 1px solid; writing-mode:lr-tb; left:59px; top:34066px; width:84px; height:16px;"><span style="font-family: YFEHEP+Times-Roman; font-size:16px">PA/01008/17 GTD
<br></span></div><div style="position:absolute; border: textbox 1px solid; writing-mode:lr-tb; left:59px; top:34105px; width:84px; height:16px;"><span style="font-family: YFEHEP+Times-Roman; font-size:16px">PA/01095/17 GTD
</body>
</html>
"""
soup = BeautifulSoup(html_doc, 'html.parser')
text = soup.get_text()
match = re.findall("PA/(\S*)\s*(\S*)", text)
print(match)
import csv
with open('ur file.csv','wb') as out:
csv_out=csv.writer(out)
csv_out.writerow(['fist_col','second_col'])
for row in match:
csv_out.writerow(row)