我在自学python线程的基础知识,并陷入困境。
我希望脚本将功能应用于pdf列表。简单地说,此函数应该计算每个pdf文件中的表格数量,然后返回每个文件有多少张表格的组合列表。
现在,我收到一条错误消息,指出“我的文件格式不受支持”。据我所知,列表中的每个路径都是以.pdf结尾的完整路径。我不知道我在做什么错?
我已将代码简化为要点,并在下面包含了我的代码
import camelot
from multiprocessing.dummy import Pool as ThreadPool
import glob
import os
#get a list of all the pdf paths in the directory I am interested in
pdfs = [os.path.abspath(x) for x in os.listdir(r'C:\Users\josiahh\Desktop\threading_learning')]
#format each path to have the r letter in front of it
rpdfs = ["r'" + pdf + "'" for pdf in pdfs]
#function that counts each table in the pdf. THIS IS WHERE SOMETHING IS WRONG...I THINK
listoflengths = []
def len_table5(filepath):
tables = camelot.read_pdf(filepath, pages = '1-end',flavor='stream')
tablelength = len(tables)
listoflengths.append(tablelength)
#threading code
pool = ThreadPool(5)
results = pool.map(len_table5, rpdfs)
pool.close()
pool.join()
任何帮助将不胜感激。请让我知道是否可以澄清任何事情
编辑:
在文件名前使用r时的回溯
---------------------------------------------------------------------------
NotImplementedError Traceback (most recent call last)
<ipython-input-57-d38bdf75c567> in <module>
1
2 pool = ThreadPool(5)
----> 3 results = pool.map(len_table5, rpdfs)
4 pool.close()
5 pool.join()
~\AppData\Local\Continuum\anaconda3\lib\multiprocessing\pool.py in map(self, func, iterable, chunksize)
286 in a list that is returned.
287 '''
--> 288 return self._map_async(func, iterable, mapstar, chunksize).get()
289
290 def starmap(self, func, iterable, chunksize=None):
~\AppData\Local\Continuum\anaconda3\lib\multiprocessing\pool.py in get(self, timeout)
668 return self._value
669 else:
--> 670 raise self._value
671
672 def _set(self, i, obj):
~\AppData\Local\Continuum\anaconda3\lib\multiprocessing\pool.py in worker(inqueue, outqueue, initializer, initargs, maxtasks, wrap_exception)
117 job, i, func, args, kwds = task
118 try:
--> 119 result = (True, func(*args, **kwds))
120 except Exception as e:
121 if wrap_exception and func is not _helper_reraises_exception:
~\AppData\Local\Continuum\anaconda3\lib\multiprocessing\pool.py in mapstar(args)
42
43 def mapstar(args):
---> 44 return list(map(*args))
45
46 def starmapstar(args):
<ipython-input-54-025080eb0d6f> in len_table5(filepath)
1 listoflengths = []
2 def len_table5(filepath):
----> 3 tables = camelot.read_pdf(filepath, pages = '1-end',flavor='stream')
4 tablelength = len(tables)
5 listoflengths.append(tablelength)
~\AppData\Local\Continuum\anaconda3\lib\site-packages\camelot\io.py in read_pdf(filepath, pages, password, flavor, suppress_stdout, layout_kwargs, **kwargs)
101
102 validate_input(kwargs, flavor=flavor)
--> 103 p = PDFHandler(filepath, pages=pages, password=password)
104 kwargs = remove_extra(kwargs, flavor=flavor)
105 tables = p.parse(flavor=flavor, suppress_stdout=suppress_stdout,
~\AppData\Local\Continuum\anaconda3\lib\site-packages\camelot\handlers.py in __init__(self, filepath, pages, password)
33 self.filepath = filepath
34 if not filepath.lower().endswith('.pdf'):
---> 35 raise NotImplementedError("File format not supported")
36
37 if password is None:
NotImplementedError: File format not supported
在文件路径中不使用rs时的回溯
---------------------------------------------------------------------------
FileNotFoundError Traceback (most recent call last)
<ipython-input-59-07744a46a83f> in <module>
1
2 pool = ThreadPool(5)
----> 3 results = pool.map(len_table5, pdfs)
4 pool.close()
5 pool.join()
~\AppData\Local\Continuum\anaconda3\lib\multiprocessing\pool.py in map(self, func, iterable, chunksize)
286 in a list that is returned.
287 '''
--> 288 return self._map_async(func, iterable, mapstar, chunksize).get()
289
290 def starmap(self, func, iterable, chunksize=None):
~\AppData\Local\Continuum\anaconda3\lib\multiprocessing\pool.py in get(self, timeout)
668 return self._value
669 else:
--> 670 raise self._value
671
672 def _set(self, i, obj):
~\AppData\Local\Continuum\anaconda3\lib\multiprocessing\pool.py in worker(inqueue, outqueue, initializer, initargs, maxtasks, wrap_exception)
117 job, i, func, args, kwds = task
118 try:
--> 119 result = (True, func(*args, **kwds))
120 except Exception as e:
121 if wrap_exception and func is not _helper_reraises_exception:
~\AppData\Local\Continuum\anaconda3\lib\multiprocessing\pool.py in mapstar(args)
42
43 def mapstar(args):
---> 44 return list(map(*args))
45
46 def starmapstar(args):
<ipython-input-58-e6499958826d> in len_table5(filepath)
14 listoflengths = []
15 def len_table5(filepath):
---> 16 tables = camelot.read_pdf(filepath, pages = '1-end',flavor='stream')
17 tablelength = len(tables)
18 listoflengths.append(tablelength)
~\AppData\Local\Continuum\anaconda3\lib\site-packages\camelot\io.py in read_pdf(filepath, pages, password, flavor, suppress_stdout, layout_kwargs, **kwargs)
101
102 validate_input(kwargs, flavor=flavor)
--> 103 p = PDFHandler(filepath, pages=pages, password=password)
104 kwargs = remove_extra(kwargs, flavor=flavor)
105 tables = p.parse(flavor=flavor, suppress_stdout=suppress_stdout,
~\AppData\Local\Continuum\anaconda3\lib\site-packages\camelot\handlers.py in __init__(self, filepath, pages, password)
41 if sys.version_info[0] < 3:
42 self.password = self.password.encode('ascii')
---> 43 self.pages = self._get_pages(self.filepath, pages)
44
45 def _get_pages(self, filepath, pages):
~\AppData\Local\Continuum\anaconda3\lib\site-packages\camelot\handlers.py in _get_pages(self, filepath, pages)
64 page_numbers.append({'start': 1, 'end': 1})
65 else:
---> 66 infile = PdfFileReader(open(filepath, 'rb'), strict=False)
67 if infile.isEncrypted:
68 infile.decrypt(self.password)
FileNotFoundError: [Errno 2] No such file or directory: 'C:\\Users\\josiahh\\camelot - Copy (2) - Copy.pdf'
最佳答案
有一些事情要考虑:
不要硬编码路径字符串,它不灵活并且很可能
将无法在其他计算机上使用。os.listdir()
只是返回文件夹中的文件名,因此os.path.abspath()
不会给您正确的结果。
不确定要在文件名中添加前缀,您真的需要吗?
更正的版本将是:
import os
from multiprocessing.dummy import Pool as ThreadPool
import camelot
BASE_PATH = os.path.dirname((os.path.abspath(__file__)))
FOLDER_PATH = os.path.join(BASE_PATH, "threading_learning")
pdfs = [os.path.join(FOLDER_PATH, file_name) for file_name in os.listdir(FOLDER_PATH)]
listoflengths = []
def len_table5(filepath):
tables = camelot.read_pdf(filepath, pages='1-end', flavor='stream')
tablelength = len(tables)
listoflengths.append(tablelength)
# threading code
pool = ThreadPool(5)
results = pool.map(len_table5, pdfs)
pool.close()
pool.join()
print(listoflengths)