我正在学习使用Fonduer从文本文档构建知识库。在随附的Jupyter笔记本中执行max_storage_temp_tutorial教程时,尝试执行以下代码时出现错误:
corpus_parser = Parser(structural=True, lingual=True, visual=True, pdf_path=pdf_path)
%time corpus_parser.apply(doc_preprocessor, parallelism=PARALLEL)
以下是我得到的错误:
UnicodeEncodeError: 'ascii' codec can't encode character '\uf0b7' in position 6282: ordinal not in range(128)
以下是错误的堆栈跟踪:
[INFO] fonduer.utils.udf - Clearing existing...
[INFO] fonduer.utils.udf - Running UDF...
---------------------------------------------------------------------------
UnicodeEncodeError Traceback (most recent call last)
<timed eval> in <module>()
~/anaconda3/envs/fonduer/lib/python3.6/site-packages/fonduer/utils/udf.py in apply(self, xs, clear, parallelism, progress_bar, count, **kwargs)
48 self.logger.info("Running UDF...")
49 if parallelism is None or parallelism < 2:
---> 50 self.apply_st(xs, progress_bar, clear=clear, count=count, **kwargs)
51 else:
52 self.apply_mt(xs, parallelism, clear=clear, **kwargs)
~/anaconda3/envs/fonduer/lib/python3.6/site-packages/fonduer/utils/udf.py in apply_st(self, xs, progress_bar, count, **kwargs)
81
82 # Commit session and close progress bar if applicable
---> 83 udf.session.commit()
84 if pb:
85 pb.close()
~/anaconda3/envs/fonduer/lib/python3.6/site-packages/sqlalchemy/orm/session.py in commit(self)
941 raise sa_exc.InvalidRequestError("No transaction is begun.")
942
--> 943 self.transaction.commit()
944
945 def prepare(self):
~/anaconda3/envs/fonduer/lib/python3.6/site-packages/sqlalchemy/orm/session.py in commit(self)
465 self._assert_active(prepared_ok=True)
466 if self._state is not PREPARED:
--> 467 self._prepare_impl()
468
469 if self._parent is None or self.nested:
~/anaconda3/envs/fonduer/lib/python3.6/site-packages/sqlalchemy/orm/session.py in _prepare_impl(self)
445 if self.session._is_clean():
446 break
--> 447 self.session.flush()
448 else:
449 raise exc.FlushError(
~/anaconda3/envs/fonduer/lib/python3.6/site-packages/sqlalchemy/orm/session.py in flush(self, objects)
2252 try:
2253 self._flushing = True
-> 2254 self._flush(objects)
2255 finally:
2256 self._flushing = False
~/anaconda3/envs/fonduer/lib/python3.6/site-packages/sqlalchemy/orm/session.py in _flush(self, objects)
2378 except:
2379 with util.safe_reraise():
-> 2380 transaction.rollback(_capture_exception=True)
2381
2382 def bulk_save_objects(
~/anaconda3/envs/fonduer/lib/python3.6/site-packages/sqlalchemy/util/langhelpers.py in __exit__(self, type_, value, traceback)
64 self._exc_info = None # remove potential circular references
65 if not self.warn_only:
---> 66 compat.reraise(exc_type, exc_value, exc_tb)
67 else:
68 if not compat.py3k and self._exc_info and self._exc_info[1]:
~/anaconda3/envs/fonduer/lib/python3.6/site-packages/sqlalchemy/util/compat.py in reraise(tp, value, tb, cause)
247 if value.__traceback__ is not tb:
248 raise value.with_traceback(tb)
--> 249 raise value
250
251 else:
~/anaconda3/envs/fonduer/lib/python3.6/site-packages/sqlalchemy/orm/session.py in _flush(self, objects)
2342 self._warn_on_events = True
2343 try:
-> 2344 flush_context.execute()
2345 finally:
2346 self._warn_on_events = False
~/anaconda3/envs/fonduer/lib/python3.6/site-packages/sqlalchemy/orm/unitofwork.py in execute(self)
384 while set_:
385 n = set_.pop()
--> 386 n.execute_aggregate(self, set_)
387 else:
388 for rec in topological.sort(
~/anaconda3/envs/fonduer/lib/python3.6/site-packages/sqlalchemy/orm/unitofwork.py in execute_aggregate(self, uow, recs)
666 [self.state] +
667 [r.state for r in our_recs],
--> 668 uow)
669
670 def __repr__(self):
~/anaconda3/envs/fonduer/lib/python3.6/site-packages/sqlalchemy/orm/persistence.py in save_obj(base_mapper, states, uowtransaction, single)
179 _emit_insert_statements(base_mapper, uowtransaction,
180 cached_connections,
--> 181 mapper, table, insert)
182
183 _finalize_insert_update_commands(
~/anaconda3/envs/fonduer/lib/python3.6/site-packages/sqlalchemy/orm/persistence.py in _emit_insert_statements(base_mapper, uowtransaction, cached_connections, mapper, table, insert, bookkeeping)
828
829 c = cached_connections[connection].\
--> 830 execute(statement, multiparams)
831
832 if bookkeeping:
~/anaconda3/envs/fonduer/lib/python3.6/site-packages/sqlalchemy/engine/base.py in execute(self, object, *multiparams, **params)
946 raise exc.ObjectNotExecutableError(object)
947 else:
--> 948 return meth(self, multiparams, params)
949
950 def _execute_function(self, func, multiparams, params):
~/anaconda3/envs/fonduer/lib/python3.6/site-packages/sqlalchemy/sql/elements.py in _execute_on_connection(self, connection, multiparams, params)
267 def _execute_on_connection(self, connection, multiparams, params):
268 if self.supports_execution:
--> 269 return connection._execute_clauseelement(self, multiparams, params)
270 else:
271 raise exc.ObjectNotExecutableError(self)
~/anaconda3/envs/fonduer/lib/python3.6/site-packages/sqlalchemy/engine/base.py in _execute_clauseelement(self, elem, multiparams, params)
1058 compiled_sql,
1059 distilled_params,
-> 1060 compiled_sql, distilled_params
1061 )
1062 if self._has_events or self.engine._has_events:
~/anaconda3/envs/fonduer/lib/python3.6/site-packages/sqlalchemy/engine/base.py in _execute_context(self, dialect, constructor, statement, parameters, *args)
1198 parameters,
1199 cursor,
-> 1200 context)
1201
1202 if self._has_events or self.engine._has_events:
~/anaconda3/envs/fonduer/lib/python3.6/site-packages/sqlalchemy/engine/base.py in _handle_dbapi_exception(self, e, statement, parameters, cursor, context)
1414 )
1415 else:
-> 1416 util.reraise(*exc_info)
1417
1418 finally:
~/anaconda3/envs/fonduer/lib/python3.6/site-packages/sqlalchemy/util/compat.py in reraise(tp, value, tb, cause)
247 if value.__traceback__ is not tb:
248 raise value.with_traceback(tb)
--> 249 raise value
250
251 else:
~/anaconda3/envs/fonduer/lib/python3.6/site-packages/sqlalchemy/engine/base.py in _execute_context(self, dialect, constructor, statement, parameters, *args)
1168 statement,
1169 parameters,
-> 1170 context)
1171 elif not parameters and context.no_parameters:
1172 if self.dialect._has_events:
~/anaconda3/envs/fonduer/lib/python3.6/site-packages/sqlalchemy/dialects/postgresql/psycopg2.py in do_executemany(self, cursor, statement, parameters, context)
681 extras.execute_batch(cursor, statement, parameters)
682 else:
--> 683 cursor.executemany(statement, parameters)
684
685 @util.memoized_instancemethod
UnicodeEncodeError: 'ascii' codec can't encode character '\uf0b7' in position 6282: ordinal not in range(128)
在将输入打印到executemany函数时,我看到存在无效的unicode字符,但是我不知道如何继续。
请注意:
我已使用本教程中的download_data.sh脚本下载了pdf和html文件。
我已经安装了安装文档中提到的所有先决条件
适用于Windows的Ubuntu 16.04 bash
PostgreSQL版本:[9.5.13]
Poppler Utils版本:[0.41.0-0ubuntu1.7]
Fonduer版本:[0.2.3]
可以在here中找到这些教程。
我已使用Windows的Ubuntu运行所需的服务
最佳答案
问题是由于postgreSQL数据库的编码。 Fonduer需要UTF-8编码,而Windows默认情况下使用其他编码。
我要做的就是:
1.拖放所需的数据库。
dropdb stg_temp_max
2.使用UTF-8编码创建一个新数据库。
createdb -E UTF8 -T template0 stg_temp_max