我正在学习使用Fonduer从文本文档构建知识库。在随附的Jupyter笔记本中执行max_storage_temp_tutorial教程时,尝试执行以下代码时出现错误:

corpus_parser = Parser(structural=True, lingual=True, visual=True, pdf_path=pdf_path)
%time corpus_parser.apply(doc_preprocessor, parallelism=PARALLEL)


以下是我得到的错误:

UnicodeEncodeError: 'ascii' codec can't encode character '\uf0b7' in position 6282: ordinal not in range(128)


以下是错误的堆栈跟踪:

[INFO] fonduer.utils.udf - Clearing existing...
[INFO] fonduer.utils.udf - Running UDF...
---------------------------------------------------------------------------
UnicodeEncodeError                        Traceback (most recent call last)
<timed eval> in <module>()

~/anaconda3/envs/fonduer/lib/python3.6/site-packages/fonduer/utils/udf.py in apply(self, xs, clear, parallelism, progress_bar, count, **kwargs)
     48         self.logger.info("Running UDF...")
     49         if parallelism is None or parallelism < 2:
---> 50             self.apply_st(xs, progress_bar, clear=clear, count=count, **kwargs)
     51         else:
     52             self.apply_mt(xs, parallelism, clear=clear, **kwargs)

~/anaconda3/envs/fonduer/lib/python3.6/site-packages/fonduer/utils/udf.py in apply_st(self, xs, progress_bar, count, **kwargs)
     81
     82         # Commit session and close progress bar if applicable
---> 83         udf.session.commit()
     84         if pb:
     85             pb.close()

~/anaconda3/envs/fonduer/lib/python3.6/site-packages/sqlalchemy/orm/session.py in commit(self)
    941                 raise sa_exc.InvalidRequestError("No transaction is begun.")
    942
--> 943         self.transaction.commit()
    944
    945     def prepare(self):

~/anaconda3/envs/fonduer/lib/python3.6/site-packages/sqlalchemy/orm/session.py in commit(self)
    465         self._assert_active(prepared_ok=True)
    466         if self._state is not PREPARED:
--> 467             self._prepare_impl()
    468
    469         if self._parent is None or self.nested:

~/anaconda3/envs/fonduer/lib/python3.6/site-packages/sqlalchemy/orm/session.py in _prepare_impl(self)
    445                 if self.session._is_clean():
    446                     break
--> 447                 self.session.flush()
    448             else:
    449                 raise exc.FlushError(

~/anaconda3/envs/fonduer/lib/python3.6/site-packages/sqlalchemy/orm/session.py in flush(self, objects)
   2252         try:
   2253             self._flushing = True
-> 2254             self._flush(objects)
   2255         finally:
   2256             self._flushing = False

~/anaconda3/envs/fonduer/lib/python3.6/site-packages/sqlalchemy/orm/session.py in _flush(self, objects)
   2378         except:
   2379             with util.safe_reraise():
-> 2380                 transaction.rollback(_capture_exception=True)
   2381
   2382     def bulk_save_objects(

~/anaconda3/envs/fonduer/lib/python3.6/site-packages/sqlalchemy/util/langhelpers.py in __exit__(self, type_, value, traceback)
     64             self._exc_info = None   # remove potential circular references
     65             if not self.warn_only:
---> 66                 compat.reraise(exc_type, exc_value, exc_tb)
     67         else:
     68             if not compat.py3k and self._exc_info and self._exc_info[1]:

~/anaconda3/envs/fonduer/lib/python3.6/site-packages/sqlalchemy/util/compat.py in reraise(tp, value, tb, cause)
    247         if value.__traceback__ is not tb:
    248             raise value.with_traceback(tb)
--> 249         raise value
    250
    251 else:

~/anaconda3/envs/fonduer/lib/python3.6/site-packages/sqlalchemy/orm/session.py in _flush(self, objects)
   2342             self._warn_on_events = True
   2343             try:
-> 2344                 flush_context.execute()
   2345             finally:
   2346                 self._warn_on_events = False

~/anaconda3/envs/fonduer/lib/python3.6/site-packages/sqlalchemy/orm/unitofwork.py in execute(self)
    384                 while set_:
    385                     n = set_.pop()
--> 386                     n.execute_aggregate(self, set_)
    387         else:
    388             for rec in topological.sort(

~/anaconda3/envs/fonduer/lib/python3.6/site-packages/sqlalchemy/orm/unitofwork.py in execute_aggregate(self, uow, recs)
    666                              [self.state] +
    667                              [r.state for r in our_recs],
--> 668                              uow)
    669
    670     def __repr__(self):

~/anaconda3/envs/fonduer/lib/python3.6/site-packages/sqlalchemy/orm/persistence.py in save_obj(base_mapper, states, uowtransaction, single)
    179         _emit_insert_statements(base_mapper, uowtransaction,
    180                                 cached_connections,
--> 181                                 mapper, table, insert)
    182
    183     _finalize_insert_update_commands(

~/anaconda3/envs/fonduer/lib/python3.6/site-packages/sqlalchemy/orm/persistence.py in _emit_insert_statements(base_mapper, uowtransaction, cached_connections, mapper, table, insert, bookkeeping)
    828
    829             c = cached_connections[connection].\
--> 830                 execute(statement, multiparams)
    831
    832             if bookkeeping:

~/anaconda3/envs/fonduer/lib/python3.6/site-packages/sqlalchemy/engine/base.py in execute(self, object, *multiparams, **params)
    946             raise exc.ObjectNotExecutableError(object)
    947         else:
--> 948             return meth(self, multiparams, params)
    949
    950     def _execute_function(self, func, multiparams, params):

~/anaconda3/envs/fonduer/lib/python3.6/site-packages/sqlalchemy/sql/elements.py in _execute_on_connection(self, connection, multiparams, params)
    267     def _execute_on_connection(self, connection, multiparams, params):
    268         if self.supports_execution:
--> 269             return connection._execute_clauseelement(self, multiparams, params)
    270         else:
    271             raise exc.ObjectNotExecutableError(self)

~/anaconda3/envs/fonduer/lib/python3.6/site-packages/sqlalchemy/engine/base.py in _execute_clauseelement(self, elem, multiparams, params)
   1058             compiled_sql,
   1059             distilled_params,
-> 1060             compiled_sql, distilled_params
   1061         )
   1062         if self._has_events or self.engine._has_events:

~/anaconda3/envs/fonduer/lib/python3.6/site-packages/sqlalchemy/engine/base.py in _execute_context(self, dialect, constructor, statement, parameters, *args)
   1198                 parameters,
   1199                 cursor,
-> 1200                 context)
   1201
   1202         if self._has_events or self.engine._has_events:

~/anaconda3/envs/fonduer/lib/python3.6/site-packages/sqlalchemy/engine/base.py in _handle_dbapi_exception(self, e, statement, parameters, cursor, context)
   1414                 )
   1415             else:
-> 1416                 util.reraise(*exc_info)
   1417
   1418         finally:

~/anaconda3/envs/fonduer/lib/python3.6/site-packages/sqlalchemy/util/compat.py in reraise(tp, value, tb, cause)
    247         if value.__traceback__ is not tb:
    248             raise value.with_traceback(tb)
--> 249         raise value
    250
    251 else:

~/anaconda3/envs/fonduer/lib/python3.6/site-packages/sqlalchemy/engine/base.py in _execute_context(self, dialect, constructor, statement, parameters, *args)
   1168                         statement,
   1169                         parameters,
-> 1170                         context)
   1171             elif not parameters and context.no_parameters:
   1172                 if self.dialect._has_events:

~/anaconda3/envs/fonduer/lib/python3.6/site-packages/sqlalchemy/dialects/postgresql/psycopg2.py in do_executemany(self, cursor, statement, parameters, context)
    681             extras.execute_batch(cursor, statement, parameters)
    682         else:
--> 683             cursor.executemany(statement, parameters)
    684
    685     @util.memoized_instancemethod

UnicodeEncodeError: 'ascii' codec can't encode character '\uf0b7' in position 6282: ordinal not in range(128)


在将输入打印到executemany函数时,我看到存在无效的unicode字符,但是我不知道如何继续。

请注意:


我已使用本教程中的download_data.sh脚本下载了pdf和html文件。
我已经安装了安装文档中提到的所有先决条件


适用于Windows的Ubuntu 16.04 bash
PostgreSQL版本:[9.5.13]
Poppler Utils版本:[0.41.0-0ubuntu1.7]
Fonduer版本:[0.2.3]

可以在here中找到这些教程。
我已使用Windows的Ubuntu运行所需的服务

最佳答案

问题是由于postgreSQL数据库的编码。 Fonduer需要UTF-8编码,而Windows默认情况下使用其他编码。

我要做的就是:

1.拖放所需的数据库。

dropdb stg_temp_max


2.使用UTF-8编码创建一个新数据库。

 createdb -E UTF8 -T template0 stg_temp_max

08-25 08:49