思路
主要为开发者提供一个思路,这里并不是完整的商业项目,只是一时兴起写的一份demo,希望对大家有帮助。
- 制作一个接口用于上传文件
- 写一个程序把文件上传到上面的接口中
- 对得到的文件进行提取,分析(调gpt)
开源仓库地址:GPT-PDF
接口代码
from flask import Flask, request, Response
import PyPDF2
app = Flask(__name__)
@app.route('/upload', methods=['POST'])
def upload_file():
if 'pdf' not in request.files:
return "No file part", 400
file = request.files['pdf']
if file.filename == '':
return "No selected file", 400
if file:
try:
reader = PyPDF2.PdfReader(file)
num_pages = len(reader.pages)
text = ''
for page in range(num_pages):
page_obj = reader.pages[page]
text += page_obj.extract_text()
# 指定返回类型为text/plain和编码为utf-8
return Response(text, mimetype="text/plain", content_type="text/plain; charset=utf-8")
except Exception as e:
return str(e), 500
if __name__ == '__main__':
app.run(debug=True)
上传代码
# coding=gbk
import requests
url = 'http://localhost:5000/upload'
files = {'pdf': open('2.pdf', 'rb')}
response = requests.post(url, files=files)
# 直接打印文本而不是编码文本
print(response.text)
pdf转文本代码
# coding=gbk
# pip install pypdf2 --upgrade
import PyPDF2
# 打开PDF文件
with open('2.pdf', 'rb') as file:
reader = PyPDF2.PdfReader(file)
# 获取PDF的总页数
num_pages = len(reader.pages)
# 逐页读取
for page in range(num_pages):
page_obj = reader.pages[page]
print(page_obj.extract_text())
综合上述步骤完整代码
import http.client
import json
import requests
# import time
# 开始计时
# start_time = time.time()
# 获取PDF文本
url = 'http://localhost:5000/upload'
files = {'pdf': open('3.pdf', 'rb')}
response = requests.post(url, files=files)
long_text = response.text # 从接口获得的长文本
# print(long_text)
# 分段函数
def split_text(text, max_size):
for start in range(0, len(text), max_size):
yield text[start:start + max_size]
# 配置GPT API api.zhangsan.cloud
conn = http.client.HTTPSConnection("api.zhangsan.cloud")
headers = {
'Accept': 'application/json',
'Authorization': 'Bearer sk-zkyXXXXXXXXXXXXXXXaA47c77',
'User-Agent': 'Apifox/1.0.0 (https://apifox.com)',
'Content-Type': 'application/json'
}
# 准备发送到GPT API的消息
all_responses = []
# 系统提示,加入到第一个消息段
system_prompt = "请总结本篇论文,并详细告诉我论文中是基于什么背景.例如:用到了什么方法/算法,是怎么解决的,得到了什么结果,一步步详细告诉我,reply in chinese."
for i, segment in enumerate(split_text(long_text, 8000)):
if i == 0:
# 第一个段落,添加系统提示
messages = [
{"role": "system", "content": system_prompt},
{"role": "user", "content": segment}
]
else:
messages = [
{"role": "user", "content": segment}
]
payload = json.dumps({
"model": "gpt-3.5-turbo-16k-0613",
"messages": messages
})
conn.request("POST", "/v1/chat/completions", payload, headers)
res = conn.getresponse()
data = res.read()
all_responses.append(json.loads(data.decode("utf-8")))
# 打印或处理所有的响应
for response in all_responses:
content = response["choices"][0]["message"]["content"]
print(content)
# print('\n\n')
# # 结束计时并输出运行时间
# end_time = time.time()
# print("Flask API 请求运行时间: {:.2f}秒".format(end_time - start_time))
效果
先运行接口,在运行分析。
效果如下: