LLM生成nvidia-h100-tensor-core-hopper-whitepaper.pdf摘要
LLM生成nvidia-h100-tensor-core-hopper-whitepaper.pdf摘要
代码
import pdfplumber
import time
def split_text_to_chunks(text, max_chunk_size=8192, delimiter='####'):
"""
将长文本分割成多块,每块的大小不超过最大块大小,并且以给定的分隔符开头和结尾。
在分割时确保不会跨段,每段由分隔符开头。
:param text: 要分割的长文本
:param max_chunk_size: 每一块的最大大小
:param delimiter: 每一段的起始分隔符
:return: 分割后的块列表
"""
# 按照分隔符分割文本,去除空白段
sections = [section for section in text.split(delimiter) if section.strip()]
chunks = []
current_chunk = ""
for section in sections:
# 每一段都要包含起始分隔符
section = delimiter + section
if len(current_chunk) + len(section) + len(delimiter) * 2 + 2 <= max_chunk_size:
current_chunk += section
else:
# 若当前段添加到当前块后超出最大块大小,则当前块保存
if current_chunk:
chunks.append(current_chunk.strip() + "\n")
current_chunk = section
# 添加最后一个块,如果有内容
if current_chunk:
chunks.append(current_chunk.strip() + "\n")
return chunks
def llm_summary(question):
from http import HTTPStatus
import dashscope
dashscope.api_key="sk-"
from dashscope import Generation
system_prompt="你是一位GPU专家,能从用户输入的文章中提取出H100相关的特性,输出中文"
messages = [{'role': 'system', 'content': system_prompt},
{'role': 'user', 'content': question}]
response = Generation.call(model="qwen-max", messages=messages,result_format='message')
if response.status_code == HTTPStatus.OK:
messages.append({'role': response.output.choices[0]['message']['role'],
'content': response.output.choices[0]['message']['content']})
output=response.output.choices[0]['message']['content']
return output
return ""
def stage_0():
pdf_path = 'nvidia-h100-tensor-core-hopper-whitepaper.pdf'
output=""
fo=open("H100_Architecture.txt","w",encoding="utf-8")
with pdfplumber.open(pdf_path) as pdf:
for idx,page in enumerate(pdf.pages[5:70]):
lines=page.extract_text().split("\n")[1:-2]
for line in lines:
output+=" "
output+=line[:-1].strip()
if line[-1]=='.':
output+="\n\n####"
fo.write(output)
fo.close()
def stage_1():
article_body_content = open("H100_Architecture.txt","r",encoding="utf-8").read()
f=open("H100_Architecture_Summary.txt","a+")
chunks = split_text_to_chunks(article_body_content)
total=len(chunks)
for idx, chunk in enumerate(chunks):
#print(f"Chunk {idx + 1}: {len(chunk)} \n{chunk}\n")
summary=llm_summary(chunk)
print(f" --------- {idx}/{total} @{idx/total:.3f} --------- ")
print(summary)
f.write(f"\n####{summary}")
f.flush()
time.sleep(5)
stage_0()
stage_1()