获取gitee上某个组织所有仓库的介绍
背景: 想用LLM总结一下ascend的开源项目
步骤:
1.用下面的脚本抓取所有项目介绍
2.合并文件
3.上传到智谱长文档解读
4.提问
代码(由GPT-4O自动生成)
import requests
from bs4 import BeautifulSoup
import html2text
def download_and_extract_links(url):
# 下载网页内容
response = requests.get(url)
if response.status_code != 200:
raise ValueError(f"无法访问网页: {url}")
# 解析HTML内容
soup = BeautifulSoup(response.text, 'html.parser')
# 提取所有 <a class="repository"> 标签中的 href 属性
links = []
for a_tag in soup.find_all('a', class_='repository'):
href = a_tag.get('href')
if href:
links.append((a_tag.text,f"https://gitee.com/{href}"))
return links
def download_and_save_markdown_content(url, file_path):
# 下载网页内容
response = requests.get(url)
if response.status_code != 200:
raise ValueError(f"无法访问网页: {url}")
# 解析HTML内容
soup = BeautifulSoup(response.text, 'html.parser')
# 查找指定 <div class="file_content markdown-body"> 标签
div_tag = soup.find('div', class_='file_content markdown-body')
if div_tag is None:
raise ValueError(f"网页中未找到 <div class='file_content markdown-body'> 标签")
# 获取这个 div 标签中的 HTML 内容
html_content = div_tag.decode_contents()
# 将 HTML 内容转换为 Markdown
markdown_content = html2text.html2text(html_content)
# 将 Markdown 内容保存到文件
with open(file_path, 'w', encoding='utf-8') as file:
file.write(markdown_content)
for i in range(1,4):
url = f'https://gitee.com/organizations/ascend/projects?page={i}'
links = download_and_extract_links(url)
for (title,link) in links:
print(title,link)
download_and_save_markdown_content(link,f"docs/{title}.md")