获取gitee上某个组织所有仓库的介绍

背景: 想用LLM总结一下ascend的开源项目
步骤:
1.用下面的脚本抓取所有项目介绍
2.合并文件
3.上传到智谱长文档解读
4.提问

代码(由GPT-4O自动生成)

import requests
from bs4 import BeautifulSoup
import html2text

def download_and_extract_links(url):
    # 下载网页内容
    response = requests.get(url)
    if response.status_code != 200:
        raise ValueError(f"无法访问网页: {url}")
    
    # 解析HTML内容
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # 提取所有 <a class="repository"> 标签中的 href 属性
    links = []
    for a_tag in soup.find_all('a', class_='repository'):    
        href = a_tag.get('href')
        if href:
            links.append((a_tag.text,f"https://gitee.com/{href}"))
    
    return links

def download_and_save_markdown_content(url, file_path):
    # 下载网页内容
    response = requests.get(url)
    if response.status_code != 200:
        raise ValueError(f"无法访问网页: {url}")

    # 解析HTML内容
    soup = BeautifulSoup(response.text, 'html.parser')

    # 查找指定 <div class="file_content markdown-body"> 标签
    div_tag = soup.find('div', class_='file_content markdown-body')
    if div_tag is None:
        raise ValueError(f"网页中未找到 <div class='file_content markdown-body'> 标签")

    # 获取这个 div 标签中的 HTML 内容
    html_content = div_tag.decode_contents()

    # 将 HTML 内容转换为 Markdown
    markdown_content = html2text.html2text(html_content)

    # 将 Markdown 内容保存到文件
    with open(file_path, 'w', encoding='utf-8') as file:
        file.write(markdown_content)

for i in range(1,4):
    url = f'https://gitee.com/organizations/ascend/projects?page={i}'
    links = download_and_extract_links(url)
    for (title,link) in links:
        print(title,link)
        download_and_save_markdown_content(link,f"docs/{title}.md")
06-07 10:36