一、第一版

第一版比较通俗易懂,使用的是urllib里的request + bs4里的BeautifulSoup,requests库可以代替urllib里的request,命令行显示结果。

from urllib import request
from urllib import error
from bs4 import BeautifulSoup

def getHtml(url, ua_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:61.0) Gecko', num_retries=5):
    """
    获取url的网页代码,加了一个num_retries参数,默认为5,表示获取如果发生错误,重新执行该函数5次
    """
    headers = {"User-Agent":ua_agent}
    req = request.Request(url, headers=headers)
    html = None
    try:
        response = request.urlopen(req)
        html = response.read().decode('utf-8')
    except error.URLError or error.HTTPError as e:
        if num_retries > 0:
            if hasattr(e,'code') and 500 <= e.code < 600:
                getHtml(url, ua_agent, num_retries-1)
    return html

def get_movie_all(html):
    """
    获取当前页面中所有电影的列表信息
    """
    soup = BeautifulSoup(html, "html.parser")
    movie_list = soup.find_all('div', class_='bd doulist-subject')
    return movie_list

def get_movie_one(movie_list):
    """
    获取一部电影的详细信息,拼成一个大的字符串
    """
    result = ""
    soup = BeautifulSoup(str(movie_list),"html.parser")
    title = soup.find_all('div', class_="title")
    soup_title = BeautifulSoup(str(title[0]), "html.parser")
    for line in soup_title.stripped_strings:
        result += line

    try:
        score = soup.find_all('span', class_='rating_nums')
        soup_score = BeautifulSoup(str(score[0]), "html.parser")
        for line in soup_score.stripped_strings:
            result += "|| 评分:"
            result += line
    except:
         result += "|| 评分:5.0"

    abstract = soup.find_all('div', class_='abstract')
    soup_abstract = BeautifulSoup(str(abstract[0]), "html.parser")
    for line in soup_abstract.stripped_strings:
        result += "|| "
        result += line

    result += '\n'
    return result

if __name__ == "__main__":
    for page in range(0, 25, 25): #这里设置就爬取第一页
        url = "https://www.douban.com/doulist/3516235/?start={}&sort=seq&playable=0&sub_type=".format(str(page))
        htmlInfo = getHtml(url)
        movie_list = get_movie_all(htmlInfo)
        for i in movie_list:
            print(get_movie_one(i))
01-07 11:58