爬虫实战--基于requests 和 Beautiful的7160美图网爬取图片

import requests

import os

from bs4 import BeautifulSoup

import re

# 初始地址

all_url = 'http://www.7160.com/xiaohua/'

#保存路径

path = 'H:/school_girl/'

# 请求头

header = {

    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 UBrowser/6.1.2107.204 Safari/537.36'

}

#################################开始请求（多列表）#################################

html = requests.get(all_url,headers = header)

start_html = html.text.encode('iso-8859-1').decode('gbk')  # 将gb2312转为UTF-8格式

#################################开始解析#################################

soup = BeautifulSoup(start_html,'lxml')

#查找最大页码

page = 255

# 同一路径

same_url = 'http://www.7160.com/xiaohua/'

for n in range(1,int(page)+1):

    ul = same_url + 'list_6_' + str(n) + '.html'

    ####################开始请求（单列表多元素）###############

    html = requests.get(ul,headers = header)

    start_html = html.text.encode('iso-8859-1').decode('gbk')

    ########################开始解析##########################

    soup = BeautifulSoup(start_html,'lxml')

    all_a = soup.find('div',class_='news_bom-left').find_all('a',target = '_blank')

    for a in all_a:

        title = a.get_text()

        if title != '':

            ########################创建目录##########################

            #win不能创建带？的目录

            if (os.path.exists(path + title.strip().replace('?', ''))):

                # print('目录已存在')

                flag = 1

            else:

                os.makedirs(path + title.strip().replace('?', ''))

                flag = 0

            os.chdir(path + title.strip().replace('?', ''))

            ######################### END ###########################

            ###################开始请求（单元素）###############

            print('准备爬取:' + title)

            hrefs = a['href']

            in_url = 'http://www.7160.com'

            href = in_url + hrefs

            htmls = requests.get(href,headers = header)

            html = htmls.text.encode('iso-8859-1').decode('gbk')

            #######################开始解析######################

            mess = BeautifulSoup(html,'lxml')

            titles = mess.find('h1').text

            pic_max = mess.find('div',class_ = 'itempage').find_all('a')[-2].text # 最大页数

            if (flag == 1 and len(os.listdir(path + title.strip().replace('?', ''))) >= int(pic_max)):

                print('已经保存完毕，跳过')

                continue

            for num in range(1,int(pic_max)+1):

                href = a['href']

                hrefs = re.findall(r'.{14}',href)

                href = "".join(hrefs)

                if num == 1:

                    html = in_url + href + '.html'

                else:

                    html = in_url + href + '_' + str(num) + ".html"

                ###################开始请求（单元素里的子元素）###############

                htmls = requests.get(html,headers = header)

                html = htmls.text.encode('iso-8859-1').decode('gbk')

                #######################开始解析######################

                mess = BeautifulSoup(html,'lxml')

                pic_url = mess.find('img',alt = titles)

                print(pic_url['src'])

                #########################开始下载#####################

                html = requests.get(pic_url['src'],headers = header)

                filename = pic_url['src'].split(r'/')[-1]

                f = open(filename,'wb')

                f.write(html.content)

                f.close()

            print('完成')

    print('第',n,'页完成')

打印后的结果为：

准备爬取:
阳光下校花美女迷人桃花眼嘴
http://img.7160.com/uploads/allimg/180913/13-1P913102541.jpg
http://img.7160.com/uploads/allimg/180913/13-1P913102541-50.jpg
http://img.7160.com/uploads/allimg/180913/13-1P913102541-51.jpg
http://img.7160.com/uploads/allimg/180913/13-1P913102542.jpg
http://img.7160.com/uploads/allimg/180913/13-1P913102542-50.jpg
http://img.7160.com/uploads/allimg/180913/13-1P913102542-51.jpg
http://img.7160.com/uploads/allimg/180913/13-1P913102542-52.jpg
http://img.7160.com/uploads/allimg/180913/13-1P913102542-53.jpg
http://img.7160.com/uploads/allimg/180913/13-1P913102542-54.jpg
http://img.7160.com/uploads/allimg/180913/13-1P913102543.jpg
http://img.7160.com/uploads/allimg/180913/13-1P913102543-50.jpg
完成
准备爬取:
黑长直发美女学生日系风制服
http://img.7160.com/uploads/allimg/180912/13-1P912102159.jpg
http://img.7160.com/uploads/allimg/180912/13-1P912102159-50.jpg
http://img.7160.com/uploads/allimg/180912/13-1P912102159-51.jpg
http://img.7160.com/uploads/allimg/180912/13-1P912102159-52.jpg
http://img.7160.com/uploads/allimg/180912/13-1P912102200.jpg