import requests
import os
from bs4 import BeautifulSoup
import re # 初始地址
all_url = 'http://www.7160.com/xiaohua/'
#保存路径
path = 'H:/school_girl/' # 请求头
header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 UBrowser/6.1.2107.204 Safari/537.36'
} #################################开始请求(多列表)#################################
html = requests.get(all_url,headers = header)
start_html = html.text.encode('iso-8859-1').decode('gbk') # 将gb2312转为UTF-8格式
#################################开始解析#################################
soup = BeautifulSoup(start_html,'lxml')
#查找最大页码
page = 255 # 同一路径
same_url = 'http://www.7160.com/xiaohua/' for n in range(1,int(page)+1):
ul = same_url + 'list_6_' + str(n) + '.html' ####################开始请求(单列表多元素)###############
html = requests.get(ul,headers = header)
start_html = html.text.encode('iso-8859-1').decode('gbk') ########################开始解析##########################
soup = BeautifulSoup(start_html,'lxml')
all_a = soup.find('div',class_='news_bom-left').find_all('a',target = '_blank')
for a in all_a:
title = a.get_text()
if title != '':
########################创建目录##########################
#win不能创建带?的目录
if (os.path.exists(path + title.strip().replace('?', ''))):
# print('目录已存在')
flag = 1
else:
os.makedirs(path + title.strip().replace('?', ''))
flag = 0
os.chdir(path + title.strip().replace('?', ''))
######################### END ########################### ###################开始请求(单元素)###############
print('准备爬取:' + title)
hrefs = a['href']
in_url = 'http://www.7160.com'
href = in_url + hrefs htmls = requests.get(href,headers = header)
html = htmls.text.encode('iso-8859-1').decode('gbk')
#######################开始解析###################### mess = BeautifulSoup(html,'lxml')
titles = mess.find('h1').text
pic_max = mess.find('div',class_ = 'itempage').find_all('a')[-2].text # 最大页数 if (flag == 1 and len(os.listdir(path + title.strip().replace('?', ''))) >= int(pic_max)):
print('已经保存完毕,跳过')
continue
for num in range(1,int(pic_max)+1):
href = a['href']
hrefs = re.findall(r'.{14}',href)
href = "".join(hrefs)
if num == 1:
html = in_url + href + '.html'
else:
html = in_url + href + '_' + str(num) + ".html" ###################开始请求(单元素里的子元素)###############
htmls = requests.get(html,headers = header)
html = htmls.text.encode('iso-8859-1').decode('gbk')
#######################开始解析######################
mess = BeautifulSoup(html,'lxml')
pic_url = mess.find('img',alt = titles)
print(pic_url['src']) #########################开始下载#####################
html = requests.get(pic_url['src'],headers = header)
filename = pic_url['src'].split(r'/')[-1]
f = open(filename,'wb')
f.write(html.content)
f.close()
print('完成')
print('第',n,'页完成')
打印后的结果为:
准备爬取:
阳光下校花美女迷人桃花眼嘴
http://img.7160.com/uploads/allimg/180913/13-1P913102541.jpg
http://img.7160.com/uploads/allimg/180913/13-1P913102541-50.jpg
http://img.7160.com/uploads/allimg/180913/13-1P913102541-51.jpg
http://img.7160.com/uploads/allimg/180913/13-1P913102542.jpg
http://img.7160.com/uploads/allimg/180913/13-1P913102542-50.jpg
http://img.7160.com/uploads/allimg/180913/13-1P913102542-51.jpg
http://img.7160.com/uploads/allimg/180913/13-1P913102542-52.jpg
http://img.7160.com/uploads/allimg/180913/13-1P913102542-53.jpg
http://img.7160.com/uploads/allimg/180913/13-1P913102542-54.jpg
http://img.7160.com/uploads/allimg/180913/13-1P913102543.jpg
http://img.7160.com/uploads/allimg/180913/13-1P913102543-50.jpg
完成
准备爬取:
黑长直发美女学生日系风制服
http://img.7160.com/uploads/allimg/180912/13-1P912102159.jpg
http://img.7160.com/uploads/allimg/180912/13-1P912102159-50.jpg
http://img.7160.com/uploads/allimg/180912/13-1P912102159-51.jpg
http://img.7160.com/uploads/allimg/180912/13-1P912102159-52.jpg
http://img.7160.com/uploads/allimg/180912/13-1P912102200.jpg