import os
import threading
import re
import time
from lxml import etree all_img_urls = [] # 图片列表页面的数组 g_lock = threading.Lock() # 初始化一个锁 # 声明一个生产者的类,来不断地获取图片详情页地址,然后添加到 all_img_url列表中 # url = "http://www.xiaohuar.com/" all_urls = [] class Spider(object):
# 构造函数,初始化数据实用
def __init__(self,target_url,headers):
self.target_url = target_url
self.headers = headers # 获取所有的想要抓取的URL
def getUrls(self,start_page,page_num):
for i in range(start_page,page_num):
url = self.target_url % i
all_urls.append(url) if __name__ == '__main__':
headers = {
"User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.62 Safari/537.36",
"Host":"eclick.baidu.com", }
target_url = "http://www.xiaohuar.com/list-1-%d.html" # 抓取链接的样式 spider = Spider(target_url,headers) # 抓取链接的对象传入 链接与请求头
spider.getUrls(0,14) # 抓取的多少页面的链接
# print (all_urls) class Producer(threading.Thread): #创建一个生产者用来批量的'生产'链接 def run(self):
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.62 Safari/537.36",
"Host": "eclick.baidu.com", } while len(all_urls) > 0: # 这里写了一个死循环,为的是能够一直抓取为爬去数据的链接
g_lock.acquire() # 锁,为的是不让不同的线程共同使用同一条连接
# for url in all_urls:
url = all_urls.pop() # 使用pop方法,可以获取链接
g_lock.release() # 获取连接后 释放锁,让其他线程可前去列表中获取链接
response = requests.get(url,headers).text selector = etree.HTML(response) # 使用xpath mods = selector.xpath("//div[@class='item_t']") # 获取指定标签 for i in mods:
img_link = i.xpath("div[@class='img']/a/img/@src")
name = i.xpath("div[@class='img']/span/text()")
name = name[0].encode("utf8")
img_link = img_link[0].encode("utf8") comment = {name: img_link}
if img_link.startswith("/"): # 因为抓取的链接,有一部分是本地,所以在此处将之拼接成可直接访问的url
str = "http://www.xiaohuar.com"
img_link = str + img_link
comment = {name: img_link}
all_img_urls.append(comment)
all_img_urls.append(comment) for x in range(10): # 创建10个线程用来爬去链接
down = Producer()
down.run()
# print all_img_urls class DownPic(threading.Thread): # 用来下载爬取数据的类 def run(self):
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.62 Safari/537.36",
"Host": "eclick.baidu.com", } while True: # 这个地方写成死循环,为的是不断监控图片链接数组是否更新
g_lock.acquire()
if len(all_img_urls) == 0: #没有图片了,就解锁
g_lock.release()
continue
else:
img = all_img_urls.pop()
g_lock.release()
# 遍历字典列表
for key,value in img.items():
path = "xiaohua/%s.jpg"% key.decode("utf8")
response = requests.get(value)
# print path
with open (path,"wb") as f:
f.write(response.content)
f.close()#
# #
#
#
for x in range(10):
down = DownPic()
down.run()