scrapy代理的设置

import random
import scrapy
import logging
class proxMiddleware(object):
#proxy_list=[{'http': 'http://123.157.146.116:8123'}, {'http': 'http://116.55.16.233:8998'}, {'http': 'http://115.85.233.94:80'}, {'http': 'http://180.76.154.5:8888'}, {'http': 'http://139.213.135.81:80'}, {'http': 'http://124.88.67.14:80'}, {'http': 'http://106.46.136.90:808'}, {'http': 'http://106.46.136.226:808'}, {'http': 'http://124.88.67.21:843'}, {'http': 'http://113.245.84.253:8118'}, {'http': 'http://124.88.67.10:80'}, {'http': 'http://171.38.141.12:8123'}, {'http': 'http://124.88.67.52:843'}, {'http': 'http://106.46.136.237:808'}, {'http': 'http://106.46.136.105:808'}, {'http': 'http://106.46.136.190:808'}, {'http': 'http://106.46.136.186:808'}, {'http': 'http://101.81.120.58:8118'}, {'http': 'http://106.46.136.250:808'}, {'http': 'http://106.46.136.8:808'}, {'http': 'http://111.78.188.157:8998'}, {'http': 'http://106.46.136.139:808'}, {'http': 'http://101.53.101.172:9999'}, {'http': 'http://27.159.125.68:8118'}, {'http': 'http://183.32.88.133:808'}, {'http': 'http://171.38.37.193:8123'}]
proxy_list=[
"http://180.76.154.5:8888",
"http://14.109.107.1:8998",
"http://106.46.136.159:808",
"http://175.155.24.107:808",
"http://124.88.67.10:80",
"http://124.88.67.14:80",
"http://58.23.122.79:8118",
"http://123.157.146.116:8123",
"http://124.88.67.21:843",
"http://106.46.136.226:808",
"http://101.81.120.58:8118",
"http://180.175.145.148:808" ]
def process_request(self,request,spider):
# if not request.meta['proxies']:
ip = random.choice(self.proxy_list)
print ip
#print 'ip=' %ip
request.meta['proxy'] = ip
#coding:utf-8
import requests
from bs4 import BeautifulSoup
import threading
import Queue
class Get_ips():
def __init__(self,page):
self.ips=[]
self.urls=[]
for i in range(page):
self.urls.append("http://www.xicidaili.com/nn/" + str(i))
self.header = {"User-Agent": 'Mozilla/5.0 (Windows NT 6.3; WOW64; rv:43.0) Gecko/20100101 Firefox/43.0'}
#self.file=open("ips",'w')
self.q=Queue.Queue()
self.Lock=threading.Lock()
def get_ips(self):
for url in self.urls:
res = requests.get(url, headers=self.header)
soup = BeautifulSoup(res.text, 'lxml')
ips = soup.find_all('tr')
for i in range(1, len(ips)):
ip = ips[i]
tds = ip.find_all("td")
ip_temp = "http://" + tds[1].contents[0] + ":" + tds[2].contents[0]
# print str(ip_temp)
self.q.put(str(ip_temp)) def review_ips(self):
while not self.q.empty():
ip=self.q.get()
try:
proxy={"http": ip}
#print proxy
res = requests.get("http://www.baidu.com", proxies=proxy,timeout=5)
self.Lock.acquire()
if res.status_code == 200:
self.ips.append(ip)
print ip
self.Lock.release()
except Exception:
pass
#print 'error'
def main(self):
self.get_ips()
threads=[]
for i in range(40):
threads.append(threading.Thread(target=self.review_ips,args=[]))
for t in threads:
t.start()
for t in threads:
t.join()
return self.ips
def get_ip():
my=Get_ips(4)
return my.main()
get_ip()
05-11 09:22