scrapy 使用下载器中间件设置随机请求头
1. 在middlewares.py 中设置下载中间件
import random
class UseragentDownloaderMiddleware:
# 自定义请求头列表
USER_AGENTS = [
"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
"Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
"Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11",
" Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)"]
def process_request(self, request, spider):
# Called for each request that goes through the downloader
# middleware.
# Must either:
# - return None: continue processing this request
# - or return a Response object
# - or return a Request object
# - or raise IgnoreRequest: process_exception() methods of
# installed downloader middleware will be called
# 从自定义的 USER_AGENTS 列表中随机选择一个请求头
user_agent = random.choice(self.USER_AGENTS)
request.headers["user-agent"] = user_agent
return None
2. 在settings.py 配置中中激活中间件
DOWNLOADER_MIDDLEWARES = {
'useragent.middlewares.UseragentDownloaderMiddleware': 543,
}
3. 编写爬虫文件测试user-agent是否随机设置成功
# -*- coding: utf-8 -*-
import scrapy
class HttpbinSpider(scrapy.Spider):
name = 'httpbin'
allowed_domains = ['httpbin.org']
start_urls = ['http://httpbin.org/user-agent']
def parse(self, response):
print(response.text)
# eval(response.text) 将str 转为字典
print(eval(response.text)["user-agent"])
# 重复请求该url,查看多次请求的 user-agent 是否为随机选择
# scrapy框架会默认拦截过滤已经请求过的url,所以 dont_filter=True 设置不过滤
yield scrapy.Request(self.start_urls[0], dont_filter=True)
http://httpbin.org/user-agent 会返回一个json字符串,内容为当前访问该url的user-agent:
{
"user-agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36"
}