版权声明:本文为博主原创文章,转载 请注明出处: https://blog.csdn.net/sc2079/article/details/82423865
-写在前面
本篇博客主要是爬虫伪装技术的应用,包括用户代理,IP代理,Cooikes,适合爬虫初学者。文中如有不足,请指正。
PS:网上资源整合,如有侵权,联系删除
-环境配置安装
运行环境:Python3.6、Spyder
依赖的模块:urllib,requests.random等
-正文
1、用户代理
def LoadUserAgents(uafile):
uas = []
with open(uafile, 'rb') as uaf:
for ua in uaf.readlines():
if ua:
uas.append(ua.strip()[1:-1 - 1])
random.shuffle(uas)
return uas
uas = LoadUserAgents("user_agents.txt")
下载user_agents.txt文档:链接,设置好路径后,运行即可。
2、cookies
Cookies = ['_lxsdk_s=b458c64cc9ef92adce92c199a6b7%7C%7C4; __mta=214784936.1512623224439.1512623224439.1512623226558.2; client-id=8d050bd8-51b6-46fc-be96-cb6871d91fe7; ci=89; webloc_geo=31.778615%2C119.958931%2Cwgs84; _lxsdk=1602f5eb154c8-05c63a41825c168-1b451e24-13c680-1602f5eb154c8; uuid=73a65301-f351-4cdd-bfcd-35ae40aea13c',
'_lxsdk_s=55618265813fc43043ce562acf29%7C%7C2; ci=89; __mta=252597174.1512623399814.1512623399814.1512623399814.1; _lxsdk=1602f6161c1c8-034ae66c3842d88-1b451e24-13c680-1602f6161c170; client-id=8ca2fe5f-ba96-41ae-b919-f8d3665e2f10; webloc_geo=31.778620%2C119.958935%2Cwgs84; uuid=ae73c608-8db3-4724-b4bf-91f4a8c701c5',
'_lxsdk_s=4ffb12d5a90c2361745e6d0993a2%7C%7C2; ci=89; __mta=40613533.1512623465821.1512623465821.1512623465821.1; client-id=a31f1c79-3287-4ef1-8bd3-56bfb431a7da; webloc_geo=31.778616%2C119.958936%2Cwgs84; _lxsdk=1602f6262117d-0f173d7fde38d18-1b451e24-13c680-1602f626212c8; uuid=ba1b457e-ae34-4b56-bf7b-9bf3579ac0ae',
'_lxsdk_s=4ba654e682eaf4d69efb90ae9cf1%7C%7C2; __mta=152351622.1512623525333.1512623525333.1512623525333.1; ci=89; client-id=e8101f58-a778-4332-81c2-75ba410b2bb3; webloc_geo=31.778611%2C119.958934%2Cwgs84; _lxsdk=1602f634c448d-0c74adf6d00e2e8-1b451e24-13c680-1602f634c45c8; uuid=e98a7e32-7511-4b40-ab3d-d88011a06471',
'_lxsdk_s=f6511ca715103c33b7bb4aea1e9f%7C%7C2; __mta=40629917.1512623573270.1512623573270.1512623573270.1; ci=89; client-id=c69db65e-b5d1-4fe6-aa7a-340e3608aeb6; webloc_geo=31.778617%2C119.958930%2Cwgs84; _lxsdk=1602f64072fc8-04cf33034885b08-1b451e24-13c680-1602f640730c8; uuid=33b52a9f-14ea-4f80-9a97-8386fc9fb0a2',
'_lxsdk_s=38327d7cfda78d0cfc4810838c32%7C%7C2; __mta=150795529.1512623625315.1512623625315.1512623625315.1; ci=89; client-id=221306b1-bfb7-4cfa-ab3f-2e4a85089864; webloc_geo=31.778610%2C119.958933%2Cwgs84; _lxsdk=1602f64d24531-0623a27320ba75-1b451e24-13c680-1602f64d246c8; uuid=85f7949c-284c-4ae6-b5f8-18aa1c6dc016',
'_lxsdk_s=c2f111bde916cad4d653e07a35c9%7C%7C2; __mta=42950426.1512623702441.1512623702441.1512623702441.1; _lxsdk=1602f66000dc8-0bb60c750cf17b8-1b451e24-13c680-1602f66000d7f; ci=89; client-id=38cc3a86-fe57-491b-8564-c9b1a251cd4a; webloc_geo=31.778617%2C119.958931%2Cwgs84; uuid=095c6c87-a397-4cc4-a3fc-1c3e35eca3be',
'_lxsdk_s=fd711080efee1adf6de57c1d4c88%7C%7C2; ci=89; webloc_geo=31.778612%2C119.958939%2Cwgs84; __mta=218070313.1512623752950.1512623752950.1512623752950.1; _lxsdk=1602f66c53e35-0b363d23b0e435-1b451e24-13c680-1602f66c53fc8; client-id=1a4a2af3-4201-4e61-bf26-92016709444b; uuid=9361e591-ada9-43c0-a762-b7b5559c649a',
'_lxsdk_s=810677247f21b53cd75826e60cb8%7C%7C2; __mta=146594315.1512623869611.1512623869611.1512623869611.1; _lxsdk=1602f688ce80-074a1ec97ff043-1b451e24-13c680-1602f688ce9c8; ci=89; client-id=e60488fa-51fe-4d12-99a3-257a5a0853cc; webloc_geo=31.778609%2C119.958928%2Cwgs84; uuid=948eec52-7d10-46ce-b689-92140842f3f0',
'_lxsdk_s=c44c26606c13c651ab5a20447428%7C%7C2; __mta=89516452.1512623906613.1512623906613.1512623906613.1; _lxsdk=1602f691da0c8-024ebcb15833bc-1b451e24-13c680-1602f691da04b; ci=89; client-id=a6846506-a50a-4612-8106-ff31978802a8; webloc_geo=31.778607%2C119.958929%2Cwgs84; uuid=6ee14a07-2d20-460e-87c2-001806dfdf1b',
'_lxsdk_s=071dfe26ce64d6e46d03323d2f71%7C%7C2; __mta=210099498.1512623985254.1512623985254.1512623985254.1; ci=89; client-id=f2cea0fa-0d83-479a-a863-f2b7156fb392; webloc_geo=31.778621%2C119.958925%2Cwgs84; _lxsdk=1602f6a4f88c8-0ce1602d9c121a-1b451e24-13c680-1602f6a4f88c8; uuid=122a1bf8-f588-4b2d-a509-2bb45aaf1be9',
'_lxsdk_s=833017bd50026c6c57b327f1f51a%7C%7C2; __mta=244702778.1512624031536.1512624031536.1512624031536.1; ci=89; client-id=ec7106e4-9dda-4ce0-aa8d-c12288812644; webloc_geo=31.778622%2C119.958921%2Cwgs84; _lxsdk=1602f6b0459c8-011b3f8b51c9328-1b451e24-13c680-1602f6b045ac8; uuid=04d6dbed-d679-4e9d-a0c7-4ea1b2343513',
'_lxsdk_s=9a68aaba42beb1ebdfc3e726d94e%7C%7C2; ci=89; webloc_geo=31.778629%2C119.958912%2Cwgs84; __mta=45406747.1512624277952.1512624277952.1512624277952.1; _lxsdk=1602f6ec7b84-095c2ecf9438458-1b451e24-13c680-1602f6ec7b9c8; uuid=33acafd1-4a03-4f0f-9572-1ccbfd570efc; client-id=77353c1f-ba56-4ac9-bbb3-c7a689858182',
'_lxsdk_s=c1508df449b0616ba37c5d9166f2%7C%7C2; __mta=216814121.1512624317347.1512624317347.1512624317347.1; ci=89; client-id=5d8e3ff2-5370-4c08-855e-9d26e5187040; webloc_geo=31.778633%2C119.958912%2Cwgs84; _lxsdk=1602f6f6187a2-029abec138ea508-1b451e24-13c680-1602f6f6188c8; uuid=1a808ef4-511d-44ba-acfd-d6339ddb92f8',
'_lxsdk_s=7a249ecc9a048cbb826ea3b1f4e7%7C%7C2; __mta=256753588.1512624355644.1512624355644.1512624355644.1; _lxsdk=1602f6ff822c8-01f025c7119f968-1b451e24-13c680-1602f6ff823c8; ci=89; client-id=33339c7c-4dd0-4da8-8617-add389e72438; webloc_geo=31.778634%2C119.958912%2Cwgs84; uuid=92e531c6-4620-453e-ad8d-0e67644404ad',
'_lxsdk_s=93f6847b5f37e552ae083beb48c2%7C%7C2; __mta=218397606.1512624389136.1512624389136.1512624389136.1; ci=89; client-id=21540b20-0240-48fa-ae1e-c1ce4dba892f; webloc_geo=31.778631%2C119.958925%2Cwgs84; _lxsdk=1602f707a3d3-0d7db668e6d9498-1b451e24-13c680-1602f707a3ec8; uuid=507d006a-6b37-4ff9-9299-88eae1bc0982',
'_lxsdk_s=0ffdd4e2dddd57d3c05606eb82f3%7C%7C2; __mta=141367821.1512624428164.1512624428164.1512624428164.1; ci=89; client-id=d6e8de67-48b8-4262-b3b3-47851c46db5c; webloc_geo=31.778620%2C119.958977%2Cwgs84; _lxsdk=1602f7112b9c8-09d6e858772c1d8-1b451e24-13c680-1602f7112bac8; uuid=2b6ac2bf-62cf-4ebd-bb68-bd0c258c6b29',
]
3、IP代理
proxies = ['61.155.164.108:3128',
'116.199.115.79:80',
'42.245.252.35:80',
'106.14.51.145:8118',
'116.199.115.78:80',
'123.147.165.143:8080',
'58.62.86.216:9999',
'202.201.3.121:3128',
'119.29.201.134:808',
'61.155.164.112:3128',
'123.57.76.102:80',
'116.199.115.78:80',
]
4.urllib调用
def ua(url):
req = urllib.request.Request(url)
#req.add_header("User-Agent",uas)
req.add_header("User-Agent",random.choice(uas))
req.add_header("Cookie",random.choice(Cookies))
proxy = urllib.request.ProxyHandler({"https":random.choice(proxies)})
opener = urllib.request.build_opener(proxy,urllib.request.HTTPHandler)
#将opener安装为全局
urllib.request.install_opener(opener)
return req
def get_data(url):
req = ua(url)
try:
data = urllib.request.urlopen(req)
#print(len(data.read()))
except Exception as err:
print(err)
return data
response = get_data(url)
contents = response.read().decode('utf-8')
5.requests调用
def get_data2(url):
headers={
'User-Agent':random.choice(uas),
'proxie':'http:'+random.choice(proxies),
'Cookie':random.choice(Cookies)
}
try:
r = requests.get(url, timeout=15,headers=headers)
r.raise_for_status()
print(r.raise_for_status())
r.encoding = r.apparent_encoding
return r
except:
return ' '
response = get_data2(url)
content=response.text
-总结
对于我而言,requests比urllib相对更简洁好用一些。在爬虫实际应用中,如果远端网站有较强的反爬虫机制,当访问次数过多后,可能爬虫就不能再爬取数据了,此时应考虑更新用户、IP代理。