用urllib2抓取被限制的网站页面
# coding:utf-8 import urllib2 url = "http://blog.csdn.net/troubleshooter" html = urllib2.urlopen(url) print html.read()
返回403错误
- 模拟用户访问
# coding:utf-8 import urllib2 url = "http://blog.csdn.net/troubleshooter" url_headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.111 Safari/537.36',
'Referer':'http://www.cnblogs.com/evilxr/p/4038902.html',
'Host':'blog.csdn.net',
'GET':url
} req = urllib2.Request(url, headers=url_headers)
html = urllib2.urlopen(req)
print html.getcode()200
[Finished in 0.4s] 获取Cookie信息
import urllib2
import cookielib cookie = cookielib.CookieJar()
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookie))
response = opener.open('http://www.baidu.com')
In [12]: for i in cookie:
print i.name,i.value
....:
BAIDUID 4722B044786BAE8B1E484C0535706271:FG=1
BIDUPSID 4722B044786BAE8B1E484C0535706271
H_PS_PSSID 10299_16540_1430_16474_12824_10812_12868_14669_16520_16326_16662_16424_16514_15050_12386_13932
PSTM 1438398244
BDSVRTM 0
BD_HOME 0打开调试功能
import urllib2 httpHandler = urllib2.HTTPHandler(debuglevel=1)
httpsHandler = urllib2.HTTPSHandler(debuglevel=1)
opener = urllib2.build_opener(httpHandler, httpsHandler) urllib2.install_opener(opener)
response = urllib2.urlopen('http://www.baidu.com')
response = urllib2.urlopen('http://www.baidu.com')
send: 'GET / HTTP/1.1\r\nAccept-Encoding: identity\r\nHost: www.baidu.com\r\nConnection: close\r\nUser-Agent: Python-urllib/2.7\r\n\r\n'
reply: 'HTTP/1.1 200 OK\r\n'
header: Date: Sat, 01 Aug 2014 03:14:07 GMT
header: Content-Type: text/html; charset=utf-8
header: Transfer-Encoding: chunked
header: Connection: Close
header: Vary: Accept-Encoding
header: Set-Cookie: BAIDUID=0E3FD673DED07D3DBB4D6048AB469A32:FG=1; expires=Thu, 31-Dec-37 23:55:55 GMT; max-age=2147483647; path=/; domain=.baidu.com
header: Set-Cookie: BIDUPSID=0E3FD673DED07D3DBB4D6048AB469A32; expires=Thu, 31-Dec-37 23:55:55 GMT; max-age=2147483647; path=/; domain=.baidu.com
header: Set-Cookie: PSTM=1438398847; expires=Thu, 31-Dec-37 23:55:55 GMT; max-age=2147483647; path=/; domain=.baidu.com
header: Set-Cookie: BDSVRTM=0; path=/
header: Set-Cookie: BD_HOME=0; path=/
header: Set-Cookie: H_PS_PSSID=13289_1441_10813_14432_12867_14667_16521_14951_16663_16427_16514_15291_12315_13932_10634; path=/; domain=.baidu.com
header: P3P: CP=" OTI DSP COR IVA OUR IND COM "
header: Cache-Control: private
header: Cxy_all: baidu+d4d7821ea11368a1cad938a4de84b7ab
header: Expires: Sat, 01 Aug 2015 03:13:12 GMT
header: X-Powered-By: HPHP
header: Server: BWS/1.1
header: X-UA-Compatible: IE=Edge,chrome=1
header: BDPAGETYPE: 1
header: BDQID: 0x8824b3dc0001bdbb
header: BDUSERID: 0