# 1.创建请求对象(Request()) url = "http://..." # 1.1 添加多个请求头,每次放一个进行访问 # list = [agent1,agent2,agent3,agent4,agent5] # agent = random.choice(list) headers = { "User-Agent": "", # 伪装,反爬虫机制 # 1.1 "User-Agent":agent, "Cookie": "", # Cookie模拟登陆 } # 1.2创建自定义请求对象 req = urllib.request.Request(url, headers=headers) # 2.获取响应对象(urlopen()) res = urllib.request.urlopen(req) # 3.获取内容(read().decode("utf-8") html = res.read().decode("utf-8") # decode() : bytes -> string # encode() : string -> bytes # 2-3.可结合 # html = request.urlopen(req).read().decode("utf-8") print(html)
# 1.构建处理器对象(专门处理请求的对象) http_hander = request.HTTPHandler() # 2.创建自定义opener opener = request.build_opener(http_hander) # 3.创建自定义请求对象 req = request.Request("http://www.baidu.com") # 4.1 发送请求,获取响应 # reponse = opener.open(req).read() # 4.2 把自定义opener设置为全局,这样urlopen发送的请求也会使用自定义的opener request.install_opener(opener) reponse = request.urlopen(req).read() print(reponse)
# 1.接收用户从终端输入 key = input("请输入要搜索的内容:") wd = {"wd": key} # dict url = "http://www.baidu.com/s?" # 2.构造url编码,进行urlencode编码 wdd = urllib.parse.urlencode(wd) # 3.拼接url url = url+wdd # 4.创建请求对象 req = request.Request(url) # 5.获取响应对象 reponse = request.urlopen(req).read().decode() print(reponse)
# 1.构造请求头信息 header={ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) " "AppleWebKit/537.36 (KHTML, like Gecko) " "Chrome/79.0.3928.4 Safari/537.36" } url = "http://fanyi.youdao.com/translate?smartresult=dict&smartresult=rule" # 2.接收用户从终端输入 key = input("请输入要搜索的内容:") formdata={ "i": key, "from": "AUTO", "to": "AUTO", "smartresult": "dict", "client": "fanyideskweb", "salt": "16003477829589", "sign": "3f351e5f7e0d84706ef063ccabe3e169", "lts": "1600347782958", "bv": "cb9a601990a9118249221b303a87fd75", "doctype": "json", "version": "2.1", "keyfrom": "fanyi.web", "action": "FY_BY_REALTlME", } # 3.把data转为bytes数据类型 data = urllib.parse.urlencode(formdata).encode(encoding='utf-8') # 4.发请求,获响应,获取内容 req = request.Request(url,data=data,headers=header) resp = request.urlopen(req).read().decode() # 5.正则表达式,提取"tgt":"like"}]]}中间的任意内容 pat = r'"tgt":"(.*?)"}]]}' result = re.findall(pat,resp) print(result[0])
list1 = [ "http://www.baidu.com", "http://www.baidu.com", "http://www.baidu25234234235454254243.com", "http://www.baidu.com", "http://www.baidu.com", ] i = 0 for url in list1: i += 1 try: request.urlopen(url) except Exception as e: print(e) print("第",i,"此请求完成")
base_url = "https://movie.douban.com/j/chart/top_list?" \ "type=11&interval_id=100%3A90&action=&start={}&limit=20" header = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) " "AppleWebKit/537.36 (KHTML, like Gecko)" " Chrome/79.0.3928.4 Safari/537.36" } i = 0 while True: url =base_url.format(i * 20) # "网站名:{name}, 地址 {url}".format(name="菜鸟教程", url="www.runoob.com") req = request.Request(url,headers=header) res = request.urlopen(req).read().decode() print(res) if res == '' or res is None: break i += 1
import ssl url = "https://www.12306.cn/mormhweb/" header = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) " "AppleWebKit/537.36 (KHTML, like Gecko)" " Chrome/79.0.3928.4 Safari/537.36" } req = request.Request(url,headers=header) # 验证忽略证书 context = ssl._create_unverified_context() res = request.urlopen(req,context=context).read().decode() print(res)
url = "https://www.qiushibaike.com/text/" header = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64;" " x64; rv:80.0) Gecko/20100101 " "Firefox/80.0" } # 构造请求 res = requests.get(url,headers=header) info = res.text infos = re.findall(r'<div class="content">\s*<span>\s*(.+)\s*</span>',info) for info in infos: with open("duanzi.txt",'a',encoding='utf-8') as f: f.write(info + "\n\n\n") print(infos)