# -*- coding:UTF-8 -*-
import requests,time
from collections import OrderedDict
import threading
from bs4 import BeautifulSoup as bp t3 = time.time()
ths = [] # 存放线程 def get(num):
dic = OrderedDict()
n = str(num)
data = {'basename':'BASENAME11',
'where':'2PLDYDY1',
'dbpage':n,
'pagecount':'',
'order':'ORDER1,ORDER2',
'orderbytype':'ASC',
'searchList':'SEARCHLIST11',
'isKz':'',
'id':'0.40519130290516947'}
header1 = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36','Referrer':'http://txl.xxx.cn/xxx/center.do?path=txl_index'}
page = requests.post('http://txl.xxx.cn/xxx/dBSearchForTxlAction.do',headers=header1,data=data) # 自定义请求头,这些请求头内容是在浏览器上看到的
t = page.text
soup = bp(t,'xml') #使用beautifulsoup解析xml文件,解析html时,将xml改为lxml
all_body = soup.find_all('EmailResult') #查找EmailResult标签包含的所有内容,生成一个列表
for info in all_body:
print(u'%s'%info.NAME.text.ljust(10,' '),info.FENJI.text.ljust(20,' '),info.SHOUJI.text.ljust(30),info.EMAIL.text.ljust(30),info.ZHIWU.text) # 根据标签查找相应的text文本内容即可 for num in range(75):
t1 = threading.Thread(target=get, args=(num,))
ths.append(t1)
for t in ths:
t.start()
for ttt in ths:
ttt.join() t4 = time.time()
tt = t4 - t3
print(tt)
05-11 13:03