利用request、beautifulsoup、xml写多线程爬虫

# -*- coding:UTF-8 -*-

import requests,time

from collections import OrderedDict

import threading

from bs4 import BeautifulSoup as bp

t3 = time.time()

ths = []  # 存放线程

def get(num):

    dic = OrderedDict()

    n = str(num)

    data = {'basename':'BASENAME11',

    'where':'2PLDYDY1',

    'dbpage':n,

    'pagecount':'',

    'order':'ORDER1,ORDER2',

    'orderbytype':'ASC',

    'searchList':'SEARCHLIST11',

    'isKz':'',

    'id':'0.40519130290516947'}

    header1 = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36','Referrer':'http://txl.xxx.cn/xxx/center.do?path=txl_index'}

    page = requests.post('http://txl.xxx.cn/xxx/dBSearchForTxlAction.do',headers=header1,data=data)  # 自定义请求头，这些请求头内容是在浏览器上看到的

    t = page.text

    soup = bp(t,'xml')  #使用beautifulsoup解析xml文件，解析html时，将xml改为lxml

    all_body = soup.find_all('EmailResult')  #查找EmailResult标签包含的所有内容，生成一个列表

    for info in all_body:

        print(u'%s'%info.NAME.text.ljust(10,'　'),info.FENJI.text.ljust(20,' '),info.SHOUJI.text.ljust(30),info.EMAIL.text.ljust(30),info.ZHIWU.text)  # 根据标签查找相应的text文本内容即可

for num in range(75):

    t1 = threading.Thread(target=get, args=(num,))

    ths.append(t1)

for t in ths:

    t.start()

for ttt in ths:

    ttt.join()

t4 = time.time()

tt = t4 - t3

print(tt)