爬取完会自动生成csv电子表格文件,含有房价、押付、链接等信息

环境
py2.7
pip install lxml
pip install cssselect
 
 #coding:utf-8
import csv
import urllib2
import lxml.html
import time
import sys
from lxml.cssselect import CSSSelector
import threading
reload(sys)
sys.setdefaultencoding('utf8') print "请输入要爬取得城市简称例如bj(北京):"
CITY=str(raw_input(">>>"))
def download(url, user_agent='Google', num_retries=2): headers = {'User-agent': user_agent}
request = urllib2.Request(url, headers=headers)
try:
html = urllib2.urlopen(request).read()
except urllib2.URLError as e:
html = None
if num_retries > 0:
if hasattr(e, 'code') and 500 <= e.code < 600:
return download(url, num_retries-1)
return html def get_data(url):
html_text_detail = download(url)
try:
tree = lxml.html.fromstring(html_text_detail)
house_ext = CSSSelector('div.house-pay-way > span:nth-child(3)')
house_title = CSSSelector('div.main-wrap > div.house-title > h1')
house_pay_way1 = CSSSelector('div.house-pay-way > span:nth-child(1)')
house_pay_way2 = CSSSelector('div.house-pay-way > span:nth-child(2)')
print house_title(tree)[0].text_content()
print '%s|%s' % (house_pay_way1(tree)[0].text_content(), house_pay_way2(tree)[0].text_content()) for i in range(7):
for j in range(2):
css = 'div.house-desc-item > ul.f14 > li:nth-child(%s) > span:nth-child(%s)' % (i+1, j+1)
house_info = CSSSelector(css)
data = [
('标题 : ',house_title(tree)[0].text_content(), '#',url),
('价格: ',house_pay_way1(tree)[0].text_content(), '#'),
('压付: ',house_pay_way2(tree)[0].text_content(), '#'),
('详情: ',house_info(tree)[0].text_content().replace(' ', ''), '#')]
with open('%s_houses.csv'%CITY,'ab+') as csvfile:
writer = csv.writer(csvfile,lineterminator='\n')
writer.writerows(data) except TypeError as e:
pass
except IndexError as e:
pass def get_url(html):
tree = lxml.html.fromstring(html)
sel = CSSSelector('div.mainbox > div.main > div.content > div.listBox > ul.listUl > li > div.des > h2 > a')
url_list = []
for i in sel(tree):
if i.get('href') not in url_list:
url_list.append(i.get('href'))
return url_list if __name__ == '__main__':
url_index = 'http://%s.58.com/chuzu/'%CITY
html_text_list = download(url_index)
url_list = get_url(html_text_list) for url_detail in url_list:
thr = threading.Thread(target=get_data, args=(url_detail,))
thr.start() time.sleep(0.001)

py58.py

04-26 16:41
查看更多