Python 爬58同城城市租房信息

爬取完会自动生成csv电子表格文件，含有房价、押付、链接等信息
环境
py2.7
pip install lxml
pip install cssselect
 #coding:utf-8

 import csv

 import urllib2

 import lxml.html

 import time

 import sys

 from lxml.cssselect import CSSSelector

 import threading

 reload(sys)

 sys.setdefaultencoding('utf8')

 print "请输入要爬取得城市简称例如bj（北京）："

 CITY=str(raw_input(">>>"))

 def download(url, user_agent='Google', num_retries=2):

     headers = {'User-agent': user_agent}

     request = urllib2.Request(url, headers=headers)

     try:

         html = urllib2.urlopen(request).read()

     except urllib2.URLError as e:

         html = None

         if num_retries > 0:

             if hasattr(e, 'code') and 500 <= e.code < 600:

                 return download(url, num_retries-1)

     return html

 def get_data(url):

     html_text_detail = download(url)

     try:

         tree = lxml.html.fromstring(html_text_detail)

         house_ext = CSSSelector('div.house-pay-way > span:nth-child(3)')

         house_title = CSSSelector('div.main-wrap > div.house-title > h1')

         house_pay_way1 = CSSSelector('div.house-pay-way > span:nth-child(1)')

         house_pay_way2 = CSSSelector('div.house-pay-way > span:nth-child(2)')

         print house_title(tree)[0].text_content()

         print '%s|%s' % (house_pay_way1(tree)[0].text_content(), house_pay_way2(tree)[0].text_content())

         for i in range(7):

             for j in range(2):

                 css = 'div.house-desc-item > ul.f14 > li:nth-child(%s) > span:nth-child(%s)' % (i+1, j+1)

                 house_info = CSSSelector(css)

                 data = [

                 ('标题 ： ',house_title(tree)[0].text_content(), '#',url),

                 ('价格： ',house_pay_way1(tree)[0].text_content(), '#'),

                 ('压付： ',house_pay_way2(tree)[0].text_content(), '#'),

                 ('详情： ',house_info(tree)[0].text_content().replace(' ', ''), '#')]

                 with open('%s_houses.csv'%CITY,'ab+') as csvfile:

                     writer = csv.writer(csvfile,lineterminator='\n')

                     writer.writerows(data)

     except TypeError as e:

         pass

     except IndexError as e:

         pass

 def get_url(html):

     tree = lxml.html.fromstring(html)

     sel = CSSSelector('div.mainbox > div.main > div.content > div.listBox > ul.listUl > li > div.des > h2 > a')

     url_list = []

     for i in sel(tree):

         if i.get('href') not in url_list:

             url_list.append(i.get('href'))

     return url_list

 if __name__ == '__main__':

     url_index = 'http://%s.58.com/chuzu/'%CITY

     html_text_list = download(url_index)

     url_list = get_url(html_text_list)

     for url_detail in url_list:

         thr = threading.Thread(target=get_data, args=(url_detail,))

         thr.start()

         time.sleep(0.001)
py58.py
信息

Python 爬58同城 城市租房信息

Python 爬58同城城市租房信息