深圳租房分析(1/2)网络爬虫

01、运行环境

# 操作系统:win10 专业版
pycharm professional 2019.1
python 3.8
requests == 2.23.0
random # 内置的

02、主要信息

  • 房屋城区
  • 房屋面积
  • 房屋价格
  • 房屋朝向
  • 房屋布局
  • 房屋小区

03、完整代码

import requests
from lxml import etree
import random
import pandas as pd
import numpy as np
import csv
# 起始url,首页的100页
urllist = ['https://sz.lianjia.com/zufang/pg{}/#contentList'.format(i) for i in range(1,100)]

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.125 Safari/537.36"
}
# 代理池
paramslist = ['121.232.199.237:9000','125.108.67.254:9000','123.163.118.102:9999',
              '125.108.67.254:9000','171.35.172.151:9999','123.101.231.234:9999',
              '125.108.67.254:9000','123.163.118.102:9999','171.35.172.151:9999',
              '123.101.231.234:9999','113.195.16.16:9999','175.44.109.145:9999',
              '125.108.85.141:9000','175.43.32.21:9999','183.166.103.22:9999',
              '125.110.96.80:9000','123.160.69.100:9999','112.111.217.69:9999',
              '1.199.30.133:9999','123.55.102.150:9999','120.83.104.196:9999',
              '180.118.128.138:9000','163.204.95.253:9999','113.195.18.89:9999',
              '113.195.16.191:9999','175.42.129.76:9999','125.110.124.214:9000',
              '125.110.102.54:9000','36.249.119.16:9999','125.110.89.240:9000',
              '171.35.146.70:9999','124.93.201.59:42672','171.35.173.112:9999']
# 代理ip
param = {'HTTP':paramslist[random.randint(0,len(paramslist))]}
for url in urllist:
    response = requests.get(url=url,headers=headers,params=param)

    res_html = response.content.decode()
    # print(res_html)
    # xpath使用准备
    res_htmlx = etree.HTML(res_html)
    # item = {}

    # 取出信息
    # 房租的价格
    price = res_htmlx.xpath('//div[@class="content__list"]//span[@class="content__list--item-price"]/em/text()')
    # print(price)
    # 城区
    distance = res_htmlx.xpath('//div[@class="content__list"]//p[@class="content__list--item--des"]/a[1]/text()')
    # 小区
    community = res_htmlx.xpath('//div[@class="content__list"]//p[@class="content__list--item--des"]/a[3]/text()')

    # 面积
    area_room = res_htmlx.xpath('//div[@class="content__list"]//p[@class="content__list--item--des"]/i[1]/following-sibling::node()')
    # print(area)
    # area = area[::7]
    # print(area_room)
    # 面积
    area = []
    # 房间格局
    room = []
    # 方向
    direction =[]
    for i in range(0,len(area_room),7):
        # print(area_room[i].split()[0])
        area.append(area_room[i].split()[0])
        direction.append(area_room[i+2].split()[0])
        room.append(area_room[i+4].split()[0])

    with open("./data/shenzhenlianjia.csv",'a+') as f:
        linajiawriter = csv.writer(f)

        # 设置标题
        linajiawriter.writerow(['城区','小区','面积','房间格局','方向','价格'])
        for i in range(len(price)):
            linajiawriter.writerow([distance[i],community[i],area[i],room[i],direction[i],price[i]])

04、结语:

09-12 11:37