来自remax.com的网页抓取

本文介绍了来自remax.com的网页抓取的处理方法，对大家解决问题具有一定的参考价值，需要的朋友们下面随着小编来一起学习吧！

问题描述

我正在尝试从Remax.com抓取一些数据，以获取诸如手数或物业平方英尺之类的信息.尽管出现以下错误:

I am trying to scrape some data from Remax.com for information like lotsize or square feet of property. Although I am get the following errors:

---------------------------------------------------------------------------
Error                                     Traceback (most recent call last)
~\AppData\Local\Continuum\anaconda3\lib\site-packages\urllib3\contrib\pyopenssl.py in wrap_socket(self, sock, server_side, do_handshake_on_connect, suppress_ragged_eofs, server_hostname)
    440             try:
--> 441                 cnx.do_handshake()
    442             except OpenSSL.SSL.WantReadError:

~\AppData\Local\Continuum\anaconda3\lib\site-packages\OpenSSL\SSL.py in do_handshake(self)
   1715         result = _lib.SSL_do_handshake(self._ssl)
-> 1716         self._raise_ssl_error(self._ssl, result)
   1717

~\AppData\Local\Continuum\anaconda3\lib\site-packages\OpenSSL\SSL.py in _raise_ssl_error(self, ssl, result)
   1455         else:
-> 1456             _raise_current_error()
   1457

~\AppData\Local\Continuum\anaconda3\lib\site-packages\OpenSSL\_util.py in exception_from_error_queue(exception_type)
     53
---> 54     raise exception_type(errors)
     55

Error: [('SSL routines', 'ssl3_get_server_certificate', 'certificate verify failed')]

During handling of the above exception, another exception occurred:

SSLError                                  Traceback (most recent call last)
~\AppData\Local\Continuum\anaconda3\lib\site-packages\urllib3\connectionpool.py in urlopen(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw)
    600                                                   body=body, headers=headers,
--> 601                                                   chunked=chunked)
    602

~\AppData\Local\Continuum\anaconda3\lib\site-packages\urllib3\connectionpool.py in _make_request(self, conn, method, url, timeout, chunked, **httplib_request_kw)
    345         try:
--> 346             self._validate_conn(conn)
    347         except (SocketTimeout, BaseSSLError) as e:

~\AppData\Local\Continuum\anaconda3\lib\site-packages\urllib3\connectionpool.py in _validate_conn(self, conn)
    849         if not getattr(conn, 'sock', None):  # AppEngine might not have  `.sock`
--> 850             conn.connect()
    851

~\AppData\Local\Continuum\anaconda3\lib\site-packages\urllib3\connection.py in connect(self)
    325             server_hostname=hostname,
--> 326             ssl_context=context)
    327

~\AppData\Local\Continuum\anaconda3\lib\site-packages\urllib3\util\ssl_.py in ssl_wrap_socket(sock, keyfile, certfile, cert_reqs, ca_certs, server_hostname, ssl_version, ciphers, ssl_context, ca_cert_dir)
    328     if HAS_SNI:  # Platform-specific: OpenSSL with enabled SNI
--> 329         return context.wrap_socket(sock, server_hostname=server_hostname)
    330

~\AppData\Local\Continuum\anaconda3\lib\site-packages\urllib3\contrib\pyopenssl.py in wrap_socket(self, sock, server_side, do_handshake_on_connect, suppress_ragged_eofs, server_hostname)
    447             except OpenSSL.SSL.Error as e:
--> 448                 raise ssl.SSLError('bad handshake: %r' % e)
    449             break

SSLError: ("bad handshake: Error([('SSL routines', 'ssl3_get_server_certificate', 'certificate verify failed')],)",)

During handling of the above exception, another exception occurred:

MaxRetryError                             Traceback (most recent call last)
~\AppData\Local\Continuum\anaconda3\lib\site-packages\requests\adapters.py in send(self, request, stream, timeout, verify, cert, proxies)
    439                     retries=self.max_retries,
--> 440                     timeout=timeout
    441                 )

~\AppData\Local\Continuum\anaconda3\lib\site-packages\urllib3\connectionpool.py in urlopen(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw)
    638             retries = retries.increment(method, url, error=e, _pool=self,
--> 639                                         _stacktrace=sys.exc_info()[2])
    640             retries.sleep()

~\AppData\Local\Continuum\anaconda3\lib\site-packages\urllib3\util\retry.py in increment(self, method, url, response, error, _pool, _stacktrace)
    387         if new_retry.is_exhausted():
--> 388             raise MaxRetryError(_pool, url, error or ResponseError(cause))
    389

MaxRetryError: HTTPSConnectionPool(host='www.remax.com', port=443): Max retries exceeded with url: /api/listings?nwlat=33.8426971435546875&nwlong=-118.3811187744140625&selat=33.8426971435546875&selong=-118.3783721923828125&Count=100&pagenumber=1&SiteID=68000000&pageCount=10&tab=map&sh=true&forcelatlong=true&maplistings=1&maplistcards=0&sv=true&sortorder=newest&view=forsale (Caused by SSLError(SSLError("bad handshake: Error([('SSL routines', 'ssl3_get_server_certificate', 'certificate verify failed')],)",),))

During handling of the above exception, another exception occurred:

SSLError                                  Traceback (most recent call last)
<ipython-input-22-bcfdfdfb0a4e> in <module>()
----> 1 get_info('119 S IRENA AVE B, Redondo Beach, CA 90277')

<ipython-input-21-f3c942a87400> in get_info(address)
     32         }
     33 #         proxies = {'http': 'http://user:pass@10.10.1.10:3128/'}
---> 34         req_properties = requests.get("https://www.remax.com/api/listings", params=params)
     35         matching_properties_json = req_properties.json()
     36         for p in matching_properties_json[0]:

~\AppData\Local\Continuum\anaconda3\lib\site-packages\requests\api.py in get(url, params, **kwargs)
     70
     71     kwargs.setdefault('allow_redirects', True)
---> 72     return request('get', url, params=params, **kwargs)
     73
     74

~\AppData\Local\Continuum\anaconda3\lib\site-packages\requests\api.py in request(method, url, **kwargs)
     56     # cases, and look like a memory leak in others.
     57     with sessions.Session() as session:
---> 58         return session.request(method=method, url=url, **kwargs)
     59
     60

~\AppData\Local\Continuum\anaconda3\lib\site-packages\requests\sessions.py in request(self, method, url, params, data, headers, cookies, files, auth, timeout, allow_redirects, proxies, hooks, stream, verify, cert, json)
    506         }
    507         send_kwargs.update(settings)
--> 508         resp = self.send(prep, **send_kwargs)
    509
    510         return resp

~\AppData\Local\Continuum\anaconda3\lib\site-packages\requests\sessions.py in send(self, request, **kwargs)
    616
    617         # Send the request
--> 618         r = adapter.send(request, **kwargs)
    619
    620         # Total elapsed time of the request (approximately)

~\AppData\Local\Continuum\anaconda3\lib\site-packages\requests\adapters.py in send(self, request, stream, timeout, verify, cert, proxies)
    504             if isinstance(e.reason, _SSLError):
    505                 # This branch is for urllib3 v1.22 and later.
--> 506                 raise SSLError(e, request=request)
    507
    508             raise ConnectionError(e, request=request)

SSLError: HTTPSConnectionPool(host='www.remax.com', port=443): Max retries exceeded with url: /api/listings?nwlat=33.8426971435546875&nwlong=-118.3811187744140625&selat=33.8426971435546875&selong=-118.3783721923828125&Count=100&pagenumber=1&SiteID=68000000&pageCount=10&tab=map&sh=true&forcelatlong=true&maplistings=1&maplistcards=0&sv=true&sortorder=newest&view=forsale (Caused by SSLError(SSLError("bad handshake: Error([('SSL routines', 'ssl3_get_server_certificate', 'certificate verify failed')],)",),))

这是我的代码:

import urllib
from bs4 import BeautifulSoup
import pandas as pd
import geopy
from geopy.geocoders import Nominatim
import geolib
from geolib import geohash
from geopy.extra.rate_limiter import RateLimiter
import requests

geolocator = Nominatim(timeout=None)
def get_dir(address):
    location = geolocator.geocode(address)
    lat = location.latitude
    lng = location.longitude
    h = geolib.geohash.encode(lat, lng, 7)
    hashes = geolib.geohash.neighbours(h)
    NW = geohash.decode(hashes.nw)
    SE = geohash.decode(hashes.ne)
    nwlat = NW.lat
    nwlon = NW.lon
    selat = SE.lat
    selon = SE.lon
    return nwlat, nwlon, selat, selon

    def get_info(address):
    try:
        nwlat, nwlon, selat, selon = get_dir(address)
        params = {
        "nwlat" : nwlat,
        "nwlong" : nwlon,
        "selat" : selat,
        "selong" : selon,
        "Count" : 100,
        "pagenumber" : 1,
        "SiteID" : "68000000",
        "pageCount" : "10",
        "tab" : "map",
        "sh" : "true",
        "forcelatlong" : "true",
        "maplistings" : "1",
        "maplistcards" : "0",
        "sv" : "true",
        "sortorder" : "newest",
        "view" : "homeestimates",
        }
        proxies = {'http': 'http://user:pass@10.10.1.10:3128/'}
        req_properties = requests.get("https://www.remax.com/api/listings", params=params, proxies=proxies, verify=False)
        matching_properties_json = req_properties.json()
        for p in matching_properties_json[0]:
            print(f"{p['Address']:<40}  {p.get('BedRooms', 0)} beds | {int(p.get('BathRooms',0))} baths | {p['SqFt']} sqft")
    except (AttributeError):
        return 'NaN'

x = get_info('693 Bluebird Canyon Drive, Laguna Beach CA, 92651')
print(x)

我不确定如何解决此问题，因为我是Web抓取的新手，我曾尝试在代码中添加代理，但在上面的后者中仍然遇到相同的错误.

I am not sure how to fix this problem as I am new to web scraping, I tried adding a proxy in the code but I still get the same errors in the latter above.

更新:

添加

proxies = {'http': 'http://user:pass@10.10.1.10:3128/'}
req_properties = requests.get("https://www.remax.com/api/listings", params=params, proxies=proxies, verify=False)

不产生任何错误，但也完全没有输出.

yields no errors but also no output at all.

推荐答案

似乎存在许多问题:

代理不是问题，因为您已经说过上一个问题正在工作，而无需进行配置.

Proxy is not an issue as you have said the previous question is working without needing one to be configured.

您的geohash.decode(hashes.ne)呼叫使用的是ne而不是se.

Your geohash.decode(hashes.ne) call is using ne instead of se.

返回的坐标未返回任何有效属性，在这种情况下，API似乎返回了另一种类型的响应，其中不包含所需的值.它确实包括价格.

The returned coordinates are not returning any valid properties, the API appears to return a different kind of response in this case which does not include the values you want. It does include the price though.

确保为获取配置了verify=False.可以禁止显示警告消息.

Make sure that verify=False is configured for the get. The warning message can be suppressed.

如果搜索方块的大小略有增加，它会返回结果:

If the search square is increased slightly in size, it does return results:

import urllib
import urllib3
from bs4 import BeautifulSoup
import pandas as pd
import geopy
from geopy.geocoders import Nominatim
import geolib
from geolib import geohash
from geopy.extra.rate_limiter import RateLimiter
import requests


# Disable the certificate warning
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
geolocator = Nominatim(timeout=None)


def get_dir(address):
    location = geolocator.geocode(address)
    lat = location.latitude
    lng = location.longitude
    h = geolib.geohash.encode(lat, lng, 7)
    hashes = geolib.geohash.neighbours(h)
    NW = geohash.decode(hashes.nw)
    SE = geohash.decode(hashes.se)

    return NW, SE


def get_info(address):
    try:
        NW, SE = get_dir(address)
        square_size = 0.001

        params = {
            "nwlat" : float(NW.lat) + square_size,
            "nwlong" : float(NW.lon) - square_size,
            "selat" : float(SE.lat) - square_size,
            "selong" : float(SE.lon) + square_size,
            "Count" : 100,
            "pagenumber" : 1,
            "SiteID" : "68000000",
            "pageCount" : "10",
            "tab" : "map",
            "sh" : "true",
            "forcelatlong" : "true",
            "maplistings" : "1",
            "maplistcards" : "0",
            "sv" : "true",
            "sortorder" : "newest",
            "view" : "homeestimates",
        }

        req_properties = requests.get("https://www.remax.com/api/listings", params=params, verify=False)
        matching_properties_json = req_properties.json()

        for p in matching_properties_json[0]:
            address = f"{p['Address']}, {p['City']}, {p['State']}, {p['Zip']}"

            try:
                print(f"  {address:<50} | {p.get('BedRooms', 0)} beds | {int(p.get('BathRooms',0))} baths | {p['SqFt']} sqft")
            except KeyError:
                print(f"None found - {address} - ${p['PriceFormatted']}")

    except (AttributeError):
        return 'NaN'

get_info('693 Bluebird Canyon Drive, Laguna Beach CA, 92651')

这将显示:

  1566 Glenneyre Street, Laguna Beach, CA, 92651     | 0 beds | 0 baths |  sqft
  1585 S Coast 4, Laguna Beach, CA, 92651            | 3 beds | 2 baths | 1448 sqft
  429 Shadow Lane, Laguna Beach, CA, 92651           | 2 beds | 2 baths | 1102 sqft
  243 Calliope Street 1, Laguna Beach, CA, 92651     | 2 beds | 2 baths | 1350 sqft

这篇关于来自remax.com的网页抓取的文章就介绍到这了，希望我们推荐的答案对大家有所帮助，也希望大家多多支持！