I am trying to scrape some data from Remax.com for information like lotsize or square feet of property. Although I am get the following errors:
Error Traceback (most recent call last)
~\AppData\Local\Continuum\anaconda3\lib\site-packages\urllib3\contrib\pyopenssl.py in wrap_socket(self, sock, server_side, do_handshake_on_connect, suppress_ragged_eofs, server_hostname)
440 try:
--> 441 cnx.do_handshake()
442 except OpenSSL.SSL.WantReadError:
~\AppData\Local\Continuum\anaconda3\lib\site-packages\OpenSSL\SSL.py in do_handshake(self)
1715 result = _lib.SSL_do_handshake(self._ssl)
-> 1716 self._raise_ssl_error(self._ssl, result)
~\AppData\Local\Continuum\anaconda3\lib\site-packages\OpenSSL\SSL.py in _raise_ssl_error(self, ssl, result)
1455 else:
-> 1456 _raise_current_error()
~\AppData\Local\Continuum\anaconda3\lib\site-packages\OpenSSL\_util.py in exception_from_error_queue(exception_type)
---> 54 raise exception_type(errors)
Error: [('SSL routines', 'ssl3_get_server_certificate', 'certificate verify failed')]
During handling of the above exception, another exception occurred:
SSLError Traceback (most recent call last)
~\AppData\Local\Continuum\anaconda3\lib\site-packages\urllib3\connectionpool.py in urlopen(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw)
600 body=body, headers=headers,
--> 601 chunked=chunked)
~\AppData\Local\Continuum\anaconda3\lib\site-packages\urllib3\connectionpool.py in _make_request(self, conn, method, url, timeout, chunked, **httplib_request_kw)
345 try:
--> 346 self._validate_conn(conn)
347 except (SocketTimeout, BaseSSLError) as e:
~\AppData\Local\Continuum\anaconda3\lib\site-packages\urllib3\connectionpool.py in _validate_conn(self, conn)
849 if not getattr(conn, 'sock', None): # AppEngine might not have `.sock`
--> 850 conn.connect()
~\AppData\Local\Continuum\anaconda3\lib\site-packages\urllib3\connection.py in connect(self)
325 server_hostname=hostname,
--> 326 ssl_context=context)
~\AppData\Local\Continuum\anaconda3\lib\site-packages\urllib3\util\ssl_.py in ssl_wrap_socket(sock, keyfile, certfile, cert_reqs, ca_certs, server_hostname, ssl_version, ciphers, ssl_context, ca_cert_dir)
328 if HAS_SNI: # Platform-specific: OpenSSL with enabled SNI
--> 329 return context.wrap_socket(sock, server_hostname=server_hostname)
~\AppData\Local\Continuum\anaconda3\lib\site-packages\urllib3\contrib\pyopenssl.py in wrap_socket(self, sock, server_side, do_handshake_on_connect, suppress_ragged_eofs, server_hostname)
447 except OpenSSL.SSL.Error as e:
--> 448 raise ssl.SSLError('bad handshake: %r' % e)
449 break
SSLError: ("bad handshake: Error([('SSL routines', 'ssl3_get_server_certificate', 'certificate verify failed')],)",)
During handling of the above exception, another exception occurred:
MaxRetryError Traceback (most recent call last)
~\AppData\Local\Continuum\anaconda3\lib\site-packages\requests\adapters.py in send(self, request, stream, timeout, verify, cert, proxies)
439 retries=self.max_retries,
--> 440 timeout=timeout
441 )
~\AppData\Local\Continuum\anaconda3\lib\site-packages\urllib3\connectionpool.py in urlopen(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw)
638 retries = retries.increment(method, url, error=e, _pool=self,
--> 639 _stacktrace=sys.exc_info()[2])
640 retries.sleep()
~\AppData\Local\Continuum\anaconda3\lib\site-packages\urllib3\util\retry.py in increment(self, method, url, response, error, _pool, _stacktrace)
387 if new_retry.is_exhausted():
--> 388 raise MaxRetryError(_pool, url, error or ResponseError(cause))
MaxRetryError: HTTPSConnectionPool(host='www.remax.com', port=443): Max retries exceeded with url: /api/listings?nwlat=33.8426971435546875&nwlong=-118.3811187744140625&selat=33.8426971435546875&selong=-118.3783721923828125&Count=100&pagenumber=1&SiteID=68000000&pageCount=10&tab=map&sh=true&forcelatlong=true&maplistings=1&maplistcards=0&sv=true&sortorder=newest&view=forsale (Caused by SSLError(SSLError("bad handshake: Error([('SSL routines', 'ssl3_get_server_certificate', 'certificate verify failed')],)",),))
During handling of the above exception, another exception occurred:
SSLError Traceback (most recent call last)
<ipython-input-22-bcfdfdfb0a4e> in <module>()
----> 1 get_info('119 S IRENA AVE B, Redondo Beach, CA 90277')
<ipython-input-21-f3c942a87400> in get_info(address)
32 }
33 # proxies = {'http': 'http://user:pass@'}
---> 34 req_properties = requests.get("https://www.remax.com/api/listings", params=params)
35 matching_properties_json = req_properties.json()
36 for p in matching_properties_json[0]:
~\AppData\Local\Continuum\anaconda3\lib\site-packages\requests\api.py in get(url, params, **kwargs)
71 kwargs.setdefault('allow_redirects', True)
---> 72 return request('get', url, params=params, **kwargs)
~\AppData\Local\Continuum\anaconda3\lib\site-packages\requests\api.py in request(method, url, **kwargs)
56 # cases, and look like a memory leak in others.
57 with sessions.Session() as session:
---> 58 return session.request(method=method, url=url, **kwargs)
~\AppData\Local\Continuum\anaconda3\lib\site-packages\requests\sessions.py in request(self, method, url, params, data, headers, cookies, files, auth, timeout, allow_redirects, proxies, hooks, stream, verify, cert, json)
506 }
507 send_kwargs.update(settings)
--> 508 resp = self.send(prep, **send_kwargs)
510 return resp
~\AppData\Local\Continuum\anaconda3\lib\site-packages\requests\sessions.py in send(self, request, **kwargs)
617 # Send the request
--> 618 r = adapter.send(request, **kwargs)
620 # Total elapsed time of the request (approximately)
~\AppData\Local\Continuum\anaconda3\lib\site-packages\requests\adapters.py in send(self, request, stream, timeout, verify, cert, proxies)
504 if isinstance(e.reason, _SSLError):
505 # This branch is for urllib3 v1.22 and later.
--> 506 raise SSLError(e, request=request)
508 raise ConnectionError(e, request=request)
SSLError: HTTPSConnectionPool(host='www.remax.com', port=443): Max retries exceeded with url: /api/listings?nwlat=33.8426971435546875&nwlong=-118.3811187744140625&selat=33.8426971435546875&selong=-118.3783721923828125&Count=100&pagenumber=1&SiteID=68000000&pageCount=10&tab=map&sh=true&forcelatlong=true&maplistings=1&maplistcards=0&sv=true&sortorder=newest&view=forsale (Caused by SSLError(SSLError("bad handshake: Error([('SSL routines', 'ssl3_get_server_certificate', 'certificate verify failed')],)",),))
import urllib
from bs4 import BeautifulSoup
import pandas as pd
import geopy
from geopy.geocoders import Nominatim
import geolib
from geolib import geohash
from geopy.extra.rate_limiter import RateLimiter
import requests
geolocator = Nominatim(timeout=None)
def get_dir(address):
location = geolocator.geocode(address)
lat = location.latitude
lng = location.longitude
h = geolib.geohash.encode(lat, lng, 7)
hashes = geolib.geohash.neighbours(h)
NW = geohash.decode(hashes.nw)
SE = geohash.decode(hashes.ne)
nwlat = NW.lat
nwlon = NW.lon
selat = SE.lat
selon = SE.lon
return nwlat, nwlon, selat, selon
def get_info(address):
nwlat, nwlon, selat, selon = get_dir(address)
params = {
"nwlat" : nwlat,
"nwlong" : nwlon,
"selat" : selat,
"selong" : selon,
"Count" : 100,
"pagenumber" : 1,
"SiteID" : "68000000",
"pageCount" : "10",
"tab" : "map",
"sh" : "true",
"forcelatlong" : "true",
"maplistings" : "1",
"maplistcards" : "0",
"sv" : "true",
"sortorder" : "newest",
"view" : "homeestimates",
proxies = {'http': 'http://user:pass@'}
req_properties = requests.get("https://www.remax.com/api/listings", params=params, proxies=proxies, verify=False)
matching_properties_json = req_properties.json()
for p in matching_properties_json[0]:
print(f"{p['Address']:<40} {p.get('BedRooms', 0)} beds | {int(p.get('BathRooms',0))} baths | {p['SqFt']} sqft")
except (AttributeError):
return 'NaN'
x = get_info('693 Bluebird Canyon Drive, Laguna Beach CA, 92651')
I am not sure how to fix this problem as I am new to web scraping, I tried adding a proxy in the code but I still get the same errors in the latter above.
proxies = {'http': 'http://user:pass@'}
req_properties = requests.get("https://www.remax.com/api/listings", params=params, proxies=proxies, verify=False)
yields no errors but also no output at all.
Proxy is not an issue as you have said the previous question is working without needing one to be configured.
Your geohash.decode(hashes.ne)
call is using ne
instead of se
The returned coordinates are not returning any valid properties, the API appears to return a different kind of response in this case which does not include the values you want. It does include the price though.
Make sure that verify=False
is configured for the get. The warning message can be suppressed.
If the search square is increased slightly in size, it does return results:
import urllib
import urllib3
from bs4 import BeautifulSoup
import pandas as pd
import geopy
from geopy.geocoders import Nominatim
import geolib
from geolib import geohash
from geopy.extra.rate_limiter import RateLimiter
import requests
# Disable the certificate warning
geolocator = Nominatim(timeout=None)
def get_dir(address):
location = geolocator.geocode(address)
lat = location.latitude
lng = location.longitude
h = geolib.geohash.encode(lat, lng, 7)
hashes = geolib.geohash.neighbours(h)
NW = geohash.decode(hashes.nw)
SE = geohash.decode(hashes.se)
return NW, SE
def get_info(address):
NW, SE = get_dir(address)
square_size = 0.001
params = {
"nwlat" : float(NW.lat) + square_size,
"nwlong" : float(NW.lon) - square_size,
"selat" : float(SE.lat) - square_size,
"selong" : float(SE.lon) + square_size,
"Count" : 100,
"pagenumber" : 1,
"SiteID" : "68000000",
"pageCount" : "10",
"tab" : "map",
"sh" : "true",
"forcelatlong" : "true",
"maplistings" : "1",
"maplistcards" : "0",
"sv" : "true",
"sortorder" : "newest",
"view" : "homeestimates",
req_properties = requests.get("https://www.remax.com/api/listings", params=params, verify=False)
matching_properties_json = req_properties.json()
for p in matching_properties_json[0]:
address = f"{p['Address']}, {p['City']}, {p['State']}, {p['Zip']}"
print(f" {address:<50} | {p.get('BedRooms', 0)} beds | {int(p.get('BathRooms',0))} baths | {p['SqFt']} sqft")
except KeyError:
print(f"None found - {address} - ${p['PriceFormatted']}")
except (AttributeError):
return 'NaN'
get_info('693 Bluebird Canyon Drive, Laguna Beach CA, 92651')
1566 Glenneyre Street, Laguna Beach, CA, 92651 | 0 beds | 0 baths | sqft
1585 S Coast 4, Laguna Beach, CA, 92651 | 3 beds | 2 baths | 1448 sqft
429 Shadow Lane, Laguna Beach, CA, 92651 | 2 beds | 2 baths | 1102 sqft
243 Calliope Street 1, Laguna Beach, CA, 92651 | 2 beds | 2 baths | 1350 sqft