From 2f5ea1ca886f62c401031ec95aea0d102d778898 Mon Sep 17 00:00:00 2001 From: Cullen Watson Date: Sun, 17 Sep 2023 15:06:31 -0500 Subject: [PATCH] feat(scrapers): add zillow --- .gitignore | 5 +- homeharvest/__init__.py | 19 +- homeharvest/core/scrapers/__init__.py | 14 +- .../core/scrapers/{types.py => models.py} | 21 +- homeharvest/core/scrapers/realtor/__init__.py | 47 ++-- homeharvest/core/scrapers/redfin/__init__.py | 88 ++++---- homeharvest/core/scrapers/zillow/__init__.py | 205 ++++++++++++++++++ homeharvest/exceptions.py | 10 +- tests/test_realtor.py | 5 +- tests/test_redfin.py | 20 +- tests/test_zillow.py | 12 + 11 files changed, 349 insertions(+), 97 deletions(-) rename homeharvest/core/scrapers/{types.py => models.py} (62%) create mode 100644 homeharvest/core/scrapers/zillow/__init__.py create mode 100644 tests/test_zillow.py diff --git a/.gitignore b/.gitignore index d6e2a37..8c31f0c 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,5 @@ /.idea -dist \ No newline at end of file +**/dist/ +**/__pycache__/ +**/.pytest_cache/ +*.pyc \ No newline at end of file diff --git a/homeharvest/__init__.py b/homeharvest/__init__.py index 2086c6d..aba1fd5 100644 --- a/homeharvest/__init__.py +++ b/homeharvest/__init__.py @@ -1,26 +1,31 @@ from .core.scrapers.redfin import RedfinScraper from .core.scrapers.realtor import RealtorScraper -from .core.scrapers.types import ListingType, Property +from .core.scrapers.zillow import ZillowScraper +from .core.scrapers.models import ListingType, Property, Building from .core.scrapers import ScraperInput from .exceptions import InvalidSite, InvalidListingType +from typing import Union _scrapers = { "redfin": RedfinScraper, - "realtor.com": RealtorScraper + "realtor.com": RealtorScraper, + "zillow": ZillowScraper, } def scrape_property( - location: str, - site_name: str, - listing_type: str = "for_sale", #: for_sale, for_rent, sold -) -> list[Property]: #: eventually, return pandas dataframe + location: str, + site_name: str, + listing_type: str = "for_sale", #: for_sale, for_rent, sold +) -> Union[list[Building], list[Property]]: #: eventually, return pandas dataframe if site_name.lower() not in _scrapers: raise InvalidSite(f"Provided site, '{site_name}', does not exist.") if listing_type.upper() not in ListingType.__members__: - raise InvalidListingType(f"Provided listing type, '{listing_type}', does not exist.") + raise InvalidListingType( + f"Provided listing type, '{listing_type}', does not exist." + ) scraper_input = ScraperInput( location=location, diff --git a/homeharvest/core/scrapers/__init__.py b/homeharvest/core/scrapers/__init__.py index 5d9addc..9b2f67a 100644 --- a/homeharvest/core/scrapers/__init__.py +++ b/homeharvest/core/scrapers/__init__.py @@ -1,6 +1,6 @@ from dataclasses import dataclass import requests -from .types import Property, ListingType +from .models import Property, ListingType @dataclass @@ -11,9 +11,12 @@ class ScraperInput: class Scraper: + listing_type = ListingType.FOR_SALE + def __init__(self, scraper_input: ScraperInput): self.location = scraper_input.location self.session = requests.Session() + Scraper.listing_type = scraper_input.listing_type if scraper_input.proxy_url: self.session.proxies = { @@ -21,9 +24,12 @@ def __init__(self, scraper_input: ScraperInput): "https": scraper_input.proxy_url, } - def search(self) -> list[Property]: ... + def search(self) -> list[Property]: + ... @staticmethod - def parse_home(home) -> Property: ... + def _parse_home(home) -> Property: + ... - def handle_location(self): ... + def handle_location(self): + ... diff --git a/homeharvest/core/scrapers/types.py b/homeharvest/core/scrapers/models.py similarity index 62% rename from homeharvest/core/scrapers/types.py rename to homeharvest/core/scrapers/models.py index 0c5edeb..3c52acf 100644 --- a/homeharvest/core/scrapers/types.py +++ b/homeharvest/core/scrapers/models.py @@ -24,14 +24,29 @@ class Property: url: str beds: int | None = None - baths: int | None = None + baths: float | None = None stories: int | None = None agent_name: str | None = None - description: str | None = None year_built: int | None = None square_feet: int | None = None price_per_square_foot: int | None = None + year_built: int | None = None price: int | None = None mls_id: str | None = None - property_type: str | None = None + listing_type: ListingType | None = None + lot_size: int | None = None + description: str | None = None + + +@dataclass +class Building: + address: Address + url: str + + num_units: int | None = None + min_unit_price: int | None = None + max_unit_price: int | None = None + avg_unit_price: int | None = None + + listing_type: str | None = None diff --git a/homeharvest/core/scrapers/realtor/__init__.py b/homeharvest/core/scrapers/realtor/__init__.py index 96a6aa7..8e4fbd8 100644 --- a/homeharvest/core/scrapers/realtor/__init__.py +++ b/homeharvest/core/scrapers/realtor/__init__.py @@ -1,5 +1,5 @@ import json -from ..types import Property, Address +from ..models import Property, Address from .. import Scraper from typing import Any @@ -10,39 +10,42 @@ def __init__(self, scraper_input): def handle_location(self): headers = { - 'authority': 'parser-external.geo.moveaws.com', - 'accept': '*/*', - 'accept-language': 'en-US,en;q=0.9', - 'origin': 'https://www.realtor.com', - 'referer': 'https://www.realtor.com/', - 'sec-ch-ua': '"Chromium";v="116", "Not)A;Brand";v="24", "Google Chrome";v="116"', - 'sec-ch-ua-mobile': '?0', - 'sec-ch-ua-platform': '"Windows"', - 'sec-fetch-dest': 'empty', - 'sec-fetch-mode': 'cors', - 'sec-fetch-site': 'cross-site', - 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36', + "authority": "parser-external.geo.moveaws.com", + "accept": "*/*", + "accept-language": "en-US,en;q=0.9", + "origin": "https://www.realtor.com", + "referer": "https://www.realtor.com/", + "sec-ch-ua": '"Chromium";v="116", "Not)A;Brand";v="24", "Google Chrome";v="116"', + "sec-ch-ua-mobile": "?0", + "sec-ch-ua-platform": '"Windows"', + "sec-fetch-dest": "empty", + "sec-fetch-mode": "cors", + "sec-fetch-site": "cross-site", + "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36", } params = { - 'input': self.location, - 'client_id': 'for-sale', - 'limit': '1', - 'area_types': 'city,state,county,postal_code,address,street,neighborhood,school,school_district,university,park', + "input": self.location, + "client_id": "for-sale", + "limit": "1", + "area_types": "city,state,county,postal_code,address,street,neighborhood,school,school_district,university,park", } - response = self.session.get('https://parser-external.geo.moveaws.com/suggest', params=params, headers=headers) + response = self.session.get( + "https://parser-external.geo.moveaws.com/suggest", + params=params, + headers=headers, + ) response_json = response.json() - return response_json['autocomplete'][0] - + return response_json["autocomplete"][0] def search(self): location_info = self.handle_location() - location_type = location_info['area_type'] + location_type = location_info["area_type"] """ property types: apartment + building + commercial + condo_townhome + condo_townhome_rowhome_coop + condos + coop + duplex_triplex + farm + investment + land + mobile + multi_family + rental + single_family + townhomes """ - print('a') + print("a") diff --git a/homeharvest/core/scrapers/redfin/__init__.py b/homeharvest/core/scrapers/redfin/__init__.py index e44a46d..3c70325 100644 --- a/homeharvest/core/scrapers/redfin/__init__.py +++ b/homeharvest/core/scrapers/redfin/__init__.py @@ -1,5 +1,5 @@ import json -from ..types import Property, Address +from ..models import Property, Address from .. import Scraper from typing import Any @@ -8,11 +8,13 @@ class RedfinScraper(Scraper): def __init__(self, scraper_input): super().__init__(scraper_input) - def handle_location(self): - url = 'https://www.redfin.com/stingray/do/location-autocomplete?v=2&al=1&location={}'.format(self.location) + def _handle_location(self): + url = "https://www.redfin.com/stingray/do/location-autocomplete?v=2&al=1&location={}".format( + self.location + ) response = self.session.get(url) - response_json = json.loads(response.text.replace('{}&&', '')) + response_json = json.loads(response.text.replace("{}&&", "")) def get_region_type(match_type: str): if match_type == "4": @@ -22,51 +24,53 @@ def get_region_type(match_type: str): elif match_type == "1": return "address" #: address, needs to be handled differently - if response_json['payload']['exactMatch'] is not None: - target = response_json['payload']['exactMatch'] + if response_json["payload"]["exactMatch"] is not None: + target = response_json["payload"]["exactMatch"] else: - target = response_json['payload']['sections'][0]['rows'][0] + target = response_json["payload"]["sections"][0]["rows"][0] - return target['id'].split('_')[1], get_region_type(target['type']) + return target["id"].split("_")[1], get_region_type(target["type"]) @staticmethod - def parse_home(home: dict, single_search: bool = False) -> Property: + def _parse_home(home: dict, single_search: bool = False) -> Property: def get_value(key: str) -> Any | None: - if key in home and 'value' in home[key]: - return home[key]['value'] + if key in home and "value" in home[key]: + return home[key]["value"] if not single_search: address = Address( - address_one=get_value('streetLine'), - city=home['city'], - state=home['state'], - zip_code=home['zip'] + address_one=get_value("streetLine"), + city=home["city"], + state=home["state"], + zip_code=home["zip"], ) else: - address_info = home['streetAddress'] + address_info = home["streetAddress"] address = Address( - address_one=address_info['assembledAddress'], - city=home['city'], - state=home['state'], - zip_code=home['zip'] + address_one=address_info["assembledAddress"], + city=home["city"], + state=home["state"], + zip_code=home["zip"], ) - url = 'https://www.redfin.com{}'.format(home['url']) + url = "https://www.redfin.com{}".format(home["url"]) return Property( address=address, url=url, - beds=home['beds'] if 'beds' in home else None, - baths=home['baths'] if 'baths' in home else None, - stories=home['stories'] if 'stories' in home else None, - agent_name=get_value('listingAgent'), - description=home['listingRemarks'] if 'listingRemarks' in home else None, - year_built=get_value('yearBuilt') if not single_search else home['yearBuilt'], - square_feet=get_value('sqFt'), - price_per_square_foot=get_value('pricePerSqFt'), - price=get_value('price'), - mls_id=get_value('mlsId') + beds=home["beds"] if "beds" in home else None, + baths=home["baths"] if "baths" in home else None, + stories=home["stories"] if "stories" in home else None, + agent_name=get_value("listingAgent"), + description=home["listingRemarks"] if "listingRemarks" in home else None, + year_built=get_value("yearBuilt") + if not single_search + else home["yearBuilt"], + square_feet=get_value("sqFt"), + price_per_square_foot=get_value("pricePerSqFt"), + price=get_value("price"), + mls_id=get_value("mlsId"), ) def handle_address(self, home_id: str): @@ -78,25 +82,33 @@ def handle_address(self, home_id: str): https://www.redfin.com/stingray/api/home/details/belowTheFold?propertyId=147337694&accessLevel=3 """ - url = "https://www.redfin.com/stingray/api/home/details/aboveTheFold?propertyId={}&accessLevel=3".format(home_id) + url = "https://www.redfin.com/stingray/api/home/details/aboveTheFold?propertyId={}&accessLevel=3".format( + home_id + ) response = self.session.get(url) - response_json = json.loads(response.text.replace('{}&&', '')) + response_json = json.loads(response.text.replace("{}&&", "")) - parsed_home = self.parse_home(response_json['payload']['addressSectionInfo'], single_search=True) + parsed_home = self._parse_home( + response_json["payload"]["addressSectionInfo"], single_search=True + ) return [parsed_home] def search(self): - region_id, region_type = self.handle_location() + region_id, region_type = self._handle_location() if region_type == "address": home_id = region_id return self.handle_address(home_id) - url = 'https://www.redfin.com/stingray/api/gis?al=1®ion_id={}®ion_type={}'.format(region_id, region_type) + url = "https://www.redfin.com/stingray/api/gis?al=1®ion_id={}®ion_type={}".format( + region_id, region_type + ) response = self.session.get(url) - response_json = json.loads(response.text.replace('{}&&', '')) + response_json = json.loads(response.text.replace("{}&&", "")) - homes = [self.parse_home(home) for home in response_json['payload']['homes']] #: support buildings + homes = [ + self._parse_home(home) for home in response_json["payload"]["homes"] + ] #: support buildings return homes diff --git a/homeharvest/core/scrapers/zillow/__init__.py b/homeharvest/core/scrapers/zillow/__init__.py new file mode 100644 index 0000000..a167e2a --- /dev/null +++ b/homeharvest/core/scrapers/zillow/__init__.py @@ -0,0 +1,205 @@ +import re +import json +from ..models import Property, Address, Building, ListingType +from ....exceptions import NoResultsFound, PropertyNotFound +from .. import Scraper + + +class ZillowScraper(Scraper): + listing_type: ListingType.FOR_SALE + + def __init__(self, scraper_input): + super().__init__(scraper_input) + if self.listing_type == ListingType.FOR_SALE: + self.url = f"https://www.zillow.com/homes/for_sale/{self.location}_rb/" + elif self.listing_type == ListingType.FOR_RENT: + self.url = f"https://www.zillow.com/homes/for_rent/{self.location}_rb/" + + def search(self): + resp = self.session.get(self.url, headers=self._get_headers()) + resp.raise_for_status() + content = resp.text + + match = re.search( + r'', + content, + re.DOTALL, + ) + if not match: + raise NoResultsFound( + "No results were found for Zillow with the given Location." + ) + + json_str = match.group(1) + data = json.loads(json_str) + + if "searchPageState" in data["props"]["pageProps"]: + houses = data["props"]["pageProps"]["searchPageState"]["cat1"][ + "searchResults" + ]["listResults"] + return [self._parse_home(house) for house in houses] + elif "gdpClientCache" in data["props"]["pageProps"]: + gdp_client_cache = json.loads(data["props"]["pageProps"]["gdpClientCache"]) + main_key = list(gdp_client_cache.keys())[0] + + property_data = gdp_client_cache[main_key]["property"] + property = self._get_single_property_page(property_data) + + return [property] + raise PropertyNotFound("Specific property data not found in the response.") + + @classmethod + def _parse_home(cls, home: dict): + """ + This method is used when a user enters a generic location & zillow returns more than one property + """ + url = ( + f"https://www.zillow.com{home['detailUrl']}" + if "zillow.com" not in home["detailUrl"] + else home["detailUrl"] + ) + + if "hdpData" in home and "homeInfo" in home["hdpData"]: + price_data = cls._extract_price(home) + address = cls._extract_address(home) + agent_name = cls._extract_agent_name(home) + beds = home["hdpData"]["homeInfo"]["bedrooms"] + baths = home["hdpData"]["homeInfo"]["bathrooms"] + listing_type = home["hdpData"]["homeInfo"].get("homeType") + + return Property( + address=address, + agent_name=agent_name, + url=url, + beds=beds, + baths=baths, + listing_type=listing_type, + **price_data, + ) + else: + keys = ("addressStreet", "addressCity", "addressState", "addressZipcode") + address_one, city, state, zip_code = (home[key] for key in keys) + address_one, address_two = cls._parse_address_two(address_one) + address = Address(address_one, city, state, zip_code, address_two) + + building_info = cls._extract_building_info(home) + return Building(address=address, url=url, **building_info) + + @classmethod + def _get_single_property_page(cls, property_data: dict): + """ + This method is used when a user enters the exact location & zillow returns just one property + """ + url = ( + f"https://www.zillow.com{property_data['hdpUrl']}" + if "zillow.com" not in property_data["hdpUrl"] + else property_data["hdpUrl"] + ) + address_data = property_data["address"] + address_one, address_two = cls._parse_address_two(address_data["streetAddress"]) + address = Address( + address_one=address_one, + address_two=address_two, + city=address_data["city"], + state=address_data["state"], + zip_code=address_data["zipcode"], + ) + + return Property( + address=address, + url=url, + beds=property_data.get("bedrooms", None), + baths=property_data.get("bathrooms", None), + year_built=property_data.get("yearBuilt", None), + price=property_data.get("price", None), + lot_size=property_data.get("lotSize", None), + agent_name=property_data.get("attributionInfo", {}).get("agentName", None), + stories=property_data.get("resoFacts", {}).get("stories", None), + description=property_data.get("description", None), + mls_id=property_data.get("attributionInfo", {}).get("mlsId", None), + price_per_square_foot=property_data.get("resoFacts", {}).get( + "pricePerSquareFoot", None + ), + square_feet=property_data.get("livingArea", None), + listing_type=property_data.get("homeType", None), + ) + + @classmethod + def _extract_building_info(cls, home: dict) -> dict: + num_units = len(home["units"]) + prices = [ + int(unit["price"].replace("$", "").replace(",", "").split("+")[0]) + for unit in home["units"] + ] + return { + "listing_type": cls.listing_type, + "num_units": len(home["units"]), + "min_unit_price": min( + ( + int(unit["price"].replace("$", "").replace(",", "").split("+")[0]) + for unit in home["units"] + ) + ), + "max_unit_price": max( + ( + int(unit["price"].replace("$", "").replace(",", "").split("+")[0]) + for unit in home["units"] + ) + ), + "avg_unit_price": sum(prices) // len(prices) if num_units else None, + } + + @staticmethod + def _extract_price(home: dict) -> dict: + price = int(home["hdpData"]["homeInfo"]["priceForHDP"]) + square_feet = home["hdpData"]["homeInfo"].get("livingArea") + + lot_size = home["hdpData"]["homeInfo"].get("lotAreaValue") + price_per_square_foot = price // square_feet if square_feet and price else None + + return { + k: v + for k, v in locals().items() + if k in ["price", "square_feet", "lot_size", "price_per_square_foot"] + } + + @staticmethod + def _extract_agent_name(home: dict) -> str | None: + broker_str = home.get("brokerName", "") + match = re.search(r"Listing by: (.+)", broker_str) + return match.group(1) if match else None + + @staticmethod + def _parse_address_two(address_one: str): + apt_match = re.search(r"(APT\s*.+|#[\s\S]+)$", address_one, re.I) + address_two = apt_match.group().strip() if apt_match else None + address_one = ( + address_one.replace(address_two, "").strip() if address_two else address_one + ) + return address_one, address_two + + @staticmethod + def _extract_address(home: dict) -> Address: + keys = ("streetAddress", "city", "state", "zipcode") + address_one, city, state, zip_code = ( + home["hdpData"]["homeInfo"][key] for key in keys + ) + address_one, address_two = ZillowScraper._parse_address_two(address_one) + return Address(address_one, city, state, zip_code, address_two=address_two) + + @staticmethod + def _get_headers(): + return { + "authority": "parser-external.geo.moveaws.com", + "accept": "*/*", + "accept-language": "en-US,en;q=0.9", + "origin": "https://www.zillow.com", + "referer": "https://www.zillow.com/", + "sec-ch-ua": '"Chromium";v="116", "Not)A;Brand";v="24", "Google Chrome";v="116"', + "sec-ch-ua-mobile": "?0", + "sec-ch-ua-platform": '"Windows"', + "sec-fetch-dest": "empty", + "sec-fetch-mode": "cors", + "sec-fetch-site": "cross-site", + "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36", + } diff --git a/homeharvest/exceptions.py b/homeharvest/exceptions.py index 7cad9ec..99df9ef 100644 --- a/homeharvest/exceptions.py +++ b/homeharvest/exceptions.py @@ -1,8 +1,14 @@ class InvalidSite(Exception): """Raised when a provided site is does not exist.""" - pass class InvalidListingType(Exception): """Raised when a provided listing type is does not exist.""" - pass + + +class NoResultsFound(Exception): + """Raised when no results are found for the given location""" + + +class PropertyNotFound(Exception): + """Raised when no property is found for the given address""" diff --git a/tests/test_realtor.py b/tests/test_realtor.py index 665db8b..2649177 100644 --- a/tests/test_realtor.py +++ b/tests/test_realtor.py @@ -3,10 +3,7 @@ def test_realtor(): results = [ - scrape_property( - location="85281", - site_name="realtor.com" - ), + scrape_property(location="85281", site_name="realtor.com"), ] assert all([result is not None for result in results]) diff --git a/tests/test_redfin.py b/tests/test_redfin.py index 4c0d12d..78fa541 100644 --- a/tests/test_redfin.py +++ b/tests/test_redfin.py @@ -3,22 +3,10 @@ def test_redfin(): results = [ - scrape_property( - location="2530 Al Lipscomb Way", - site_name="redfin" - ), - scrape_property( - location="Phoenix, AZ, USA", - site_name="redfin" - ), - scrape_property( - location="Dallas, TX, USA", - site_name="redfin" - ), - scrape_property( - location="85281", - site_name="redfin" - ), + scrape_property(location="2530 Al Lipscomb Way", site_name="redfin"), + scrape_property(location="Phoenix, AZ, USA", site_name="redfin"), + scrape_property(location="Dallas, TX, USA", site_name="redfin"), + scrape_property(location="85281", site_name="redfin"), ] assert all([result is not None for result in results]) diff --git a/tests/test_zillow.py b/tests/test_zillow.py new file mode 100644 index 0000000..d9a56dc --- /dev/null +++ b/tests/test_zillow.py @@ -0,0 +1,12 @@ +from homeharvest import scrape_property + + +def test_zillow(): + results = [ + scrape_property(location="2530 Al Lipscomb Way", site_name="zillow"), + scrape_property(location="Phoenix, AZ, USA", site_name="zillow"), + scrape_property(location="Dallas, TX, USA", site_name="zillow"), + scrape_property(location="85281", site_name="zillow"), + ] + + assert all([result is not None for result in results])