diff --git a/homeharvest/__init__.py b/homeharvest/__init__.py index f817806..009aee6 100644 --- a/homeharvest/__init__.py +++ b/homeharvest/__init__.py @@ -1,7 +1,7 @@ from .core.scrapers.redfin import RedfinScraper from .core.scrapers.realtor import RealtorScraper from .core.scrapers.zillow import ZillowScraper -from .core.scrapers.models import ListingType, Property, Building, SiteName +from .core.scrapers.models import ListingType, Property, SiteName from .core.scrapers import ScraperInput from .exceptions import InvalidSite, InvalidListingType from typing import Union @@ -25,60 +25,62 @@ def validate_input(site_name: str, listing_type: str) -> None: ) -def get_ordered_properties(result: Union[Building, Property]) -> list[str]: - if isinstance(result, Property): - return [ - "listing_type", - "address_one", - "city", - "state", - "zip_code", - "address_two", - "url", - "property_type", - "price", - "beds", - "baths", - "square_feet", - "price_per_square_foot", - "lot_size", - "stories", - "year_built", - "agent_name", - "mls_id", - "description", - ] - elif isinstance(result, Building): - return [ - "address_one", - "city", - "state", - "zip_code", - "address_two", - "url", - "num_units", - "min_unit_price", - "max_unit_price", - "avg_unit_price", - "listing_type", - ] - return [] - - -def process_result(result: Union[Building, Property]) -> pd.DataFrame: +def get_ordered_properties(result: Property) -> list[str]: + return [ + "property_url", + "site_name", + "listing_type", + "property_type", + "status_text", + "currency", + "price", + "apt_min_price", + "tax_assessed_value", + "square_feet", + "price_per_sqft", + "beds", + "baths", + "lot_area_value", + "lot_area_unit", + "street_address", + "unit", + "city", + "state", + "zip_code", + "country", + "posted_time", + "bldg_min_beds", + "bldg_min_baths", + "bldg_min_area", + "bldg_unit_count", + "bldg_name", + "stories", + "year_built", + "agent_name", + "mls_id", + "description", + "img_src", + "latitude", + "longitude", + ] + + +def process_result(result: Property) -> pd.DataFrame: prop_data = result.__dict__ - address_data = prop_data["address"] prop_data["site_name"] = prop_data["site_name"].value - prop_data["listing_type"] = prop_data["listing_type"].value + prop_data["listing_type"] = prop_data["listing_type"].value.lower() prop_data["property_type"] = prop_data["property_type"].value.lower() - prop_data["address_one"] = address_data.address_one - prop_data["city"] = address_data.city - prop_data["state"] = address_data.state - prop_data["zip_code"] = address_data.zip_code - prop_data["address_two"] = address_data.address_two + if "address" in prop_data: + address_data = prop_data["address"] + prop_data["street_address"] = address_data.street_address + prop_data["unit"] = address_data.unit + prop_data["city"] = address_data.city + prop_data["state"] = address_data.state + prop_data["zip_code"] = address_data.zip_code + prop_data["country"] = address_data.country - del prop_data["address"] + del prop_data["address"] properties_df = pd.DataFrame([prop_data]) properties_df = properties_df[get_ordered_properties(result)] @@ -90,7 +92,7 @@ def scrape_property( location: str, site_name: str, listing_type: str = "for_sale", #: for_sale, for_rent, sold -) -> Union[list[Building], list[Property]]: +) -> list[Property]: validate_input(site_name, listing_type) scraper_input = ScraperInput( @@ -103,5 +105,7 @@ def scrape_property( results = site.search() properties_dfs = [process_result(result) for result in results] + if not properties_dfs: + return pd.DataFrame() return pd.concat(properties_dfs, ignore_index=True) diff --git a/homeharvest/core/scrapers/models.py b/homeharvest/core/scrapers/models.py index 1a3db97..b3075c5 100644 --- a/homeharvest/core/scrapers/models.py +++ b/homeharvest/core/scrapers/models.py @@ -9,22 +9,28 @@ class SiteName(Enum): class ListingType(Enum): - FOR_SALE = "for_sale" - FOR_RENT = "for_rent" - SOLD = "sold" + FOR_SALE = "FOR_SALE" + FOR_RENT = "FOR_RENT" + SOLD = "SOLD" class PropertyType(Enum): HOUSE = "HOUSE" + BUILDING = "BUILDING" CONDO = "CONDO" TOWNHOUSE = "TOWNHOUSE" SINGLE_FAMILY = "SINGLE_FAMILY" MULTI_FAMILY = "MULTI_FAMILY" MANUFACTURED = "MANUFACTURED" + NEW_CONSTRUCTION = "NEW_CONSTRUCTION" APARTMENT = "APARTMENT" + APARTMENTS = "APARTMENTS" LAND = "LAND" + LOT = "LOT" OTHER = "OTHER" + BLANK = "BLANK" + @classmethod def from_int_code(cls, code): mapping = { @@ -38,48 +44,56 @@ def from_int_code(cls, code): 13: cls.SINGLE_FAMILY, } - return mapping.get(code, cls.OTHER) + return mapping.get(code, cls.BLANK) @dataclass class Address: - address_one: str + street_address: str city: str state: str zip_code: str - - address_two: str | None = None + unit: str + country: str | None = None -@dataclass() -class Realty: +@dataclass +class Property: + property_url: str site_name: SiteName + listing_type: ListingType + property_type: PropertyType address: Address - url: str - listing_type: ListingType | None = None - -@dataclass -class Property(Realty): + # house for sale price: int | None = None + tax_assessed_value: int | None = None + currency: str | None = None + square_feet: int | None = None beds: int | None = None baths: float | None = None + lot_area_value: float | None = None + lot_area_unit: str | None = None stories: int | None = None year_built: int | None = None - square_feet: int | None = None - price_per_square_foot: int | None = None + price_per_sqft: int | None = None year_built: int | None = None mls_id: str | None = None agent_name: str | None = None - property_type: PropertyType | None = None - lot_size: int | None = None + img_src: str | None = None description: str | None = None - - -@dataclass -class Building(Realty): - num_units: int | None = None - min_unit_price: int | None = None - max_unit_price: int | None = None - avg_unit_price: int | None = None + status_text: str | None = None + latitude: float | None = None + longitude: float | None = None + posted_time: str | None = None + + # building for sale + bldg_name: str | None = None + bldg_unit_count: int | None = None + bldg_min_beds: int | None = None + bldg_min_baths: float | None = None + bldg_min_area: int | None = None + + # apt + apt_min_price: int | None = None diff --git a/homeharvest/core/scrapers/zillow/__init__.py b/homeharvest/core/scrapers/zillow/__init__.py index 1b25c16..5843b40 100644 --- a/homeharvest/core/scrapers/zillow/__init__.py +++ b/homeharvest/core/scrapers/zillow/__init__.py @@ -1,6 +1,6 @@ import re import json -from ..models import Property, Address, Building, ListingType, PropertyType +from ..models import Property, Address, ListingType, PropertyType, SiteName from ....exceptions import NoResultsFound, PropertyNotFound from .. import Scraper @@ -13,6 +13,8 @@ def __init__(self, scraper_input): self.url = f"https://www.zillow.com/homes/for_sale/{self.location}_rb/" elif self.listing_type == ListingType.FOR_RENT: self.url = f"https://www.zillow.com/homes/for_rent/{self.location}_rb/" + else: + self.url = f"https://www.zillow.com/homes/recently_sold/{self.location}_rb/" def search(self): resp = self.session.get(self.url, headers=self._get_headers()) @@ -33,10 +35,17 @@ def search(self): data = json.loads(json_str) if "searchPageState" in data["props"]["pageProps"]: - houses = data["props"]["pageProps"]["searchPageState"]["cat1"][ - "searchResults" - ]["listResults"] - return [self._parse_home(house) for house in houses] + pattern = r'window\.mapBounds = \{\s*"west":\s*(-?\d+\.\d+),\s*"east":\s*(-?\d+\.\d+),\s*"south":\s*(-?\d+\.\d+),\s*"north":\s*(-?\d+\.\d+)\s*\};' + + match = re.search(pattern, content) + + if match: + coords = [float(coord) for coord in match.groups()] + return self._fetch_properties_backend(coords) + + else: + raise BoxBoundsNotFound("Box bounds could not be located.") + elif "gdpClientCache" in data["props"]["pageProps"]: gdp_client_cache = json.loads(data["props"]["pageProps"]["gdpClientCache"]) main_key = list(gdp_client_cache.keys())[0] @@ -47,45 +56,188 @@ def search(self): return [property] raise PropertyNotFound("Specific property data not found in the response.") - def _parse_home(self, home: dict): - """ - This method is used when a user enters a generic location & zillow returns more than one property - """ - url = ( - f"https://www.zillow.com{home['detailUrl']}" - if "zillow.com" not in home["detailUrl"] - else home["detailUrl"] + def _fetch_properties_backend(self, coords): + url = "https://www.zillow.com/async-create-search-page-state" + + filter_state_for_sale = { + "sortSelection": { + # "value": "globalrelevanceex" + "value": "days" + }, + "isAllHomes": {"value": True}, + } + + filter_state_for_rent = { + "isForRent": {"value": True}, + "isForSaleByAgent": {"value": False}, + "isForSaleByOwner": {"value": False}, + "isNewConstruction": {"value": False}, + "isComingSoon": {"value": False}, + "isAuction": {"value": False}, + "isForSaleForeclosure": {"value": False}, + "isAllHomes": {"value": True}, + } + + filter_state_sold = { + "isRecentlySold": {"value": True}, + "isForSaleByAgent": {"value": False}, + "isForSaleByOwner": {"value": False}, + "isNewConstruction": {"value": False}, + "isComingSoon": {"value": False}, + "isAuction": {"value": False}, + "isForSaleForeclosure": {"value": False}, + "isAllHomes": {"value": True}, + } + + selected_filter = ( + filter_state_for_rent + if self.listing_type == ListingType.FOR_RENT + else filter_state_for_sale + if self.listing_type == ListingType.FOR_SALE + else filter_state_sold ) - if "hdpData" in home and "homeInfo" in home["hdpData"]: - price_data = self._extract_price(home) - address = self._extract_address(home) - agent_name = self._extract_agent_name(home) - beds = home["hdpData"]["homeInfo"]["bedrooms"] - baths = home["hdpData"]["homeInfo"]["bathrooms"] - property_type = home["hdpData"]["homeInfo"].get("homeType") - - return Property( - site_name=self.site_name, - address=address, - agent_name=agent_name, - url=url, - beds=beds, - baths=baths, - listing_type=self.listing_type, - property_type=PropertyType(property_type), - **price_data, - ) - else: - keys = ("addressStreet", "addressCity", "addressState", "addressZipcode") - address_one, city, state, zip_code = (home[key] for key in keys) - address_one, address_two = self._parse_address_two(address_one) - address = Address(address_one, city, state, zip_code, address_two) - - building_info = self._extract_building_info(home) - return Building( - site_name=self.site_name, address=address, url=url, **building_info + payload = json.dumps( + { + "searchQueryState": { + "pagination": {}, + "isMapVisible": True, + "mapBounds": { + "west": coords[0], + "east": coords[1], + "south": coords[2], + "north": coords[3], + }, + "filterState": selected_filter, + "isListVisible": True, + "mapZoom": 11, + }, + "wants": {"cat1": ["mapResults"]}, + "isDebugRequest": False, + } + ) + print(payload) + resp = self.session.put(url, headers=self._get_headers(), data=payload) + resp.raise_for_status() + a = resp.json() + return self._parse_properties(resp.json()) + + def _parse_properties(self, property_data: dict): + mapresults = property_data["cat1"]["searchResults"]["mapResults"] + + properties_list = [] + + for result in mapresults: + try: + if "hdpData" in result: + home_info = result["hdpData"]["homeInfo"] + address_data = { + "street_address": home_info["streetAddress"], + "unit": home_info.get("unit"), + "city": home_info["city"], + "state": home_info["state"], + "zip_code": home_info["zipcode"], + "country": home_info["country"], + } + property_data = { + "site_name": self.site_name, + "address": Address(**address_data), + "property_url": f"https://www.zillow.com{result['detailUrl']}", + "beds": int(home_info["bedrooms"]) + if "bedrooms" in home_info + else None, + "baths": home_info.get("bathrooms"), + "square_feet": int(home_info["livingArea"]) + if "livingArea" in home_info + else None, + "currency": home_info["currency"], + "price": home_info.get("price"), + "square_feet": int(home_info["livingArea"]) + if "livingArea" in home_info + else None, + "tax_assessed_value": int(home_info["taxAssessedValue"]) + if "taxAssessedValue" in home_info + else None, + "property_type": PropertyType(home_info["homeType"]), + "listing_type": ListingType( + home_info["statusType"] + if "statusType" in home_info + else self.listing_type + ), + "lot_area_value": round(home_info["lotAreaValue"], 2) + if "lotAreaValue" in home_info + else None, + "lot_area_unit": home_info.get("lotAreaUnit"), + "latitude": result["latLong"]["latitude"], + "longitude": result["latLong"]["longitude"], + "status_text": result.get("statusText"), + "posted_time": result["variableData"]["text"] + if "variableData" in result + and "text" in result["variableData"] + and result["variableData"]["type"] == "TIME_ON_INFO" + else None, + "img_src": result.get("imgSrc"), + "price_per_sqft": int( + home_info["price"] // home_info["livingArea"] + ) + if "livingArea" in home_info and "price" in home_info + else None, + } + property_obj = Property(**property_data) + properties_list.append(property_obj) + + elif "isBuilding" in result: + price = result["price"] + building_data = { + "property_url": f"https://www.zillow.com{result['detailUrl']}", + "site_name": self.site_name, + "property_type": PropertyType("BUILDING"), + "listing_type": ListingType(result["statusType"]), + "img_src": result["imgSrc"], + "price": int(price.replace("From $", "").replace(",", "")) + if "From $" in price + else None, + "apt_min_price": int( + price.replace("$", "").replace(",", "").replace("+/mo", "") + ) + if "+/mo" in price + else None, + "address": self._extract_address(result["address"]), + "bldg_min_beds": result["minBeds"], + "currency": "USD", + "bldg_min_baths": result["minBaths"], + "bldg_min_area": result.get("minArea"), + "bldg_unit_count": result["unitCount"], + "bldg_name": result.get("communityName"), + "status_text": result["statusText"], + "latitude": result["latLong"]["latitude"], + "longitude": result["latLong"]["longitude"], + } + building_obj = Property(**building_data) + properties_list.append(building_obj) + + except Exception as e: + print(home_info) + traceback.print_exc() + sys.exit() + + return properties_list + + def _extract_units(self, result: dict): + units = {} + if "units" in result: + num_units = result.get("availabilityCount", len(result["units"])) + prices = [ + int(unit["price"].replace("$", "").replace(",", "").split("+")[0]) + for unit in result["units"] + ] + units["apt_availability_count"] = num_units + units["apt_min_unit_price"] = min(prices) + units["apt_max_unit_price"] = max(prices) + units["apt_avg_unit_price"] = ( + sum(prices) // num_units if num_units else None ) + return units def _get_single_property_page(self, property_data: dict): """ @@ -97,32 +249,38 @@ def _get_single_property_page(self, property_data: dict): else property_data["hdpUrl"] ) address_data = property_data["address"] - address_one, address_two = self._parse_address_two( - address_data["streetAddress"] - ) + unit = self._parse_address_two(address_data["streetAddress"]) address = Address( - address_one=address_one, - address_two=address_two, + street_address=address_data["streetAddress"], + unit=unit, city=address_data["city"], state=address_data["state"], zip_code=address_data["zipcode"], + country=property_data.get("country"), ) property_type = property_data.get("homeType", None) - return Property( site_name=self.site_name, address=address, - url=url, + property_url=url, beds=property_data.get("bedrooms", None), baths=property_data.get("bathrooms", None), year_built=property_data.get("yearBuilt", None), price=property_data.get("price", None), - lot_size=property_data.get("lotSize", None), + tax_assessed_value=property_data.get("taxAssessedValue", None), + latitude=property_data.get("latitude"), + longitude=property_data.get("longitude"), + img_src=property_data.get("streetViewTileImageUrlMediumAddress"), + currency=property_data.get("currency", None), + lot_area_value=property_data.get("lotAreaValue"), + lot_area_unit=property_data["lotAreaUnits"].lower() + if "lotAreaUnits" in property_data + else None, agent_name=property_data.get("attributionInfo", {}).get("agentName", None), stories=property_data.get("resoFacts", {}).get("stories", None), description=property_data.get("description", None), mls_id=property_data.get("attributionInfo", {}).get("mlsId", None), - price_per_square_foot=property_data.get("resoFacts", {}).get( + price_per_sqft=property_data.get("resoFacts", {}).get( "pricePerSquareFoot", None ), square_feet=property_data.get("livingArea", None), @@ -130,81 +288,59 @@ def _get_single_property_page(self, property_data: dict): listing_type=self.listing_type, ) - def _extract_building_info(self, home: dict) -> dict: - num_units = len(home["units"]) - prices = [ - int(unit["price"].replace("$", "").replace(",", "").split("+")[0]) - for unit in home["units"] - ] - return { - "listing_type": self.listing_type, - "num_units": len(home["units"]), - "min_unit_price": min( - ( - int(unit["price"].replace("$", "").replace(",", "").split("+")[0]) - for unit in home["units"] - ) - ), - "max_unit_price": max( - ( - int(unit["price"].replace("$", "").replace(",", "").split("+")[0]) - for unit in home["units"] - ) - ), - "avg_unit_price": sum(prices) // len(prices) if num_units else None, - } - @staticmethod - def _extract_price(home: dict) -> dict: - price = int(home["hdpData"]["homeInfo"]["priceForHDP"]) - square_feet = home["hdpData"]["homeInfo"].get("livingArea") + def _parse_address_two(address_one: str): + apt_match = re.search(r"(APT\s*.+|#[\s\S]+)$", address_one, re.I) + return apt_match.group().strip() if apt_match else None - lot_size = home["hdpData"]["homeInfo"].get("lotAreaValue") - price_per_square_foot = price // square_feet if square_feet and price else None + def _extract_address(self, address_str): + """ + Extract address components from a string formatted like '555 Wedglea Dr, Dallas, TX', + and return an Address object. + """ + parts = address_str.split(", ") - return { - k: v - for k, v in locals().items() - if k in ["price", "square_feet", "lot_size", "price_per_square_foot"] - } + if len(parts) != 3: + raise ValueError(f"Unexpected address format: {address_str}") - @staticmethod - def _extract_agent_name(home: dict) -> str | None: - broker_str = home.get("brokerName", "") - match = re.search(r"Listing by: (.+)", broker_str) - return match.group(1) if match else None + street_address = parts[0].strip() + city = parts[1].strip() + state_zip = parts[2].split(" ") - @staticmethod - def _parse_address_two(address_one: str): - apt_match = re.search(r"(APT\s*.+|#[\s\S]+)$", address_one, re.I) - address_two = apt_match.group().strip() if apt_match else None - address_one = ( - address_one.replace(address_two, "").strip() if address_two else address_one - ) - return address_one, address_two + if len(state_zip) == 1: + state = state_zip[0].strip() + zip_code = None + elif len(state_zip) == 2: + state = state_zip[0].strip() + zip_code = state_zip[1].strip() + else: + raise ValueError(f"Unexpected state/zip format in address: {address_str}") - @staticmethod - def _extract_address(home: dict) -> Address: - keys = ("streetAddress", "city", "state", "zipcode") - address_one, city, state, zip_code = ( - home["hdpData"]["homeInfo"][key] for key in keys + unit = self._parse_address_two(street_address) + return Address( + street_address=street_address, + city=city, + unit=unit, + state=state, + zip_code=zip_code, + country="USA", ) - address_one, address_two = ZillowScraper._parse_address_two(address_one) - return Address(address_one, city, state, zip_code, address_two=address_two) @staticmethod def _get_headers(): return { - "authority": "parser-external.geo.moveaws.com", + "authority": "www.zillow.com", "accept": "*/*", "accept-language": "en-US,en;q=0.9", + "content-type": "application/json", + "cookie": 'zjs_user_id=null; zg_anonymous_id=%220976ab81-2950-4013-98f0-108b15a554d2%22; zguid=24|%246b1bc625-3955-4d1e-a723-e59602e4ed08; g_state={"i_p":1693611172520,"i_l":1}; zgsession=1|d48820e2-1659-4d2f-b7d2-99a8127dd4f3; zjs_anonymous_id=%226b1bc625-3955-4d1e-a723-e59602e4ed08%22; JSESSIONID=82E8274D3DC8AF3AB9C8E613B38CF861; search=6|1697585860120%7Crb%3DDallas%252C-TX%26rect%3D33.016646%252C-96.555516%252C32.618763%252C-96.999347%26disp%3Dmap%26mdm%3Dauto%26sort%3Ddays%26listPriceActive%3D1%26fs%3D1%26fr%3D0%26mmm%3D0%26rs%3D0%26ah%3D0%26singlestory%3D0%26abo%3D0%26garage%3D0%26pool%3D0%26ac%3D0%26waterfront%3D0%26finished%3D0%26unfinished%3D0%26cityview%3D0%26mountainview%3D0%26parkview%3D0%26waterview%3D0%26hoadata%3D1%263dhome%3D0%26commuteMode%3Ddriving%26commuteTimeOfDay%3Dnow%09%0938128%09%7B%22isList%22%3Atrue%2C%22isMap%22%3Atrue%7D%09%09%09%09%09; AWSALB=gAlFj5Ngnd4bWP8k7CME/+YlTtX9bHK4yEkdPHa3VhL6K523oGyysFxBEpE1HNuuyL+GaRPvt2i/CSseAb+zEPpO4SNjnbLAJzJOOO01ipnWN3ZgPaa5qdv+fAki; AWSALBCORS=gAlFj5Ngnd4bWP8k7CME/+YlTtX9bHK4yEkdPHa3VhL6K523oGyysFxBEpE1HNuuyL+GaRPvt2i/CSseAb+zEPpO4SNjnbLAJzJOOO01ipnWN3ZgPaa5qdv+fAki; search=6|1697587741808%7Crect%3D33.37188814545521%2C-96.34484483007813%2C32.260490641365685%2C-97.21001816992188%26disp%3Dmap%26mdm%3Dauto%26p%3D1%26sort%3Ddays%26z%3D1%26listPriceActive%3D1%26fs%3D1%26fr%3D0%26mmm%3D0%26rs%3D0%26ah%3D0%26singlestory%3D0%26housing-connector%3D0%26abo%3D0%26garage%3D0%26pool%3D0%26ac%3D0%26waterfront%3D0%26finished%3D0%26unfinished%3D0%26cityview%3D0%26mountainview%3D0%26parkview%3D0%26waterview%3D0%26hoadata%3D1%26zillow-owned%3D0%263dhome%3D0%26featuredMultiFamilyBuilding%3D0%26commuteMode%3Ddriving%26commuteTimeOfDay%3Dnow%09%09%09%7B%22isList%22%3Atrue%2C%22isMap%22%3Atrue%7D%09%09%09%09%09', "origin": "https://www.zillow.com", - "referer": "https://www.zillow.com/", + "referer": "https://www.zillow.com/homes/Dallas,-TX_rb/", "sec-ch-ua": '"Chromium";v="116", "Not)A;Brand";v="24", "Google Chrome";v="116"', "sec-ch-ua-mobile": "?0", "sec-ch-ua-platform": '"Windows"', "sec-fetch-dest": "empty", "sec-fetch-mode": "cors", - "sec-fetch-site": "cross-site", + "sec-fetch-site": "same-origin", "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36", } diff --git a/homeharvest/exceptions.py b/homeharvest/exceptions.py index 99df9ef..299e02b 100644 --- a/homeharvest/exceptions.py +++ b/homeharvest/exceptions.py @@ -12,3 +12,7 @@ class NoResultsFound(Exception): class PropertyNotFound(Exception): """Raised when no property is found for the given address""" + + +class BoxBoundsNotFound(Exception): + """Raised when no property is found for the given address"""