diff --git a/homeharvest/core/scrapers/models.py b/homeharvest/core/scrapers/models.py index b3075c5..6ae6955 100644 --- a/homeharvest/core/scrapers/models.py +++ b/homeharvest/core/scrapers/models.py @@ -53,7 +53,7 @@ class Address: city: str state: str zip_code: str - unit: str + unit: str | None = None country: str | None = None diff --git a/homeharvest/core/scrapers/redfin/__init__.py b/homeharvest/core/scrapers/redfin/__init__.py index 29855a7..f1d9c29 100644 --- a/homeharvest/core/scrapers/redfin/__init__.py +++ b/homeharvest/core/scrapers/redfin/__init__.py @@ -1,7 +1,8 @@ import json -from ..models import Property, Address, PropertyType -from .. import Scraper from typing import Any +from .. import Scraper +from ....utils import parse_address_two +from ..models import Property, Address, PropertyType class RedfinScraper(Scraper): @@ -38,20 +39,26 @@ def get_value(key: str) -> Any | None: return home[key]["value"] if not single_search: + unit = parse_address_two(get_value("streetLine")) address = Address( - address_one=get_value("streetLine"), + street_address=get_value("streetLine"), city=home["city"], state=home["state"], zip_code=home["zip"], + unit=unit, + country="USA", ) else: address_info = home["streetAddress"] + unit = parse_address_two(address_info["assembledAddress"]) address = Address( - address_one=address_info["assembledAddress"], + street_address=address_info["assembledAddress"], city=home["city"], state=home["state"], zip_code=home["zip"], + unit=unit, + country="USA", ) url = "https://www.redfin.com{}".format(home["url"]) property_type = home["propertyType"] if "propertyType" in home else None @@ -69,7 +76,7 @@ def get_value(key: str) -> Any | None: site_name=self.site_name, listing_type=self.listing_type, address=address, - url=url, + property_url=url, beds=home["beds"] if "beds" in home else None, baths=home["baths"] if "baths" in home else None, stories=home["stories"] if "stories" in home else None, @@ -79,9 +86,9 @@ def get_value(key: str) -> Any | None: if not single_search else home["yearBuilt"], square_feet=get_value("sqFt"), - lot_size=lot_size, + lot_area_value=lot_size, property_type=PropertyType.from_int_code(home.get("propertyType")), - price_per_square_foot=get_value("pricePerSqFt"), + price_per_sqft=get_value("pricePerSqFt"), price=get_value("price"), mls_id=get_value("mlsId"), ) diff --git a/homeharvest/core/scrapers/zillow/__init__.py b/homeharvest/core/scrapers/zillow/__init__.py index 5843b40..6c36196 100644 --- a/homeharvest/core/scrapers/zillow/__init__.py +++ b/homeharvest/core/scrapers/zillow/__init__.py @@ -1,8 +1,9 @@ import re import json -from ..models import Property, Address, ListingType, PropertyType, SiteName -from ....exceptions import NoResultsFound, PropertyNotFound from .. import Scraper +from ....utils import parse_address_two +from ....exceptions import NoResultsFound, PropertyNotFound +from ..models import Property, Address, ListingType, PropertyType, SiteName class ZillowScraper(Scraper): @@ -120,7 +121,7 @@ def _fetch_properties_backend(self, coords): resp = self.session.put(url, headers=self._get_headers(), data=payload) resp.raise_for_status() a = resp.json() - return self._parse_properties(resp.json()) + return parse_properties(resp.json()) def _parse_properties(self, property_data: dict): mapresults = property_data["cat1"]["searchResults"]["mapResults"] @@ -249,7 +250,7 @@ def _get_single_property_page(self, property_data: dict): else property_data["hdpUrl"] ) address_data = property_data["address"] - unit = self._parse_address_two(address_data["streetAddress"]) + unit = parse_address_two(address_data["streetAddress"]) address = Address( street_address=address_data["streetAddress"], unit=unit, @@ -288,11 +289,6 @@ def _get_single_property_page(self, property_data: dict): listing_type=self.listing_type, ) - @staticmethod - def _parse_address_two(address_one: str): - apt_match = re.search(r"(APT\s*.+|#[\s\S]+)$", address_one, re.I) - return apt_match.group().strip() if apt_match else None - def _extract_address(self, address_str): """ Extract address components from a string formatted like '555 Wedglea Dr, Dallas, TX', @@ -309,14 +305,14 @@ def _extract_address(self, address_str): if len(state_zip) == 1: state = state_zip[0].strip() - zip_code = None + zip_code = None elif len(state_zip) == 2: state = state_zip[0].strip() zip_code = state_zip[1].strip() else: raise ValueError(f"Unexpected state/zip format in address: {address_str}") - unit = self._parse_address_two(street_address) + unit = parse_address_two(street_address) return Address( street_address=street_address, city=city, @@ -335,7 +331,7 @@ def _get_headers(): "content-type": "application/json", "cookie": 'zjs_user_id=null; zg_anonymous_id=%220976ab81-2950-4013-98f0-108b15a554d2%22; zguid=24|%246b1bc625-3955-4d1e-a723-e59602e4ed08; g_state={"i_p":1693611172520,"i_l":1}; zgsession=1|d48820e2-1659-4d2f-b7d2-99a8127dd4f3; zjs_anonymous_id=%226b1bc625-3955-4d1e-a723-e59602e4ed08%22; JSESSIONID=82E8274D3DC8AF3AB9C8E613B38CF861; search=6|1697585860120%7Crb%3DDallas%252C-TX%26rect%3D33.016646%252C-96.555516%252C32.618763%252C-96.999347%26disp%3Dmap%26mdm%3Dauto%26sort%3Ddays%26listPriceActive%3D1%26fs%3D1%26fr%3D0%26mmm%3D0%26rs%3D0%26ah%3D0%26singlestory%3D0%26abo%3D0%26garage%3D0%26pool%3D0%26ac%3D0%26waterfront%3D0%26finished%3D0%26unfinished%3D0%26cityview%3D0%26mountainview%3D0%26parkview%3D0%26waterview%3D0%26hoadata%3D1%263dhome%3D0%26commuteMode%3Ddriving%26commuteTimeOfDay%3Dnow%09%0938128%09%7B%22isList%22%3Atrue%2C%22isMap%22%3Atrue%7D%09%09%09%09%09; AWSALB=gAlFj5Ngnd4bWP8k7CME/+YlTtX9bHK4yEkdPHa3VhL6K523oGyysFxBEpE1HNuuyL+GaRPvt2i/CSseAb+zEPpO4SNjnbLAJzJOOO01ipnWN3ZgPaa5qdv+fAki; AWSALBCORS=gAlFj5Ngnd4bWP8k7CME/+YlTtX9bHK4yEkdPHa3VhL6K523oGyysFxBEpE1HNuuyL+GaRPvt2i/CSseAb+zEPpO4SNjnbLAJzJOOO01ipnWN3ZgPaa5qdv+fAki; search=6|1697587741808%7Crect%3D33.37188814545521%2C-96.34484483007813%2C32.260490641365685%2C-97.21001816992188%26disp%3Dmap%26mdm%3Dauto%26p%3D1%26sort%3Ddays%26z%3D1%26listPriceActive%3D1%26fs%3D1%26fr%3D0%26mmm%3D0%26rs%3D0%26ah%3D0%26singlestory%3D0%26housing-connector%3D0%26abo%3D0%26garage%3D0%26pool%3D0%26ac%3D0%26waterfront%3D0%26finished%3D0%26unfinished%3D0%26cityview%3D0%26mountainview%3D0%26parkview%3D0%26waterview%3D0%26hoadata%3D1%26zillow-owned%3D0%263dhome%3D0%26featuredMultiFamilyBuilding%3D0%26commuteMode%3Ddriving%26commuteTimeOfDay%3Dnow%09%09%09%7B%22isList%22%3Atrue%2C%22isMap%22%3Atrue%7D%09%09%09%09%09', "origin": "https://www.zillow.com", - "referer": "https://www.zillow.com/homes/Dallas,-TX_rb/", + "referer": "https://www.zillow.com", "sec-ch-ua": '"Chromium";v="116", "Not)A;Brand";v="24", "Google Chrome";v="116"', "sec-ch-ua-mobile": "?0", "sec-ch-ua-platform": '"Windows"', diff --git a/homeharvest/utils.py b/homeharvest/utils.py new file mode 100644 index 0000000..a22cdcf --- /dev/null +++ b/homeharvest/utils.py @@ -0,0 +1,6 @@ +import re + + +def parse_address_two(address_one: str): + apt_match = re.search(r"(APT\s*.+|#[\s\S]+)$", address_one, re.I) + return apt_match.group().strip() if apt_match else None