From ba9fe806a7240e4e7df09a0defd7935e7a4ff4e9 Mon Sep 17 00:00:00 2001 From: Zachary Hampton <69336300+ZacharyHampton@users.noreply.github.com> Date: Mon, 18 Sep 2023 08:16:59 -0700 Subject: [PATCH] - finished realtor --- homeharvest/__init__.py | 8 +- homeharvest/core/scrapers/__init__.py | 4 +- homeharvest/core/scrapers/models.py | 3 +- homeharvest/core/scrapers/realtor/__init__.py | 233 +++++++++++++++++- tests/test_realtor.py | 3 + 5 files changed, 236 insertions(+), 15 deletions(-) diff --git a/homeharvest/__init__.py b/homeharvest/__init__.py index f817806..4afbbc5 100644 --- a/homeharvest/__init__.py +++ b/homeharvest/__init__.py @@ -69,9 +69,9 @@ def process_result(result: Union[Building, Property]) -> pd.DataFrame: prop_data = result.__dict__ address_data = prop_data["address"] - prop_data["site_name"] = prop_data["site_name"].value + prop_data["site_name"] = prop_data["site_name"] prop_data["listing_type"] = prop_data["listing_type"].value - prop_data["property_type"] = prop_data["property_type"].value.lower() + prop_data["property_type"] = prop_data["property_type"].value.lower() if prop_data["property_type"] else None prop_data["address_one"] = address_data.address_one prop_data["city"] = address_data.city prop_data["state"] = address_data.state @@ -90,13 +90,13 @@ def scrape_property( location: str, site_name: str, listing_type: str = "for_sale", #: for_sale, for_rent, sold -) -> Union[list[Building], list[Property]]: +) -> pd.DataFrame: validate_input(site_name, listing_type) scraper_input = ScraperInput( location=location, listing_type=ListingType[listing_type.upper()], - site_name=SiteName[site_name.upper()], + site_name=site_name.lower(), ) site = _scrapers[site_name.lower()](scraper_input) diff --git a/homeharvest/core/scrapers/__init__.py b/homeharvest/core/scrapers/__init__.py index 873bf76..e985eec 100644 --- a/homeharvest/core/scrapers/__init__.py +++ b/homeharvest/core/scrapers/__init__.py @@ -7,13 +7,15 @@ class ScraperInput: location: str listing_type: ListingType - site_name: SiteName + site_name: str proxy_url: str | None = None class Scraper: def __init__(self, scraper_input: ScraperInput): self.location = scraper_input.location + self.listing_type = scraper_input.listing_type + self.session = requests.Session() self.listing_type = scraper_input.listing_type self.site_name = scraper_input.site_name diff --git a/homeharvest/core/scrapers/models.py b/homeharvest/core/scrapers/models.py index 1a3db97..b715fbd 100644 --- a/homeharvest/core/scrapers/models.py +++ b/homeharvest/core/scrapers/models.py @@ -53,7 +53,7 @@ class Address: @dataclass() class Realty: - site_name: SiteName + site_name: str address: Address url: str listing_type: ListingType | None = None @@ -68,7 +68,6 @@ class Property(Realty): year_built: int | None = None square_feet: int | None = None price_per_square_foot: int | None = None - year_built: int | None = None mls_id: str | None = None agent_name: str | None = None diff --git a/homeharvest/core/scrapers/realtor/__init__.py b/homeharvest/core/scrapers/realtor/__init__.py index 8e4fbd8..d3660f6 100644 --- a/homeharvest/core/scrapers/realtor/__init__.py +++ b/homeharvest/core/scrapers/realtor/__init__.py @@ -1,12 +1,15 @@ import json from ..models import Property, Address from .. import Scraper -from typing import Any +from typing import Any, Generator +from ....exceptions import NoResultsFound +from concurrent.futures import ThreadPoolExecutor, as_completed class RealtorScraper(Scraper): def __init__(self, scraper_input): super().__init__(scraper_input) + self.search_url = "https://www.realtor.com/api/v1/rdc_search_srp?client_id=rdc-search-new-communities&schema=vesta" def handle_location(self): headers = { @@ -26,7 +29,7 @@ def handle_location(self): params = { "input": self.location, - "client_id": "for-sale", + "client_id": self.listing_type.value.replace('_', '-'), "limit": "1", "area_types": "city,state,county,postal_code,address,street,neighborhood,school,school_district,university,park", } @@ -38,14 +41,228 @@ def handle_location(self): ) response_json = response.json() - return response_json["autocomplete"][0] + result = response_json["autocomplete"] + + if result is None: + raise NoResultsFound("No results found for location: " + self.location) + + return result[0] + + def handle_address(self, property_id: str) -> list[Property]: + query = """query Property($property_id: ID!) { + property(id: $property_id) { + property_id + details { + date_updated + garage + permalink + year_built + stories + } + address { + address_validation_code + city + country + county + line + postal_code + state_code + street_direction + street_name + street_number + street_suffix + street_post_direction + unit_value + unit + unit_descriptor + zip + } + basic { + baths + beds + price + sqft + lot_sqft + type + sold_price + } + public_record { + lot_size + sqft + stories + units + year_built + } + } + }""" + + variables = { + 'property_id': property_id + } + + payload = { + 'query': query, + 'variables': variables, + } + + response = self.session.post(self.search_url, json=payload) + response_json = response.json() + + property_info = response_json['data']['property'] + + return [Property( + site_name=self.site_name, + address=Address( + address_one=property_info['address']['line'], + city=property_info['address']['city'], + state=property_info['address']['state_code'], + zip_code=property_info['address']['postal_code'], + ), + url="https://www.realtor.com/realestateandhomes-detail/" + property_info['details']['permalink'], + beds=property_info['basic']['beds'], + baths=property_info['basic']['baths'], + stories=property_info['details']['stories'], + year_built=property_info['details']['year_built'], + square_feet=property_info['basic']['sqft'], + price_per_square_foot=property_info['basic']['price'] / property_info['basic']['sqft'] + if property_info['basic']['sqft'] is not None and + property_info['basic']['price'] is not None + else None, + price=property_info['basic']['price'], + mls_id=property_id, + listing_type=self.listing_type, + lot_size=property_info['public_record']['lot_size'] if property_info['public_record'] is not None else None, + )] + + def handle_area(self, variables: dict, return_total: bool = False) -> list[Property] | int: + query = """query Home_search( + $city: String, + $county: [String], + $state_code: String, + $postal_code: String + $offset: Int, + ) { + home_search( + query: { + city: $city + county: $county + postal_code: $postal_code + state_code: $state_code + status: %s + } + limit: 200 + offset: $offset + ) { + count + total + results { + property_id + description { + baths + beds + lot_sqft + sqft + text + sold_price + stories + year_built + garage + unit_number + floor_number + } + location { + address { + city + country + line + postal_code + state_code + state + street_direction + street_name + street_number + street_post_direction + street_suffix + unit + } + } + list_price + price_per_sqft + source { + id + } + } + } + }""" % self.listing_type.value + + payload = { + 'query': query, + 'variables': variables, + } + + response = self.session.post(self.search_url, json=payload) + response_json = response.json() + + if return_total: + return response_json['data']['home_search']['total'] + + properties: list[Property] = [] + + for result in response_json['data']['home_search']['results']: + realty_property = Property( + address=Address( + address_one=result['location']['address']['line'], + city=result['location']['address']['city'], + state=result['location']['address']['state_code'], + zip_code=result['location']['address']['postal_code'], + address_two=result['location']['address']['unit'], + ), + site_name=self.site_name, + url="https://www.realtor.com/realestateandhomes-detail/" + result['property_id'], + beds=result['description']['beds'], + baths=result['description']['baths'], + stories=result['description']['stories'], + year_built=result['description']['year_built'], + square_feet=result['description']['sqft'], + price_per_square_foot=result['price_per_sqft'], + price=result['list_price'], + mls_id=result['property_id'], + listing_type=self.listing_type, + lot_size=result['description']['lot_sqft'], + ) + + properties.append(realty_property) + + return properties def search(self): location_info = self.handle_location() location_type = location_info["area_type"] - """ - property types: - apartment + building + commercial + condo_townhome + condo_townhome_rowhome_coop + condos + coop + duplex_triplex + farm + investment + land + mobile + multi_family + rental + single_family + townhomes - """ - print("a") + if location_type == 'address': + property_id = location_info['mpr_id'] + return self.handle_address(property_id) + + offset = 0 + search_variables = { + 'city': location_info.get('city'), + 'county': location_info.get('county'), + 'state_code': location_info.get('state_code'), + 'postal_code': location_info.get('postal_code'), + 'offset': offset, + } + + total = self.handle_area(search_variables, return_total=True) + + homes = [] + with ThreadPoolExecutor(max_workers=10) as executor: + futures = [ + executor.submit( + self.handle_area, variables=search_variables | {'offset': i}, return_total=False + ) for i in range(0, total, 200) + ] + + for future in as_completed(futures): + homes.extend(future.result()) + + return homes diff --git a/tests/test_realtor.py b/tests/test_realtor.py index 2649177..291eb12 100644 --- a/tests/test_realtor.py +++ b/tests/test_realtor.py @@ -3,6 +3,9 @@ def test_realtor(): results = [ + scrape_property(location="2530 Al Lipscomb Way", site_name="realtor.com"), + scrape_property(location="Phoenix, AZ", site_name="realtor.com"), #: does not support "city, state, USA" format + scrape_property(location="Dallas, TX", site_name="realtor.com"), #: does not support "city, state, USA" format scrape_property(location="85281", site_name="realtor.com"), ]