From fbcd3c88bf48a986dd926fd35fbdc7942bc0b02e Mon Sep 17 00:00:00 2001 From: Cullen Watson Date: Fri, 3 Nov 2023 18:35:41 -0500 Subject: [PATCH] [enh] date_to and date_from --- README.md | 68 +++++++++++-------- homeharvest/__init__.py | 11 ++- homeharvest/core/scrapers/__init__.py | 4 ++ homeharvest/core/scrapers/realtor/__init__.py | 30 +++++--- homeharvest/exceptions.py | 5 +- homeharvest/utils.py | 20 +++++- pyproject.toml | 2 +- tests/test_realtor.py | 32 ++++++--- 8 files changed, 112 insertions(+), 60 deletions(-) diff --git a/README.md b/README.md index 1504240..7df35bd 100644 --- a/README.md +++ b/README.md @@ -46,7 +46,11 @@ filename = f"HomeHarvest_{current_timestamp}.csv" properties = scrape_property( location="San Diego, CA", listing_type="sold", # or (for_sale, for_rent, pending) - past_days=30, # sold in last 30 days - listed in last x days if (for_sale, for_rent) + past_days=30, # sold in last 30 days - listed in last 30 days if (for_sale, for_rent) + + # date_from="2023-05-01", # alternative to past_days + # date_to="2023-05-28", + # mls_only=True, # only fetch MLS listings # proxy="http://user:pass@host:port" # use a proxy to change your IP address ) @@ -57,34 +61,6 @@ properties.to_csv(filename, index=False) print(properties.head()) ``` -### CLI - -``` -usage: homeharvest [-l {for_sale,for_rent,sold}] [-o {excel,csv}] [-f FILENAME] [-p PROXY] [-d DAYS] [-r RADIUS] [-m] [-c] location - -Home Harvest Property Scraper - -positional arguments: - location Location to scrape (e.g., San Francisco, CA) - -options: - -l {for_sale,for_rent,sold,pending}, --listing_type {for_sale,for_rent,sold,pending} - Listing type to scrape - -o {excel,csv}, --output {excel,csv} - Output format - -f FILENAME, --filename FILENAME - Name of the output file (without extension) - -p PROXY, --proxy PROXY - Proxy to use for scraping - -d DAYS, --days DAYS Sold/listed in last _ days filter. - -r RADIUS, --radius RADIUS - Get comparable properties within _ (e.g., 0.0) miles. Only applicable for individual addresses. - -m, --mls_only If set, fetches only MLS listings. -``` -```bash -homeharvest "San Francisco, CA" -l for_rent -o excel -f HomeHarvest -``` - ## Output ```plaintext @@ -115,11 +91,45 @@ Optional ├── past_days (integer): Number of past days to filter properties. Utilizes 'last_sold_date' for 'sold' listing types, and 'list_date' for others (for_rent, for_sale). │ Example: 30 (fetches properties listed/sold in the last 30 days) │ +├── date_range (string tuple): Start and end dates to filter properties listed or sold, both dates are required. +} (use this to get properties in chunks as there's a 10k result limit) +│ Format for both must be "YYYY-MM-DD". +│ Example: ("2023-05-01", "2023-05-15") (fetches properties listed/sold between these dates) +│ ├── mls_only (True/False): If set, fetches only MLS listings (mainly applicable to 'sold' listings) │ └── proxy (string): In format 'http://user:pass@host:port' ``` + +### CLI + +``` +usage: homeharvest [-l {for_sale,for_rent,sold}] [-o {excel,csv}] [-f FILENAME] [-p PROXY] [-d DAYS] [-r RADIUS] [-m] [-c] location + +Home Harvest Property Scraper + +positional arguments: + location Location to scrape (e.g., San Francisco, CA) + +options: + -l {for_sale,for_rent,sold,pending}, --listing_type {for_sale,for_rent,sold,pending} + Listing type to scrape + -o {excel,csv}, --output {excel,csv} + Output format + -f FILENAME, --filename FILENAME + Name of the output file (without extension) + -p PROXY, --proxy PROXY + Proxy to use for scraping + -d DAYS, --days DAYS Sold/listed in last _ days filter. + -r RADIUS, --radius RADIUS + Get comparable properties within _ (e.g., 0.0) miles. Only applicable for individual addresses. + -m, --mls_only If set, fetches only MLS listings. +``` +```bash +homeharvest "San Francisco, CA" -l for_rent -o excel -f HomeHarvest +``` + ### Property Schema ```plaintext Property diff --git a/homeharvest/__init__.py b/homeharvest/__init__.py index da478fb..44107eb 100644 --- a/homeharvest/__init__.py +++ b/homeharvest/__init__.py @@ -1,10 +1,9 @@ import warnings import pandas as pd from .core.scrapers import ScraperInput -from .utils import process_result, ordered_properties, validate_input +from .utils import process_result, ordered_properties, validate_input, validate_dates from .core.scrapers.realtor import RealtorScraper from .core.scrapers.models import ListingType -from .exceptions import InvalidListingType, NoResultsFound def scrape_property( @@ -14,6 +13,8 @@ def scrape_property( mls_only: bool = False, past_days: int = None, proxy: str = None, + date_from: str = None, + date_to: str = None, ) -> pd.DataFrame: """ Scrape properties from Realtor.com based on a given location and listing type. @@ -22,9 +23,11 @@ def scrape_property( :param radius: Get properties within _ (e.g. 1.0) miles. Only applicable for individual addresses. :param mls_only: If set, fetches only listings with MLS IDs. :param past_days: Get properties sold or listed (dependent on your listing_type) in the last _ days. + :param date_from, date_to: Get properties sold or listed (dependent on your listing_type) between these dates. format: 2021-01-28 :param proxy: Proxy to use for scraping """ validate_input(listing_type) + validate_dates(date_from, date_to) scraper_input = ScraperInput( location=location, @@ -33,6 +36,8 @@ def scrape_property( radius=radius, mls_only=mls_only, last_x_days=past_days, + date_from=date_from, + date_to=date_to, ) site = RealtorScraper(scraper_input) @@ -40,7 +45,7 @@ def scrape_property( properties_dfs = [process_result(result) for result in results] if not properties_dfs: - raise NoResultsFound("no results found for the query") + return pd.DataFrame() with warnings.catch_warnings(): warnings.simplefilter("ignore", category=FutureWarning) diff --git a/homeharvest/core/scrapers/__init__.py b/homeharvest/core/scrapers/__init__.py index 2871e2d..29dc08f 100644 --- a/homeharvest/core/scrapers/__init__.py +++ b/homeharvest/core/scrapers/__init__.py @@ -11,6 +11,8 @@ class ScraperInput: mls_only: bool | None = None proxy: str | None = None last_x_days: int | None = None + date_from: str | None = None + date_to: str | None = None class Scraper: @@ -36,6 +38,8 @@ def __init__( self.radius = scraper_input.radius self.last_x_days = scraper_input.last_x_days self.mls_only = scraper_input.mls_only + self.date_from = scraper_input.date_from + self.date_to = scraper_input.date_to def search(self) -> list[Property]: ... diff --git a/homeharvest/core/scrapers/realtor/__init__.py b/homeharvest/core/scrapers/realtor/__init__.py index 1023bf2..dc87920 100644 --- a/homeharvest/core/scrapers/realtor/__init__.py +++ b/homeharvest/core/scrapers/realtor/__init__.py @@ -9,7 +9,6 @@ from concurrent.futures import ThreadPoolExecutor, as_completed from .. import Scraper -from ....exceptions import NoResultsFound from ..models import Property, Address, ListingType, Description @@ -38,7 +37,7 @@ def handle_location(self): result = response_json["autocomplete"] if not result: - raise NoResultsFound("No results found for location: " + self.location) + return None return result[0] @@ -336,15 +335,21 @@ def general_search( } }""" - date_param = ( - 'sold_date: { min: "$today-%sD" }' % self.last_x_days - if self.listing_type == ListingType.SOLD and self.last_x_days - else ( - 'list_date: { min: "$today-%sD" }' % self.last_x_days - if self.last_x_days - else "" - ) - ) + date_param = "" + if self.listing_type == ListingType.SOLD: + if self.date_from and self.date_to: + # Use DateStringRange for sold_date + date_param = f'sold_date: {{ min: "{self.date_from}", max: "{self.date_to}" }}' + elif self.last_x_days: + # Use last_x_days for sold_date + date_param = f'sold_date: {{ min: "$today-{self.last_x_days}D" }}' + else: + if self.date_from and self.date_to: + # Use DateStringRange for list_date + date_param = f'list_date: {{ min: "{self.date_from}", max: "{self.date_to}" }}' + elif self.last_x_days: + # Use last_x_days for list_date + date_param = f'list_date: {{ min: "$today-{self.last_x_days}D" }}' sort_param = ( "sort: [{ field: sold_date, direction: desc }]" @@ -509,6 +514,9 @@ def general_search( def search(self): location_info = self.handle_location() + if not location_info: + return [] + location_type = location_info["area_type"] search_variables = { diff --git a/homeharvest/exceptions.py b/homeharvest/exceptions.py index f018c97..0d71398 100644 --- a/homeharvest/exceptions.py +++ b/homeharvest/exceptions.py @@ -1,6 +1,5 @@ class InvalidListingType(Exception): """Raised when a provided listing type is does not exist.""" - -class NoResultsFound(Exception): - """Raised when no results are found for the given location""" +class InvalidDate(Exception): + """Raised when only one of date_from or date_to is provided or not in the correct format. ex: 2023-10-23 """ diff --git a/homeharvest/utils.py b/homeharvest/utils.py index a03ad14..27effef 100644 --- a/homeharvest/utils.py +++ b/homeharvest/utils.py @@ -1,6 +1,7 @@ -from .core.scrapers.models import Property, ListingType import pandas as pd -from .exceptions import InvalidListingType +from datetime import datetime +from .core.scrapers.models import Property, ListingType +from .exceptions import InvalidListingType, InvalidDate ordered_properties = [ "property_url", @@ -70,3 +71,18 @@ def validate_input(listing_type: str) -> None: raise InvalidListingType( f"Provided listing type, '{listing_type}', does not exist." ) + + +def validate_dates(date_from: str | None, date_to: str | None) -> None: + if (date_from is not None and date_to is None) or (date_from is None and date_to is not None): + raise InvalidDate("Both date_from and date_to must be provided.") + + if date_from and date_to: + try: + date_from_obj = datetime.strptime(date_from, "%Y-%m-%d") + date_to_obj = datetime.strptime(date_to, "%Y-%m-%d") + + if date_to_obj < date_from_obj: + raise InvalidDate("date_to must be after date_from.") + except ValueError as e: + raise InvalidDate(f"Invalid date format or range") diff --git a/pyproject.toml b/pyproject.toml index 6a2212c..2cb393f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "homeharvest" -version = "0.3.7" +version = "0.3.8" description = "Real estate scraping library supporting Zillow, Realtor.com & Redfin." authors = ["Zachary Hampton ", "Cullen Watson "] homepage = "https://github.com/Bunsly/HomeHarvest" diff --git a/tests/test_realtor.py b/tests/test_realtor.py index ea18852..018b1b3 100644 --- a/tests/test_realtor.py +++ b/tests/test_realtor.py @@ -1,7 +1,6 @@ from homeharvest import scrape_property from homeharvest.exceptions import ( InvalidListingType, - NoResultsFound, ) @@ -85,6 +84,20 @@ def test_realtor_last_x_days_sold(): ) and len(days_result_30) != len(days_result_10) +def test_realtor_date_range_sold(): + days_result_30 = scrape_property( + location="Dallas, TX", listing_type="sold", date_from="2023-05-01", date_to="2023-05-28" + ) + + days_result_60 = scrape_property( + location="Dallas, TX", listing_type="sold", date_from="2023-04-01", date_to="2023-06-10" + ) + + assert all( + [result is not None for result in [days_result_30, days_result_60]] + ) and len(days_result_30) < len(days_result_60) + + def test_realtor_single_property(): results = [ scrape_property( @@ -117,15 +130,12 @@ def test_realtor(): assert all([result is not None for result in results]) - bad_results = [] - try: - bad_results += [ - scrape_property( - location="abceefg ju098ot498hh9", - listing_type="for_sale", - ) - ] - except (InvalidListingType, NoResultsFound): + +def test_realtor_bad_address(): + bad_results = scrape_property( + location="abceefg ju098ot498hh9", + listing_type="for_sale", + ) + if len(bad_results) == 0: assert True - assert all([result is None for result in bad_results])