Skip to content

Commit

Permalink
[enh] date_to and date_from
Browse files Browse the repository at this point in the history
  • Loading branch information
cullenwatson committed Nov 3, 2023
1 parent c597a78 commit 4edad90
Show file tree
Hide file tree
Showing 8 changed files with 108 additions and 60 deletions.
68 changes: 39 additions & 29 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,11 @@ filename = f"HomeHarvest_{current_timestamp}.csv"
properties = scrape_property(
location="San Diego, CA",
listing_type="sold", # or (for_sale, for_rent, pending)
past_days=30, # sold in last 30 days - listed in last x days if (for_sale, for_rent)
past_days=30, # sold in last 30 days - listed in last 30 days if (for_sale, for_rent)

# date_from="2023-05-01", # alternative to past_days
# date_to="2023-05-28",

# mls_only=True, # only fetch MLS listings
# proxy="http://user:pass@host:port" # use a proxy to change your IP address
)
Expand All @@ -57,34 +61,6 @@ properties.to_csv(filename, index=False)
print(properties.head())
```

### CLI

```
usage: homeharvest [-l {for_sale,for_rent,sold}] [-o {excel,csv}] [-f FILENAME] [-p PROXY] [-d DAYS] [-r RADIUS] [-m] [-c] location
Home Harvest Property Scraper
positional arguments:
location Location to scrape (e.g., San Francisco, CA)
options:
-l {for_sale,for_rent,sold,pending}, --listing_type {for_sale,for_rent,sold,pending}
Listing type to scrape
-o {excel,csv}, --output {excel,csv}
Output format
-f FILENAME, --filename FILENAME
Name of the output file (without extension)
-p PROXY, --proxy PROXY
Proxy to use for scraping
-d DAYS, --days DAYS Sold/listed in last _ days filter.
-r RADIUS, --radius RADIUS
Get comparable properties within _ (e.g., 0.0) miles. Only applicable for individual addresses.
-m, --mls_only If set, fetches only MLS listings.
```
```bash
homeharvest "San Francisco, CA" -l for_rent -o excel -f HomeHarvest
```


## Output
```plaintext
Expand Down Expand Up @@ -115,11 +91,45 @@ Optional
├── past_days (integer): Number of past days to filter properties. Utilizes 'last_sold_date' for 'sold' listing types, and 'list_date' for others (for_rent, for_sale).
│ Example: 30 (fetches properties listed/sold in the last 30 days)
├── date_range (string tuple): Start and end dates to filter properties listed or sold, both dates are required.
} (use this to get properties in chunks as there's a 10k result limit)
│ Format for both must be "YYYY-MM-DD".
│ Example: ("2023-05-01", "2023-05-15") (fetches properties listed/sold between these dates)
├── mls_only (True/False): If set, fetches only MLS listings (mainly applicable to 'sold' listings)
└── proxy (string): In format 'http://user:pass@host:port'
```

### CLI

```
usage: homeharvest [-l {for_sale,for_rent,sold}] [-o {excel,csv}] [-f FILENAME] [-p PROXY] [-d DAYS] [-r RADIUS] [-m] [-c] location
Home Harvest Property Scraper
positional arguments:
location Location to scrape (e.g., San Francisco, CA)
options:
-l {for_sale,for_rent,sold,pending}, --listing_type {for_sale,for_rent,sold,pending}
Listing type to scrape
-o {excel,csv}, --output {excel,csv}
Output format
-f FILENAME, --filename FILENAME
Name of the output file (without extension)
-p PROXY, --proxy PROXY
Proxy to use for scraping
-d DAYS, --days DAYS Sold/listed in last _ days filter.
-r RADIUS, --radius RADIUS
Get comparable properties within _ (e.g., 0.0) miles. Only applicable for individual addresses.
-m, --mls_only If set, fetches only MLS listings.
```
```bash
homeharvest "San Francisco, CA" -l for_rent -o excel -f HomeHarvest
```

### Property Schema
```plaintext
Property
Expand Down
11 changes: 8 additions & 3 deletions homeharvest/__init__.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,9 @@
import warnings
import pandas as pd
from .core.scrapers import ScraperInput
from .utils import process_result, ordered_properties, validate_input
from .utils import process_result, ordered_properties, validate_input, validate_dates
from .core.scrapers.realtor import RealtorScraper
from .core.scrapers.models import ListingType
from .exceptions import InvalidListingType, NoResultsFound


def scrape_property(
Expand All @@ -14,6 +13,8 @@ def scrape_property(
mls_only: bool = False,
past_days: int = None,
proxy: str = None,
date_from: str = None,
date_to: str = None,
) -> pd.DataFrame:
"""
Scrape properties from Realtor.com based on a given location and listing type.
Expand All @@ -22,9 +23,11 @@ def scrape_property(
:param radius: Get properties within _ (e.g. 1.0) miles. Only applicable for individual addresses.
:param mls_only: If set, fetches only listings with MLS IDs.
:param past_days: Get properties sold or listed (dependent on your listing_type) in the last _ days.
:param date_from, date_to: Get properties sold or listed (dependent on your listing_type) between these dates. format: 2021-01-28
:param proxy: Proxy to use for scraping
"""
validate_input(listing_type)
validate_dates(date_from, date_to)

scraper_input = ScraperInput(
location=location,
Expand All @@ -33,14 +36,16 @@ def scrape_property(
radius=radius,
mls_only=mls_only,
last_x_days=past_days,
date_from=date_from,
date_to=date_to,
)

site = RealtorScraper(scraper_input)
results = site.search()

properties_dfs = [process_result(result) for result in results]
if not properties_dfs:
raise NoResultsFound("no results found for the query")
return pd.DataFrame()

with warnings.catch_warnings():
warnings.simplefilter("ignore", category=FutureWarning)
Expand Down
4 changes: 4 additions & 0 deletions homeharvest/core/scrapers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@ class ScraperInput:
mls_only: bool | None = None
proxy: str | None = None
last_x_days: int | None = None
date_from: str | None = None
date_to: str | None = None


class Scraper:
Expand All @@ -36,6 +38,8 @@ def __init__(
self.radius = scraper_input.radius
self.last_x_days = scraper_input.last_x_days
self.mls_only = scraper_input.mls_only
self.date_from = scraper_input.date_from
self.date_to = scraper_input.date_to

def search(self) -> list[Property]:
...
Expand Down
26 changes: 15 additions & 11 deletions homeharvest/core/scrapers/realtor/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
from concurrent.futures import ThreadPoolExecutor, as_completed

from .. import Scraper
from ....exceptions import NoResultsFound
from ..models import Property, Address, ListingType, Description


Expand Down Expand Up @@ -38,7 +37,7 @@ def handle_location(self):
result = response_json["autocomplete"]

if not result:
raise NoResultsFound("No results found for location: " + self.location)
return None

return result[0]

Expand Down Expand Up @@ -336,15 +335,17 @@ def general_search(
}
}"""

date_param = (
'sold_date: { min: "$today-%sD" }' % self.last_x_days
if self.listing_type == ListingType.SOLD and self.last_x_days
else (
'list_date: { min: "$today-%sD" }' % self.last_x_days
if self.last_x_days
else ""
)
)
date_param = ""
if self.listing_type == ListingType.SOLD:
if self.date_from and self.date_to:
date_param = f'sold_date: {{ min: "{self.date_from}", max: "{self.date_to}" }}'
elif self.last_x_days:
date_param = f'sold_date: {{ min: "$today-{self.last_x_days}D" }}'
else:
if self.date_from and self.date_to:
date_param = f'list_date: {{ min: "{self.date_from}", max: "{self.date_to}" }}'
elif self.last_x_days:
date_param = f'list_date: {{ min: "$today-{self.last_x_days}D" }}'

sort_param = (
"sort: [{ field: sold_date, direction: desc }]"
Expand Down Expand Up @@ -509,6 +510,9 @@ def general_search(

def search(self):
location_info = self.handle_location()
if not location_info:
return []

location_type = location_info["area_type"]

search_variables = {
Expand Down
5 changes: 2 additions & 3 deletions homeharvest/exceptions.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
class InvalidListingType(Exception):
"""Raised when a provided listing type is does not exist."""


class NoResultsFound(Exception):
"""Raised when no results are found for the given location"""
class InvalidDate(Exception):
"""Raised when only one of date_from or date_to is provided or not in the correct format. ex: 2023-10-23 """
20 changes: 18 additions & 2 deletions homeharvest/utils.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from .core.scrapers.models import Property, ListingType
import pandas as pd
from .exceptions import InvalidListingType
from datetime import datetime
from .core.scrapers.models import Property, ListingType
from .exceptions import InvalidListingType, InvalidDate

ordered_properties = [
"property_url",
Expand Down Expand Up @@ -70,3 +71,18 @@ def validate_input(listing_type: str) -> None:
raise InvalidListingType(
f"Provided listing type, '{listing_type}', does not exist."
)


def validate_dates(date_from: str | None, date_to: str | None) -> None:
if (date_from is not None and date_to is None) or (date_from is None and date_to is not None):
raise InvalidDate("Both date_from and date_to must be provided.")

if date_from and date_to:
try:
date_from_obj = datetime.strptime(date_from, "%Y-%m-%d")
date_to_obj = datetime.strptime(date_to, "%Y-%m-%d")

if date_to_obj < date_from_obj:
raise InvalidDate("date_to must be after date_from.")
except ValueError as e:
raise InvalidDate(f"Invalid date format or range")
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "homeharvest"
version = "0.3.7"
version = "0.3.8"
description = "Real estate scraping library supporting Zillow, Realtor.com & Redfin."
authors = ["Zachary Hampton <[email protected]>", "Cullen Watson <[email protected]>"]
homepage = "https://github.com/Bunsly/HomeHarvest"
Expand Down
32 changes: 21 additions & 11 deletions tests/test_realtor.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
from homeharvest import scrape_property
from homeharvest.exceptions import (
InvalidListingType,
NoResultsFound,
)


Expand Down Expand Up @@ -85,6 +84,20 @@ def test_realtor_last_x_days_sold():
) and len(days_result_30) != len(days_result_10)


def test_realtor_date_range_sold():
days_result_30 = scrape_property(
location="Dallas, TX", listing_type="sold", date_from="2023-05-01", date_to="2023-05-28"
)

days_result_60 = scrape_property(
location="Dallas, TX", listing_type="sold", date_from="2023-04-01", date_to="2023-06-10"
)

assert all(
[result is not None for result in [days_result_30, days_result_60]]
) and len(days_result_30) < len(days_result_60)


def test_realtor_single_property():
results = [
scrape_property(
Expand Down Expand Up @@ -117,15 +130,12 @@ def test_realtor():

assert all([result is not None for result in results])

bad_results = []
try:
bad_results += [
scrape_property(
location="abceefg ju098ot498hh9",
listing_type="for_sale",
)
]
except (InvalidListingType, NoResultsFound):

def test_realtor_bad_address():
bad_results = scrape_property(
location="abceefg ju098ot498hh9",
listing_type="for_sale",
)
if len(bad_results) == 0:
assert True

assert all([result is None for result in bad_results])

0 comments on commit 4edad90

Please sign in to comment.