Skip to content

Commit

Permalink
Merge pull request #2 from ZacharyHampton/all_3_sites
Browse files Browse the repository at this point in the history
feat: run all 3 sites with one call
  • Loading branch information
ZacharyHampton committed Sep 18, 2023
2 parents fe351ab + 8e140a0 commit d0a6a66
Show file tree
Hide file tree
Showing 4 changed files with 80 additions and 40 deletions.
67 changes: 52 additions & 15 deletions homeharvest/__init__.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,14 @@
import pandas as pd
from typing import Union
import concurrent.futures
from concurrent.futures import ThreadPoolExecutor

from .core.scrapers import ScraperInput
from .core.scrapers.redfin import RedfinScraper
from .core.scrapers.realtor import RealtorScraper
from .core.scrapers.zillow import ZillowScraper
from .core.scrapers.models import ListingType, Property, SiteName
from .core.scrapers import ScraperInput
from .exceptions import InvalidSite, InvalidListingType
from typing import Union
import pandas as pd


_scrapers = {
Expand Down Expand Up @@ -91,21 +94,12 @@ def process_result(result: Property) -> pd.DataFrame:
return properties_df


def scrape_property(
location: str,
site_name: str,
listing_type: str = "for_sale", #: for_sale, for_rent, sold
def _scrape_single_site(
location: str, site_name: str, listing_type: str
) -> pd.DataFrame:
"""
Scrape property from various sites from a given location and listing type.
:returns: pd.DataFrame
:param location: US Location (e.g. 'San Francisco, CA', 'Cook County, IL', '85281', '2530 Al Lipscomb Way')
:param site_name: Site name (e.g. 'realtor.com', 'zillow', 'redfin')
:param listing_type: Listing type (e.g. 'for_sale', 'for_rent', 'sold')
:return: pd.DataFrame containing properties
Helper function to scrape a single site.
"""

validate_input(site_name, listing_type)

scraper_input = ScraperInput(
Expand All @@ -122,3 +116,46 @@ def scrape_property(
return pd.DataFrame()

return pd.concat(properties_dfs, ignore_index=True)


def scrape_property(
location: str,
site_name: Union[str, list[str]] = list(_scrapers.keys()),
listing_type: str = "for_sale",
) -> pd.DataFrame:
"""
Scrape property from various sites from a given location and listing type.
:returns: pd.DataFrame
:param location: US Location (e.g. 'San Francisco, CA', 'Cook County, IL', '85281', '2530 Al Lipscomb Way')
:param site_name: Site name or list of site names (e.g. ['realtor.com', 'zillow'], 'redfin')
:param listing_type: Listing type (e.g. 'for_sale', 'for_rent', 'sold')
:return: pd.DataFrame containing properties
"""
if site_name is None:
site_name = list(_scrapers.keys())

if not isinstance(site_name, list):
site_name = [site_name]

if len(site_name) == 1:
final_df = _scrape_single_site(location, site_name[0], listing_type)
final_df = final_df.drop_duplicates(subset="street_address", keep="first")
return final_df

results = []
with ThreadPoolExecutor() as executor:
futures = {
executor.submit(_scrape_single_site, location, s_name, listing_type): s_name
for s_name in site_name
}

for future in concurrent.futures.as_completed(futures):
result = future.result()
results.append(result)

if not results:
return pd.DataFrame()
final_df = pd.concat(results, ignore_index=True)
final_df = final_df.drop_duplicates(subset="street_address", keep="first")
return final_df
2 changes: 1 addition & 1 deletion homeharvest/core/scrapers/realtor/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -240,7 +240,7 @@ def handle_area(
city=result["location"]["address"]["city"],
state=result["location"]["address"]["state_code"],
zip_code=result["location"]["address"]["postal_code"],
unit=result["location"]["address"]["unit"],
unit=parse_address_two(result["location"]["address"]["unit"]),
country="USA",
),
site_name=self.site_name,
Expand Down
26 changes: 5 additions & 21 deletions homeharvest/core/scrapers/zillow/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,9 @@ def _parse_properties(self, property_data: dict):
home_info = result["hdpData"]["homeInfo"]
address_data = {
"street_address": home_info["streetAddress"],
"unit": home_info.get("unit"),
"unit": parse_address_two(home_info["unit"])
if "unit" in home_info
else None,
"city": home_info["city"],
"state": home_info["state"],
"zip_code": home_info["zipcode"],
Expand Down Expand Up @@ -213,22 +215,6 @@ def _parse_properties(self, property_data: dict):

return properties_list

def _extract_units(self, result: dict):
units = {}
if "units" in result:
num_units = result.get("availabilityCount", len(result["units"]))
prices = [
int(unit["price"].replace("$", "").replace(",", "").split("+")[0])
for unit in result["units"]
]
units["apt_availability_count"] = num_units
units["apt_min_unit_price"] = min(prices)
units["apt_max_unit_price"] = max(prices)
units["apt_avg_unit_price"] = (
sum(prices) // num_units if num_units else None
)
return units

def _get_single_property_page(self, property_data: dict):
"""
This method is used when a user enters the exact location & zillow returns just one property
Expand All @@ -239,10 +225,9 @@ def _get_single_property_page(self, property_data: dict):
else property_data["hdpUrl"]
)
address_data = property_data["address"]
unit = parse_address_two(address_data["streetAddress"])
address = Address(
street_address=address_data["streetAddress"],
unit=unit,
unit=parse_address_two(address_data["streetAddress"]),
city=address_data["city"],
state=address_data["state"],
zip_code=address_data["zipcode"],
Expand Down Expand Up @@ -301,11 +286,10 @@ def _extract_address(self, address_str):
else:
raise ValueError(f"Unexpected state/zip format in address: {address_str}")

unit = parse_address_two(street_address)
return Address(
street_address=street_address,
city=city,
unit=unit,
unit=parse_address_two(street_address),
state=state,
zip_code=zip_code,
country="USA",
Expand Down
25 changes: 22 additions & 3 deletions homeharvest/utils.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,25 @@
import re


def parse_address_two(address_one: str):
apt_match = re.search(r"(APT\s*.+|#[\s\S]+)$", address_one, re.I)
return apt_match.group().strip() if apt_match else None
def parse_address_two(street_address: str):
if not street_address:
return None
apt_match = re.search(
r"(APT\s*[\dA-Z]+|#[\dA-Z]+|UNIT\s*[\dA-Z]+|LOT\s*[\dA-Z]+)$",
street_address,
re.I,
)

if apt_match:
apt_str = apt_match.group().strip()
apt_str = re.sub(r"(APT\s*|UNIT\s*|LOT\s*)", "#", apt_str, flags=re.I)
return apt_str
else:
return None


if __name__ == "__main__":
print(parse_address_two("810 E Colter St APT 32"))
print(parse_address_two("1234 Elm Street apt 2B"))
print(parse_address_two("1234 Elm Street UNIT 3A"))
print(parse_address_two("1234 Elm Street unit 3A"))

0 comments on commit d0a6a66

Please sign in to comment.