Skip to content

Commit

Permalink
[enh]: clean data
Browse files Browse the repository at this point in the history
  • Loading branch information
cullenwatson committed Oct 4, 2023
1 parent 8388d47 commit 1464b4f
Show file tree
Hide file tree
Showing 13 changed files with 362 additions and 906 deletions.
123 changes: 22 additions & 101 deletions homeharvest/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,139 +3,60 @@
import concurrent.futures
from concurrent.futures import ThreadPoolExecutor

from .utils import process_result, ordered_properties
from .core.scrapers import ScraperInput
from .core.scrapers.redfin import RedfinScraper
from .core.scrapers.realtor import RealtorScraper
from .core.scrapers.zillow import ZillowScraper
from .core.scrapers.models import ListingType, Property, SiteName
from .core.scrapers.models import Status, Property, SiteName
from .exceptions import InvalidSite, InvalidListingType


_scrapers = {
"redfin": RedfinScraper,
"realtor.com": RealtorScraper,
"zillow": ZillowScraper,
}


def _validate_input(site_name: str, listing_type: str) -> None:
def _validate_input(site_name: str, status: str) -> None:
if site_name.lower() not in _scrapers:
raise InvalidSite(f"Provided site, '{site_name}', does not exist.")

if listing_type.upper() not in ListingType.__members__:
raise InvalidListingType(f"Provided listing type, '{listing_type}', does not exist.")


def _get_ordered_properties(result: Property) -> list[str]:
return [
"property_url",
"site_name",
"listing_type",
"property_type",
"status_text",
"baths_min",
"baths_max",
"beds_min",
"beds_max",
"sqft_min",
"sqft_max",
"price_min",
"price_max",
"unit_count",
"tax_assessed_value",
"price_per_sqft",
"lot_area_value",
"lot_area_unit",
"address_one",
"address_two",
"city",
"state",
"zip_code",
"posted_time",
"area_min",
"bldg_name",
"stories",
"year_built",
"agent_name",
"agent_phone",
"agent_email",
"days_on_market",
"sold_date",
"mls_id",
"img_src",
"latitude",
"longitude",
"description",
]


def _process_result(result: Property) -> pd.DataFrame:
prop_data = result.__dict__

prop_data["site_name"] = prop_data["site_name"].value
prop_data["listing_type"] = prop_data["listing_type"].value.lower()
if "property_type" in prop_data and prop_data["property_type"] is not None:
prop_data["property_type"] = prop_data["property_type"].value.lower()
else:
prop_data["property_type"] = None
if "address" in prop_data:
address_data = prop_data["address"]
prop_data["address_one"] = address_data.address_one
prop_data["address_two"] = address_data.address_two
prop_data["city"] = address_data.city
prop_data["state"] = address_data.state
prop_data["zip_code"] = address_data.zip_code

del prop_data["address"]

if "agent" in prop_data and prop_data["agent"] is not None:
agent_data = prop_data["agent"]
prop_data["agent_name"] = agent_data.name
prop_data["agent_phone"] = agent_data.phone
prop_data["agent_email"] = agent_data.email

del prop_data["agent"]
else:
prop_data["agent_name"] = None
prop_data["agent_phone"] = None
prop_data["agent_email"] = None

properties_df = pd.DataFrame([prop_data])
properties_df = properties_df[_get_ordered_properties(result)]
if status.upper() not in Status.__members__:
raise InvalidListingType(f"Provided listing type, '{status}', does not exist.")

return properties_df


def _scrape_single_site(location: str, site_name: str, listing_type: str, proxy: str = None) -> pd.DataFrame:
def _scrape_single_site(
location: str, site_name: str, status: str, proxy: str = None, timeframe: str = None
) -> pd.DataFrame:
"""
Helper function to scrape a single site.
"""
_validate_input(site_name, listing_type)
print(status)
_validate_input(site_name, status)

scraper_input = ScraperInput(
location=location,
listing_type=ListingType[listing_type.upper()],
status=status,
site_name=SiteName.get_by_value(site_name.lower()),
proxy=proxy,
timeframe=timeframe,
)

site = _scrapers[site_name.lower()](scraper_input)
results = site.search()
print(f"Found {len(results)} results for {site_name}")

properties_dfs = [_process_result(result) for result in results]
properties_dfs = [df.dropna(axis=1, how="all") for df in properties_dfs if not df.empty]
properties_dfs = [process_result(result) for result in results]
if not properties_dfs:
return pd.DataFrame()

return pd.concat(properties_dfs, ignore_index=True)
return pd.concat(properties_dfs, ignore_index=True, axis=0)[ordered_properties]


def scrape_property(
location: str,
timeframe: str,
site_name: Union[str, list[str]] = None,
listing_type: str = "for_sale",
status: str = "sale",
proxy: str = None,
keep_duplicates: bool = False
) -> pd.DataFrame:
"""
Scrape property from various sites from a given location and listing type.
Expand All @@ -155,12 +76,14 @@ def scrape_property(
results = []

if len(site_name) == 1:
final_df = _scrape_single_site(location, site_name[0], listing_type, proxy)
final_df = _scrape_single_site(location, site_name[0], status, proxy, timeframe)
results.append(final_df)
else:
with ThreadPoolExecutor() as executor:
futures = {
executor.submit(_scrape_single_site, location, s_name, listing_type, proxy): s_name
executor.submit(
_scrape_single_site, location, s_name, status, proxy, timeframe
): s_name
for s_name in site_name
}

Expand All @@ -175,13 +98,11 @@ def scrape_property(

final_df = pd.concat(results, ignore_index=True)

columns_to_track = ["address_one", "address_two", "city"]
columns_to_track = ["Street", "Unit", "Zip"]

#: validate they exist, otherwise create them
for col in columns_to_track:
if col not in final_df.columns:
final_df[col] = None

if not keep_duplicates:
final_df = final_df.drop_duplicates(subset=columns_to_track, keep="first")
return final_df
18 changes: 14 additions & 4 deletions homeharvest/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,9 @@

def main():
parser = argparse.ArgumentParser(description="Home Harvest Property Scraper")
parser.add_argument("location", type=str, help="Location to scrape (e.g., San Francisco, CA)")
parser.add_argument(
"location", type=str, help="Location to scrape (e.g., San Francisco, CA)"
)

parser.add_argument(
"-s",
Expand Down Expand Up @@ -46,14 +48,22 @@ def main():
"-k",
"--keep_duplicates",
action="store_true",
help="Keep duplicate properties based on address"
help="Keep duplicate properties based on address",
)

parser.add_argument("-p", "--proxy", type=str, default=None, help="Proxy to use for scraping")
parser.add_argument(
"-p", "--proxy", type=str, default=None, help="Proxy to use for scraping"
)

args = parser.parse_args()

result = scrape_property(args.location, args.site_name, args.listing_type, proxy=args.proxy, keep_duplicates=args.keep_duplicates)
result = scrape_property(
args.location,
args.site_name,
args.listing_type,
proxy=args.proxy,
keep_duplicates=args.keep_duplicates,
)

if not args.filename:
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
Expand Down
31 changes: 24 additions & 7 deletions homeharvest/core/scrapers/__init__.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,38 @@
from dataclasses import dataclass
import requests
import tls_client
from .models import Property, ListingType, SiteName
from typing import Optional
from .models import Property, SiteName, Status
from ...exceptions import InvalidTimeFrame

VALID_TIMEFRAMES = ["1W", "1M", "3M", "6M", "1Y"]
VALID_STATUSES = ["sold", "for_sale", "for_rent"]


@dataclass
class ScraperInput:
location: str
listing_type: ListingType
site_name: SiteName
proxy: str | None = None
status: str
site_name: str
proxy: Optional[str] = None
timeframe: Optional[str] = None

def __post_init__(self):
if self.timeframe and self.timeframe not in VALID_TIMEFRAMES:
raise InvalidTimeFrame(f"Invalid timeframe provided: {self.timeframe}")
if self.status and self.status not in VALID_STATUSES:
raise InvalidTimeFrame(f"Invalid status provided: {self.status}")


class Scraper:
def __init__(self, scraper_input: ScraperInput, session: requests.Session | tls_client.Session = None):
def __init__(
self,
scraper_input: ScraperInput,
session: requests.Session | tls_client.Session = None,
):
self.location = scraper_input.location
self.listing_type = scraper_input.listing_type
self.status = scraper_input.status
self.timeframe = scraper_input.timeframe

if not session:
self.session = requests.Session()
Expand All @@ -27,7 +44,7 @@ def __init__(self, scraper_input: ScraperInput, session: requests.Session | tls_
proxies = {"http": proxy_url, "https": proxy_url}
self.session.proxies.update(proxies)

self.listing_type = scraper_input.listing_type
self.listing_type = scraper_input.status
self.site_name = scraper_input.site_name

def search(self) -> list[Property]:
Expand Down
Loading

0 comments on commit 1464b4f

Please sign in to comment.