Skip to content

Commit

Permalink
fix: simplify fields
Browse files Browse the repository at this point in the history
  • Loading branch information
cullenwatson committed Sep 20, 2023
1 parent e8d9235 commit f6054e8
Show file tree
Hide file tree
Showing 11 changed files with 277 additions and 330 deletions.
59 changes: 20 additions & 39 deletions homeharvest/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,7 @@ def _validate_input(site_name: str, listing_type: str) -> None:
raise InvalidSite(f"Provided site, '{site_name}', does not exist.")

if listing_type.upper() not in ListingType.__members__:
raise InvalidListingType(
f"Provided listing type, '{listing_type}', does not exist."
)
raise InvalidListingType(f"Provided listing type, '{listing_type}', does not exist.")


def _get_ordered_properties(result: Property) -> list[str]:
Expand All @@ -35,34 +33,26 @@ def _get_ordered_properties(result: Property) -> list[str]:
"listing_type",
"property_type",
"status_text",
"currency",
"price",
"apt_min_price",
"apt_max_price",
"apt_min_sqft",
"apt_max_sqft",
"apt_min_beds",
"apt_max_beds",
"apt_min_baths",
"apt_max_baths",
"baths_min",
"baths_max",
"beds_min",
"beds_max",
"sqft_min",
"sqft_max",
"price_min",
"price_max",
"unit_count",
"tax_assessed_value",
"square_feet",
"price_per_sqft",
"beds",
"baths",
"lot_area_value",
"lot_area_unit",
"street_address",
"unit",
"address_one",
"address_two",
"city",
"state",
"zip_code",
"country",
"posted_time",
"bldg_min_beds",
"bldg_min_baths",
"bldg_min_area",
"bldg_unit_count",
"area_min",
"bldg_name",
"stories",
"year_built",
Expand All @@ -86,12 +76,11 @@ def _process_result(result: Property) -> pd.DataFrame:
prop_data["property_type"] = None
if "address" in prop_data:
address_data = prop_data["address"]
prop_data["street_address"] = address_data.street_address
prop_data["unit"] = address_data.unit
prop_data["address_one"] = address_data.address_one
prop_data["address_two"] = address_data.address_two
prop_data["city"] = address_data.city
prop_data["state"] = address_data.state
prop_data["zip_code"] = address_data.zip_code
prop_data["country"] = address_data.country

del prop_data["address"]

Expand All @@ -101,9 +90,7 @@ def _process_result(result: Property) -> pd.DataFrame:
return properties_df


def _scrape_single_site(
location: str, site_name: str, listing_type: str, proxy: str = None
) -> pd.DataFrame:
def _scrape_single_site(location: str, site_name: str, listing_type: str, proxy: str = None) -> pd.DataFrame:
"""
Helper function to scrape a single site.
"""
Expand All @@ -120,9 +107,7 @@ def _scrape_single_site(
results = site.search()

properties_dfs = [_process_result(result) for result in results]
properties_dfs = [
df.dropna(axis=1, how="all") for df in properties_dfs if not df.empty
]
properties_dfs = [df.dropna(axis=1, how="all") for df in properties_dfs if not df.empty]
if not properties_dfs:
return pd.DataFrame()

Expand Down Expand Up @@ -158,9 +143,7 @@ def scrape_property(
else:
with ThreadPoolExecutor() as executor:
futures = {
executor.submit(
_scrape_single_site, location, s_name, listing_type, proxy
): s_name
executor.submit(_scrape_single_site, location, s_name, listing_type, proxy): s_name
for s_name in site_name
}

Expand All @@ -175,14 +158,12 @@ def scrape_property(

final_df = pd.concat(results, ignore_index=True)

columns_to_track = ["street_address", "city", "unit"]
columns_to_track = ["address_one", "address_two", "city"]

#: validate they exist, otherwise create them
for col in columns_to_track:
if col not in final_df.columns:
final_df[col] = None

final_df = final_df.drop_duplicates(
subset=["street_address", "city", "unit"], keep="first"
)
final_df = final_df.drop_duplicates(subset=columns_to_track, keep="first")
return final_df
12 changes: 3 additions & 9 deletions homeharvest/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,7 @@

def main():
parser = argparse.ArgumentParser(description="Home Harvest Property Scraper")
parser.add_argument(
"location", type=str, help="Location to scrape (e.g., San Francisco, CA)"
)
parser.add_argument("location", type=str, help="Location to scrape (e.g., San Francisco, CA)")

parser.add_argument(
"-s",
Expand Down Expand Up @@ -44,15 +42,11 @@ def main():
help="Name of the output file (without extension)",
)

parser.add_argument(
"-p", "--proxy", type=str, default=None, help="Proxy to use for scraping"
)
parser.add_argument("-p", "--proxy", type=str, default=None, help="Proxy to use for scraping")

args = parser.parse_args()

result = scrape_property(
args.location, args.site_name, args.listing_type, proxy=args.proxy
)
result = scrape_property(args.location, args.site_name, args.listing_type, proxy=args.proxy)

if not args.filename:
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
Expand Down
5 changes: 1 addition & 4 deletions homeharvest/core/scrapers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,7 @@ def __init__(self, scraper_input: ScraperInput):
self.session = requests.Session()
if scraper_input.proxy:
proxy_url = scraper_input.proxy
proxies = {
"http": proxy_url,
"https": proxy_url
}
proxies = {"http": proxy_url, "https": proxy_url}
self.session.proxies.update(proxies)
self.listing_type = scraper_input.listing_type
self.site_name = scraper_input.site_name
Expand Down
51 changes: 24 additions & 27 deletions homeharvest/core/scrapers/models.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from dataclasses import dataclass
from enum import Enum
from typing import Tuple


class SiteName(Enum):
Expand Down Expand Up @@ -56,12 +57,11 @@ def from_int_code(cls, code):

@dataclass
class Address:
street_address: str
city: str
state: str
zip_code: str
unit: str | None = None
country: str | None = None
address_one: str | None = None
address_two: str | None = "#"
city: str | None = None
state: str | None = None
zip_code: str | None = None


@dataclass
Expand All @@ -73,12 +73,7 @@ class Property:
property_type: PropertyType | None = None

# house for sale
price: int | None = None
tax_assessed_value: int | None = None
currency: str | None = None
square_feet: int | None = None
beds: int | None = None
baths: float | None = None
lot_area_value: float | None = None
lot_area_unit: str | None = None
stories: int | None = None
Expand All @@ -90,23 +85,25 @@ class Property:
img_src: str | None = None
description: str | None = None
status_text: str | None = None
latitude: float | None = None
longitude: float | None = None
posted_time: str | None = None

# building for sale
bldg_name: str | None = None
bldg_unit_count: int | None = None
bldg_min_beds: int | None = None
bldg_min_baths: float | None = None
bldg_min_area: int | None = None

# apt
apt_min_beds: int | None = None
apt_max_beds: int | None = None
apt_min_baths: float | None = None
apt_max_baths: float | None = None
apt_min_price: int | None = None
apt_max_price: int | None = None
apt_min_sqft: int | None = None
apt_max_sqft: int | None = None
area_min: int | None = None

beds_min: int | None = None
beds_max: int | None = None

baths_min: float | None = None
baths_max: float | None = None

sqft_min: int | None = None
sqft_max: int | None = None

price_min: int | None = None
price_max: int | None = None

unit_count: int | None = None

latitude: float | None = None
longitude: float | None = None
76 changes: 44 additions & 32 deletions homeharvest/core/scrapers/realtor/__init__.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,23 @@
import json
"""
homeharvest.realtor.__init__
~~~~~~~~~~~~
This module implements the scraper for relator.com
"""
from ..models import Property, Address
from .. import Scraper
from typing import Any, Generator
from ....exceptions import NoResultsFound
from ....utils import parse_address_two, parse_unit
from ....utils import parse_address_one, parse_address_two
from concurrent.futures import ThreadPoolExecutor, as_completed


class RealtorScraper(Scraper):
def __init__(self, scraper_input):
self.counter = 1
super().__init__(scraper_input)
self.search_url = "https://www.realtor.com/api/v1/rdc_search_srp?client_id=rdc-search-new-communities&schema=vesta"
self.search_url = (
"https://www.realtor.com/api/v1/rdc_search_srp?client_id=rdc-search-new-communities&schema=vesta"
)

def handle_location(self):
headers = {
Expand Down Expand Up @@ -50,6 +57,9 @@ def handle_location(self):
return result[0]

def handle_address(self, property_id: str) -> list[Property]:
"""
Handles a specific address & returns one property
"""
query = """query Property($property_id: ID!) {
property(id: $property_id) {
property_id
Expand Down Expand Up @@ -108,43 +118,45 @@ def handle_address(self, property_id: str) -> list[Property]:
response_json = response.json()

property_info = response_json["data"]["property"]
street_address, unit = parse_address_two(property_info["address"]["line"])
address_one, address_two = parse_address_one(property_info["address"]["line"])

return [
Property(
site_name=self.site_name,
address=Address(
street_address=street_address,
address_one=address_one,
address_two=address_two,
city=property_info["address"]["city"],
state=property_info["address"]["state_code"],
zip_code=property_info["address"]["postal_code"],
unit=unit,
country="USA",
),
property_url="https://www.realtor.com/realestateandhomes-detail/"
+ property_info["details"]["permalink"],
beds=property_info["basic"]["beds"],
baths=property_info["basic"]["baths"],
stories=property_info["details"]["stories"],
year_built=property_info["details"]["year_built"],
square_feet=property_info["basic"]["sqft"],
price_per_sqft=property_info["basic"]["price"]
// property_info["basic"]["sqft"]
if property_info["basic"]["sqft"] is not None
and property_info["basic"]["price"] is not None
price_per_sqft=property_info["basic"]["price"] // property_info["basic"]["sqft"]
if property_info["basic"]["sqft"] is not None and property_info["basic"]["price"] is not None
else None,
price=property_info["basic"]["price"],
mls_id=property_id,
listing_type=self.listing_type,
lot_area_value=property_info["public_record"]["lot_size"]
if property_info["public_record"] is not None
else None,
beds_min=property_info["basic"]["beds"],
beds_max=property_info["basic"]["beds"],
baths_min=property_info["basic"]["baths"],
baths_max=property_info["basic"]["baths"],
sqft_min=property_info["basic"]["sqft"],
sqft_max=property_info["basic"]["sqft"],
price_min=property_info["basic"]["price"],
price_max=property_info["basic"]["price"],
)
]

def handle_area(
self, variables: dict, return_total: bool = False
) -> list[Property] | int:
def handle_area(self, variables: dict, return_total: bool = False) -> list[Property] | int:
"""
Handles a location area & returns a list of properties
"""
query = (
"""query Home_search(
$city: String,
Expand Down Expand Up @@ -237,17 +249,15 @@ def handle_area(
return []

for result in response_json["data"]["home_search"]["results"]:
street_address, unit = parse_address_two(
result["location"]["address"]["line"]
)
self.counter += 1
address_one, _ = parse_address_one(result["location"]["address"]["line"])
realty_property = Property(
address=Address(
street_address=street_address,
address_one=address_one,
city=result["location"]["address"]["city"],
state=result["location"]["address"]["state_code"],
zip_code=result["location"]["address"]["postal_code"],
unit=parse_unit(result["location"]["address"]["unit"]),
country="USA",
address_two=parse_address_two(result["location"]["address"]["unit"]),
),
latitude=result["location"]["address"]["coordinate"]["lat"]
if result
Expand All @@ -264,20 +274,22 @@ def handle_area(
and "lon" in result["location"]["address"]["coordinate"]
else None,
site_name=self.site_name,
property_url="https://www.realtor.com/realestateandhomes-detail/"
+ result["property_id"],
beds=result["description"]["beds"],
baths=result["description"]["baths"],
property_url="https://www.realtor.com/realestateandhomes-detail/" + result["property_id"],
stories=result["description"]["stories"],
year_built=result["description"]["year_built"],
square_feet=result["description"]["sqft"],
price_per_sqft=result["price_per_sqft"],
price=result["list_price"],
mls_id=result["property_id"],
listing_type=self.listing_type,
lot_area_value=result["description"]["lot_sqft"],
beds_min=result["description"]["beds"],
beds_max=result["description"]["beds"],
baths_min=result["description"]["baths"],
baths_max=result["description"]["baths"],
sqft_min=result["description"]["sqft"],
sqft_max=result["description"]["sqft"],
price_min=result["list_price"],
price_max=result["list_price"],
)

properties.append(realty_property)

return properties
Expand Down
Loading

0 comments on commit f6054e8

Please sign in to comment.