Skip to content

Commit

Permalink
[fix] scrape property params
Browse files Browse the repository at this point in the history
  • Loading branch information
cullenwatson committed Oct 4, 2023
1 parent 1464b4f commit bd33c3b
Show file tree
Hide file tree
Showing 3 changed files with 233 additions and 7 deletions.
4 changes: 1 addition & 3 deletions homeharvest/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,6 @@ def _scrape_single_site(
"""
Helper function to scrape a single site.
"""
print(status)
_validate_input(site_name, status)

scraper_input = ScraperInput(
Expand All @@ -42,7 +41,6 @@ def _scrape_single_site(

site = _scrapers[site_name.lower()](scraper_input)
results = site.search()
print(f"Found {len(results)} results for {site_name}")

properties_dfs = [process_result(result) for result in results]
if not properties_dfs:
Expand All @@ -53,7 +51,7 @@ def _scrape_single_site(

def scrape_property(
location: str,
timeframe: str,
timeframe: str = None,
site_name: Union[str, list[str]] = None,
status: str = "sale",
proxy: str = None,
Expand Down
4 changes: 4 additions & 0 deletions homeharvest/core/scrapers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,12 @@ class ScraperInput:
timeframe: Optional[str] = None

def __post_init__(self):
if self.status == "sold" and not self.timeframe:
raise InvalidTimeFrame("Timeframe is required when status is 'sold'")

if self.timeframe and self.timeframe not in VALID_TIMEFRAMES:
raise InvalidTimeFrame(f"Invalid timeframe provided: {self.timeframe}")

if self.status and self.status not in VALID_STATUSES:
raise InvalidTimeFrame(f"Invalid status provided: {self.status}")

Expand Down
232 changes: 228 additions & 4 deletions homeharvest/core/scrapers/realtor/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -258,9 +258,8 @@ def handle_area(self, variables: dict) -> Dict[str, Union[int, list[Property]]]:
self.status,
f'"$nowUTC-{self.timeframe}"',
)

payload = {
"query": query,
"query": self.get_query(),
"variables": variables,
}
response = self.session.post(self.endpoint, json=payload)
Expand Down Expand Up @@ -314,7 +313,6 @@ def handle_area(self, variables: dict) -> Dict[str, Union[int, list[Property]]]:
+ result["property_id"],
mls=mls,
mls_id=mls_id,
# status=(result["source"]["raw"].get("status").upper() if 'source' in result and isinstance(result["source"], dict) and "raw" in result["source"] and isinstance(result["source"]["raw"], dict) else None),
status=result["status"].upper(),
style=result["description"]["type"].upper(),
beds=result["description"]["beds"],
Expand All @@ -323,7 +321,9 @@ def handle_area(self, variables: dict) -> Dict[str, Union[int, list[Property]]]:
est_sf=result["description"]["sqft"],
lot_sf=result["description"]["lot_sqft"],
list_price=result["list_price"],
list_date=result["list_date"].split("T")[0],
list_date=result["list_date"].split("T")[0]
if result["list_date"]
else None,
sold_price=result["description"]["sold_price"],
prc_sqft=result["price_per_sqft"],
last_sold_date=result["last_sold_date"],
Expand Down Expand Up @@ -363,6 +363,230 @@ def handle_area(self, variables: dict) -> Dict[str, Union[int, list[Property]]]:
"properties": properties,
}

def get_query(self):
if self.status == "sold":
return """query Home_search(
$city: String,
$county: [String],
$state_code: String,
$postal_code: String,
$offset: Int
) {
home_search(
query: {
city: $city
county: $county
postal_code: $postal_code
state_code: $state_code
status: %s
sold_date: {
min: %s
}
}
limit: 200
offset: $offset
sort: [
{
field: sold_date,
direction: desc
}
]
) {
count
total
results {
property_id
list_date
status
last_sold_price
last_sold_date
hoa {
fee
}
description {
baths_full
baths_half
beds
lot_sqft
sqft
sold_price
year_built
garage
sold_price
type
sub_type
name
stories
}
source {
raw {
area
status
style
}
last_update_date
contract_date
id
listing_id
name
type
listing_href
community_id
management_id
corporation_id
subdivision_status
spec_id
plan_id
tier_rank
feed_type
}
location {
address {
city
country
line
postal_code
state_code
state
coordinate {
lon
lat
}
street_direction
street_name
street_number
street_post_direction
street_suffix
unit
}
neighborhoods {
name
}
}
list_price
price_per_sqft
style_category_tags {
exterior}
source {
id
}
}
}
}""" % (
self.status,
f'"$nowUTC-{self.timeframe}"',
)
else:
return """query Home_search(
$city: String,
$county: [String],
$state_code: String,
$postal_code: String,
$offset: Int
) {
home_search(
query: {
city: $city
county: $county
postal_code: $postal_code
state_code: $state_code
status: %s
}
limit: 200
offset: $offset
sort: [
{
field: sold_date,
direction: desc
}
]
) {
count
total
results {
property_id
list_date
status
last_sold_price
last_sold_date
hoa {
fee
}
description {
baths_full
baths_half
beds
lot_sqft
sqft
sold_price
year_built
garage
sold_price
type
sub_type
name
stories
}
source {
raw {
area
status
style
}
last_update_date
contract_date
id
listing_id
name
type
listing_href
community_id
management_id
corporation_id
subdivision_status
spec_id
plan_id
tier_rank
feed_type
}
location {
address {
city
country
line
postal_code
state_code
state
coordinate {
lon
lat
}
street_direction
street_name
street_number
street_post_direction
street_suffix
unit
}
neighborhoods {
name
}
}
list_price
price_per_sqft
style_category_tags {
exterior}
source {
id
}
}
}
}""" % (
self.status,
)

def search(self):
location_info = self.handle_location()
location_type = location_info["area_type"]
Expand Down

0 comments on commit bd33c3b

Please sign in to comment.