Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

enh: assessed/estimated value #77

Merged
merged 4 commits into from
Apr 30, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 19 additions & 8 deletions homeharvest/core/scrapers/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import uuid
from dataclasses import dataclass
import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
import uuid
from .models import Property, ListingType, SiteName

Expand All @@ -19,24 +20,30 @@ class ScraperInput:


class Scraper:
session = None

def __init__(
self,
scraper_input: ScraperInput,
session: requests.Session = None,
):
self.location = scraper_input.location
self.listing_type = scraper_input.listing_type

if not session:
self.session = requests.Session()
self.session.headers.update(
if not self.session:
Scraper.session = requests.Session()
retries = Retry(
total=3, backoff_factor=3, status_forcelist=[429, 403], allowed_methods=frozenset(["GET", "POST"])
)

adapter = HTTPAdapter(max_retries=retries)
Scraper.session.mount("http://", adapter)
Scraper.session.mount("https://", adapter)
Scraper.session.headers.update(
{
"auth": f"Bearer {self.get_access_token()}",
"apollographql-client-name": "com.move.Realtor-apollo-ios",
}
)
else:
self.session = session

if scraper_input.proxy:
proxy_url = scraper_input.proxy
Expand Down Expand Up @@ -73,4 +80,8 @@ def get_access_token(self):
response = requests.post(url, headers=headers, data=payload)

data = response.json()
return data["access_token"]
try:
access_token = data["access_token"]
except Exception:
raise Exception("Could not get access token, use a proxy/vpn or wait")
return access_token
2 changes: 2 additions & 0 deletions homeharvest/core/scrapers/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,3 +106,5 @@ class Property:
fips_code: Optional[str] = None
agents: list[Agent] = None
nearby_schools: list[str] = None
assessed_value: int | None = None
estimated_value: int | None = None
49 changes: 31 additions & 18 deletions homeharvest/core/scrapers/realtor/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,9 @@
This module implements the scraper for realtor.com
"""

from concurrent.futures import ThreadPoolExecutor, as_completed
from datetime import datetime
from typing import Dict, Union, Optional
from concurrent.futures import ThreadPoolExecutor, as_completed

from .. import Scraper
from ..models import Property, Address, ListingType, Description, PropertyType, Agent
Expand Down Expand Up @@ -142,7 +142,7 @@ def handle_listing(self, listing_id: str) -> list[Property]:
days_on_mls = None

property_id = property_info["details"]["permalink"]
agents_schools = self.get_agents_schools(property_id)
prop_details = self.get_prop_details(property_id)
listing = Property(
mls=mls,
mls_id=(
Expand Down Expand Up @@ -176,11 +176,13 @@ def handle_listing(self, listing_id: str) -> list[Property]:
year_built=property_info["details"].get("year_built"),
garage=property_info["details"].get("garage"),
stories=property_info["details"].get("stories"),
text=property_info["description"].get("text"),
text=property_info.get("description", {}).get("text"),
),
days_on_mls=days_on_mls,
agents=agents_schools["agents"],
nearby_schools=agents_schools["schools"],
agents=prop_details.get("agents"),
nearby_schools=prop_details.get("schools"),
assessed_value=prop_details.get("assessed_value"),
estimated_value=prop_details.get("estimated_value"),
)

return [listing]
Expand Down Expand Up @@ -274,7 +276,7 @@ def handle_address(self, property_id: str) -> list[Property]:
}"""

variables = {"property_id": property_id}
agents_schools = self.get_agents_schools(property_id)
prop_details = self.get_prop_details(property_id)

payload = {
"query": query,
Expand All @@ -292,8 +294,10 @@ def handle_address(self, property_id: str) -> list[Property]:
property_url=f"{self.PROPERTY_URL}{property_info['details']['permalink']}",
address=self._parse_address(property_info, search_type="handle_address"),
description=self._parse_description(property_info),
agents=agents_schools["agents"],
nearby_schools=agents_schools["schools"],
agents=prop_details.get("agents"),
nearby_schools=prop_details.get("schools"),
assessed_value=prop_details.get("assessed_value"),
estimated_value=prop_details.get("estimated_value"),
)
]

Expand Down Expand Up @@ -486,7 +490,6 @@ def general_search(self, variables: dict, search_type: str) -> Dict[str, Union[i
}

response = self.session.post(self.SEARCH_GQL_URL, json=payload)
response.raise_for_status()
response_json = response.json()
search_key = "home_search" if "home_search" in query else "property_search"

Expand Down Expand Up @@ -521,7 +524,7 @@ def process_property(result: dict) -> Property | None:
return

property_id = result["property_id"]
agents_schools = self.get_agents_schools(property_id)
prop_details = self.get_prop_details(property_id)

realty_property = Property(
mls=mls,
Expand All @@ -546,11 +549,13 @@ def process_property(result: dict) -> Property | None:
address=self._parse_address(result, search_type="general_search"),
description=self._parse_description(result),
neighborhoods=self._parse_neighborhoods(result),
county=result["location"]["county"].get("name"),
fips_code=result["location"]["county"].get("fips_code"),
county=result["location"]["county"].get("name") if result["location"]["county"] else None,
fips_code=result["location"]["county"].get("fips_code") if result["location"]["county"] else None,
days_on_mls=self.calculate_days_on_mls(result),
agents=agents_schools["agents"],
nearby_schools=agents_schools["schools"],
agents=prop_details.get("agents"),
nearby_schools=prop_details.get("schools"),
assessed_value=prop_details.get("assessed_value"),
estimated_value=prop_details.get("estimated_value"),
)
return realty_property

Expand Down Expand Up @@ -645,8 +650,8 @@ def search(self):

return homes

def get_agents_schools(self, property_id: str) -> dict:
payload = f'{{"query":"query GetHome($property_id: ID!) {{\\n home(property_id: $property_id) {{\\n __typename\\n\\n consumerAdvertisers: consumer_advertisers {{\\n __typename\\n type\\n advertiserId: advertiser_id\\n name\\n phone\\n type\\n href\\n slogan\\n photo {{\\n __typename\\n href\\n }}\\n showRealtorLogo: show_realtor_logo\\n hours\\n }}\\n\\n\\n nearbySchools: nearby_schools(radius: 5.0, limit_per_level: 3) {{ __typename schools {{ district {{ __typename id name }} }} }}}}\\n}}\\n","variables":{{"property_id":"{property_id}"}}}}'
def get_prop_details(self, property_id: str) -> dict:
payload = f'{{"query":"query GetHome($property_id: ID!) {{\\n home(property_id: $property_id) {{\\n __typename\\n\\n consumerAdvertisers: consumer_advertisers {{\\n __typename\\n type\\n advertiserId: advertiser_id\\n name\\n phone\\n type\\n href\\n slogan\\n photo {{\\n __typename\\n href\\n }}\\n showRealtorLogo: show_realtor_logo\\n hours\\n }}\\n\\n\\n nearbySchools: nearby_schools(radius: 5.0, limit_per_level: 3) {{ __typename schools {{ district {{ __typename id name }} }} }} taxHistory: tax_history {{ __typename tax year assessment {{ __typename building land total }} }}estimates {{ __typename currentValues: current_values {{ __typename source {{ __typename type name }} estimate estimateHigh: estimate_high estimateLow: estimate_low date isBestHomeValue: isbest_homevalue }} }} }}\\n}}\\n","variables":{{"property_id":"{property_id}"}}}}'
response = self.session.post(self.PROPERTY_GQL, data=payload)

def get_key(keys: list):
Expand All @@ -656,14 +661,22 @@ def get_key(keys: list):
data = data[key]
return data
except (KeyError, TypeError):
return []
return {}

ads = get_key(["data", "home", "consumerAdvertisers"])
schools = get_key(["data", "home", "nearbySchools", "schools"])
assessed_value = get_key(["data", "home", "taxHistory", 0, "assessment", "total"])
estimated_value = get_key(["data", "home", "estimates", "currentValues", 0, "estimate"])

agents = [Agent(name=ad["name"], phone=ad["phone"]) for ad in ads]

schools = [school["district"]["name"] for school in schools]
return {"agents": agents, "schools": schools}
return {
"agents": agents if agents else None,
"schools": schools if schools else None,
"assessed_value": assessed_value if assessed_value else None,
"estimated_value": estimated_value if estimated_value else None,
}

@staticmethod
def _parse_neighborhoods(result: dict) -> Optional[str]:
Expand Down
5 changes: 3 additions & 2 deletions homeharvest/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@
"list_date",
"sold_price",
"last_sold_date",
"assessed_value",
"estimated_value",
"lot_sqft",
"price_per_sqft",
"latitude",
Expand Down Expand Up @@ -71,7 +73,7 @@ def process_result(result: Property) -> pd.DataFrame:
description = result.description
prop_data["primary_photo"] = description.primary_photo
prop_data["alt_photos"] = ", ".join(description.alt_photos)
prop_data["style"] = description.style.value
prop_data["style"] = description.style if type(description.style) == str else description.style.value
prop_data["beds"] = description.beds
prop_data["full_baths"] = description.baths_full
prop_data["half_baths"] = description.baths_half
Expand All @@ -83,7 +85,6 @@ def process_result(result: Property) -> pd.DataFrame:
prop_data["stories"] = description.stories
prop_data["text"] = description.text


properties_df = pd.DataFrame([prop_data])
properties_df = properties_df.reindex(columns=ordered_properties)

Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "homeharvest"
version = "0.3.19"
version = "0.3.20"
description = "Real estate scraping library"
authors = ["Zachary Hampton <[email protected]>", "Cullen Watson <[email protected]>"]
homepage = "https://github.com/Bunsly/HomeHarvest"
Expand Down
Loading