Skip to content

Commit

Permalink
refator(realtor): fit to updated models
Browse files Browse the repository at this point in the history
  • Loading branch information
cullenwatson committed Sep 18, 2023
1 parent ffd3ce6 commit 869d7e7
Show file tree
Hide file tree
Showing 8 changed files with 163 additions and 109 deletions.
7 changes: 5 additions & 2 deletions homeharvest/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,10 @@ def process_result(result: Property) -> pd.DataFrame:

prop_data["site_name"] = prop_data["site_name"].value
prop_data["listing_type"] = prop_data["listing_type"].value.lower()
prop_data["property_type"] = prop_data["property_type"].value.lower()
if "property_type" in prop_data and prop_data["property_type"] is not None:
prop_data["property_type"] = prop_data["property_type"].value.lower()
else:
prop_data["property_type"] = None
if "address" in prop_data:
address_data = prop_data["address"]
prop_data["street_address"] = address_data.street_address
Expand Down Expand Up @@ -108,7 +111,7 @@ def scrape_property(
scraper_input = ScraperInput(
location=location,
listing_type=ListingType[listing_type.upper()],
site_name=SiteName[site_name.upper()],
site_name=SiteName.get_by_value(site_name.lower()),
)

site = _scrapers[site_name.lower()](scraper_input)
Expand Down
11 changes: 8 additions & 3 deletions homeharvest/core/scrapers/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,13 @@ class SiteName(Enum):
REDFIN = "redfin"
REALTOR = "realtor.com"

@classmethod
def get_by_value(cls, value):
for item in cls:
if item.value == value:
return item
raise ValueError(f"{value} not found in {cls}")


class ListingType(Enum):
FOR_SALE = "FOR_SALE"
Expand Down Expand Up @@ -57,14 +64,13 @@ class Address:
country: str | None = None



@dataclass
class Property:
property_url: str
site_name: SiteName
listing_type: ListingType
property_type: PropertyType
address: Address
property_type: PropertyType | None = None

# house for sale
price: int | None = None
Expand All @@ -78,7 +84,6 @@ class Property:
stories: int | None = None
year_built: int | None = None
price_per_sqft: int | None = None
year_built: int | None = None
mls_id: str | None = None

agent_name: str | None = None
Expand Down
152 changes: 91 additions & 61 deletions homeharvest/core/scrapers/realtor/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from .. import Scraper
from typing import Any, Generator
from ....exceptions import NoResultsFound
from ....utils import parse_address_two
from concurrent.futures import ThreadPoolExecutor, as_completed


Expand All @@ -29,7 +30,7 @@ def handle_location(self):

params = {
"input": self.location,
"client_id": self.listing_type.value.replace('_', '-'),
"client_id": self.listing_type.value.lower().replace("_", "-"),
"limit": "1",
"area_types": "city,state,county,postal_code,address,street,neighborhood,school,school_district,university,park",
}
Expand Down Expand Up @@ -96,46 +97,57 @@ def handle_address(self, property_id: str) -> list[Property]:
}
}"""

variables = {
'property_id': property_id
}
variables = {"property_id": property_id}

payload = {
'query': query,
'variables': variables,
"query": query,
"variables": variables,
}

response = self.session.post(self.search_url, json=payload)
response_json = response.json()

property_info = response_json['data']['property']
property_info = response_json["data"]["property"]
street_address = property_info["address"]["line"]
unit = parse_address_two(street_address)

return [Property(
site_name=self.site_name,
address=Address(
address_one=property_info['address']['line'],
city=property_info['address']['city'],
state=property_info['address']['state_code'],
zip_code=property_info['address']['postal_code'],
),
url="https://www.realtor.com/realestateandhomes-detail/" + property_info['details']['permalink'],
beds=property_info['basic']['beds'],
baths=property_info['basic']['baths'],
stories=property_info['details']['stories'],
year_built=property_info['details']['year_built'],
square_feet=property_info['basic']['sqft'],
price_per_square_foot=property_info['basic']['price'] / property_info['basic']['sqft']
if property_info['basic']['sqft'] is not None and
property_info['basic']['price'] is not None
else None,
price=property_info['basic']['price'],
mls_id=property_id,
listing_type=self.listing_type,
lot_size=property_info['public_record']['lot_size'] if property_info['public_record'] is not None else None,
)]
return [
Property(
site_name=self.site_name,
address=Address(
street_address=street_address,
city=property_info["address"]["city"],
state=property_info["address"]["state_code"],
zip_code=property_info["address"]["postal_code"],
unit=unit,
country="USA",
),
property_url="https://www.realtor.com/realestateandhomes-detail/"
+ property_info["details"]["permalink"],
beds=property_info["basic"]["beds"],
baths=property_info["basic"]["baths"],
stories=property_info["details"]["stories"],
year_built=property_info["details"]["year_built"],
square_feet=property_info["basic"]["sqft"],
price_per_sqft=property_info["basic"]["price"]
// property_info["basic"]["sqft"]
if property_info["basic"]["sqft"] is not None
and property_info["basic"]["price"] is not None
else None,
price=property_info["basic"]["price"],
mls_id=property_id,
listing_type=self.listing_type,
lot_area_value=property_info["public_record"]["lot_size"]
if property_info["public_record"] is not None
else None,
)
]

def handle_area(self, variables: dict, return_total: bool = False) -> list[Property] | int:
query = """query Home_search(
def handle_area(
self, variables: dict, return_total: bool = False
) -> list[Property] | int:
query = (
"""query Home_search(
$city: String,
$county: [String],
$state_code: String,
Expand Down Expand Up @@ -193,42 +205,57 @@ def handle_area(self, variables: dict, return_total: bool = False) -> list[Prope
}
}
}
}""" % self.listing_type.value
}"""
% self.listing_type.value.lower()
)

payload = {
'query': query,
'variables': variables,
"query": query,
"variables": variables,
}

response = self.session.post(self.search_url, json=payload)
response.raise_for_status()
response_json = response.json()

if return_total:
return response_json['data']['home_search']['total']
return response_json["data"]["home_search"]["total"]

properties: list[Property] = []

for result in response_json['data']['home_search']['results']:
if (
response_json is None
or "data" not in response_json
or response_json["data"] is None
or "home_search" not in response_json["data"]
or response_json["data"]["home_search"] is None
or "results" not in response_json["data"]["home_search"]
):
return []

for result in response_json["data"]["home_search"]["results"]:
realty_property = Property(
address=Address(
address_one=result['location']['address']['line'],
city=result['location']['address']['city'],
state=result['location']['address']['state_code'],
zip_code=result['location']['address']['postal_code'],
address_two=result['location']['address']['unit'],
street_address=result["location"]["address"]["line"],
city=result["location"]["address"]["city"],
state=result["location"]["address"]["state_code"],
zip_code=result["location"]["address"]["postal_code"],
unit=result["location"]["address"]["unit"],
country="USA",
),
site_name=self.site_name,
url="https://www.realtor.com/realestateandhomes-detail/" + result['property_id'],
beds=result['description']['beds'],
baths=result['description']['baths'],
stories=result['description']['stories'],
year_built=result['description']['year_built'],
square_feet=result['description']['sqft'],
price_per_square_foot=result['price_per_sqft'],
price=result['list_price'],
mls_id=result['property_id'],
property_url="https://www.realtor.com/realestateandhomes-detail/"
+ result["property_id"],
beds=result["description"]["beds"],
baths=result["description"]["baths"],
stories=result["description"]["stories"],
year_built=result["description"]["year_built"],
square_feet=result["description"]["sqft"],
price_per_sqft=result["price_per_sqft"],
price=result["list_price"],
mls_id=result["property_id"],
listing_type=self.listing_type,
lot_size=result['description']['lot_sqft'],
lot_area_value=result["description"]["lot_sqft"],
)

properties.append(realty_property)
Expand All @@ -239,17 +266,17 @@ def search(self):
location_info = self.handle_location()
location_type = location_info["area_type"]

if location_type == 'address':
property_id = location_info['mpr_id']
if location_type == "address":
property_id = location_info["mpr_id"]
return self.handle_address(property_id)

offset = 0
search_variables = {
'city': location_info.get('city'),
'county': location_info.get('county'),
'state_code': location_info.get('state_code'),
'postal_code': location_info.get('postal_code'),
'offset': offset,
"city": location_info.get("city"),
"county": location_info.get("county"),
"state_code": location_info.get("state_code"),
"postal_code": location_info.get("postal_code"),
"offset": offset,
}

total = self.handle_area(search_variables, return_total=True)
Expand All @@ -258,8 +285,11 @@ def search(self):
with ThreadPoolExecutor(max_workers=10) as executor:
futures = [
executor.submit(
self.handle_area, variables=search_variables | {'offset': i}, return_total=False
) for i in range(0, total, 200)
self.handle_area,
variables=search_variables | {"offset": i},
return_total=False,
)
for i in range(0, total, 200)
]

for future in as_completed(futures):
Expand Down
24 changes: 12 additions & 12 deletions homeharvest/core/scrapers/redfin/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,28 +100,27 @@ def _parse_building(self, building: dict) -> Property:
address=Address(
street_address=" ".join(
[
building['address']['streetNumber'],
building['address']['directionalPrefix'],
building['address']['streetName'],
building['address']['streetType'],
building["address"]["streetNumber"],
building["address"]["directionalPrefix"],
building["address"]["streetName"],
building["address"]["streetType"],
]
),
city=building['address']['city'],
state=building['address']['stateOrProvinceCode'],
zip_code=building['address']['postalCode'],
city=building["address"]["city"],
state=building["address"]["stateOrProvinceCode"],
zip_code=building["address"]["postalCode"],
unit=" ".join(
[
building['address']['unitType'],
building['address']['unitValue'],
building["address"]["unitType"],
building["address"]["unitValue"],
]
)
),
),
property_url="https://www.redfin.com{}".format(building["url"]),
listing_type=self.listing_type,
bldg_unit_count=building["numUnitsForSale"],
)


def handle_address(self, home_id: str):
"""
EPs:
Expand Down Expand Up @@ -160,7 +159,8 @@ def search(self):
homes = [
self._parse_home(home) for home in response_json["payload"]["homes"]
] + [
self._parse_building(building) for building in response_json["payload"]["buildings"].values()
self._parse_building(building)
for building in response_json["payload"]["buildings"].values()
]

return homes
40 changes: 18 additions & 22 deletions homeharvest/core/scrapers/zillow/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,26 +98,24 @@ def _fetch_properties_backend(self, coords):
else filter_state_sold
)

payload = json.dumps(
{
"searchQueryState": {
"pagination": {},
"isMapVisible": True,
"mapBounds": {
"west": coords[0],
"east": coords[1],
"south": coords[2],
"north": coords[3],
},
"filterState": selected_filter,
"isListVisible": True,
"mapZoom": 11,
payload = {
"searchQueryState": {
"pagination": {},
"isMapVisible": True,
"mapBounds": {
"west": coords[0],
"east": coords[1],
"south": coords[2],
"north": coords[3],
},
"wants": {"cat1": ["mapResults"]},
"isDebugRequest": False,
}
)
resp = self.session.put(url, headers=self._get_headers(), data=payload)
"filterState": selected_filter,
"isListVisible": True,
"mapZoom": 11,
},
"wants": {"cat1": ["mapResults"]},
"isDebugRequest": False,
}
resp = self.session.put(url, headers=self._get_headers(), json=payload)
resp.raise_for_status()
a = resp.json()
return self._parse_properties(resp.json())
Expand Down Expand Up @@ -176,9 +174,7 @@ def _parse_properties(self, property_data: dict):
and result["variableData"]["type"] == "TIME_ON_INFO"
else None,
"img_src": result.get("imgSrc"),
"price_per_sqft": int(
home_info["price"] // home_info["livingArea"]
)
"price_per_sqft": int(home_info["price"] // home_info["livingArea"])
if "livingArea" in home_info and "price" in home_info
else None,
}
Expand Down
Loading

0 comments on commit 869d7e7

Please sign in to comment.