Skip to content

Commit

Permalink
[fix] add back zillow/redfin
Browse files Browse the repository at this point in the history
  • Loading branch information
cullenwatson committed Oct 4, 2023
1 parent bd33c3b commit ff95ca0
Show file tree
Hide file tree
Showing 5 changed files with 573 additions and 11 deletions.
20 changes: 17 additions & 3 deletions homeharvest/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,12 @@ def _validate_input(site_name: str, status: str) -> None:


def _scrape_single_site(
location: str, site_name: str, status: str, proxy: str = None, timeframe: str = None
location: str,
site_name: str,
status: str,
radius: float,
proxy: str = None,
timeframe: str = None,
) -> pd.DataFrame:
"""
Helper function to scrape a single site.
Expand All @@ -36,6 +41,7 @@ def _scrape_single_site(
status=status,
site_name=SiteName.get_by_value(site_name.lower()),
proxy=proxy,
radius=radius,
timeframe=timeframe,
)

Expand All @@ -53,7 +59,8 @@ def scrape_property(
location: str,
timeframe: str = None,
site_name: Union[str, list[str]] = None,
status: str = "sale",
listing_type: str = "for_sale",
radius: float = None,
proxy: str = None,
) -> pd.DataFrame:
"""
Expand All @@ -65,6 +72,7 @@ def scrape_property(
:param listing_type: Listing type (e.g. 'for_sale', 'for_rent', 'sold')
:return: pd.DataFrame containing properties
"""
status = listing_type
if site_name is None:
site_name = list(_scrapers.keys())

Expand All @@ -80,7 +88,13 @@ def scrape_property(
with ThreadPoolExecutor() as executor:
futures = {
executor.submit(
_scrape_single_site, location, s_name, status, proxy, timeframe
_scrape_single_site,
location,
s_name,
status,
radius,
proxy,
timeframe,
): s_name
for s_name in site_name
}
Expand Down
2 changes: 2 additions & 0 deletions homeharvest/core/scrapers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ class ScraperInput:
site_name: str
proxy: Optional[str] = None
timeframe: Optional[str] = None
radius: float | None = None

def __post_init__(self):
if self.status == "sold" and not self.timeframe:
Expand Down Expand Up @@ -50,6 +51,7 @@ def __init__(

self.listing_type = scraper_input.status
self.site_name = scraper_input.site_name
self.radius = scraper_input.radius

def search(self) -> list[Property]:
...
Expand Down
26 changes: 18 additions & 8 deletions homeharvest/core/scrapers/realtor/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -590,19 +590,29 @@ def get_query(self):
def search(self):
location_info = self.handle_location()
location_type = location_info["area_type"]
is_for_comps = self.radius is not None and location_type == "address"

if location_type == "address":
if location_type == "address" and not is_for_comps:
property_id = location_info["mpr_id"]
return self.handle_address(property_id)

offset = 0
search_variables = {
"city": location_info.get("city"),
"county": location_info.get("county"),
"state_code": location_info.get("state_code"),
"postal_code": location_info.get("postal_code"),
"offset": offset,
}

if not is_for_comps:
search_variables = {
"city": location_info.get("city"),
"county": location_info.get("county"),
"state_code": location_info.get("state_code"),
"postal_code": location_info.get("postal_code"),
"offset": offset,
}
else:
coordinates = list(location_info["centroid"].values())
search_variables = {
"coordinates": coordinates,
"radius": "{}mi".format(self.radius),
"offset": offset,
}

result = self.handle_area(search_variables)
total = result["total"]
Expand Down
228 changes: 228 additions & 0 deletions homeharvest/core/scrapers/redfin/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,228 @@
"""
homeharvest.redfin.__init__
~~~~~~~~~~~~
This module implements the scraper for redfin.com
"""
import json
from typing import Any
from .. import Scraper
from ..models import Property, Address, Status
from ....exceptions import NoResultsFound, SearchTooBroad
from datetime import datetime


class RedfinScraper(Scraper):
def __init__(self, scraper_input):
super().__init__(scraper_input)
self.listing_type = scraper_input.status

def _handle_location(self):
url = "https://www.redfin.com/stingray/do/location-autocomplete?v=2&al=1&location={}".format(
self.location
)

response = self.session.get(url)
response_json = json.loads(response.text.replace("{}&&", ""))

def get_region_type(match_type: str):
if match_type == "4":
return "2" #: zip
elif match_type == "2":
return "6" #: city
elif match_type == "1":
return "address" #: address, needs to be handled differently
elif match_type == "11":
return "state"

if "exactMatch" not in response_json["payload"]:
raise NoResultsFound(
"No results found for location: {}".format(self.location)
)

if response_json["payload"]["exactMatch"] is not None:
target = response_json["payload"]["exactMatch"]
else:
target = response_json["payload"]["sections"][0]["rows"][0]

return target["id"].split("_")[1], get_region_type(target["type"])

def _parse_home(self, home: dict, single_search: bool = False) -> Property:
def get_value(key: str) -> Any | None:
if key in home and "value" in home[key]:
return home[key]["value"]

if not single_search:
address = Address(
street=get_value("streetLine"),
city=home.get("city"),
state=home.get("state"),
zip=home.get("zip"),
)
else:
address_info = home.get("streetAddress")

address = Address(
street=address_info.get("assembledAddress"),
city=home.get("city"),
state=home.get("state"),
zip=home.get("zip"),
)

url = "https://www.redfin.com{}".format(home["url"])
lot_size_data = home.get("lotSize")

if not isinstance(lot_size_data, int):
lot_size = (
lot_size_data.get("value", None)
if isinstance(lot_size_data, dict)
else None
)
else:
lot_size = lot_size_data

lat_long = get_value("latLong")

return Property(
status=self.listing_type,
address=address,
property_url=url,
beds=home["beds"] if "beds" in home else None,
baths_full=home["baths"] if "baths" in home else None,
list_price=get_value("price"),
est_sf=get_value("sqFt"),
stories=home["stories"] if "stories" in home else None,
yr_blt=get_value("yearBuilt")
if not single_search
else home.get("yearBuilt"),
lot_sf=lot_size,
prc_sqft=get_value("pricePerSqFt")
if type(home.get("pricePerSqFt")) != int
else home.get("pricePerSqFt"),
mls_id=get_value("mlsId"),
latitude=lat_long.get("latitude") if lat_long else None,
longitude=lat_long.get("longitude") if lat_long else None,
last_sold_date=datetime.fromtimestamp(home["soldDate"] / 1000)
if "soldDate" in home
else None,
)

def _handle_rentals(self, region_id, region_type):
url = f"https://www.redfin.com/stingray/api/v1/search/rentals?al=1&isRentals=true&region_id={region_id}&region_type={region_type}&num_homes=100000"

response = self.session.get(url)
response.raise_for_status()
homes = response.json()

properties_list = []

for home in homes["homes"]:
home_data = home["homeData"]
rental_data = home["rentalExtension"]

property_url = f"https://www.redfin.com{home_data.get('url', '')}"
address_info = home_data.get("addressInfo", {})
centroid = address_info.get("centroid", {}).get("centroid", {})
address = Address(
street=address_info.get("formattedStreetLine"),
city=address_info.get("city"),
state=address_info.get("state"),
zip=address_info.get("zip"),
)

price_range = rental_data.get("rentPriceRange", {"min": None, "max": None})
bed_range = rental_data.get("bedRange", {"min": None, "max": None})
bath_range = rental_data.get("bathRange", {"min": None, "max": None})
sqft_range = rental_data.get("sqftRange", {"min": None, "max": None})

property_ = Property(
property_url=property_url,
status=Status.FOR_RENT.value,
address=address,
latitude=centroid.get("latitude"),
longitude=centroid.get("longitude"),
baths_full=bath_range.get("min"),
beds=bed_range.get("min"),
list_price=price_range.get("min"),
est_sf=sqft_range.get("min"),
)

properties_list.append(property_)

if not properties_list:
raise NoResultsFound("No rentals found for the given location.")

return properties_list

def _parse_building(self, building: dict) -> Property:
street_address = " ".join(
[
building["address"]["streetNumber"],
building["address"]["directionalPrefix"],
building["address"]["streetName"],
building["address"]["streetType"],
]
)
return Property(
status=self.status,
address=Address(
street=street_address,
city=building["address"]["city"],
state=building["address"]["stateOrProvinceCode"],
zip=building["address"]["postalCode"],
),
property_url="https://www.redfin.com{}".format(building["url"]),
)

def handle_address(self, home_id: str):
"""
EPs:
https://www.redfin.com/stingray/api/home/details/initialInfo?al=1&path=/TX/Austin/70-Rainey-St-78701/unit-1608/home/147337694
https://www.redfin.com/stingray/api/home/details/mainHouseInfoPanelInfo?propertyId=147337694&accessLevel=3
https://www.redfin.com/stingray/api/home/details/aboveTheFold?propertyId=147337694&accessLevel=3
https://www.redfin.com/stingray/api/home/details/belowTheFold?propertyId=147337694&accessLevel=3
"""
url = "https://www.redfin.com/stingray/api/home/details/aboveTheFold?propertyId={}&accessLevel=3".format(
home_id
)

response = self.session.get(url)
response_json = json.loads(response.text.replace("{}&&", ""))

parsed_home = self._parse_home(
response_json["payload"]["addressSectionInfo"], single_search=True
)
return [parsed_home]

def search(self):
region_id, region_type = self._handle_location()

if region_type == "state":
raise SearchTooBroad(
"State searches are not supported, please use a more specific location."
)

if region_type == "address":
home_id = region_id
return self.handle_address(home_id)

if self.listing_type == Status.FOR_RENT:
return self._handle_rentals(region_id, region_type)
else:
if self.listing_type == Status.FOR_SALE:
url = f"https://www.redfin.com/stingray/api/gis?al=1&region_id={region_id}&region_type={region_type}&num_homes=100000"
else:
url = f"https://www.redfin.com/stingray/api/gis?al=1&region_id={region_id}&region_type={region_type}&sold_within_days=30&num_homes=100000"
response = self.session.get(url)
response_json = json.loads(response.text.replace("{}&&", ""))

if "payload" in response_json:
homes_list = response_json["payload"].get("homes", [])
buildings_list = response_json["payload"].get("buildings", {}).values()

homes = [self._parse_home(home) for home in homes_list] + [
self._parse_building(building) for building in buildings_list
]
return homes
else:
return []
Loading

0 comments on commit ff95ca0

Please sign in to comment.