Skip to content

Commit

Permalink
feat(scrapers): add zillow
Browse files Browse the repository at this point in the history
  • Loading branch information
cullenwatson committed Sep 17, 2023
1 parent 2f3b012 commit 2f5ea1c
Show file tree
Hide file tree
Showing 11 changed files with 349 additions and 97 deletions.
5 changes: 4 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,2 +1,5 @@
/.idea
dist
**/dist/
**/__pycache__/
**/.pytest_cache/
*.pyc
19 changes: 12 additions & 7 deletions homeharvest/__init__.py
Original file line number Diff line number Diff line change
@@ -1,26 +1,31 @@
from .core.scrapers.redfin import RedfinScraper
from .core.scrapers.realtor import RealtorScraper
from .core.scrapers.types import ListingType, Property
from .core.scrapers.zillow import ZillowScraper
from .core.scrapers.models import ListingType, Property, Building
from .core.scrapers import ScraperInput
from .exceptions import InvalidSite, InvalidListingType
from typing import Union


_scrapers = {
"redfin": RedfinScraper,
"realtor.com": RealtorScraper
"realtor.com": RealtorScraper,
"zillow": ZillowScraper,
}


def scrape_property(
location: str,
site_name: str,
listing_type: str = "for_sale", #: for_sale, for_rent, sold
) -> list[Property]: #: eventually, return pandas dataframe
location: str,
site_name: str,
listing_type: str = "for_sale", #: for_sale, for_rent, sold
) -> Union[list[Building], list[Property]]: #: eventually, return pandas dataframe
if site_name.lower() not in _scrapers:
raise InvalidSite(f"Provided site, '{site_name}', does not exist.")

if listing_type.upper() not in ListingType.__members__:
raise InvalidListingType(f"Provided listing type, '{listing_type}', does not exist.")
raise InvalidListingType(
f"Provided listing type, '{listing_type}', does not exist."
)

scraper_input = ScraperInput(
location=location,
Expand Down
14 changes: 10 additions & 4 deletions homeharvest/core/scrapers/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from dataclasses import dataclass
import requests
from .types import Property, ListingType
from .models import Property, ListingType


@dataclass
Expand All @@ -11,19 +11,25 @@ class ScraperInput:


class Scraper:
listing_type = ListingType.FOR_SALE

def __init__(self, scraper_input: ScraperInput):
self.location = scraper_input.location
self.session = requests.Session()
Scraper.listing_type = scraper_input.listing_type

if scraper_input.proxy_url:
self.session.proxies = {
"http": scraper_input.proxy_url,
"https": scraper_input.proxy_url,
}

def search(self) -> list[Property]: ...
def search(self) -> list[Property]:
...

@staticmethod
def parse_home(home) -> Property: ...
def _parse_home(home) -> Property:
...

def handle_location(self): ...
def handle_location(self):
...
Original file line number Diff line number Diff line change
Expand Up @@ -24,14 +24,29 @@ class Property:
url: str

beds: int | None = None
baths: int | None = None
baths: float | None = None
stories: int | None = None
agent_name: str | None = None
description: str | None = None
year_built: int | None = None
square_feet: int | None = None
price_per_square_foot: int | None = None
year_built: int | None = None
price: int | None = None
mls_id: str | None = None

property_type: str | None = None
listing_type: ListingType | None = None
lot_size: int | None = None
description: str | None = None


@dataclass
class Building:
address: Address
url: str

num_units: int | None = None
min_unit_price: int | None = None
max_unit_price: int | None = None
avg_unit_price: int | None = None

listing_type: str | None = None
47 changes: 25 additions & 22 deletions homeharvest/core/scrapers/realtor/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import json
from ..types import Property, Address
from ..models import Property, Address
from .. import Scraper
from typing import Any

Expand All @@ -10,39 +10,42 @@ def __init__(self, scraper_input):

def handle_location(self):
headers = {
'authority': 'parser-external.geo.moveaws.com',
'accept': '*/*',
'accept-language': 'en-US,en;q=0.9',
'origin': 'https://www.realtor.com',
'referer': 'https://www.realtor.com/',
'sec-ch-ua': '"Chromium";v="116", "Not)A;Brand";v="24", "Google Chrome";v="116"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'sec-fetch-dest': 'empty',
'sec-fetch-mode': 'cors',
'sec-fetch-site': 'cross-site',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36',
"authority": "parser-external.geo.moveaws.com",
"accept": "*/*",
"accept-language": "en-US,en;q=0.9",
"origin": "https://www.realtor.com",
"referer": "https://www.realtor.com/",
"sec-ch-ua": '"Chromium";v="116", "Not)A;Brand";v="24", "Google Chrome";v="116"',
"sec-ch-ua-mobile": "?0",
"sec-ch-ua-platform": '"Windows"',
"sec-fetch-dest": "empty",
"sec-fetch-mode": "cors",
"sec-fetch-site": "cross-site",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36",
}

params = {
'input': self.location,
'client_id': 'for-sale',
'limit': '1',
'area_types': 'city,state,county,postal_code,address,street,neighborhood,school,school_district,university,park',
"input": self.location,
"client_id": "for-sale",
"limit": "1",
"area_types": "city,state,county,postal_code,address,street,neighborhood,school,school_district,university,park",
}

response = self.session.get('https://parser-external.geo.moveaws.com/suggest', params=params, headers=headers)
response = self.session.get(
"https://parser-external.geo.moveaws.com/suggest",
params=params,
headers=headers,
)
response_json = response.json()

return response_json['autocomplete'][0]

return response_json["autocomplete"][0]

def search(self):
location_info = self.handle_location()
location_type = location_info['area_type']
location_type = location_info["area_type"]

"""
property types:
apartment + building + commercial + condo_townhome + condo_townhome_rowhome_coop + condos + coop + duplex_triplex + farm + investment + land + mobile + multi_family + rental + single_family + townhomes
"""
print('a')
print("a")
88 changes: 50 additions & 38 deletions homeharvest/core/scrapers/redfin/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import json
from ..types import Property, Address
from ..models import Property, Address
from .. import Scraper
from typing import Any

Expand All @@ -8,11 +8,13 @@ class RedfinScraper(Scraper):
def __init__(self, scraper_input):
super().__init__(scraper_input)

def handle_location(self):
url = 'https://www.redfin.com/stingray/do/location-autocomplete?v=2&al=1&location={}'.format(self.location)
def _handle_location(self):
url = "https://www.redfin.com/stingray/do/location-autocomplete?v=2&al=1&location={}".format(
self.location
)

response = self.session.get(url)
response_json = json.loads(response.text.replace('{}&&', ''))
response_json = json.loads(response.text.replace("{}&&", ""))

def get_region_type(match_type: str):
if match_type == "4":
Expand All @@ -22,51 +24,53 @@ def get_region_type(match_type: str):
elif match_type == "1":
return "address" #: address, needs to be handled differently

if response_json['payload']['exactMatch'] is not None:
target = response_json['payload']['exactMatch']
if response_json["payload"]["exactMatch"] is not None:
target = response_json["payload"]["exactMatch"]
else:
target = response_json['payload']['sections'][0]['rows'][0]
target = response_json["payload"]["sections"][0]["rows"][0]

return target['id'].split('_')[1], get_region_type(target['type'])
return target["id"].split("_")[1], get_region_type(target["type"])

@staticmethod
def parse_home(home: dict, single_search: bool = False) -> Property:
def _parse_home(home: dict, single_search: bool = False) -> Property:
def get_value(key: str) -> Any | None:
if key in home and 'value' in home[key]:
return home[key]['value']
if key in home and "value" in home[key]:
return home[key]["value"]

if not single_search:
address = Address(
address_one=get_value('streetLine'),
city=home['city'],
state=home['state'],
zip_code=home['zip']
address_one=get_value("streetLine"),
city=home["city"],
state=home["state"],
zip_code=home["zip"],
)
else:
address_info = home['streetAddress']
address_info = home["streetAddress"]

address = Address(
address_one=address_info['assembledAddress'],
city=home['city'],
state=home['state'],
zip_code=home['zip']
address_one=address_info["assembledAddress"],
city=home["city"],
state=home["state"],
zip_code=home["zip"],
)

url = 'https://www.redfin.com{}'.format(home['url'])
url = "https://www.redfin.com{}".format(home["url"])

return Property(
address=address,
url=url,
beds=home['beds'] if 'beds' in home else None,
baths=home['baths'] if 'baths' in home else None,
stories=home['stories'] if 'stories' in home else None,
agent_name=get_value('listingAgent'),
description=home['listingRemarks'] if 'listingRemarks' in home else None,
year_built=get_value('yearBuilt') if not single_search else home['yearBuilt'],
square_feet=get_value('sqFt'),
price_per_square_foot=get_value('pricePerSqFt'),
price=get_value('price'),
mls_id=get_value('mlsId')
beds=home["beds"] if "beds" in home else None,
baths=home["baths"] if "baths" in home else None,
stories=home["stories"] if "stories" in home else None,
agent_name=get_value("listingAgent"),
description=home["listingRemarks"] if "listingRemarks" in home else None,
year_built=get_value("yearBuilt")
if not single_search
else home["yearBuilt"],
square_feet=get_value("sqFt"),
price_per_square_foot=get_value("pricePerSqFt"),
price=get_value("price"),
mls_id=get_value("mlsId"),
)

def handle_address(self, home_id: str):
Expand All @@ -78,25 +82,33 @@ def handle_address(self, home_id: str):
https://www.redfin.com/stingray/api/home/details/belowTheFold?propertyId=147337694&accessLevel=3
"""

url = "https://www.redfin.com/stingray/api/home/details/aboveTheFold?propertyId={}&accessLevel=3".format(home_id)
url = "https://www.redfin.com/stingray/api/home/details/aboveTheFold?propertyId={}&accessLevel=3".format(
home_id
)

response = self.session.get(url)
response_json = json.loads(response.text.replace('{}&&', ''))
response_json = json.loads(response.text.replace("{}&&", ""))

parsed_home = self.parse_home(response_json['payload']['addressSectionInfo'], single_search=True)
parsed_home = self._parse_home(
response_json["payload"]["addressSectionInfo"], single_search=True
)
return [parsed_home]

def search(self):
region_id, region_type = self.handle_location()
region_id, region_type = self._handle_location()

if region_type == "address":
home_id = region_id
return self.handle_address(home_id)

url = 'https://www.redfin.com/stingray/api/gis?al=1&region_id={}&region_type={}'.format(region_id, region_type)
url = "https://www.redfin.com/stingray/api/gis?al=1&region_id={}&region_type={}".format(
region_id, region_type
)

response = self.session.get(url)
response_json = json.loads(response.text.replace('{}&&', ''))
response_json = json.loads(response.text.replace("{}&&", ""))

homes = [self.parse_home(home) for home in response_json['payload']['homes']] #: support buildings
homes = [
self._parse_home(home) for home in response_json["payload"]["homes"]
] #: support buildings
return homes
Loading

0 comments on commit 2f5ea1c

Please sign in to comment.