Skip to content

Commit

Permalink
enh: estimate/assessed
Browse files Browse the repository at this point in the history
  • Loading branch information
cullenwatson committed Apr 30, 2024
1 parent c5b15e9 commit 3f994ef
Show file tree
Hide file tree
Showing 8 changed files with 1,245 additions and 0 deletions.
54 changes: 54 additions & 0 deletions homeharvest/homeharvest/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
import warnings
import pandas as pd
from .core.scrapers import ScraperInput
from .utils import process_result, ordered_properties, validate_input, validate_dates
from .core.scrapers.realtor import RealtorScraper
from .core.scrapers.models import ListingType


def scrape_property(
location: str,
listing_type: str = "for_sale",
radius: float = None,
mls_only: bool = False,
past_days: int = None,
proxy: str = None,
date_from: str = None,
date_to: str = None,
foreclosure: bool = None,
) -> pd.DataFrame:
"""
Scrape properties from Realtor.com based on a given location and listing type.
:param location: Location to search (e.g. "Dallas, TX", "85281", "2530 Al Lipscomb Way")
:param listing_type: Listing Type (for_sale, for_rent, sold)
:param radius: Get properties within _ (e.g. 1.0) miles. Only applicable for individual addresses.
:param mls_only: If set, fetches only listings with MLS IDs.
:param past_days: Get properties sold or listed (dependent on your listing_type) in the last _ days.
:param date_from, date_to: Get properties sold or listed (dependent on your listing_type) between these dates. format: 2021-01-28
:param proxy: Proxy to use for scraping
"""
validate_input(listing_type)
validate_dates(date_from, date_to)

scraper_input = ScraperInput(
location=location,
listing_type=ListingType[listing_type.upper()],
proxy=proxy,
radius=radius,
mls_only=mls_only,
last_x_days=past_days,
date_from=date_from,
date_to=date_to,
foreclosure=foreclosure,
)

site = RealtorScraper(scraper_input)
results = site.search()

properties_dfs = [process_result(result) for result in results]
if not properties_dfs:
return pd.DataFrame()

with warnings.catch_warnings():
warnings.simplefilter("ignore", category=FutureWarning)
return pd.concat(properties_dfs, ignore_index=True, axis=0)[ordered_properties]
85 changes: 85 additions & 0 deletions homeharvest/homeharvest/cli.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
import argparse
import datetime
from homeharvest import scrape_property


def main():
parser = argparse.ArgumentParser(description="Home Harvest Property Scraper")
parser.add_argument("location", type=str, help="Location to scrape (e.g., San Francisco, CA)")

parser.add_argument(
"-l",
"--listing_type",
type=str,
default="for_sale",
choices=["for_sale", "for_rent", "sold", "pending"],
help="Listing type to scrape",
)

parser.add_argument(
"-o",
"--output",
type=str,
default="excel",
choices=["excel", "csv"],
help="Output format",
)

parser.add_argument(
"-f",
"--filename",
type=str,
default=None,
help="Name of the output file (without extension)",
)

parser.add_argument("-p", "--proxy", type=str, default=None, help="Proxy to use for scraping")
parser.add_argument(
"-d",
"--days",
type=int,
default=None,
help="Sold/listed in last _ days filter.",
)

parser.add_argument(
"-r",
"--radius",
type=float,
default=None,
help="Get comparable properties within _ (eg. 0.0) miles. Only applicable for individual addresses.",
)
parser.add_argument(
"-m",
"--mls_only",
action="store_true",
help="If set, fetches only MLS listings.",
)

args = parser.parse_args()

result = scrape_property(
args.location,
args.listing_type,
radius=args.radius,
proxy=args.proxy,
mls_only=args.mls_only,
past_days=args.days,
)

if not args.filename:
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
args.filename = f"HomeHarvest_{timestamp}"

if args.output == "excel":
output_filename = f"{args.filename}.xlsx"
result.to_excel(output_filename, index=False)
print(f"Excel file saved as {output_filename}")
elif args.output == "csv":
output_filename = f"{args.filename}.csv"
result.to_csv(output_filename, index=False)
print(f"CSV file saved as {output_filename}")


if __name__ == "__main__":
main()
Empty file.
88 changes: 88 additions & 0 deletions homeharvest/homeharvest/core/scrapers/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
from dataclasses import dataclass
import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
import uuid
from .models import Property, ListingType, SiteName


@dataclass
class ScraperInput:
location: str
listing_type: ListingType
radius: float | None = None
mls_only: bool | None = None
proxy: str | None = None
last_x_days: int | None = None
date_from: str | None = None
date_to: str | None = None
foreclosure: bool | None = None


class Scraper:
session = None

def __init__(
self,
scraper_input: ScraperInput,
):
self.location = scraper_input.location
self.listing_type = scraper_input.listing_type

if not self.session:
Scraper.session = requests.Session()
print("Session created")
retries = Retry(
total=3, backoff_factor=3, status_forcelist=[429, 403], allowed_methods=frozenset(["GET", "POST"])
)

adapter = HTTPAdapter(max_retries=retries)
Scraper.session.mount("http://", adapter)
Scraper.session.mount("https://", adapter)
Scraper.session.headers.update(
{
"auth": f"Bearer {self.get_access_token()}",
"apollographql-client-name": "com.move.Realtor-apollo-ios",
}
)

if scraper_input.proxy:
proxy_url = scraper_input.proxy
proxies = {"http": proxy_url, "https": proxy_url}
self.session.proxies.update(proxies)

self.listing_type = scraper_input.listing_type
self.radius = scraper_input.radius
self.last_x_days = scraper_input.last_x_days
self.mls_only = scraper_input.mls_only
self.date_from = scraper_input.date_from
self.date_to = scraper_input.date_to
self.foreclosure = scraper_input.foreclosure

def search(self) -> list[Property]: ...

@staticmethod
def _parse_home(home) -> Property: ...

def handle_location(self): ...

def get_access_token(self):
url = "https://graph.realtor.com/auth/token"

payload = f'{{"client_app_id":"rdc_mobile_native,24.20.4.149916,iphone","device_id":"{str(uuid.uuid4()).upper()}","grant_type":"device_mobile"}}'
headers = {
"Host": "graph.realtor.com",
"x-client-version": "24.20.4.149916",
"accept": "*/*",
"content-type": "Application/json",
"user-agent": "Realtor.com/24.20.4.149916 CFNetwork/1410.0.3 Darwin/22.6.0",
"accept-language": "en-US,en;q=0.9",
}
response = requests.post(url, headers=headers, data=payload)

data = response.json()
try:
access_token = data["access_token"]
except Exception:
raise Exception("Could not get access token, use a proxy/vpn or wait")
return access_token
110 changes: 110 additions & 0 deletions homeharvest/homeharvest/core/scrapers/models.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
from dataclasses import dataclass
from enum import Enum
from typing import Optional


class SiteName(Enum):
ZILLOW = "zillow"
REDFIN = "redfin"
REALTOR = "realtor.com"

@classmethod
def get_by_value(cls, value):
for item in cls:
if item.value == value:
return item
raise ValueError(f"{value} not found in {cls}")


class ListingType(Enum):
FOR_SALE = "FOR_SALE"
FOR_RENT = "FOR_RENT"
PENDING = "PENDING"
SOLD = "SOLD"


@dataclass
class Agent:
name: str | None = None
phone: str | None = None


class PropertyType(Enum):
APARTMENT = "APARTMENT"
BUILDING = "BUILDING"
COMMERCIAL = "COMMERCIAL"
CONDO_TOWNHOME = "CONDO_TOWNHOME"
CONDO_TOWNHOME_ROWHOME_COOP = "CONDO_TOWNHOME_ROWHOME_COOP"
CONDO = "CONDO"
CONDOS = "CONDOS"
COOP = "COOP"
DUPLEX_TRIPLEX = "DUPLEX_TRIPLEX"
FARM = "FARM"
INVESTMENT = "INVESTMENT"
LAND = "LAND"
MOBILE = "MOBILE"
MULTI_FAMILY = "MULTI_FAMILY"
RENTAL = "RENTAL"
SINGLE_FAMILY = "SINGLE_FAMILY"
TOWNHOMES = "TOWNHOMES"
OTHER = "OTHER"


@dataclass
class Address:
street: str | None = None
unit: str | None = None
city: str | None = None
state: str | None = None
zip: str | None = None


@dataclass
class Description:
primary_photo: str | None = None
alt_photos: list[str] | None = None
style: PropertyType | None = None
beds: int | None = None
baths_full: int | None = None
baths_half: int | None = None
sqft: int | None = None
lot_sqft: int | None = None
sold_price: int | None = None
year_built: int | None = None
garage: float | None = None
stories: int | None = None
text: str | None = None


@dataclass
class Agent:
name: str | None = None
phone: str | None = None


@dataclass
class Property:
property_url: str
mls: str | None = None
mls_id: str | None = None
status: str | None = None
address: Address | None = None

list_price: int | None = None
list_date: str | None = None
pending_date: str | None = None
last_sold_date: str | None = None
prc_sqft: int | None = None
hoa_fee: int | None = None
days_on_mls: int | None = None
description: Description | None = None

latitude: float | None = None
longitude: float | None = None
neighborhoods: Optional[str] = None
county: Optional[str] = None
fips_code: Optional[str] = None
agents: list[Agent] = None
nearby_schools: list[str] = None
assessed_value: int | None = None
estimated_value: int | None = None
Loading

0 comments on commit 3f994ef

Please sign in to comment.