Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

enh: estimate/assessed #76

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 54 additions & 0 deletions homeharvest/homeharvest/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
import warnings
import pandas as pd
from .core.scrapers import ScraperInput
from .utils import process_result, ordered_properties, validate_input, validate_dates
from .core.scrapers.realtor import RealtorScraper
from .core.scrapers.models import ListingType


def scrape_property(
location: str,
listing_type: str = "for_sale",
radius: float = None,
mls_only: bool = False,
past_days: int = None,
proxy: str = None,
date_from: str = None,
date_to: str = None,
foreclosure: bool = None,
) -> pd.DataFrame:
"""
Scrape properties from Realtor.com based on a given location and listing type.
:param location: Location to search (e.g. "Dallas, TX", "85281", "2530 Al Lipscomb Way")
:param listing_type: Listing Type (for_sale, for_rent, sold)
:param radius: Get properties within _ (e.g. 1.0) miles. Only applicable for individual addresses.
:param mls_only: If set, fetches only listings with MLS IDs.
:param past_days: Get properties sold or listed (dependent on your listing_type) in the last _ days.
:param date_from, date_to: Get properties sold or listed (dependent on your listing_type) between these dates. format: 2021-01-28
:param proxy: Proxy to use for scraping
"""
validate_input(listing_type)
validate_dates(date_from, date_to)

scraper_input = ScraperInput(
location=location,
listing_type=ListingType[listing_type.upper()],
proxy=proxy,
radius=radius,
mls_only=mls_only,
last_x_days=past_days,
date_from=date_from,
date_to=date_to,
foreclosure=foreclosure,
)

site = RealtorScraper(scraper_input)
results = site.search()

properties_dfs = [process_result(result) for result in results]
if not properties_dfs:
return pd.DataFrame()

with warnings.catch_warnings():
warnings.simplefilter("ignore", category=FutureWarning)
return pd.concat(properties_dfs, ignore_index=True, axis=0)[ordered_properties]
85 changes: 85 additions & 0 deletions homeharvest/homeharvest/cli.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
import argparse
import datetime
from homeharvest import scrape_property


def main():
parser = argparse.ArgumentParser(description="Home Harvest Property Scraper")
parser.add_argument("location", type=str, help="Location to scrape (e.g., San Francisco, CA)")

parser.add_argument(
"-l",
"--listing_type",
type=str,
default="for_sale",
choices=["for_sale", "for_rent", "sold", "pending"],
help="Listing type to scrape",
)

parser.add_argument(
"-o",
"--output",
type=str,
default="excel",
choices=["excel", "csv"],
help="Output format",
)

parser.add_argument(
"-f",
"--filename",
type=str,
default=None,
help="Name of the output file (without extension)",
)

parser.add_argument("-p", "--proxy", type=str, default=None, help="Proxy to use for scraping")
parser.add_argument(
"-d",
"--days",
type=int,
default=None,
help="Sold/listed in last _ days filter.",
)

parser.add_argument(
"-r",
"--radius",
type=float,
default=None,
help="Get comparable properties within _ (eg. 0.0) miles. Only applicable for individual addresses.",
)
parser.add_argument(
"-m",
"--mls_only",
action="store_true",
help="If set, fetches only MLS listings.",
)

args = parser.parse_args()

result = scrape_property(
args.location,
args.listing_type,
radius=args.radius,
proxy=args.proxy,
mls_only=args.mls_only,
past_days=args.days,
)

if not args.filename:
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
args.filename = f"HomeHarvest_{timestamp}"

if args.output == "excel":
output_filename = f"{args.filename}.xlsx"
result.to_excel(output_filename, index=False)
print(f"Excel file saved as {output_filename}")
elif args.output == "csv":
output_filename = f"{args.filename}.csv"
result.to_csv(output_filename, index=False)
print(f"CSV file saved as {output_filename}")


if __name__ == "__main__":
main()
Empty file.
88 changes: 88 additions & 0 deletions homeharvest/homeharvest/core/scrapers/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
from dataclasses import dataclass
import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
import uuid
from .models import Property, ListingType, SiteName


@dataclass
class ScraperInput:
location: str
listing_type: ListingType
radius: float | None = None
mls_only: bool | None = None
proxy: str | None = None
last_x_days: int | None = None
date_from: str | None = None
date_to: str | None = None
foreclosure: bool | None = None


class Scraper:
session = None

def __init__(
self,
scraper_input: ScraperInput,
):
self.location = scraper_input.location
self.listing_type = scraper_input.listing_type

if not self.session:
Scraper.session = requests.Session()
print("Session created")
retries = Retry(
total=3, backoff_factor=3, status_forcelist=[429, 403], allowed_methods=frozenset(["GET", "POST"])
)

adapter = HTTPAdapter(max_retries=retries)
Scraper.session.mount("http://", adapter)
Scraper.session.mount("https://", adapter)
Scraper.session.headers.update(
{
"auth": f"Bearer {self.get_access_token()}",
"apollographql-client-name": "com.move.Realtor-apollo-ios",
}
)

if scraper_input.proxy:
proxy_url = scraper_input.proxy
proxies = {"http": proxy_url, "https": proxy_url}
self.session.proxies.update(proxies)

self.listing_type = scraper_input.listing_type
self.radius = scraper_input.radius
self.last_x_days = scraper_input.last_x_days
self.mls_only = scraper_input.mls_only
self.date_from = scraper_input.date_from
self.date_to = scraper_input.date_to
self.foreclosure = scraper_input.foreclosure

def search(self) -> list[Property]: ...

@staticmethod
def _parse_home(home) -> Property: ...

def handle_location(self): ...

def get_access_token(self):
url = "https://graph.realtor.com/auth/token"

payload = f'{{"client_app_id":"rdc_mobile_native,24.20.4.149916,iphone","device_id":"{str(uuid.uuid4()).upper()}","grant_type":"device_mobile"}}'
headers = {
"Host": "graph.realtor.com",
"x-client-version": "24.20.4.149916",
"accept": "*/*",
"content-type": "Application/json",
"user-agent": "Realtor.com/24.20.4.149916 CFNetwork/1410.0.3 Darwin/22.6.0",
"accept-language": "en-US,en;q=0.9",
}
response = requests.post(url, headers=headers, data=payload)

data = response.json()
try:
access_token = data["access_token"]
except Exception:
raise Exception("Could not get access token, use a proxy/vpn or wait")
return access_token
110 changes: 110 additions & 0 deletions homeharvest/homeharvest/core/scrapers/models.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
from dataclasses import dataclass
from enum import Enum
from typing import Optional


class SiteName(Enum):
ZILLOW = "zillow"
REDFIN = "redfin"
REALTOR = "realtor.com"

@classmethod
def get_by_value(cls, value):
for item in cls:
if item.value == value:
return item
raise ValueError(f"{value} not found in {cls}")


class ListingType(Enum):
FOR_SALE = "FOR_SALE"
FOR_RENT = "FOR_RENT"
PENDING = "PENDING"
SOLD = "SOLD"


@dataclass
class Agent:
name: str | None = None
phone: str | None = None


class PropertyType(Enum):
APARTMENT = "APARTMENT"
BUILDING = "BUILDING"
COMMERCIAL = "COMMERCIAL"
CONDO_TOWNHOME = "CONDO_TOWNHOME"
CONDO_TOWNHOME_ROWHOME_COOP = "CONDO_TOWNHOME_ROWHOME_COOP"
CONDO = "CONDO"
CONDOS = "CONDOS"
COOP = "COOP"
DUPLEX_TRIPLEX = "DUPLEX_TRIPLEX"
FARM = "FARM"
INVESTMENT = "INVESTMENT"
LAND = "LAND"
MOBILE = "MOBILE"
MULTI_FAMILY = "MULTI_FAMILY"
RENTAL = "RENTAL"
SINGLE_FAMILY = "SINGLE_FAMILY"
TOWNHOMES = "TOWNHOMES"
OTHER = "OTHER"


@dataclass
class Address:
street: str | None = None
unit: str | None = None
city: str | None = None
state: str | None = None
zip: str | None = None


@dataclass
class Description:
primary_photo: str | None = None
alt_photos: list[str] | None = None
style: PropertyType | None = None
beds: int | None = None
baths_full: int | None = None
baths_half: int | None = None
sqft: int | None = None
lot_sqft: int | None = None
sold_price: int | None = None
year_built: int | None = None
garage: float | None = None
stories: int | None = None
text: str | None = None


@dataclass
class Agent:
name: str | None = None
phone: str | None = None


@dataclass
class Property:
property_url: str
mls: str | None = None
mls_id: str | None = None
status: str | None = None
address: Address | None = None

list_price: int | None = None
list_date: str | None = None
pending_date: str | None = None
last_sold_date: str | None = None
prc_sqft: int | None = None
hoa_fee: int | None = None
days_on_mls: int | None = None
description: Description | None = None

latitude: float | None = None
longitude: float | None = None
neighborhoods: Optional[str] = None
county: Optional[str] = None
fips_code: Optional[str] = None
agents: list[Agent] = None
nearby_schools: list[str] = None
assessed_value: int | None = None
estimated_value: int | None = None
Loading
Loading