Bunsly · cullenwatson · Apr 30, 2024
diff --git a/homeharvest/homeharvest/__init__.py b/homeharvest/homeharvest/__init__.py
@@ -0,0 +1,54 @@
+import warnings
+import pandas as pd
+from .core.scrapers import ScraperInput
+from .utils import process_result, ordered_properties, validate_input, validate_dates
+from .core.scrapers.realtor import RealtorScraper
+from .core.scrapers.models import ListingType
+
+
+def scrape_property(
+    location: str,
+    listing_type: str = "for_sale",
+    radius: float = None,
+    mls_only: bool = False,
+    past_days: int = None,
+    proxy: str = None,
+    date_from: str = None,
+    date_to: str = None,
+    foreclosure: bool = None,
+) -> pd.DataFrame:
+    """
+    Scrape properties from Realtor.com based on a given location and listing type.
+    :param location: Location to search (e.g. "Dallas, TX", "85281", "2530 Al Lipscomb Way")
+    :param listing_type: Listing Type (for_sale, for_rent, sold)
+    :param radius: Get properties within _ (e.g. 1.0) miles. Only applicable for individual addresses.
+    :param mls_only: If set, fetches only listings with MLS IDs.
+    :param past_days: Get properties sold or listed (dependent on your listing_type) in the last _ days.
+    :param date_from, date_to: Get properties sold or listed (dependent on your listing_type) between these dates. format: 2021-01-28
+    :param proxy: Proxy to use for scraping
+    """
+    validate_input(listing_type)
+    validate_dates(date_from, date_to)
+
+    scraper_input = ScraperInput(
+        location=location,
+        listing_type=ListingType[listing_type.upper()],
+        proxy=proxy,
+        radius=radius,
+        mls_only=mls_only,
+        last_x_days=past_days,
+        date_from=date_from,
+        date_to=date_to,
+        foreclosure=foreclosure,
+    )
+
+    site = RealtorScraper(scraper_input)
+    results = site.search()
+
+    properties_dfs = [process_result(result) for result in results]
+    if not properties_dfs:
+        return pd.DataFrame()
+
+    with warnings.catch_warnings():
+        warnings.simplefilter("ignore", category=FutureWarning)
+        return pd.concat(properties_dfs, ignore_index=True, axis=0)[ordered_properties]
diff --git a/homeharvest/homeharvest/cli.py b/homeharvest/homeharvest/cli.py
@@ -0,0 +1,85 @@
+import argparse
+import datetime
+from homeharvest import scrape_property
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Home Harvest Property Scraper")
+    parser.add_argument("location", type=str, help="Location to scrape (e.g., San Francisco, CA)")
+
+    parser.add_argument(
+        "-l",
+        "--listing_type",
+        type=str,
+        default="for_sale",
+        choices=["for_sale", "for_rent", "sold", "pending"],
+        help="Listing type to scrape",
+    )
+
+    parser.add_argument(
+        "-o",
+        "--output",
+        type=str,
+        default="excel",
+        choices=["excel", "csv"],
+        help="Output format",
+    )
+
+    parser.add_argument(
+        "-f",
+        "--filename",
+        type=str,
+        default=None,
+        help="Name of the output file (without extension)",
+    )
+
+    parser.add_argument("-p", "--proxy", type=str, default=None, help="Proxy to use for scraping")
+    parser.add_argument(
+        "-d",
+        "--days",
+        type=int,
+        default=None,
+        help="Sold/listed in last _ days filter.",
+    )
+
+    parser.add_argument(
+        "-r",
+        "--radius",
+        type=float,
+        default=None,
+        help="Get comparable properties within _ (eg. 0.0) miles. Only applicable for individual addresses.",
+    )
+    parser.add_argument(
+        "-m",
+        "--mls_only",
+        action="store_true",
+        help="If set, fetches only MLS listings.",
+    )
+
+    args = parser.parse_args()
+
+    result = scrape_property(
+        args.location,
+        args.listing_type,
+        radius=args.radius,
+        proxy=args.proxy,
+        mls_only=args.mls_only,
+        past_days=args.days,
+    )
+
+    if not args.filename:
+        timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
+        args.filename = f"HomeHarvest_{timestamp}"
+
+    if args.output == "excel":
+        output_filename = f"{args.filename}.xlsx"
+        result.to_excel(output_filename, index=False)
+        print(f"Excel file saved as {output_filename}")
+    elif args.output == "csv":
+        output_filename = f"{args.filename}.csv"
+        result.to_csv(output_filename, index=False)
+        print(f"CSV file saved as {output_filename}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/homeharvest/homeharvest/core/__init__.py b/homeharvest/homeharvest/core/__init__.py
diff --git a/homeharvest/homeharvest/core/scrapers/__init__.py b/homeharvest/homeharvest/core/scrapers/__init__.py
@@ -0,0 +1,88 @@
+from dataclasses import dataclass
+import requests
+from requests.adapters import HTTPAdapter
+from urllib3.util.retry import Retry
+import uuid
+from .models import Property, ListingType, SiteName
+
+
+@dataclass
+class ScraperInput:
+    location: str
+    listing_type: ListingType
+    radius: float | None = None
+    mls_only: bool | None = None
+    proxy: str | None = None
+    last_x_days: int | None = None
+    date_from: str | None = None
+    date_to: str | None = None
+    foreclosure: bool | None = None
+
+
+class Scraper:
+    session = None
+
+    def __init__(
+        self,
+        scraper_input: ScraperInput,
+    ):
+        self.location = scraper_input.location
+        self.listing_type = scraper_input.listing_type
+
+        if not self.session:
+            Scraper.session = requests.Session()
+            print("Session created")
+            retries = Retry(
+                total=3, backoff_factor=3, status_forcelist=[429, 403], allowed_methods=frozenset(["GET", "POST"])
+            )
+
+            adapter = HTTPAdapter(max_retries=retries)
+            Scraper.session.mount("http://", adapter)
+            Scraper.session.mount("https://", adapter)
+            Scraper.session.headers.update(
+                {
+                    "auth": f"Bearer {self.get_access_token()}",
+                    "apollographql-client-name": "com.move.Realtor-apollo-ios",
+                }
+            )
+
+        if scraper_input.proxy:
+            proxy_url = scraper_input.proxy
+            proxies = {"http": proxy_url, "https": proxy_url}
+            self.session.proxies.update(proxies)
+
+        self.listing_type = scraper_input.listing_type
+        self.radius = scraper_input.radius
+        self.last_x_days = scraper_input.last_x_days
+        self.mls_only = scraper_input.mls_only
+        self.date_from = scraper_input.date_from
+        self.date_to = scraper_input.date_to
+        self.foreclosure = scraper_input.foreclosure
+
+    def search(self) -> list[Property]: ...
+
+    @staticmethod
+    def _parse_home(home) -> Property: ...
+
+    def handle_location(self): ...
+
+    def get_access_token(self):
+        url = "https://graph.realtor.com/auth/token"
+
+        payload = f'{{"client_app_id":"rdc_mobile_native,24.20.4.149916,iphone","device_id":"{str(uuid.uuid4()).upper()}","grant_type":"device_mobile"}}'
+        headers = {
+            "Host": "graph.realtor.com",
+            "x-client-version": "24.20.4.149916",
+            "accept": "*/*",
+            "content-type": "Application/json",
+            "user-agent": "Realtor.com/24.20.4.149916 CFNetwork/1410.0.3 Darwin/22.6.0",
+            "accept-language": "en-US,en;q=0.9",
+        }
+        response = requests.post(url, headers=headers, data=payload)
+
+        data = response.json()
+        try:
+            access_token = data["access_token"]
+        except Exception:
+            raise Exception("Could not get access token, use a proxy/vpn or wait")
+        return access_token
diff --git a/homeharvest/homeharvest/core/scrapers/models.py b/homeharvest/homeharvest/core/scrapers/models.py
@@ -0,0 +1,110 @@
+from dataclasses import dataclass
+from enum import Enum
+from typing import Optional
+
+
+class SiteName(Enum):
+    ZILLOW = "zillow"
+    REDFIN = "redfin"
+    REALTOR = "realtor.com"
+
+    @classmethod
+    def get_by_value(cls, value):
+        for item in cls:
+            if item.value == value:
+                return item
+        raise ValueError(f"{value} not found in {cls}")
+
+
+class ListingType(Enum):
+    FOR_SALE = "FOR_SALE"
+    FOR_RENT = "FOR_RENT"
+    PENDING = "PENDING"
+    SOLD = "SOLD"
+
+
+@dataclass
+class Agent:
+    name: str | None = None
+    phone: str | None = None
+
+
+class PropertyType(Enum):
+    APARTMENT = "APARTMENT"
+    BUILDING = "BUILDING"
+    COMMERCIAL = "COMMERCIAL"
+    CONDO_TOWNHOME = "CONDO_TOWNHOME"
+    CONDO_TOWNHOME_ROWHOME_COOP = "CONDO_TOWNHOME_ROWHOME_COOP"
+    CONDO = "CONDO"
+    CONDOS = "CONDOS"
+    COOP = "COOP"
+    DUPLEX_TRIPLEX = "DUPLEX_TRIPLEX"
+    FARM = "FARM"
+    INVESTMENT = "INVESTMENT"
+    LAND = "LAND"
+    MOBILE = "MOBILE"
+    MULTI_FAMILY = "MULTI_FAMILY"
+    RENTAL = "RENTAL"
+    SINGLE_FAMILY = "SINGLE_FAMILY"
+    TOWNHOMES = "TOWNHOMES"
+    OTHER = "OTHER"
+
+
+@dataclass
+class Address:
+    street: str | None = None
+    unit: str | None = None
+    city: str | None = None
+    state: str | None = None
+    zip: str | None = None
+
+
+@dataclass
+class Description:
+    primary_photo: str | None = None
+    alt_photos: list[str] | None = None
+    style: PropertyType | None = None
+    beds: int | None = None
+    baths_full: int | None = None
+    baths_half: int | None = None
+    sqft: int | None = None
+    lot_sqft: int | None = None
+    sold_price: int | None = None
+    year_built: int | None = None
+    garage: float | None = None
+    stories: int | None = None
+    text: str | None = None
+
+
+@dataclass
+class Agent:
+    name: str | None = None
+    phone: str | None = None
+
+
+@dataclass
+class Property:
+    property_url: str
+    mls: str | None = None
+    mls_id: str | None = None
+    status: str | None = None
+    address: Address | None = None
+
+    list_price: int | None = None
+    list_date: str | None = None
+    pending_date: str | None = None
+    last_sold_date: str | None = None
+    prc_sqft: int | None = None
+    hoa_fee: int | None = None
+    days_on_mls: int | None = None
+    description: Description | None = None
+
+    latitude: float | None = None
+    longitude: float | None = None
+    neighborhoods: Optional[str] = None
+    county: Optional[str] = None
+    fips_code: Optional[str] = None
+    agents: list[Agent] = None
+    nearby_schools: list[str] = None
+    assessed_value: int | None = None
+    estimated_value: int | None = None