feat: add pandas

Bunsly · Sep 17, 2023 · 3697b7c · 3697b7c
1 parent b76c659
commit 3697b7c
Show file tree

Hide file tree

Showing 9 changed files with 393 additions and 30 deletions.
diff --git a/.gitignore b/.gitignore
@@ -2,4 +2,5 @@
 **/dist/
 **/__pycache__/
 **/.pytest_cache/
-*.pyc
+*.pyc
+/.ipynb_checkpoints/
diff --git a/HomeHarvest_Demo.ipynb b/HomeHarvest_Demo.ipynb
@@ -0,0 +1,73 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "cb48903e-5021-49fe-9688-45cd0bc05d0f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from homeharvest import scrape_property\n",
+    "import pandas as pd"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "156488ce-0d5f-43c5-87f4-c33e9c427860",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pd.set_option('display.max_columns', None)  # Show all columns\n",
+    "pd.set_option('display.max_rows', None)     # Show all rows\n",
+    "pd.set_option('display.width', None)        # Auto-adjust display width to fit console\n",
+    "pd.set_option('display.max_colwidth', 50)   # Limit max column width to 50 characters"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1c8b9744-8606-4e9b-8add-b90371a249a7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "scrape_property(\n",
+    "    location=\"dallas\", site_name=\"zillow\", listing_type=\"for_sale\"\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ab7b4c21-da1d-4713-9df4-d7425d8ce21e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "scrape_property(\n",
+    "    location=\"dallas\", site_name=\"redfin\", listing_type=\"for_sale\"\n",
+    ")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.11"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/homeharvest/__init__.py b/homeharvest/__init__.py
@@ -1,10 +1,11 @@
 from .core.scrapers.redfin import RedfinScraper
 from .core.scrapers.realtor import RealtorScraper
 from .core.scrapers.zillow import ZillowScraper
-from .core.scrapers.models import ListingType, Property, Building
+from .core.scrapers.models import ListingType, Property, Building, SiteName
 from .core.scrapers import ScraperInput
 from .exceptions import InvalidSite, InvalidListingType
 from typing import Union
+import pandas as pd
 
 
 _scrapers = {
@@ -18,7 +19,7 @@ def scrape_property(
     location: str,
     site_name: str,
     listing_type: str = "for_sale",  #: for_sale, for_rent, sold
-) -> Union[list[Building], list[Property]]:  #: eventually, return pandas dataframe
+) -> Union[list[Building], list[Property]]:
     if site_name.lower() not in _scrapers:
         raise InvalidSite(f"Provided site, '{site_name}', does not exist.")
 
@@ -30,8 +31,69 @@ def scrape_property(
     scraper_input = ScraperInput(
         location=location,
         listing_type=ListingType[listing_type.upper()],
+        site_name=SiteName[site_name.upper()],
     )
 
     site = _scrapers[site_name.lower()](scraper_input)
+    results = site.search()
 
-    return site.search()
+    properties_dfs = []
+
+    for result in results:
+        prop_data = result.__dict__
+
+        address_data = prop_data["address"]
+        prop_data["site_name"] = prop_data["site_name"].value
+        prop_data["listing_type"] = prop_data["listing_type"].value
+        prop_data["property_type"] = prop_data["property_type"].value.lower()
+        prop_data["address_one"] = address_data.address_one
+        prop_data["city"] = address_data.city
+        prop_data["state"] = address_data.state
+        prop_data["zip_code"] = address_data.zip_code
+        prop_data["address_two"] = address_data.address_two
+
+        del prop_data["address"]
+
+        if isinstance(result, Property):
+            desired_order = [
+                "listing_type",
+                "address_one",
+                "city",
+                "state",
+                "zip_code",
+                "address_two",
+                "url",
+                "property_type",
+                "price",
+                "beds",
+                "baths",
+                "square_feet",
+                "price_per_square_foot",
+                "lot_size",
+                "stories",
+                "year_built",
+                "agent_name",
+                "mls_id",
+                "description",
+            ]
+
+        elif isinstance(result, Building):
+            desired_order = [
+                "address_one",
+                "city",
+                "state",
+                "zip_code",
+                "address_two",
+                "url",
+                "num_units",
+                "min_unit_price",
+                "max_unit_price",
+                "avg_unit_price",
+                "listing_type",
+            ]
+
+        properties_df = pd.DataFrame([prop_data])
+        properties_df = properties_df[desired_order]
+        properties_dfs.append(properties_df)
+
+    return pd.concat(properties_dfs, ignore_index=True)
diff --git a/homeharvest/core/scrapers/__init__.py b/homeharvest/core/scrapers/__init__.py
@@ -1,19 +1,22 @@
 from dataclasses import dataclass
 import requests
-from .models import Property, ListingType
+from .models import Property, ListingType, SiteName
 
 
 @dataclass
 class ScraperInput:
     location: str
     listing_type: ListingType
+    site_name: SiteName
     proxy_url: str | None = None
 
 
 class Scraper:
     def __init__(self, scraper_input: ScraperInput):
         self.location = scraper_input.location
         self.session = requests.Session()
+        self.listing_type = scraper_input.listing_type
+        self.site_name = scraper_input.site_name
 
         if scraper_input.proxy_url:
             self.session.proxies = {

diff --git a/homeharvest/core/scrapers/models.py b/homeharvest/core/scrapers/models.py
@@ -2,12 +2,43 @@
 from enum import Enum
 
 
+class SiteName(Enum):
+    ZILLOW = "zillow"
+    REDFIN = "redfin"
+    REALTOR = "realtor.com"
+
+
 class ListingType(Enum):
     FOR_SALE = "for_sale"
     FOR_RENT = "for_rent"
     SOLD = "sold"
 
 
+class PropertyType(Enum):
+    HOUSE = "HOUSE"
+    CONDO = "CONDO"
+    TOWNHOUSE = "townhousE"
+    SINGLE_FAMILY = "SINGLE_FAMILY"
+    MULTI_FAMILY = "MULTI_FAMILY"
+    LAND = "LAND"
+    OTHER = "OTHER"
+
+    @classmethod
+    def from_int_code(cls, code):
+        mapping = {
+            1: cls.HOUSE,
+            2: cls.CONDO,
+            3: cls.TOWNHOUSE,
+            4: cls.MULTI_FAMILY,
+            5: cls.LAND,
+            6: cls.OTHER,
+            8: cls.SINGLE_FAMILY,
+            13: cls.SINGLE_FAMILY,
+        }
+
+        return mapping.get(code, cls.OTHER)
+
+
 @dataclass
 class Address:
     address_one: str
@@ -18,35 +49,35 @@ class Address:
     address_two: str | None = None
 
 
-@dataclass
-class Property:
+@dataclass()
+class Realty:
+    site_name: SiteName
     address: Address
     url: str
+    listing_type: ListingType | None = None
 
+
+@dataclass
+class Property(Realty):
+    price: int | None = None
     beds: int | None = None
     baths: float | None = None
     stories: int | None = None
-    agent_name: str | None = None
     year_built: int | None = None
     square_feet: int | None = None
     price_per_square_foot: int | None = None
     year_built: int | None = None
-    price: int | None = None
     mls_id: str | None = None
 
-    listing_type: ListingType | None = None
+    agent_name: str | None = None
+    property_type: PropertyType | None = None
     lot_size: int | None = None
     description: str | None = None
 
 
 @dataclass
-class Building:
-    address: Address
-    url: str
-
+class Building(Realty):
     num_units: int | None = None
     min_unit_price: int | None = None
     max_unit_price: int | None = None
     avg_unit_price: int | None = None
-
-    listing_type: str | None = None
diff --git a/homeharvest/core/scrapers/redfin/__init__.py b/homeharvest/core/scrapers/redfin/__init__.py
@@ -1,12 +1,13 @@
 import json
-from ..models import Property, Address
+from ..models import Property, Address, PropertyType
 from .. import Scraper
 from typing import Any
 
 
 class RedfinScraper(Scraper):
     def __init__(self, scraper_input):
         super().__init__(scraper_input)
+        self.listing_type = scraper_input.listing_type
 
     def _handle_location(self):
         url = "https://www.redfin.com/stingray/do/location-autocomplete?v=2&al=1&location={}".format(
@@ -31,8 +32,7 @@ def get_region_type(match_type: str):
 
         return target["id"].split("_")[1], get_region_type(target["type"])
 
-    @staticmethod
-    def _parse_home(home: dict, single_search: bool = False) -> Property:
+    def _parse_home(self, home: dict, single_search: bool = False) -> Property:
         def get_value(key: str) -> Any | None:
             if key in home and "value" in home[key]:
                 return home[key]["value"]
@@ -53,10 +53,12 @@ def get_value(key: str) -> Any | None:
                 state=home["state"],
                 zip_code=home["zip"],
             )
-
         url = "https://www.redfin.com{}".format(home["url"])
+        property_type = home["propertyType"] if "propertyType" in home else None
 
         return Property(
+            site_name=self.site_name,
+            listing_type=self.listing_type,
             address=address,
             url=url,
             beds=home["beds"] if "beds" in home else None,
@@ -68,6 +70,8 @@ def get_value(key: str) -> Any | None:
             if not single_search
             else home["yearBuilt"],
             square_feet=get_value("sqFt"),
+            lot_size=home.get("lotSize", {}).get("value", None),
+            property_type=PropertyType.from_int_code(home.get("propertyType")),
             price_per_square_foot=get_value("pricePerSqFt"),
             price=get_value("price"),
             mls_id=get_value("mlsId"),