Skip to content

Commit

Permalink
feat: add pandas
Browse files Browse the repository at this point in the history
  • Loading branch information
cullenwatson committed Sep 17, 2023
1 parent b76c659 commit 3697b7c
Show file tree
Hide file tree
Showing 9 changed files with 393 additions and 30 deletions.
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,5 @@
**/dist/
**/__pycache__/
**/.pytest_cache/
*.pyc
*.pyc
/.ipynb_checkpoints/
73 changes: 73 additions & 0 deletions HomeHarvest_Demo.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "cb48903e-5021-49fe-9688-45cd0bc05d0f",
"metadata": {},
"outputs": [],
"source": [
"from homeharvest import scrape_property\n",
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "156488ce-0d5f-43c5-87f4-c33e9c427860",
"metadata": {},
"outputs": [],
"source": [
"pd.set_option('display.max_columns', None) # Show all columns\n",
"pd.set_option('display.max_rows', None) # Show all rows\n",
"pd.set_option('display.width', None) # Auto-adjust display width to fit console\n",
"pd.set_option('display.max_colwidth', 50) # Limit max column width to 50 characters"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1c8b9744-8606-4e9b-8add-b90371a249a7",
"metadata": {},
"outputs": [],
"source": [
"scrape_property(\n",
" location=\"dallas\", site_name=\"zillow\", listing_type=\"for_sale\"\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ab7b4c21-da1d-4713-9df4-d7425d8ce21e",
"metadata": {},
"outputs": [],
"source": [
"scrape_property(\n",
" location=\"dallas\", site_name=\"redfin\", listing_type=\"for_sale\"\n",
")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.11"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
68 changes: 65 additions & 3 deletions homeharvest/__init__.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
from .core.scrapers.redfin import RedfinScraper
from .core.scrapers.realtor import RealtorScraper
from .core.scrapers.zillow import ZillowScraper
from .core.scrapers.models import ListingType, Property, Building
from .core.scrapers.models import ListingType, Property, Building, SiteName
from .core.scrapers import ScraperInput
from .exceptions import InvalidSite, InvalidListingType
from typing import Union
import pandas as pd


_scrapers = {
Expand All @@ -18,7 +19,7 @@ def scrape_property(
location: str,
site_name: str,
listing_type: str = "for_sale", #: for_sale, for_rent, sold
) -> Union[list[Building], list[Property]]: #: eventually, return pandas dataframe
) -> Union[list[Building], list[Property]]:
if site_name.lower() not in _scrapers:
raise InvalidSite(f"Provided site, '{site_name}', does not exist.")

Expand All @@ -30,8 +31,69 @@ def scrape_property(
scraper_input = ScraperInput(
location=location,
listing_type=ListingType[listing_type.upper()],
site_name=SiteName[site_name.upper()],
)

site = _scrapers[site_name.lower()](scraper_input)
results = site.search()

return site.search()
properties_dfs = []

for result in results:
prop_data = result.__dict__

address_data = prop_data["address"]
prop_data["site_name"] = prop_data["site_name"].value
prop_data["listing_type"] = prop_data["listing_type"].value
prop_data["property_type"] = prop_data["property_type"].value.lower()
prop_data["address_one"] = address_data.address_one
prop_data["city"] = address_data.city
prop_data["state"] = address_data.state
prop_data["zip_code"] = address_data.zip_code
prop_data["address_two"] = address_data.address_two

del prop_data["address"]

if isinstance(result, Property):
desired_order = [
"listing_type",
"address_one",
"city",
"state",
"zip_code",
"address_two",
"url",
"property_type",
"price",
"beds",
"baths",
"square_feet",
"price_per_square_foot",
"lot_size",
"stories",
"year_built",
"agent_name",
"mls_id",
"description",
]

elif isinstance(result, Building):
desired_order = [
"address_one",
"city",
"state",
"zip_code",
"address_two",
"url",
"num_units",
"min_unit_price",
"max_unit_price",
"avg_unit_price",
"listing_type",
]

properties_df = pd.DataFrame([prop_data])
properties_df = properties_df[desired_order]
properties_dfs.append(properties_df)

return pd.concat(properties_dfs, ignore_index=True)
5 changes: 4 additions & 1 deletion homeharvest/core/scrapers/__init__.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,22 @@
from dataclasses import dataclass
import requests
from .models import Property, ListingType
from .models import Property, ListingType, SiteName


@dataclass
class ScraperInput:
location: str
listing_type: ListingType
site_name: SiteName
proxy_url: str | None = None


class Scraper:
def __init__(self, scraper_input: ScraperInput):
self.location = scraper_input.location
self.session = requests.Session()
self.listing_type = scraper_input.listing_type
self.site_name = scraper_input.site_name

if scraper_input.proxy_url:
self.session.proxies = {
Expand Down
53 changes: 42 additions & 11 deletions homeharvest/core/scrapers/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,43 @@
from enum import Enum


class SiteName(Enum):
ZILLOW = "zillow"
REDFIN = "redfin"
REALTOR = "realtor.com"


class ListingType(Enum):
FOR_SALE = "for_sale"
FOR_RENT = "for_rent"
SOLD = "sold"


class PropertyType(Enum):
HOUSE = "HOUSE"
CONDO = "CONDO"
TOWNHOUSE = "townhousE"
SINGLE_FAMILY = "SINGLE_FAMILY"
MULTI_FAMILY = "MULTI_FAMILY"
LAND = "LAND"
OTHER = "OTHER"

@classmethod
def from_int_code(cls, code):
mapping = {
1: cls.HOUSE,
2: cls.CONDO,
3: cls.TOWNHOUSE,
4: cls.MULTI_FAMILY,
5: cls.LAND,
6: cls.OTHER,
8: cls.SINGLE_FAMILY,
13: cls.SINGLE_FAMILY,
}

return mapping.get(code, cls.OTHER)


@dataclass
class Address:
address_one: str
Expand All @@ -18,35 +49,35 @@ class Address:
address_two: str | None = None


@dataclass
class Property:
@dataclass()
class Realty:
site_name: SiteName
address: Address
url: str
listing_type: ListingType | None = None


@dataclass
class Property(Realty):
price: int | None = None
beds: int | None = None
baths: float | None = None
stories: int | None = None
agent_name: str | None = None
year_built: int | None = None
square_feet: int | None = None
price_per_square_foot: int | None = None
year_built: int | None = None
price: int | None = None
mls_id: str | None = None

listing_type: ListingType | None = None
agent_name: str | None = None
property_type: PropertyType | None = None
lot_size: int | None = None
description: str | None = None


@dataclass
class Building:
address: Address
url: str

class Building(Realty):
num_units: int | None = None
min_unit_price: int | None = None
max_unit_price: int | None = None
avg_unit_price: int | None = None

listing_type: str | None = None
12 changes: 8 additions & 4 deletions homeharvest/core/scrapers/redfin/__init__.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
import json
from ..models import Property, Address
from ..models import Property, Address, PropertyType
from .. import Scraper
from typing import Any


class RedfinScraper(Scraper):
def __init__(self, scraper_input):
super().__init__(scraper_input)
self.listing_type = scraper_input.listing_type

def _handle_location(self):
url = "https://www.redfin.com/stingray/do/location-autocomplete?v=2&al=1&location={}".format(
Expand All @@ -31,8 +32,7 @@ def get_region_type(match_type: str):

return target["id"].split("_")[1], get_region_type(target["type"])

@staticmethod
def _parse_home(home: dict, single_search: bool = False) -> Property:
def _parse_home(self, home: dict, single_search: bool = False) -> Property:
def get_value(key: str) -> Any | None:
if key in home and "value" in home[key]:
return home[key]["value"]
Expand All @@ -53,10 +53,12 @@ def get_value(key: str) -> Any | None:
state=home["state"],
zip_code=home["zip"],
)

url = "https://www.redfin.com{}".format(home["url"])
property_type = home["propertyType"] if "propertyType" in home else None

return Property(
site_name=self.site_name,
listing_type=self.listing_type,
address=address,
url=url,
beds=home["beds"] if "beds" in home else None,
Expand All @@ -68,6 +70,8 @@ def get_value(key: str) -> Any | None:
if not single_search
else home["yearBuilt"],
square_feet=get_value("sqFt"),
lot_size=home.get("lotSize", {}).get("value", None),
property_type=PropertyType.from_int_code(home.get("propertyType")),
price_per_square_foot=get_value("pricePerSqFt"),
price=get_value("price"),
mls_id=get_value("mlsId"),
Expand Down
Loading

0 comments on commit 3697b7c

Please sign in to comment.