Merge pull request #31 from ZacharyHampton/v0.3

v0.3
Bunsly · Oct 5, 2023 · 4a11164 · 4a11164
2 parents 8388d47 + 2d092c5
commit 4a11164
Show file tree

Hide file tree

Showing 17 changed files with 794 additions and 1,284 deletions.
diff --git a/README.md b/README.md
@@ -1,6 +1,6 @@
 <img src="https://github.com/ZacharyHampton/HomeHarvest/assets/78247585/d1a2bf8b-09f5-4c57-b33a-0ada8a34f12d" width="400">
 
-**HomeHarvest** is a simple, yet comprehensive, real estate scraping library.
+**HomeHarvest** is a simple, yet comprehensive, real estate scraping library that extracts and formats data in the style of MLS listings.
 
 [![Try with Replit](https://replit.com/badge?caption=Try%20with%20Replit)](https://replit.com/@ZacharyHampton/HomeHarvestDemo)
 
@@ -11,10 +11,14 @@
 
 Check out another project we wrote: ***[JobSpy](https://github.com/cullenwatson/JobSpy)** – a Python package for job scraping*
 
-## Features
+## HomeHarvest Features
 
-- Scrapes properties from **Zillow**, **Realtor.com** & **Redfin** simultaneously
-- Aggregates the properties in a Pandas DataFrame
+- **Source**: Fetches properties directly from **Realtor.com**.
+- **Data Format**: Structures data to resemble MLS listings.
+- **Export Flexibility**: Options to save as either CSV or Excel.
+- **Usage Modes**:
+  - **CLI**: For users who prefer command-line operations.
+  - **Python**: For those who'd like to integrate scraping into their Python scripts.
 
 [Video Guide for HomeHarvest](https://youtu.be/JnV7eR2Ve2o) - _updated for release v0.2.7_
 
@@ -31,136 +35,150 @@ pip install homeharvest
 
 ### CLI 
 
+```
+usage: homeharvest [-l {for_sale,for_rent,sold}] [-o {excel,csv}] [-f FILENAME] [-p PROXY] [-d DAYS] [-r RADIUS] [-m] location
+                                                                                                                             
+Home Harvest Property Scraper                                                                                                 
+                                                                                                                             
+positional arguments:                                                                                                         
+  location              Location to scrape (e.g., San Francisco, CA)                                                          
+                                                                                                                             
+options:                                                                                                                      
+  -l {for_sale,for_rent,sold}, --listing_type {for_sale,for_rent,sold}                                                        
+                        Listing type to scrape                                                                                
+  -o {excel,csv}, --output {excel,csv}                                                                                        
+                        Output format                                                                                         
+  -f FILENAME, --filename FILENAME                                                                                            
+                        Name of the output file (without extension)                                                           
+  -p PROXY, --proxy PROXY                                                                                                     
+                        Proxy to use for scraping                                                                             
+  -d DAYS, --days DAYS  Sold/listed in last _ days filter.                                                                           
+  -r RADIUS, --radius RADIUS                                                                                                  
+                        Get comparable properties within _ (eg. 0.0) miles. Only applicable for individual addresses.        
+  -m, --mls_only        If set, fetches only MLS listings.
+```
 ```bash
-homeharvest "San Francisco, CA" -s zillow realtor.com redfin -l for_rent -o excel -f HomeHarvest
+> homeharvest "San Francisco, CA" -l for_rent -o excel -f HomeHarvest
 ```
 
-This will scrape properties from the specified sites for the given location and listing type, and save the results to an Excel file named `HomeHarvest.xlsx`.
-
-By default:
-- If `-s` or `--site_name` is not provided, it will scrape from all available sites.
-- If `-l` or `--listing_type` is left blank, the default is `for_sale`. Other options are `for_rent` or `sold`.
-- The `-o` or `--output` default format is `excel`. Options are `csv` or `excel`.
-- If `-f` or `--filename` is left blank, the default is `HomeHarvest_<current_timestamp>`.
-- If `-p` or `--proxy` is not provided, the scraper uses the local IP.
-- Use `-k` or `--keep_duplicates` to keep duplicate properties based on address. If not provided, duplicates will be removed.
-### Python 
+### Python
 
 ```py
 from homeharvest import scrape_property
-import pandas as pd
+from datetime import datetime
+
+# Generate filename based on current timestamp
+current_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+filename = f"output/{current_timestamp}.csv"
 
-properties: pd.DataFrame = scrape_property(
-    site_name=["zillow", "realtor.com", "redfin"],
-    location="85281",
-    listing_type="for_rent" # for_sale / sold
+properties = scrape_property(
+  location="San Diego, CA",
+  listing_type="sold",  # or (for_sale, for_rent)
+  property_younger_than=30,  # sold in last 30 days - listed in last x days if (for_sale, for_rent)
+  mls_only=True,  # only fetch MLS listings
 )
+print(f"Number of properties: {len(properties)}")
 
-#: Note, to export to CSV or Excel, use properties.to_csv() or properties.to_excel().
-print(properties)
+# Export to csv
+properties.to_csv(filename, index=False)
+print(properties.head())
 ```
 
 ## Output
-```py
+```plaintext
 >>> properties.head()
-                                        property_url site_name listing_type  apt_min_price  apt_max_price   ...  
-0  https://www.redfin.com/AZ/Tempe/1003-W-Washing...    redfin     for_rent         1666.0         2750.0   ... 
-1  https://www.redfin.com/AZ/Tempe/VELA-at-Town-L...    redfin     for_rent         1665.0         3763.0   ...  
-2  https://www.redfin.com/AZ/Tempe/Camden-Tempe/a...    redfin     for_rent         1939.0         3109.0   ...  
-3  https://www.redfin.com/AZ/Tempe/Emerson-Park/a...    redfin     for_rent         1185.0         1817.0   ... 
-4  https://www.redfin.com/AZ/Tempe/Rio-Paradiso-A...    redfin     for_rent         1470.0         2235.0   ...   
-[5 rows x 41 columns]
+    MLS       MLS # Status          Style  ...     COEDate LotSFApx PrcSqft Stories
+0  SDCA   230018348   SOLD         CONDOS  ...  2023-10-03   290110     803       2
+1  SDCA   230016614   SOLD      TOWNHOMES  ...  2023-10-03     None     838       3
+2  SDCA   230016367   SOLD         CONDOS  ...  2023-10-03    30056     649       1
+3  MRCA  NDP2306335   SOLD  SINGLE_FAMILY  ...  2023-10-03     7519     661       2
+4  SDCA   230014532   SOLD         CONDOS  ...  2023-10-03     None     752       1
+[5 rows x 22 columns]
 ```
 
-### Parameters for `scrape_properties()`
-```plaintext
+### Parameters for `scrape_property()`
+```
 Required
-├── location (str): address in various formats e.g. just zip, full address, city/state, etc.
-└── listing_type (enum): for_rent, for_sale, sold
+├── location (str): The address in various formats - this could be just a zip code, a full address, or city/state, etc.
+└── listing_type (option): Choose the type of listing.
+    - 'for_rent'
+    - 'for_sale'
+    - 'sold'
+
 Optional
-├── site_name (list[enum], default=all three sites): zillow, realtor.com, redfin
-├── proxy (str): in format 'http://user:pass@host:port' or [https, socks]
-└── keep_duplicates (bool, default=False): whether to keep or remove duplicate properties based on address
-```
+├── radius (decimal): Radius in miles to find comparable properties based on individual addresses.
+│    Example: 5.5 (fetches properties within a 5.5-mile radius if location is set to a specific address; otherwise, ignored)
+│
+├── property_younger_than (integer): Number of past days to filter properties. Utilizes 'last_sold_date' for 'sold' listing types, and 'list_date' for others (for_rent, for_sale).
+│    Example: 30 (fetches properties listed/sold in the last 30 days)
+│
+├── mls_only (True/False): If set, fetches only MLS listings (mainly applicable to 'sold' listings)
+│
+└── proxy (string): In format 'http://user:pass@host:port'
 
+```
 ### Property Schema
 ```plaintext
 Property
 ├── Basic Information:
-│   ├── property_url (str)
-│   ├── site_name (enum): zillow, redfin, realtor.com
-│   ├── listing_type (enum): for_sale, for_rent, sold
-│   └── property_type (enum): house, apartment, condo, townhouse, single_family, multi_family, building
+│ ├── property_url
+│ ├── mls
+│ ├── mls_id
+│ └── status
 
 ├── Address Details:
-│   ├── street_address (str)
-│   ├── city (str)
-│   ├── state (str)
-│   ├── zip_code (str)
-│   ├── unit (str)
-│   └── country (str)
-
-├── House for Sale Features:
-│   ├── tax_assessed_value (int)
-│   ├── lot_area_value (float)
-│   ├── lot_area_unit (str)
-│   ├── stories (int)
-│   ├── year_built (int)
-│   └── price_per_sqft (int)
-
-├── Building for Sale and Apartment Details:
-│   ├── bldg_name (str)
-│   ├── beds_min (int)
-│   ├── beds_max (int)
-│   ├── baths_min (float)
-│   ├── baths_max (float)
-│   ├── sqft_min (int)
-│   ├── sqft_max (int)
-│   ├── price_min (int)
-│   ├── price_max (int)
-│   ├── area_min (int)
-│   └── unit_count (int)
-
-├── Miscellaneous Details:
-│   ├── mls_id (str)
-│   ├── agent_name (str)
-│   ├── img_src (str)
-│   ├── description (str)
-│   ├── status_text (str)
-│   └── posted_time (str)
-
-└── Location Details:
-    ├── latitude (float)
-    └── longitude (float)
+│ ├── street
+│ ├── unit
+│ ├── city
+│ ├── state
+│ └── zip_code
+
+├── Property Description:
+│ ├── style
+│ ├── beds
+│ ├── full_baths
+│ ├── half_baths
+│ ├── sqft
+│ ├── year_built
+│ ├── stories
+│ └── lot_sqft
+
+├── Property Listing Details:
+│ ├── list_price
+│ ├── list_date
+│ ├── sold_price
+│ ├── last_sold_date
+│ ├── price_per_sqft
+│ └── hoa_fee
+
+├── Location Details:
+│ ├── latitude
+│ ├── longitude
+
+└── Parking Details:
+    └── parking_garage
 ```
-## Supported Countries for Property Scraping
-
-* **Zillow**: contains listings in the **US** & **Canada** 
-* **Realtor.com**: mainly from the **US** but also has international listings
-* **Redfin**: listings mainly in the **US**, **Canada**, & has expanded to some areas in **Mexico**
 
 ### Exceptions
 The following exceptions may be raised when using HomeHarvest:
 
-- `InvalidSite` - valid options: `zillow`, `redfin`, `realtor.com`
 - `InvalidListingType` - valid options: `for_sale`, `for_rent`, `sold`
-- `NoResultsFound` - no properties found from your input
-- `GeoCoordsNotFound` - if Zillow scraper is not able to derive geo-coordinates from the location you input
-
+- `NoResultsFound` - no properties found from your search
+  
+  
 ## Frequently Asked Questions
-
 ---
 
-**Q: Encountering issues with your queries?**  
-**A:** Try a single site and/or broaden the location. If problems persist, [submit an issue](https://github.com/ZacharyHampton/HomeHarvest/issues).
+**Q: Encountering issues with your searches?**  
+**A:** Try to broaden the parameters you're using. If problems persist, [submit an issue](https://github.com/ZacharyHampton/HomeHarvest/issues).
 
 ---
 
 **Q: Received a Forbidden 403 response code?**  
-**A:** This indicates that you have been blocked by the real estate site for sending too many requests. Currently, **Zillow** is particularly aggressive with blocking. We recommend:
+**A:** This indicates that you have been blocked by Realtor.com for sending too many requests. We recommend:
 
 - Waiting a few seconds between requests.
-- Trying a VPN to change your IP address.
+- Trying a VPN or useing a proxy as a parameter to scrape_property() to change your IP address.
 
 ---
 
diff --git a/HomeHarvest_Demo.ipynb → examples/HomeHarvest_Demo.ipynb b/HomeHarvest_Demo.ipynb → examples/HomeHarvest_Demo.ipynb
@@ -31,7 +31,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# scrapes all 3 sites by default\n",
+    "# check for sale properties\n",
     "scrape_property(\n",
     "    location=\"dallas\",\n",
     "    listing_type=\"for_sale\"\n",
@@ -53,7 +53,6 @@
     "# search a specific address\n",
     "scrape_property(\n",
     "    location=\"2530 Al Lipscomb Way\",\n",
-    "    site_name=\"zillow\",\n",
     "    listing_type=\"for_sale\"\n",
     ")"
    ]
@@ -68,7 +67,6 @@
     "# check rentals\n",
     "scrape_property(\n",
     "    location=\"chicago, illinois\",\n",
-    "    site_name=[\"redfin\", \"zillow\"],\n",
     "    listing_type=\"for_rent\"\n",
     ")"
    ]
@@ -88,7 +86,6 @@
     "# check sold properties\n",
     "scrape_property(\n",
     "    location=\"90210\",\n",
-    "    site_name=[\"redfin\"],\n",
     "    listing_type=\"sold\"\n",
     ")"
    ]

diff --git a/examples/HomeHarvest_Demo.py b/examples/HomeHarvest_Demo.py
@@ -0,0 +1,18 @@
+from homeharvest import scrape_property
+from datetime import datetime
+
+# Generate filename based on current timestamp
+current_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+filename = f"output/{current_timestamp}.csv"
+
+properties = scrape_property(
+    location="San Diego, CA",
+    listing_type="sold", # for_sale, for_rent
+    property_younger_than=30, # sold/listed in last 30 days
+    mls_only=True, # only fetch MLS listings
+)
+print(f"Number of properties: {len(properties)}")
+
+# Export to csv
+properties.to_csv(filename, index=False)
+print(properties.head())