Skip to content

Commit

Permalink
reactor(redfin)
Browse files Browse the repository at this point in the history
  • Loading branch information
cullenwatson committed Sep 18, 2023
2 parents 471e531 + 10c01f3 commit ffd3ce6
Show file tree
Hide file tree
Showing 9 changed files with 392 additions and 127 deletions.
51 changes: 29 additions & 22 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,34 +2,41 @@

**HomeHarvest** aims to be the top Python real estate scraping library.

## RoadMap
_**Under Consideration**: We're looking into the possibility of an Excel plugin to cater to a broader audience._

- **Supported Sites**: Currently, we support scraping from sites such as `Zillow` and `RedFin`.
- **Output**: Provides the option to return the scraped data as a Pandas dataframe.
- **Under Consideration**: We're looking into the possibility of an Excel plugin to cater to a broader audience.
[![Try with Replit](https://replit.com/badge?caption=Try%20with%20Replit)](https://replit.com/@ZacharyHampton/HomeHarvestDemo)

## Site Name Options

- `zillow`
- `redfin`

## Listing Types

- `for_rent`
- `for_sale`

### Installation
## Installation

```bash
pip install --upgrade homeharvest
```

### Example Usage
## Example Usage
```py
>>> from homeharvest import scrape_property
... properties = scrape_property(
... location="85281", site_name="zillow", listing_type="for_rent"
... )

>>> properties.head()
address_one city ... mls_id description
0 420 N Scottsdale Rd Tempe ... NaN NaN
1 1255 E University Dr Tempe ... NaN NaN
2 1979 E Rio Salado Pkwy Tempe ... NaN NaN
3 548 S Wilson St Tempe ... None None
4 945 E Playa Del Norte Dr Unit 4027 Tempe ... NaN NaN
[5 rows x 23 columns]
```
from homeharvest import scrape_property

properties = scrape_property(
location="85281", site_name="zillow", listing_type="for_rent"
)
print(properties)
```
### Site Name Options

- `zillow`
- `redfin`
- `realtor.com`

### Listing Types

- `for_rent`
- `for_sale`
- `sold`
12 changes: 11 additions & 1 deletion homeharvest/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,17 @@ def scrape_property(
location: str,
site_name: str,
listing_type: str = "for_sale", #: for_sale, for_rent, sold
) -> list[Property]:
) -> pd.DataFrame:
"""
Scrape property from various sites from a given location and listing type.
:returns: pd.DataFrame
:param location: US Location (e.g. 'San Francisco, CA', 'Cook County, IL', '85281', '2530 Al Lipscomb Way')
:param site_name: Site name (e.g. 'realtor.com', 'zillow', 'redfin')
:param listing_type: Listing type (e.g. 'for_sale', 'for_rent', 'sold')
:return: pd.DataFrame containing properties
"""

validate_input(site_name, listing_type)

scraper_input = ScraperInput(
Expand Down
2 changes: 2 additions & 0 deletions homeharvest/core/scrapers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@ class ScraperInput:
class Scraper:
def __init__(self, scraper_input: ScraperInput):
self.location = scraper_input.location
self.listing_type = scraper_input.listing_type

self.session = requests.Session()
self.listing_type = scraper_input.listing_type
self.site_name = scraper_input.site_name
Expand Down
1 change: 1 addition & 0 deletions homeharvest/core/scrapers/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ class Address:
country: str | None = None



@dataclass
class Property:
property_url: str
Expand Down
233 changes: 225 additions & 8 deletions homeharvest/core/scrapers/realtor/__init__.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,15 @@
import json
from ..models import Property, Address
from .. import Scraper
from typing import Any
from typing import Any, Generator
from ....exceptions import NoResultsFound
from concurrent.futures import ThreadPoolExecutor, as_completed


class RealtorScraper(Scraper):
def __init__(self, scraper_input):
super().__init__(scraper_input)
self.search_url = "https://www.realtor.com/api/v1/rdc_search_srp?client_id=rdc-search-new-communities&schema=vesta"

def handle_location(self):
headers = {
Expand All @@ -26,7 +29,7 @@ def handle_location(self):

params = {
"input": self.location,
"client_id": "for-sale",
"client_id": self.listing_type.value.replace('_', '-'),
"limit": "1",
"area_types": "city,state,county,postal_code,address,street,neighborhood,school,school_district,university,park",
}
Expand All @@ -38,14 +41,228 @@ def handle_location(self):
)
response_json = response.json()

return response_json["autocomplete"][0]
result = response_json["autocomplete"]

if result is None:
raise NoResultsFound("No results found for location: " + self.location)

return result[0]

def handle_address(self, property_id: str) -> list[Property]:
query = """query Property($property_id: ID!) {
property(id: $property_id) {
property_id
details {
date_updated
garage
permalink
year_built
stories
}
address {
address_validation_code
city
country
county
line
postal_code
state_code
street_direction
street_name
street_number
street_suffix
street_post_direction
unit_value
unit
unit_descriptor
zip
}
basic {
baths
beds
price
sqft
lot_sqft
type
sold_price
}
public_record {
lot_size
sqft
stories
units
year_built
}
}
}"""

variables = {
'property_id': property_id
}

payload = {
'query': query,
'variables': variables,
}

response = self.session.post(self.search_url, json=payload)
response_json = response.json()

property_info = response_json['data']['property']

return [Property(
site_name=self.site_name,
address=Address(
address_one=property_info['address']['line'],
city=property_info['address']['city'],
state=property_info['address']['state_code'],
zip_code=property_info['address']['postal_code'],
),
url="https://www.realtor.com/realestateandhomes-detail/" + property_info['details']['permalink'],
beds=property_info['basic']['beds'],
baths=property_info['basic']['baths'],
stories=property_info['details']['stories'],
year_built=property_info['details']['year_built'],
square_feet=property_info['basic']['sqft'],
price_per_square_foot=property_info['basic']['price'] / property_info['basic']['sqft']
if property_info['basic']['sqft'] is not None and
property_info['basic']['price'] is not None
else None,
price=property_info['basic']['price'],
mls_id=property_id,
listing_type=self.listing_type,
lot_size=property_info['public_record']['lot_size'] if property_info['public_record'] is not None else None,
)]

def handle_area(self, variables: dict, return_total: bool = False) -> list[Property] | int:
query = """query Home_search(
$city: String,
$county: [String],
$state_code: String,
$postal_code: String
$offset: Int,
) {
home_search(
query: {
city: $city
county: $county
postal_code: $postal_code
state_code: $state_code
status: %s
}
limit: 200
offset: $offset
) {
count
total
results {
property_id
description {
baths
beds
lot_sqft
sqft
text
sold_price
stories
year_built
garage
unit_number
floor_number
}
location {
address {
city
country
line
postal_code
state_code
state
street_direction
street_name
street_number
street_post_direction
street_suffix
unit
}
}
list_price
price_per_sqft
source {
id
}
}
}
}""" % self.listing_type.value

payload = {
'query': query,
'variables': variables,
}

response = self.session.post(self.search_url, json=payload)
response_json = response.json()

if return_total:
return response_json['data']['home_search']['total']

properties: list[Property] = []

for result in response_json['data']['home_search']['results']:
realty_property = Property(
address=Address(
address_one=result['location']['address']['line'],
city=result['location']['address']['city'],
state=result['location']['address']['state_code'],
zip_code=result['location']['address']['postal_code'],
address_two=result['location']['address']['unit'],
),
site_name=self.site_name,
url="https://www.realtor.com/realestateandhomes-detail/" + result['property_id'],
beds=result['description']['beds'],
baths=result['description']['baths'],
stories=result['description']['stories'],
year_built=result['description']['year_built'],
square_feet=result['description']['sqft'],
price_per_square_foot=result['price_per_sqft'],
price=result['list_price'],
mls_id=result['property_id'],
listing_type=self.listing_type,
lot_size=result['description']['lot_sqft'],
)

properties.append(realty_property)

return properties

def search(self):
location_info = self.handle_location()
location_type = location_info["area_type"]

"""
property types:
apartment + building + commercial + condo_townhome + condo_townhome_rowhome_coop + condos + coop + duplex_triplex + farm + investment + land + mobile + multi_family + rental + single_family + townhomes
"""
print("a")
if location_type == 'address':
property_id = location_info['mpr_id']
return self.handle_address(property_id)

offset = 0
search_variables = {
'city': location_info.get('city'),
'county': location_info.get('county'),
'state_code': location_info.get('state_code'),
'postal_code': location_info.get('postal_code'),
'offset': offset,
}

total = self.handle_area(search_variables, return_total=True)

homes = []
with ThreadPoolExecutor(max_workers=10) as executor:
futures = [
executor.submit(
self.handle_area, variables=search_variables | {'offset': i}, return_total=False
) for i in range(0, total, 200)
]

for future in as_completed(futures):
homes.extend(future.result())

return homes
Loading

0 comments on commit ffd3ce6

Please sign in to comment.