Skip to content

Commit

Permalink
Merge pull request #3 from ZacharyHampton/all_3_sites
Browse files Browse the repository at this point in the history
Check dups with city, street_address, unit
  • Loading branch information
ZacharyHampton committed Sep 18, 2023
2 parents 94e5b09 + 086bcfd commit d5b4d80
Show file tree
Hide file tree
Showing 8 changed files with 68 additions and 35 deletions.
4 changes: 2 additions & 2 deletions homeharvest/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,5 +158,5 @@ def scrape_property(
return pd.DataFrame()

final_df = pd.concat(results, ignore_index=True)
final_df = final_df.drop_duplicates(subset="street_address", keep="first")
return final_df
final_df = final_df.drop_duplicates(subset=["street_address", "city", "unit"], keep="first")
return final_df
12 changes: 7 additions & 5 deletions homeharvest/core/scrapers/realtor/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from .. import Scraper
from typing import Any, Generator
from ....exceptions import NoResultsFound
from ....utils import parse_address_two
from ....utils import parse_address_two, parse_unit
from concurrent.futures import ThreadPoolExecutor, as_completed


Expand Down Expand Up @@ -108,8 +108,7 @@ def handle_address(self, property_id: str) -> list[Property]:
response_json = response.json()

property_info = response_json["data"]["property"]
street_address = property_info["address"]["line"]
unit = parse_address_two(street_address)
street_address, unit = parse_address_two(property_info["address"]["line"])

return [
Property(
Expand Down Expand Up @@ -234,13 +233,16 @@ def handle_area(
return []

for result in response_json["data"]["home_search"]["results"]:
street_address, unit = parse_address_two(
result["location"]["address"]["line"]
)
realty_property = Property(
address=Address(
street_address=result["location"]["address"]["line"],
street_address=street_address,
city=result["location"]["address"]["city"],
state=result["location"]["address"]["state_code"],
zip_code=result["location"]["address"]["postal_code"],
unit=parse_address_two(result["location"]["address"]["unit"]),
unit=parse_unit(result["location"]["address"]["unit"]),
country="USA",
),
site_name=self.site_name,
Expand Down
40 changes: 23 additions & 17 deletions homeharvest/core/scrapers/redfin/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import json
from typing import Any
from .. import Scraper
from ....utils import parse_address_two
from ....utils import parse_address_two, parse_unit
from ..models import Property, Address, PropertyType


Expand Down Expand Up @@ -39,9 +39,10 @@ def get_value(key: str) -> Any | None:
return home[key]["value"]

if not single_search:
unit = parse_address_two(get_value("streetLine"))
street_address, unit = parse_address_two(get_value("streetLine"))
unit = parse_unit(get_value("streetLine"))
address = Address(
street_address=get_value("streetLine"),
street_address=street_address,
city=home["city"],
state=home["state"],
zip_code=home["zip"],
Expand All @@ -50,10 +51,11 @@ def get_value(key: str) -> Any | None:
)
else:
address_info = home["streetAddress"]
street_address, unit = parse_address_two(address_info["assembledAddress"])
unit = parse_address_two(address_info["assembledAddress"])

address = Address(
street_address=address_info["assembledAddress"],
street_address=street_address,
city=home["city"],
state=home["state"],
zip_code=home["zip"],
Expand Down Expand Up @@ -94,26 +96,30 @@ def get_value(key: str) -> Any | None:
)

def _parse_building(self, building: dict) -> Property:
street_address = " ".join(
[
building["address"]["streetNumber"],
building["address"]["directionalPrefix"],
building["address"]["streetName"],
building["address"]["streetType"],
]
)
street_address, unit = parse_address_two(street_address)
return Property(
site_name=self.site_name,
property_type=PropertyType("BUILDING"),
address=Address(
street_address=" ".join(
[
building["address"]["streetNumber"],
building["address"]["directionalPrefix"],
building["address"]["streetName"],
building["address"]["streetType"],
]
),
street_address=street_address,
city=building["address"]["city"],
state=building["address"]["stateOrProvinceCode"],
zip_code=building["address"]["postalCode"],
unit=" ".join(
[
building["address"]["unitType"],
building["address"]["unitValue"],
]
unit=parse_unit(
" ".join(
[
building["address"]["unitType"],
building["address"]["unitValue"],
]
)
),
),
property_url="https://www.redfin.com{}".format(building["url"]),
Expand Down
14 changes: 8 additions & 6 deletions homeharvest/core/scrapers/zillow/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import re
import json
from .. import Scraper
from ....utils import parse_address_two
from ....utils import parse_address_two, parse_unit
from ....exceptions import NoResultsFound, PropertyNotFound
from ..models import Property, Address, ListingType, PropertyType, SiteName

Expand Down Expand Up @@ -129,8 +129,8 @@ def _parse_properties(self, property_data: dict):
if "hdpData" in result:
home_info = result["hdpData"]["homeInfo"]
address_data = {
"street_address": home_info["streetAddress"],
"unit": parse_address_two(home_info["unit"])
"street_address": parse_address_two(home_info["streetAddress"])[0],
"unit": parse_unit(home_info["unit"])
if "unit" in home_info
else None,
"city": home_info["city"],
Expand Down Expand Up @@ -225,9 +225,10 @@ def _get_single_property_page(self, property_data: dict):
else property_data["hdpUrl"]
)
address_data = property_data["address"]
street_address, unit = parse_address_two(address_data["streetAddress"])
address = Address(
street_address=address_data["streetAddress"],
unit=parse_address_two(address_data["streetAddress"]),
street_address=street_address,
unit=unit,
city=address_data["city"],
state=address_data["state"],
zip_code=address_data["zipcode"],
Expand Down Expand Up @@ -286,10 +287,11 @@ def _extract_address(self, address_str):
else:
raise ValueError(f"Unexpected state/zip format in address: {address_str}")

street_address, unit = parse_address_two(street_address)
return Address(
street_address=street_address,
city=city,
unit=parse_address_two(street_address),
unit=unit,
state=state,
zip_code=zip_code,
country="USA",
Expand Down
27 changes: 25 additions & 2 deletions homeharvest/utils.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,29 @@
import re


def parse_address_two(street_address: str):
def parse_address_two(street_address: str) -> tuple:
if not street_address:
return street_address, None

apt_match = re.search(
r"(APT\s*[\dA-Z]+|#[\dA-Z]+|UNIT\s*[\dA-Z]+|LOT\s*[\dA-Z]+|SUITE\s*[\dA-Z]+)$",
street_address,
re.I,
)

if apt_match:
apt_str = apt_match.group().strip()
cleaned_apt_str = re.sub(
r"(APT\s*|UNIT\s*|LOT\s*|SUITE\s*)", "#", apt_str, flags=re.I
)

main_address = street_address.replace(apt_str, "").strip()
return main_address, cleaned_apt_str
else:
return street_address, None


def parse_unit(street_address: str):
if not street_address:
return None
apt_match = re.search(
Expand All @@ -19,7 +41,8 @@ def parse_address_two(street_address: str):


if __name__ == "__main__":
print(parse_address_two("810 E Colter St APT 32"))
print(parse_address_two("4303 E Cactus Rd Apt 126"))
print(parse_address_two("1234 Elm Street apt 2B"))
print(parse_address_two("1234 Elm Street UNIT 3A"))
print(parse_address_two("1234 Elm Street unit 3A"))
print(parse_address_two("1234 Elm Street SuIte 3A"))
2 changes: 1 addition & 1 deletion tests/test_realtor.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ def test_realtor():
listing_type="for_sale",
),
scrape_property(
location="Phoenix, AZ", site_name="realtor.com", listing_type="for_rent"
location="Phoenix, AZ", site_name=["realtor.com"], listing_type="for_rent"
), #: does not support "city, state, USA" format
scrape_property(
location="Dallas, TX", site_name="realtor.com", listing_type="sold"
Expand Down
2 changes: 1 addition & 1 deletion tests/test_redfin.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ def test_redfin():
location="2530 Al Lipscomb Way", site_name="redfin", listing_type="for_sale"
),
scrape_property(
location="Phoenix, AZ, USA", site_name="redfin", listing_type="for_rent"
location="Phoenix, AZ, USA", site_name=["redfin"], listing_type="for_rent"
),
scrape_property(
location="Dallas, TX, USA", site_name="redfin", listing_type="sold"
Expand Down
2 changes: 1 addition & 1 deletion tests/test_zillow.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ def test_zillow():
location="2530 Al Lipscomb Way", site_name="zillow", listing_type="for_sale"
),
scrape_property(
location="Phoenix, AZ, USA", site_name="zillow", listing_type="for_rent"
location="Phoenix, AZ, USA", site_name=["zillow"], listing_type="for_rent"
),
scrape_property(
location="Dallas, TX, USA", site_name="zillow", listing_type="sold"
Expand Down

0 comments on commit d5b4d80

Please sign in to comment.