feat: keep duplicates flag

Bunsly · Sep 21, 2023 · 644f16b · 644f16b
1 parent e9ddc6d
commit 644f16b
Show file tree

Hide file tree

Showing 2 changed files with 11 additions and 2 deletions.
diff --git a/homeharvest/__init__.py b/homeharvest/__init__.py
@@ -119,6 +119,7 @@ def scrape_property(
     site_name: Union[str, list[str]] = None,
     listing_type: str = "for_sale",
     proxy: str = None,
+    keep_duplicates: bool = False
 ) -> pd.DataFrame:
     """
     Scrape property from various sites from a given location and listing type.
@@ -165,5 +166,6 @@ def scrape_property(
         if col not in final_df.columns:
             final_df[col] = None
 
-    final_df = final_df.drop_duplicates(subset=columns_to_track, keep="first")
+    if not keep_duplicates:
+        final_df = final_df.drop_duplicates(subset=columns_to_track, keep="first")
     return final_df
diff --git a/homeharvest/cli.py b/homeharvest/cli.py
@@ -42,11 +42,18 @@ def main():
         help="Name of the output file (without extension)",
     )
 
+    parser.add_argument(
+        "-k",
+        "--keep_duplicates",
+        action="store_true",
+        help="Keep duplicate properties based on address"
+    )
+
     parser.add_argument("-p", "--proxy", type=str, default=None, help="Proxy to use for scraping")
 
     args = parser.parse_args()
 
-    result = scrape_property(args.location, args.site_name, args.listing_type, proxy=args.proxy)
+    result = scrape_property(args.location, args.site_name, args.listing_type, proxy=args.proxy, keep_duplicates=args.keep_duplicates)
 
     if not args.filename:
         timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")