clean up filtered keys

johnseekins · johnseekins · commit 24d7a01e67ab · 2025-09-18T19:43:32.000-06:00
Signed-off-by: John Seekins &lt;john@robot-house.us&gt;
diff --git a/file_utils.py b/file_utils.py
@@ -1,11 +1,11 @@
 import copy
-import csv
 import json
 from schemas import enrichment_print_schema
 from utils import (
-    _flatdict,
+    convert_to_dataframe,
     logger,
 )
+import xlsxwriter  # type: ignore [import-untyped]
 
 
 def export_to_file(
@@ -18,30 +18,20 @@ def export_to_file(
         return ""
 
     full_name = f"{filename}.{file_type}"
-    csv_filtered_keys = [
-        "_repaired_record",
-        "raw_scrape",
-        "wikipedia.search_query",
-        "wikidata.search_query",
-        "osm.search_query",
-        "source_urls",
-    ]
-    try:
-        with open(full_name, "w", newline="", encoding="utf-8") as f_out:
-            if file_type == "csv":
-                flatdata = [_flatdict(f) for _, f in facilities_data["facilities"].items()]
-                fieldnames = [k for k in flatdata[0].keys() if k not in csv_filtered_keys]
-
-                writer = csv.DictWriter(f_out, fieldnames=fieldnames)
-                writer.writeheader()
-                for facility in flatdata:
-                    row_data = {field: facility.get(field, None) for field in fieldnames}
-                    writer.writerow(row_data)
-            elif file_type == "json":
-                json.dump(facilities_data, f_out, indent=2, sort_keys=True, default=str)
-    except Exception as e:
-        logger.error("Error writing %s file: %s", file_type, e)
-        return ""
+    if file_type in ["csv", "xlsx", "parquet"]:
+        writer = convert_to_dataframe(facilities_data["facilities"])
+        match file_type:
+            case "xlsx":
+                with xlsxwriter.Workbook(full_name, {"remove_timezone": True}) as wb:
+                    writer.write_excel(workbook=wb, include_header=True, autofit=True)
+            case "csv":
+                with open(full_name, "w", newline="", encoding="utf-8") as f_out:
+                    writer.write_csv(file=f_out, include_header=True)
+            case "parquet":
+                writer.write_parquet(full_name, use_pyarrow=True)
+    elif file_type == "json":
+        with open(full_name, "w", encoding="utf-8") as f_out:
+            json.dump(facilities_data, f_out, indent=2, sort_keys=True, default=str)
 
     logger.info(
         "%s file '%s.%s' created successfully with %s facilities.",
@@ -68,8 +58,7 @@ def print_summary(facilities_data: dict) -> None:
     # Count by field office
     field_offices: dict = {}
     for facility_id, facility in facilities_data["facilities"].items():
-        office = facility.get("field_office", "Unknown")
-        field_offices[office] = field_offices.get(office, 0) + 1
+        field_offices[facility["field_office"]] = field_offices.get(facility["field_office"], 0) + 1
 
     logger.info("\nFacilities by Field Office:")
     for office, count in sorted(field_offices.items(), key=lambda x: x[1], reverse=True):
diff --git a/main.py b/main.py
@@ -24,6 +24,7 @@
 from file_utils import export_to_file, print_summary
 import default_data
 from enricher import ExternalDataEnricher
+from schemas import supported_output_types
 from scraper import ICEGovFacilityScraper
 from utils import logger
 # CLI, argument parsing, script orchestration
@@ -58,7 +59,7 @@ def main() -> None:
     parser.add_argument(
         "--file-type",
         default="csv",
-        choices=["csv", "json"],
+        choices=supported_output_types,
         help="type of file to export",
     )
     parser.add_argument(
diff --git a/pyproject.toml b/pyproject.toml
@@ -11,6 +11,7 @@ dependencies = [
     "polars>=1.33.0",
     "pyarrow>=21.0.0",
     "requests>=2.32.5",
+    "xlsxwriter>=3.2.5",
 ]
 
 [dependency-groups]
diff --git a/schemas.py b/schemas.py
@@ -22,6 +22,7 @@
         "postal_code": "",
         "street": "",
     },
+    "address_str": "",
     "_repaired_record": False,
     "field_office": "",
     "image_url": "",
@@ -91,10 +92,18 @@
         "expanded_name": "United States Marshal Service Intergovernmental Agreement",
         "description": "A USMS Intergovernmental Agreement in which ICE agrees to utilize an already established US Marshal Service contract.",
     },
+    "USMS IGA": {
+        "expanded_name": "United States Marshal Service Intergovernmental Agreement",
+        "description": "A USMS Intergovernmental Agreement in which ICE agrees to utilize an already established US Marshal Service contract.",
+    },
     "USMS CDF": {
         "expanded_name": "United States Marshal Service Central Detention Facility",
         "description": "Name guessed at from searching",
     },
+    "CDF": {
+        "expanded_name": "Central Detention Facility",
+        "description": "Name guessed at from searching",
+    },
 }
 
 # enrichment response object
@@ -115,3 +124,5 @@
 }
 
 default_field_office = "(Possibly) Not managed by DHS field office"
+
+supported_output_types = ["csv", "json", "xlsx", "parquet"]
diff --git a/scraper.py b/scraper.py
@@ -281,6 +281,7 @@ def _load_sheet(self) -> dict:
             details["avg_stay_length"] = row["FY25 ALOS"]
             details["inspection_date"] = row["Last Inspection End Date"]
             details["source_urls"].append(self.sheet_url)
+            details["address_str"] = full_address
             details["field_office"] = default_field_office
             results[full_address] = details
         return results
@@ -341,6 +342,8 @@ def scrape_facilities(self):
                     addr["locality"] = locality
                     facility["_repaired_record"] = True
                 full_address = ",".join([street, locality, addr["administrative_area"], zcode]).upper()
+                if not facility["address_str"]:
+                    facility["address_str"] = full_address
                 if full_address in self.facilities_data["facilities"].keys():
                     self.facilities_data["facilities"][full_address] = self._update_facility(
                         self.facilities_data["facilities"][full_address], facility
diff --git a/utils.py b/utils.py
@@ -1,5 +1,6 @@
 # For general helpers, regexes, or shared logic (e.g. phone/address parsing functions).
 import logging
+import polars
 import requests
 from requests.adapters import HTTPAdapter
 import urllib3
@@ -53,9 +54,20 @@
     "Last Final Rating",
 ]
 
+# all values that will only complicate workbook output types
+flatdata_filtered_keys = [
+    "_repaired_record",
+    "address_str",
+    "osm.search_query",
+    "raw_scrape",
+    "source_urls",
+    "wikipedia.search_query",
+    "wikidata.search_query",
+]
+
 
 def _flatdict(d: dict, parent_key: str = "", sep: str = ".") -> dict:
-    """flatten a nested dictionary for nicer printing in CSV"""
+    """flatten a nested dictionary for nicer printing to workbooks (excel/csv/etc.)"""
     items: list = []
     for k, v in d.items():
         new_key = parent_key + sep + str(k) if parent_key else str(k)
@@ -64,3 +76,14 @@ def _flatdict(d: dict, parent_key: str = "", sep: str = ".") -> dict:
         else:
             items.append((new_key, v))
     return dict(items)
+
+
+def convert_to_dataframe(d: dict) -> polars.DataFrame:
+    """facilities internal dict to dataframe"""
+    flatdata = [_flatdict(f) for f in d.values()]
+    fieldnames = [k for k in flatdata[0].keys() if k not in flatdata_filtered_keys]
+    # https://docs.pola.rs/api/python/stable/reference/api/polars.from_dicts.html
+    df = polars.from_dicts(flatdata, schema=fieldnames)
+    logger.debug("Dataframe: %s", df)
+    logger.debug("All header fields: %s", fieldnames)
+    return df
diff --git a/uv.lock b/uv.lock

Original file line number	Diff line number	Diff line change
`@@ -11,6 +11,7 @@ dependencies = [`
`11`	`11`	`"polars>=1.33.0",`
`12`	`12`	`"pyarrow>=21.0.0",`
`13`	`13`	`"requests>=2.32.5",`
	`14`	`+ "xlsxwriter>=3.2.5",`
`14`	`15`	`]`
`15`	`16`
`16`	`17`	`[dependency-groups]`