add parquet output as well (potential support for s3-style storage?)

johnseekins · johnseekins · commit 36ced9c314b0 · 2025-09-17T07:22:20.000-06:00
Signed-off-by: John Seekins &lt;john@robot-house.us&gt;
diff --git a/file_utils.py b/file_utils.py
@@ -18,16 +18,19 @@ def export_to_file(
         return ""
 
     full_name = f"{filename}.{file_type}"
-    if file_type in ["csv", "xlsx"]:
+    if file_type in ["csv", "xlsx", "parquet"]:
         writer = convert_to_dataframe(facilities_data["facilities"])
-        if file_type == "xlsx":
-            with xlsxwriter.Workbook(full_name, {"remove_timezone": True}) as wb:
-                writer.write_excel(workbook=wb, include_header=True, autofit=True)
-        elif file_type == "csv":
-            with open(full_name, "w", newline="", encoding="utf-8") as f_out:
-                writer.write_csv(file=f_out, include_header=True)
+        match file_type:
+            case "xlsx":
+                with xlsxwriter.Workbook(full_name, {"remove_timezone": True}) as wb:
+                    writer.write_excel(workbook=wb, include_header=True, autofit=True)
+            case "csv":
+                with open(full_name, "w", newline="", encoding="utf-8") as f_out:
+                    writer.write_csv(file=f_out, include_header=True)
+            case "parquet":
+                writer.write_parquet(full_name, use_pyarrow=True)
     elif file_type == "json":
-        with open(full_name, "w", newline="", encoding="utf-8") as f_out:
+        with open(full_name, "w", encoding="utf-8") as f_out:
             json.dump(facilities_data, f_out, indent=2, sort_keys=True, default=str)
 
     logger.info(
diff --git a/main.py b/main.py
@@ -24,6 +24,7 @@
 from file_utils import export_to_file, print_summary
 import default_data
 from enricher import ExternalDataEnricher
+from schemas import supported_output_types
 from scraper import ICEGovFacilityScraper
 from utils import logger
 # CLI, argument parsing, script orchestration
@@ -58,7 +59,7 @@ def main() -> None:
     parser.add_argument(
         "--file-type",
         default="csv",
-        choices=["csv", "json", "xlsx"],
+        choices=supported_output_types,
         help="type of file to export",
     )
     parser.add_argument(
diff --git a/schemas.py b/schemas.py
@@ -106,3 +106,5 @@
 }
 
 default_field_office = "(Possibly) Not managed by DHS field office"
+
+supported_output_types = ["csv", "json", "xlsx", "parquet"]
diff --git a/uv.lock b/uv.lock

Original file line number	Diff line number	Diff line change
`@@ -106,3 +106,5 @@`
`106`	`106`	`}`
`107`	`107`
`108`	`108`	`default_field_office = "(Possibly) Not managed by DHS field office"`
	`109`	`+`
	`110`	`+supported_output_types = ["csv", "json", "xlsx", "parquet"]`