Skip to content

Commit 24d7a01

Browse files
committed
clean up filtered keys
Signed-off-by: John Seekins <[email protected]>
2 parents 7b63df1 + 753e554 commit 24d7a01

File tree

7 files changed

+75
-36
lines changed

7 files changed

+75
-36
lines changed

file_utils.py

Lines changed: 17 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,11 @@
11
import copy
2-
import csv
32
import json
43
from schemas import enrichment_print_schema
54
from utils import (
6-
_flatdict,
5+
convert_to_dataframe,
76
logger,
87
)
8+
import xlsxwriter # type: ignore [import-untyped]
99

1010

1111
def export_to_file(
@@ -18,30 +18,20 @@ def export_to_file(
1818
return ""
1919

2020
full_name = f"{filename}.{file_type}"
21-
csv_filtered_keys = [
22-
"_repaired_record",
23-
"raw_scrape",
24-
"wikipedia.search_query",
25-
"wikidata.search_query",
26-
"osm.search_query",
27-
"source_urls",
28-
]
29-
try:
30-
with open(full_name, "w", newline="", encoding="utf-8") as f_out:
31-
if file_type == "csv":
32-
flatdata = [_flatdict(f) for _, f in facilities_data["facilities"].items()]
33-
fieldnames = [k for k in flatdata[0].keys() if k not in csv_filtered_keys]
34-
35-
writer = csv.DictWriter(f_out, fieldnames=fieldnames)
36-
writer.writeheader()
37-
for facility in flatdata:
38-
row_data = {field: facility.get(field, None) for field in fieldnames}
39-
writer.writerow(row_data)
40-
elif file_type == "json":
41-
json.dump(facilities_data, f_out, indent=2, sort_keys=True, default=str)
42-
except Exception as e:
43-
logger.error("Error writing %s file: %s", file_type, e)
44-
return ""
21+
if file_type in ["csv", "xlsx", "parquet"]:
22+
writer = convert_to_dataframe(facilities_data["facilities"])
23+
match file_type:
24+
case "xlsx":
25+
with xlsxwriter.Workbook(full_name, {"remove_timezone": True}) as wb:
26+
writer.write_excel(workbook=wb, include_header=True, autofit=True)
27+
case "csv":
28+
with open(full_name, "w", newline="", encoding="utf-8") as f_out:
29+
writer.write_csv(file=f_out, include_header=True)
30+
case "parquet":
31+
writer.write_parquet(full_name, use_pyarrow=True)
32+
elif file_type == "json":
33+
with open(full_name, "w", encoding="utf-8") as f_out:
34+
json.dump(facilities_data, f_out, indent=2, sort_keys=True, default=str)
4535

4636
logger.info(
4737
"%s file '%s.%s' created successfully with %s facilities.",
@@ -68,8 +58,7 @@ def print_summary(facilities_data: dict) -> None:
6858
# Count by field office
6959
field_offices: dict = {}
7060
for facility_id, facility in facilities_data["facilities"].items():
71-
office = facility.get("field_office", "Unknown")
72-
field_offices[office] = field_offices.get(office, 0) + 1
61+
field_offices[facility["field_office"]] = field_offices.get(facility["field_office"], 0) + 1
7362

7463
logger.info("\nFacilities by Field Office:")
7564
for office, count in sorted(field_offices.items(), key=lambda x: x[1], reverse=True):

main.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
from file_utils import export_to_file, print_summary
2525
import default_data
2626
from enricher import ExternalDataEnricher
27+
from schemas import supported_output_types
2728
from scraper import ICEGovFacilityScraper
2829
from utils import logger
2930
# CLI, argument parsing, script orchestration
@@ -58,7 +59,7 @@ def main() -> None:
5859
parser.add_argument(
5960
"--file-type",
6061
default="csv",
61-
choices=["csv", "json"],
62+
choices=supported_output_types,
6263
help="type of file to export",
6364
)
6465
parser.add_argument(

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ dependencies = [
1111
"polars>=1.33.0",
1212
"pyarrow>=21.0.0",
1313
"requests>=2.32.5",
14+
"xlsxwriter>=3.2.5",
1415
]
1516

1617
[dependency-groups]

schemas.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
"postal_code": "",
2323
"street": "",
2424
},
25+
"address_str": "",
2526
"_repaired_record": False,
2627
"field_office": "",
2728
"image_url": "",
@@ -91,10 +92,18 @@
9192
"expanded_name": "United States Marshal Service Intergovernmental Agreement",
9293
"description": "A USMS Intergovernmental Agreement in which ICE agrees to utilize an already established US Marshal Service contract.",
9394
},
95+
"USMS IGA": {
96+
"expanded_name": "United States Marshal Service Intergovernmental Agreement",
97+
"description": "A USMS Intergovernmental Agreement in which ICE agrees to utilize an already established US Marshal Service contract.",
98+
},
9499
"USMS CDF": {
95100
"expanded_name": "United States Marshal Service Central Detention Facility",
96101
"description": "Name guessed at from searching",
97102
},
103+
"CDF": {
104+
"expanded_name": "Central Detention Facility",
105+
"description": "Name guessed at from searching",
106+
},
98107
}
99108

100109
# enrichment response object
@@ -115,3 +124,5 @@
115124
}
116125

117126
default_field_office = "(Possibly) Not managed by DHS field office"
127+
128+
supported_output_types = ["csv", "json", "xlsx", "parquet"]

scraper.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -281,6 +281,7 @@ def _load_sheet(self) -> dict:
281281
details["avg_stay_length"] = row["FY25 ALOS"]
282282
details["inspection_date"] = row["Last Inspection End Date"]
283283
details["source_urls"].append(self.sheet_url)
284+
details["address_str"] = full_address
284285
details["field_office"] = default_field_office
285286
results[full_address] = details
286287
return results
@@ -341,6 +342,8 @@ def scrape_facilities(self):
341342
addr["locality"] = locality
342343
facility["_repaired_record"] = True
343344
full_address = ",".join([street, locality, addr["administrative_area"], zcode]).upper()
345+
if not facility["address_str"]:
346+
facility["address_str"] = full_address
344347
if full_address in self.facilities_data["facilities"].keys():
345348
self.facilities_data["facilities"][full_address] = self._update_facility(
346349
self.facilities_data["facilities"][full_address], facility

utils.py

Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
# For general helpers, regexes, or shared logic (e.g. phone/address parsing functions).
22
import logging
3+
import polars
34
import requests
45
from requests.adapters import HTTPAdapter
56
import urllib3
@@ -53,9 +54,20 @@
5354
"Last Final Rating",
5455
]
5556

57+
# all values that will only complicate workbook output types
58+
flatdata_filtered_keys = [
59+
"_repaired_record",
60+
"address_str",
61+
"osm.search_query",
62+
"raw_scrape",
63+
"source_urls",
64+
"wikipedia.search_query",
65+
"wikidata.search_query",
66+
]
67+
5668

5769
def _flatdict(d: dict, parent_key: str = "", sep: str = ".") -> dict:
58-
"""flatten a nested dictionary for nicer printing in CSV"""
70+
"""flatten a nested dictionary for nicer printing to workbooks (excel/csv/etc.)"""
5971
items: list = []
6072
for k, v in d.items():
6173
new_key = parent_key + sep + str(k) if parent_key else str(k)
@@ -64,3 +76,14 @@ def _flatdict(d: dict, parent_key: str = "", sep: str = ".") -> dict:
6476
else:
6577
items.append((new_key, v))
6678
return dict(items)
79+
80+
81+
def convert_to_dataframe(d: dict) -> polars.DataFrame:
82+
"""facilities internal dict to dataframe"""
83+
flatdata = [_flatdict(f) for f in d.values()]
84+
fieldnames = [k for k in flatdata[0].keys() if k not in flatdata_filtered_keys]
85+
# https://docs.pola.rs/api/python/stable/reference/api/polars.from_dicts.html
86+
df = polars.from_dicts(flatdata, schema=fieldnames)
87+
logger.debug("Dataframe: %s", df)
88+
logger.debug("All header fields: %s", fieldnames)
89+
return df

uv.lock

Lines changed: 17 additions & 6 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)