Skip to content

Commit 753e554

Browse files
authored
Merge pull request #23 from johnseekins/xlsx-output . support xlsx writing for excel outputs, and more dataframes in data structures.
support xlsx writing
2 parents 1197e65 + fa3b0db commit 753e554

File tree

7 files changed

+73
-29
lines changed

7 files changed

+73
-29
lines changed

file_utils.py

Lines changed: 17 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,11 @@
11
import copy
2-
import csv
32
import json
43
from schemas import enrichment_print_schema
54
from utils import (
6-
_flatdict,
5+
convert_to_dataframe,
76
logger,
87
)
8+
import xlsxwriter # type: ignore [import-untyped]
99

1010

1111
def export_to_file(
@@ -18,23 +18,20 @@ def export_to_file(
1818
return ""
1919

2020
full_name = f"{filename}.{file_type}"
21-
csv_filtered_keys = ["raw_scrape", "wikipedia_search_query", "wikidata_search_query", "osm_search_query"]
22-
try:
23-
with open(full_name, "w", newline="", encoding="utf-8") as f_out:
24-
if file_type == "csv":
25-
flatdata = [_flatdict(f) for _, f in facilities_data["facilities"].items()]
26-
fieldnames = [k for k in flatdata[0].keys() if k not in csv_filtered_keys]
27-
28-
writer = csv.DictWriter(f_out, fieldnames=fieldnames)
29-
writer.writeheader()
30-
for facility in flatdata:
31-
row_data = {field: facility.get(field, None) for field in fieldnames}
32-
writer.writerow(row_data)
33-
elif file_type == "json":
34-
json.dump(facilities_data, f_out, indent=2, sort_keys=True, default=str)
35-
except Exception as e:
36-
logger.error("Error writing %s file: %s", file_type, e)
37-
return ""
21+
if file_type in ["csv", "xlsx", "parquet"]:
22+
writer = convert_to_dataframe(facilities_data["facilities"])
23+
match file_type:
24+
case "xlsx":
25+
with xlsxwriter.Workbook(full_name, {"remove_timezone": True}) as wb:
26+
writer.write_excel(workbook=wb, include_header=True, autofit=True)
27+
case "csv":
28+
with open(full_name, "w", newline="", encoding="utf-8") as f_out:
29+
writer.write_csv(file=f_out, include_header=True)
30+
case "parquet":
31+
writer.write_parquet(full_name, use_pyarrow=True)
32+
elif file_type == "json":
33+
with open(full_name, "w", encoding="utf-8") as f_out:
34+
json.dump(facilities_data, f_out, indent=2, sort_keys=True, default=str)
3835

3936
logger.info(
4037
"%s file '%s.%s' created successfully with %s facilities.",
@@ -61,8 +58,7 @@ def print_summary(facilities_data: dict) -> None:
6158
# Count by field office
6259
field_offices: dict = {}
6360
for facility_id, facility in facilities_data["facilities"].items():
64-
office = facility.get("field_office", "Unknown")
65-
field_offices[office] = field_offices.get(office, 0) + 1
61+
field_offices[facility["field_office"]] = field_offices.get(facility["field_office"], 0) + 1
6662

6763
logger.info("\nFacilities by Field Office:")
6864
for office, count in sorted(field_offices.items(), key=lambda x: x[1], reverse=True):

main.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
from file_utils import export_to_file, print_summary
2525
import default_data
2626
from enricher import ExternalDataEnricher
27+
from schemas import supported_output_types
2728
from scraper import ICEGovFacilityScraper
2829
from utils import logger
2930
# CLI, argument parsing, script orchestration
@@ -58,7 +59,7 @@ def main() -> None:
5859
parser.add_argument(
5960
"--file-type",
6061
default="csv",
61-
choices=["csv", "json"],
62+
choices=supported_output_types,
6263
help="type of file to export",
6364
)
6465
parser.add_argument(

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ dependencies = [
1111
"polars>=1.33.0",
1212
"pyarrow>=21.0.0",
1313
"requests>=2.32.5",
14+
"xlsxwriter>=3.2.5",
1415
]
1516

1617
[dependency-groups]

schemas.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
"postal_code": "",
1717
"street": "",
1818
},
19+
"address_str": "",
1920
"_repaired_record": False,
2021
"field_office": "",
2122
"image_url": "",
@@ -74,10 +75,18 @@
7475
"expanded_name": "United States Marshal Service Intergovernmental Agreement",
7576
"description": "A USMS Intergovernmental Agreement in which ICE agrees to utilize an already established US Marshal Service contract.",
7677
},
78+
"USMS IGA": {
79+
"expanded_name": "United States Marshal Service Intergovernmental Agreement",
80+
"description": "A USMS Intergovernmental Agreement in which ICE agrees to utilize an already established US Marshal Service contract.",
81+
},
7782
"USMS CDF": {
7883
"expanded_name": "United States Marshal Service Central Detention Facility",
7984
"description": "Name guessed at from searching",
8085
},
86+
"CDF": {
87+
"expanded_name": "Central Detention Facility",
88+
"description": "Name guessed at from searching",
89+
},
8190
}
8291

8392
# enrichment response object
@@ -97,3 +106,5 @@
97106
}
98107

99108
default_field_office = "(Possibly) Not managed by DHS field office"
109+
110+
supported_output_types = ["csv", "json", "xlsx", "parquet"]

scraper.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -276,6 +276,7 @@ def _load_sheet(self) -> dict:
276276
details["avg_stay_length"] = row["FY25 ALOS"]
277277
details["inspection_date"] = row["Last Inspection End Date"]
278278
details["source_urls"].append(self.sheet_url)
279+
details["address_str"] = full_address
279280
details["field_office"] = default_field_office
280281
results[full_address] = details
281282
return results
@@ -335,6 +336,8 @@ def scrape_facilities(self):
335336
addr["locality"] = locality
336337
facility["_repaired_record"] = True
337338
full_address = ",".join([street, locality, addr["administrative_area"], zcode]).upper()
339+
if not facility["address_str"]:
340+
facility["address_str"] = full_address
338341
if full_address in self.facilities_data["facilities"].keys():
339342
self.facilities_data["facilities"][full_address] = self._update_facility(
340343
self.facilities_data["facilities"][full_address], facility

utils.py

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
# For general helpers, regexes, or shared logic (e.g. phone/address parsing functions).
22
import logging
3+
import polars
34
import requests
45
from requests.adapters import HTTPAdapter
56
import urllib3
@@ -53,9 +54,18 @@
5354
"Last Final Rating",
5455
]
5556

57+
# all values that will only complicate workbook output types
58+
flatdata_filtered_keys = [
59+
"raw_scrape",
60+
"wikipedia_search_query",
61+
"wikidata_search_query",
62+
"osm_search_query",
63+
"source_urls",
64+
]
65+
5666

5767
def _flatdict(d: dict, parent_key: str = "", sep: str = ".") -> dict:
58-
"""flatten a nested dictionary for nicer printing in CSV"""
68+
"""flatten a nested dictionary for nicer printing to workbooks (excel/csv/etc.)"""
5969
items: list = []
6070
for k, v in d.items():
6171
new_key = parent_key + sep + str(k) if parent_key else str(k)
@@ -64,3 +74,14 @@ def _flatdict(d: dict, parent_key: str = "", sep: str = ".") -> dict:
6474
else:
6575
items.append((new_key, v))
6676
return dict(items)
77+
78+
79+
def convert_to_dataframe(d: dict) -> polars.DataFrame:
80+
"""facilities internal dict to dataframe"""
81+
flatdata = [_flatdict(f) for f in d.values()]
82+
fieldnames = [k for k in flatdata[0].keys() if k not in flatdata_filtered_keys]
83+
# https://docs.pola.rs/api/python/stable/reference/api/polars.from_dicts.html
84+
df = polars.from_dicts(flatdata, schema=fieldnames)
85+
logger.debug("Dataframe: %s", df)
86+
logger.debug("All header fields: %s", fieldnames)
87+
return df

uv.lock

Lines changed: 17 additions & 6 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)