Skip to content

Commit c3341c1

Browse files
committed
start actually matching records between excel and web page
Signed-off-by: John Seekins <[email protected]>
1 parent c725183 commit c3341c1

File tree

5 files changed

+199
-99
lines changed

5 files changed

+199
-99
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,3 +2,4 @@ __pycache__/
22
*.csv
33
*.json
44
*.xlsx
5+
*.xlsx#

enricher.py

Lines changed: 6 additions & 70 deletions
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,16 @@
11
import copy
2-
import os
3-
import polars
42
from schemas import (
53
facilities_schema,
64
resp_info_schema,
75
)
86
import time
97
from urllib.parse import quote
108
from utils import (
11-
facility_sheet_header,
129
logger,
1310
session,
1411
)
1512
# ExternalDataEnricher class for enrichment logic
1613

17-
SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__))
1814
# Rate limiting for API calls
1915
NOMINATIM_DELAY = 1.0 # 1 second between requests as per OSM policy
2016
WIKIPEDIA_DELAY = 0.5 # Be respectful to Wikipedia
@@ -23,80 +19,19 @@
2319

2420
class ExternalDataEnricher(object):
2521
def __init__(self):
26-
self.sheet_url = "https://www.ice.gov/doclib/detention/FY25_detentionStats08292025.xlsx"
27-
self.filename = f"{SCRIPT_DIR}{os.sep}detentionstats.xlsx"
28-
self.adp_sheet_data = self._load_sheet()
29-
30-
def _download_sheet(self) -> None:
31-
if not os.path.isfile(self.filename) or os.path.getsize(self.filename) < 1:
32-
logger.info("Downloading sheet from %s", self.sheet_url)
33-
resp = session.get(self.sheet_url, timeout=120)
34-
with open(self.filename, "wb") as f:
35-
for chunk in resp.iter_content(chunk_size=1024):
36-
if chunk:
37-
f.write(chunk)
38-
39-
def _load_sheet(self) -> dict:
40-
"""Convert the detentionstats sheet data into something we can update our facilities with"""
41-
self._download_sheet()
42-
df = polars.read_excel(
43-
drop_empty_rows=True,
44-
has_header=False,
45-
# because we're manually defining the header...
46-
read_options={"skip_rows": 7, "column_names": facility_sheet_header},
47-
sheet_name="Facilities FY25",
48-
source=open(self.filename, "rb"),
49-
)
50-
results: dict = {}
51-
for row in df.iter_rows(named=True):
52-
full_address = f"{row['Address']} {row['City']}, {row['State']} {row['Zip']}".upper()
53-
results[full_address] = row
54-
return results
55-
56-
def _update_from_sheet(self, base: dict, row: dict) -> dict:
57-
base["population"]["male"]["criminal"] = row["Male Crim"]
58-
base["population"]["male"]["non_criminal"] = row["Male Non-Crim"]
59-
base["population"]["female"]["criminal"] = row["Female Crim"]
60-
base["population"]["female"]["non_criminal"] = row["Female Non-Crim"]
61-
if "/" in row["Male/Female"]:
62-
base["population"]["female"]["allowed"] = True
63-
base["population"]["male"]["allowed"] = True
64-
elif "Female" in row["Male/Female"]:
65-
base["population"]["female"]["allowed"] = True
66-
else:
67-
base["population"]["male"]["allowed"] = True
68-
69-
base["base_type"] = row["Type Detailed"]
70-
base["avg_stay_length"] = row["FY25 ALOS"]
71-
base["inspection_date"] = row["Last Inspection End Date"]
72-
logger.debug("Updated facility: %s", base)
73-
return base
22+
pass
7423

7524
def enrich_facility_data(self, facilities_data: dict) -> dict:
7625
start_time = time.time()
7726
logger.info("Starting data enrichment with external sources...")
7827
enriched_data = copy.deepcopy(facilities_schema)
7928
total = len(facilities_data["facilities"])
29+
processed = 0
8030

81-
for index, facility in enumerate(facilities_data["facilities"]):
31+
for facility_id, facility in enumerate(facilities_data["facilities"]):
8232
facility_name = facility["name"]
83-
logger.info("Processing facility %s/%s: %s...", index + 1, total, facility_name)
33+
logger.info("Processing facility %s/%s: %s...", processed + 1, total, facility_name)
8434
enriched_facility = copy.deepcopy(facility)
85-
addr = facility["address"]
86-
full_address = (
87-
f"{addr['street']} {addr['locality']}, {addr['administrative_area']} {addr['postal_code']}".upper()
88-
)
89-
if full_address in self.adp_sheet_data:
90-
row = self.adp_sheet_data[full_address]
91-
logger.debug("Found additional data in the ADP sheet for %s", facility_name)
92-
enriched_facility = self._update_from_sheet(facility, row)
93-
else:
94-
logger.debug("Just making sure no other facilities match...")
95-
for sheet_row in self.adp_sheet_data.values():
96-
if facility_name.upper() == sheet_row["Name"].upper():
97-
logger.debug("Matching facility for %s", facility_name)
98-
enriched_facility = self._update_from_sheet(facility, sheet_row)
99-
break
10035

10136
# Wikipedia search # todo refactor to method
10237
try:
@@ -131,7 +66,8 @@ def enrich_facility_data(self, facilities_data: dict) -> dict:
13166
enriched_facility["osm_result_url"] = ""
13267
enriched_facility["osm_search_query"] = str(e)
13368

134-
enriched_data["facilities"].append(enriched_facility) # type: ignore [attr-defined]
69+
enriched_data["facilities"][facility_id] = enriched_facility # type: ignore [index]
70+
processed += 1
13571

13672
logger.info("Data enrichment completed!")
13773
enriched_data["enrich_runtime"] = time.time() - start_time

file_utils.py

Lines changed: 18 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ def export_to_file(
2222
try:
2323
with open(full_name, "w", newline="", encoding="utf-8") as f_out:
2424
if file_type == "csv":
25-
flatdata = [_flatdict(f) for f in facilities_data["facilities"]]
25+
flatdata = [_flatdict(f) for _, f in facilities_data["facilities"].items()]
2626
fieldnames = [k for k in flatdata[0].keys() if k not in csv_filtered_keys]
2727

2828
writer = csv.DictWriter(f_out, fieldnames=fieldnames)
@@ -60,7 +60,7 @@ def print_summary(facilities_data: dict) -> None:
6060

6161
# Count by field office
6262
field_offices: dict = {}
63-
for facility in facilities_data["facilities"]:
63+
for facility_id, facility in facilities_data["facilities"].items():
6464
office = facility.get("field_office", "Unknown")
6565
field_offices[office] = field_offices.get(office, 0) + 1
6666

@@ -70,9 +70,13 @@ def print_summary(facilities_data: dict) -> None:
7070

7171
# Check enrichment data if available
7272
enrich_data = copy.deepcopy(enrichment_print_schema)
73-
enrich_data["wiki_found"] = sum(1 for f in facilities_data["facilities"] if f.get("wikipedia_page_url", None))
74-
enrich_data["wikidata_found"] = sum(1 for f in facilities_data["facilities"] if f.get("wikidata_page_url", None))
75-
enrich_data["osm_found"] = sum(1 for f in facilities_data["facilities"] if f.get("osm_result_url", None))
73+
enrich_data["wiki_found"] = sum(
74+
1 for f in facilities_data["facilities"].values() if f.get("wikipedia_page_url", None)
75+
)
76+
enrich_data["wikidata_found"] = sum(
77+
1 for f in facilities_data["facilities"].values() if f.get("wikidata_page_url", None)
78+
)
79+
enrich_data["osm_found"] = sum(1 for f in facilities_data["facilities"].values() if f.get("osm_result_url", None))
7680

7781
if any(v > 0 for v in enrich_data.values()):
7882
logger.info("\n=== External Data Enrichment Results ===")
@@ -96,25 +100,17 @@ def print_summary(facilities_data: dict) -> None:
96100
)
97101

98102
# Debug information if available
99-
if facilities_data["facilities"][0].get("wikipedia_search_query", None):
100-
logger.info("\n=== Wikipedia Debug Information ===")
101-
false_positives = 0
102-
errors = 0
103-
for facility in facilities_data["facilities"]:
104-
query = facility.get("wikipedia_search_query", "")
105-
if "REJECTED" in query:
106-
false_positives += 1
107-
elif "ERROR" in query:
108-
errors += 1
103+
logger.info("\n=== Wikipedia Debug Information ===")
104+
false_positives = 0
105+
errors = 0
106+
for facility in facilities_data["facilities"].values():
107+
query = facility.get("wikipedia_search_query", "")
108+
if "REJECTED" in query:
109+
false_positives += 1
110+
elif "ERROR" in query:
111+
errors += 1
109112

110113
logger.info("False positives detected and rejected: %s", false_positives)
111114
logger.info("Search errors encountered: %s", errors)
112-
logger.info("Note: Review 'wikipedia_search_query' column for detailed search information")
113-
114-
if facilities_data["facilities"][0].get("wikidata_search_query", None):
115-
logger.warning("Note: Review 'wikidata_search_query' column for detailed search information")
116-
117-
if facilities_data["facilities"][0].get("osm_search_query", None):
118-
logger.warning("Note: Review 'osm_search_query' column for detailed search information")
119115

120116
logger.info("\n=== ICE Detention Facilities Scraper: Run completed ===")

schemas.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
"scraped_date": datetime.datetime.now(datetime.UTC),
55
"scrape_runtime": 0,
66
"enrich_runtime": 0,
7-
"facilities": [],
7+
"facilities": {},
88
}
99

1010
# default keys to "false"-y values so we can merge easier
@@ -16,6 +16,7 @@
1616
"postal_code": "",
1717
"street": "",
1818
},
19+
"_repaired_record": False,
1920
"facility_url": "",
2021
"field_office": "",
2122
"image_url": "",

0 commit comments

Comments
 (0)