Skip to content

Commit 29e8ab6

Browse files
committed
remove some complexity around writing with nested dicts
Signed-off-by: John Seekins <[email protected]>
1 parent 068a81b commit 29e8ab6

File tree

8 files changed

+777
-72
lines changed

8 files changed

+777
-72
lines changed

default_data.py

Lines changed: 722 additions & 2 deletions
Large diffs are not rendered by default.

enricher.py

Lines changed: 3 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -28,21 +28,10 @@ def enrich_facility_data(self, facilities_data: dict) -> dict:
2828
enriched_data = copy.deepcopy(facilities_schema)
2929
total = len(facilities_data["facilities"])
3030

31-
for i, facility in enumerate(facilities_data["facilities"]):
32-
logger.info("Processing facility %s/%s: %s...", i + 1, total, facility["name"])
33-
enriched_facility = copy.deepcopy(facility)
34-
base_enrichment = {
35-
"wikipedia_page_url": "",
36-
"wikipedia_search_query": "",
37-
"wikidata_page_url": "",
38-
"wikidata_search_query": "",
39-
"osm_result_url": "",
40-
"osm_search_query": "",
41-
}
42-
43-
enriched_facility.update(base_enrichment)
44-
31+
for index, facility in enumerate(facilities_data["facilities"]):
4532
facility_name = facility["name"]
33+
logger.info("Processing facility %s/%s: %s...", index + 1, total, facility_name)
34+
enriched_facility = copy.deepcopy(facility)
4635

4736
# Wikipedia search # todo refactor to method
4837
try:

file_utils.py

Lines changed: 22 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,8 @@
1+
import copy
12
import csv
3+
import flatdict # type: ignore [import-untyped]
24
import json
3-
from schemas import (
4-
debug_schema,
5-
facility_schema,
6-
enrichment_schema,
7-
)
5+
from schemas import enrichment_print_schema
86
from utils import logger
97

108

@@ -18,22 +16,17 @@ def export_to_file(
1816
return ""
1917

2018
full_name = f"{filename}.{file_type}"
19+
csv_filtered_keys = ["raw_scrape", "wikipedia_search_query", "wikidata_search_query", "osm_search_query"]
2120
try:
2221
with open(full_name, "w", newline="", encoding="utf-8") as f_out:
2322
if file_type == "csv":
24-
base_fields: list = list(facility_schema.keys())
25-
fieldnames: list = base_fields.copy()
26-
27-
if any(field in facilities_data["facilities"][0] for field in enrichment_schema):
28-
fieldnames.extend(enrichment_schema)
29-
30-
if any(field in facilities_data["facilities"][0] for field in debug_schema):
31-
fieldnames.extend(debug_schema)
23+
flatdata = [flatdict.FlatDict(f, delimiter=".") for f in facilities_data["facilities"]]
24+
fieldnames = [k for k in flatdata[0].keys() if k not in csv_filtered_keys]
3225

3326
writer = csv.DictWriter(f_out, fieldnames=fieldnames)
3427
writer.writeheader()
35-
for facility in facilities_data["facilities"]:
36-
row_data = {field: facility.get(field, "") for field in fieldnames}
28+
for facility in flatdata:
29+
row_data = {field: facility.get(field, None) for field in fieldnames}
3730
writer.writerow(row_data)
3831
elif file_type == "json":
3932
json.dump(facilities_data, f_out, indent=2, sort_keys=True, default=str)
@@ -75,37 +68,34 @@ def print_summary(facilities_data: dict) -> None:
7568
logger.info(" %s: %s", office, count)
7669

7770
# Check enrichment data if available
78-
if "wikipedia_page_url" in facilities_data["facilities"][0]:
79-
wiki_found = sum(
80-
1 for f in facilities_data["facilities"] if f.get("wikipedia_page_url") and f["wikipedia_page_url"]
81-
)
82-
wikidata_found = sum(
83-
1 for f in facilities_data["facilities"] if f.get("wikidata_page_url") and f["wikidata_page_url"]
84-
)
85-
osm_found = sum(1 for f in facilities_data["facilities"] if f.get("osm_result_url") and f["osm_result_url"])
71+
enrich_data = copy.deepcopy(enrichment_print_schema)
72+
enrich_data["wiki_found"] = sum(1 for f in facilities_data["facilities"] if f.get("wikipedia_page_url", None))
73+
enrich_data["wikidata_found"] = sum(1 for f in facilities_data["facilities"] if f.get("wikidata_page_url", None))
74+
enrich_data["osm_found"] = sum(1 for f in facilities_data["facilities"] if f.get("osm_result_url", None))
8675

76+
if any(v > 0 for v in enrich_data.values()):
8777
logger.info("\n=== External Data Enrichment Results ===")
8878
logger.info(
8979
"Wikipedia pages found: %s/%s (%s%%)",
90-
wiki_found,
80+
enrich_data["wiki_found"],
9181
total_facilities,
92-
wiki_found / total_facilities * 100,
82+
enrich_data["wiki_found"] / total_facilities * 100,
9383
)
9484
logger.info(
9585
"Wikidata entries found: %s/%s (%s%%)",
96-
wikidata_found,
86+
enrich_data["wikidata_found"],
9787
total_facilities,
98-
wikidata_found / total_facilities * 100,
88+
enrich_data["wikidata_found"] / total_facilities * 100,
9989
)
10090
logger.info(
10191
"OpenStreetMap results found: %s/%s (%s%%)",
102-
osm_found,
92+
enrich_data["osm_found"],
10393
total_facilities,
104-
osm_found / total_facilities * 100,
94+
enrich_data["osm_found"] / total_facilities * 100,
10595
)
10696

10797
# Debug information if available
108-
if "wikipedia_search_query" in facilities_data["facilities"][0]:
98+
if facilities_data["facilities"][0].get("wikipedia_search_query", None):
10999
logger.info("\n=== Wikipedia Debug Information ===")
110100
false_positives = 0
111101
errors = 0
@@ -120,10 +110,10 @@ def print_summary(facilities_data: dict) -> None:
120110
logger.info("Search errors encountered: %s", errors)
121111
logger.info("Note: Review 'wikipedia_search_query' column for detailed search information")
122112

123-
if "wikidata_search_query" in facilities_data["facilities"][0]:
113+
if facilities_data["facilities"][0].get("wikidata_search_query", None):
124114
logger.warning("Note: Review 'wikidata_search_query' column for detailed search information")
125115

126-
if "osm_search_query" in facilities_data["facilities"][0]:
116+
if facilities_data["facilities"][0].get("osm_search_query", None):
127117
logger.warning("Note: Review 'osm_search_query' column for detailed search information")
128118

129119
logger.info("\n=== ICE Detention Facilities Scraper: Run completed ===")

main.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424
from file_utils import export_to_file, print_summary
2525
import default_data
2626
from enricher import ExternalDataEnricher
27-
from scraper import ICEFacilityScraper
27+
from scraper import ICEGovFacilityScraper
2828
from utils import logger
2929

3030
# CLI, argument parsing, script orchestration
@@ -78,7 +78,7 @@ def main() -> None:
7878
exit(1)
7979

8080
if args.scrape:
81-
scraper = ICEFacilityScraper()
81+
scraper = ICEGovFacilityScraper()
8282
facilities_data = scraper.scrape_facilities()
8383
elif args.load_existing:
8484
facilities_data = copy.deepcopy(default_data.facilities_data)

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ readme = "README.md"
66
requires-python = ">=3.13"
77
dependencies = [
88
"beautifulsoup4>=4.13.5",
9+
"flatdict>=4.0.1",
910
"lxml>=6.0.1",
1011
"requests>=2.32.5",
1112
]

schemas.py

Lines changed: 13 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -23,18 +23,13 @@
2323
"phone": "",
2424
"raw_scrape": "",
2525
"source_url": "",
26+
"wikipedia_page_url": "",
27+
"wikidata_page_url": "",
28+
"osm_result_url": "",
29+
"wikipedia_search_query": "",
30+
"wikidata_search_query": "",
31+
"osm_search_query": "",
2632
}
27-
enrichment_schema = [
28-
"wikipedia_page_url",
29-
"wikidata_page_url",
30-
"osm_result_url",
31-
]
32-
33-
debug_schema = [
34-
"wikipedia_search_query",
35-
"wikidata_search_query",
36-
"osm_search_query",
37-
]
3833

3934
# enrichment response object
4035
resp_info_schema = {
@@ -44,3 +39,10 @@
4439
"url": "",
4540
"method": "none",
4641
}
42+
43+
# enrichment print details
44+
enrichment_print_schema = {
45+
"wiki_found": 0,
46+
"wikidata_found": 0,
47+
"osm_found": 0,
48+
}

scraper.py

Lines changed: 6 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
)
1616

1717

18-
class ICEFacilityScraper(object):
18+
class ICEGovFacilityScraper(object):
1919
# All methods for scraping ICE websites
2020

2121
def __init__(self):
@@ -72,24 +72,19 @@ def _scrape_updated(self, url: str):
7272
timestamp = f"{timestamp}-+{tz}"
7373
return datetime.datetime.strptime(timestamp, timestamp_format)
7474

75-
def _scrape_page(self, url: str) -> list:
75+
def _scrape_page(self, page_url: str) -> list:
7676
"""Scrape a single page of facilities using BeautifulSoup"""
77-
logger.debug(" Fetching: %s", url)
77+
logger.debug(" Fetching: %s", page_url)
7878
try:
79-
response = session.get(url, timeout=30)
79+
response = session.get(page_url, timeout=30)
8080
response.raise_for_status()
8181
except Exception as e:
82-
logger.error(" Error parsing %s: %s", url, e)
82+
logger.error(" Error parsing %s: %s", page_url, e)
8383
return []
8484

8585
# Parse HTML with BeautifulSoup
8686
soup = BeautifulSoup(response.content, "html.parser")
8787

88-
# Extract facilities from the parsed HTML
89-
return self._extract_facilities_from_html(soup, url)
90-
91-
def _extract_facilities_from_html(self, soup, page_url: str) -> list:
92-
"""Extract facility data from BeautifulSoup parsed HTML"""
9388
facilities = []
9489

9590
# Look for the main content area - ICE uses different possible containers
@@ -122,7 +117,7 @@ def _extract_facilities_from_html(self, soup, page_url: str) -> list:
122117
"article", # Article elements
123118
"div.node", # Drupal node containers
124119
]
125-
facility_elements = []
120+
facility_elements: list = []
126121
for selector in facility_selectors:
127122
elements = content_container.select(selector)
128123
if elements:

uv.lock

Lines changed: 8 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)