update docs and make worker count flexible

johnseekins · johnseekins · commit a0307cbdf25b · 2025-09-19T10:38:30.000-06:00
Signed-off-by: John Seekins &lt;john.seekins@spoileralert.com&gt;
diff --git a/README.md b/README.md
@@ -14,8 +14,9 @@ which will help with documenting the facilities appropriately. As these entries
 your CSV results change almost immediately.
 
 You can also use `--load-existing` to leverage an existing
-scrape of the data from ICE.gov. This is stored in default_data.py and includes the official current addresses of facilities.
-(Note ICE has been renaming known "detention center" sites to "processing center", and so on.)
+scrape of the data from ICE.gov. This is stored in `default_data.py` and includes the official current addresses of facilities.
+
+> Note ICE has been renaming known "detention center" sites to "processing center", and so on.
 
 The initial scrape data also keeps a `base64` ecoded string containing the original HTML that was scraped from ice.gov about the
 facility. Keeping this initial data allows us to verify the resulting extracted data if we need to.
@@ -53,7 +54,7 @@ directory.
     uv run python main.py --load-existing --enrich --debug
 
     # With custom output file
-    uv run python main.py --load-existing --enrich --debug-wikipedia -o debug_facilities.csv
+    uv run python main.py --load-existing --enrich --debug-wikipedia -o debug_facilities
 ```
 
 ## Requirements
@@ -102,9 +103,8 @@ in hopes of finding similarly named pages but this is too aggressive, and it vee
 that have simpler names, like the county name instead of `county + detention center`). Use the debug mode to see what
 it is doing.
 * ICE scraping is not robustly tested. The image URL extraction needs some work. (should be able to get the detention center image URLs.)
-* OSM enrichment submits to OSM Nominatim API search with an extra comma between address number and street name.
 * The user-agent for running ice.gov scrape web requests calls itself `'User-Agent': 'ICE-Facilities-Research/1.0 (Educational Research Purpose)'`.
-You can change this in scraper.py and enricher.py.
+You can change this in `utils.py`.
 * It tells some pretty inaccurate percentages in the final summary - a lot of false positives, the Wikipedia debug percent
 seems wrong.
 * The remote query rate limiting is (I think) done in series but would go faster with parallel/async processing.
diff --git a/enricher.py b/enricher.py
@@ -18,27 +18,19 @@
 WIKIDATA_DELAY = 0.5  # Be respectful to Wikidata
 
 
-def _smap(f):
-    """
-    map a function name to its execution
-    per https://stackoverflow.com/a/60467981
-    """
-    return f()
-
-
-def enrich_facility_data(facilities_data: dict) -> dict:
+def enrich_facility_data(facilities_data: dict, workers: int = 3) -> dict:
     """wrapper function for multiprocessing of facility enrichment"""
     start_time = time.time()
     logger.info("Starting data enrichment with external sources...")
     enriched_data = copy.deepcopy(facilities_schema)
     total = len(facilities_data["facilities"])
     processed = 0
 
-    with ProcessPoolExecutor(max_workers=3) as pool:
+    with ProcessPoolExecutor(max_workers=workers) as pool:
         for res in pool.map(enrich_facility, facilities_data["facilities"].items()):
             enriched_data["facilities"][res[0]] = res[1]  # type: ignore [index]
             processed += 1
-            logger.info("Finished %s, %s/%s completed", res[1]["name"], processed, total)
+            logger.info("  -> Finished %s, %s/%s completed", res[1]["name"], processed, total)
 
     logger.info("Data enrichment completed!")
     enriched_data["enrich_runtime"] = time.time() - start_time
@@ -64,12 +56,12 @@ def enrich_facility(facility_data: tuple) -> tuple:
     enriched_facility["osm_search_query"] = osm.get("search_query_steps", "")
 
     logger.debug(enriched_facility)
-    return (facility_id, enriched_facility)
+    return facility_id, enriched_facility
 
 
 def _search_wikipedia(facility_name: str) -> dict:
     """Search Wikipedia for facility and return final URL after redirects"""
-    facility_terms = [
+    facility_terms: list = [
         "detention",
         "prison",
         "jail",
@@ -466,5 +458,4 @@ def _clean_facility_name(name: str) -> str:
         if cleaned.endswith(suffix):
             cleaned = cleaned[: -len(suffix)].strip()
             break
-
     return cleaned
diff --git a/main.py b/main.py
@@ -15,7 +15,7 @@
     python main.py --load-existing --enrich --debug
 
     # With custom output file
-    python main.py --load-existing --enrich --debug -o debug_facilities.csv
+    python main.py --load-existing --enrich --debug -o debug_facilities
 """
 
 from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter
@@ -29,9 +29,6 @@
 from utils import logger
 # CLI, argument parsing, script orchestration
 
-# argparse ArgumentParser converts hyphens to underscores.
-# @see https://docs.python.org/3/library/argparse.html
-
 
 def main() -> None:
     parser = ArgumentParser(
@@ -73,6 +70,12 @@ def main() -> None:
         action="store_true",
         help="Full debug information and logging",
     )
+    parser.add_argument(
+        "--enrich-workers",
+        type=int,
+        default=3,
+        help="Number of concurrent processes to allow while enriching data",
+    )
     # todo these need more attention, but should now be accepted as command line options now.
     parser.add_argument(
         "--debug-wikipedia",
@@ -123,7 +126,7 @@ def main() -> None:
         if not facilities_data:
             logger.warning("No facility data available for enrichment.")
             return
-        facilities_data = enrich_facility_data(facilities_data)
+        facilities_data = enrich_facility_data(facilities_data, args.enrich_workers)
 
     if facilities_data:
         output_filename = args.output_file_name