Skip to content

Commit a0307cb

Browse files
committed
update docs and make worker count flexible
Signed-off-by: John Seekins <[email protected]>
1 parent e5734ba commit a0307cb

File tree

3 files changed

+18
-24
lines changed

3 files changed

+18
-24
lines changed

README.md

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,9 @@ which will help with documenting the facilities appropriately. As these entries
1414
your CSV results change almost immediately.
1515

1616
You can also use `--load-existing` to leverage an existing
17-
scrape of the data from ICE.gov. This is stored in default_data.py and includes the official current addresses of facilities.
18-
(Note ICE has been renaming known "detention center" sites to "processing center", and so on.)
17+
scrape of the data from ICE.gov. This is stored in `default_data.py` and includes the official current addresses of facilities.
18+
19+
> Note ICE has been renaming known "detention center" sites to "processing center", and so on.
1920
2021
The initial scrape data also keeps a `base64` ecoded string containing the original HTML that was scraped from ice.gov about the
2122
facility. Keeping this initial data allows us to verify the resulting extracted data if we need to.
@@ -53,7 +54,7 @@ directory.
5354
uv run python main.py --load-existing --enrich --debug
5455

5556
# With custom output file
56-
uv run python main.py --load-existing --enrich --debug-wikipedia -o debug_facilities.csv
57+
uv run python main.py --load-existing --enrich --debug-wikipedia -o debug_facilities
5758
```
5859

5960
## Requirements
@@ -102,9 +103,8 @@ in hopes of finding similarly named pages but this is too aggressive, and it vee
102103
that have simpler names, like the county name instead of `county + detention center`). Use the debug mode to see what
103104
it is doing.
104105
* ICE scraping is not robustly tested. The image URL extraction needs some work. (should be able to get the detention center image URLs.)
105-
* OSM enrichment submits to OSM Nominatim API search with an extra comma between address number and street name.
106106
* The user-agent for running ice.gov scrape web requests calls itself `'User-Agent': 'ICE-Facilities-Research/1.0 (Educational Research Purpose)'`.
107-
You can change this in scraper.py and enricher.py.
107+
You can change this in `utils.py`.
108108
* It tells some pretty inaccurate percentages in the final summary - a lot of false positives, the Wikipedia debug percent
109109
seems wrong.
110110
* The remote query rate limiting is (I think) done in series but would go faster with parallel/async processing.

enricher.py

Lines changed: 5 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -18,27 +18,19 @@
1818
WIKIDATA_DELAY = 0.5 # Be respectful to Wikidata
1919

2020

21-
def _smap(f):
22-
"""
23-
map a function name to its execution
24-
per https://stackoverflow.com/a/60467981
25-
"""
26-
return f()
27-
28-
29-
def enrich_facility_data(facilities_data: dict) -> dict:
21+
def enrich_facility_data(facilities_data: dict, workers: int = 3) -> dict:
3022
"""wrapper function for multiprocessing of facility enrichment"""
3123
start_time = time.time()
3224
logger.info("Starting data enrichment with external sources...")
3325
enriched_data = copy.deepcopy(facilities_schema)
3426
total = len(facilities_data["facilities"])
3527
processed = 0
3628

37-
with ProcessPoolExecutor(max_workers=3) as pool:
29+
with ProcessPoolExecutor(max_workers=workers) as pool:
3830
for res in pool.map(enrich_facility, facilities_data["facilities"].items()):
3931
enriched_data["facilities"][res[0]] = res[1] # type: ignore [index]
4032
processed += 1
41-
logger.info("Finished %s, %s/%s completed", res[1]["name"], processed, total)
33+
logger.info(" -> Finished %s, %s/%s completed", res[1]["name"], processed, total)
4234

4335
logger.info("Data enrichment completed!")
4436
enriched_data["enrich_runtime"] = time.time() - start_time
@@ -64,12 +56,12 @@ def enrich_facility(facility_data: tuple) -> tuple:
6456
enriched_facility["osm_search_query"] = osm.get("search_query_steps", "")
6557

6658
logger.debug(enriched_facility)
67-
return (facility_id, enriched_facility)
59+
return facility_id, enriched_facility
6860

6961

7062
def _search_wikipedia(facility_name: str) -> dict:
7163
"""Search Wikipedia for facility and return final URL after redirects"""
72-
facility_terms = [
64+
facility_terms: list = [
7365
"detention",
7466
"prison",
7567
"jail",
@@ -466,5 +458,4 @@ def _clean_facility_name(name: str) -> str:
466458
if cleaned.endswith(suffix):
467459
cleaned = cleaned[: -len(suffix)].strip()
468460
break
469-
470461
return cleaned

main.py

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
python main.py --load-existing --enrich --debug
1616
1717
# With custom output file
18-
python main.py --load-existing --enrich --debug -o debug_facilities.csv
18+
python main.py --load-existing --enrich --debug -o debug_facilities
1919
"""
2020

2121
from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter
@@ -29,9 +29,6 @@
2929
from utils import logger
3030
# CLI, argument parsing, script orchestration
3131

32-
# argparse ArgumentParser converts hyphens to underscores.
33-
# @see https://docs.python.org/3/library/argparse.html
34-
3532

3633
def main() -> None:
3734
parser = ArgumentParser(
@@ -73,6 +70,12 @@ def main() -> None:
7370
action="store_true",
7471
help="Full debug information and logging",
7572
)
73+
parser.add_argument(
74+
"--enrich-workers",
75+
type=int,
76+
default=3,
77+
help="Number of concurrent processes to allow while enriching data",
78+
)
7679
# todo these need more attention, but should now be accepted as command line options now.
7780
parser.add_argument(
7881
"--debug-wikipedia",
@@ -123,7 +126,7 @@ def main() -> None:
123126
if not facilities_data:
124127
logger.warning("No facility data available for enrichment.")
125128
return
126-
facilities_data = enrich_facility_data(facilities_data)
129+
facilities_data = enrich_facility_data(facilities_data, args.enrich_workers)
127130

128131
if facilities_data:
129132
output_filename = args.output_file_name

0 commit comments

Comments
 (0)