1818WIKIDATA_DELAY = 0.5 # Be respectful to Wikidata
1919
2020
21- def _smap (f ):
22- """
23- map a function name to its execution
24- per https://stackoverflow.com/a/60467981
25- """
26- return f ()
27-
28-
29- def enrich_facility_data (facilities_data : dict ) -> dict :
21+ def enrich_facility_data (facilities_data : dict , workers : int = 3 ) -> dict :
3022 """wrapper function for multiprocessing of facility enrichment"""
3123 start_time = time .time ()
3224 logger .info ("Starting data enrichment with external sources..." )
3325 enriched_data = copy .deepcopy (facilities_schema )
3426 total = len (facilities_data ["facilities" ])
3527 processed = 0
3628
37- with ProcessPoolExecutor (max_workers = 3 ) as pool :
29+ with ProcessPoolExecutor (max_workers = workers ) as pool :
3830 for res in pool .map (enrich_facility , facilities_data ["facilities" ].items ()):
3931 enriched_data ["facilities" ][res [0 ]] = res [1 ] # type: ignore [index]
4032 processed += 1
41- logger .info ("Finished %s, %s/%s completed" , res [1 ]["name" ], processed , total )
33+ logger .info (" -> Finished %s, %s/%s completed" , res [1 ]["name" ], processed , total )
4234
4335 logger .info ("Data enrichment completed!" )
4436 enriched_data ["enrich_runtime" ] = time .time () - start_time
@@ -64,12 +56,12 @@ def enrich_facility(facility_data: tuple) -> tuple:
6456 enriched_facility ["osm_search_query" ] = osm .get ("search_query_steps" , "" )
6557
6658 logger .debug (enriched_facility )
67- return ( facility_id , enriched_facility )
59+ return facility_id , enriched_facility
6860
6961
7062def _search_wikipedia (facility_name : str ) -> dict :
7163 """Search Wikipedia for facility and return final URL after redirects"""
72- facility_terms = [
64+ facility_terms : list = [
7365 "detention" ,
7466 "prison" ,
7567 "jail" ,
@@ -466,5 +458,4 @@ def _clean_facility_name(name: str) -> str:
466458 if cleaned .endswith (suffix ):
467459 cleaned = cleaned [: - len (suffix )].strip ()
468460 break
469-
470461 return cleaned
0 commit comments