1+ import copy
12import csv
3+ import flatdict # type: ignore [import-untyped]
24import json
3- from schemas import (
4- debug_schema ,
5- facility_schema ,
6- enrichment_schema ,
7- )
5+ from schemas import enrichment_print_schema
86from utils import logger
97
108
@@ -18,22 +16,17 @@ def export_to_file(
1816 return ""
1917
2018 full_name = f"{ filename } .{ file_type } "
19+ csv_filtered_keys = ["raw_scrape" , "wikipedia_search_query" , "wikidata_search_query" , "osm_search_query" ]
2120 try :
2221 with open (full_name , "w" , newline = "" , encoding = "utf-8" ) as f_out :
2322 if file_type == "csv" :
24- base_fields : list = list (facility_schema .keys ())
25- fieldnames : list = base_fields .copy ()
26-
27- if any (field in facilities_data ["facilities" ][0 ] for field in enrichment_schema ):
28- fieldnames .extend (enrichment_schema )
29-
30- if any (field in facilities_data ["facilities" ][0 ] for field in debug_schema ):
31- fieldnames .extend (debug_schema )
23+ flatdata = [flatdict .FlatDict (f , delimiter = "." ) for f in facilities_data ["facilities" ]]
24+ fieldnames = [k for k in flatdata [0 ].keys () if k not in csv_filtered_keys ]
3225
3326 writer = csv .DictWriter (f_out , fieldnames = fieldnames )
3427 writer .writeheader ()
35- for facility in facilities_data [ "facilities" ] :
36- row_data = {field : facility .get (field , "" ) for field in fieldnames }
28+ for facility in flatdata :
29+ row_data = {field : facility .get (field , None ) for field in fieldnames }
3730 writer .writerow (row_data )
3831 elif file_type == "json" :
3932 json .dump (facilities_data , f_out , indent = 2 , sort_keys = True , default = str )
@@ -75,37 +68,34 @@ def print_summary(facilities_data: dict) -> None:
7568 logger .info (" %s: %s" , office , count )
7669
7770 # Check enrichment data if available
78- if "wikipedia_page_url" in facilities_data ["facilities" ][0 ]:
79- wiki_found = sum (
80- 1 for f in facilities_data ["facilities" ] if f .get ("wikipedia_page_url" ) and f ["wikipedia_page_url" ]
81- )
82- wikidata_found = sum (
83- 1 for f in facilities_data ["facilities" ] if f .get ("wikidata_page_url" ) and f ["wikidata_page_url" ]
84- )
85- osm_found = sum (1 for f in facilities_data ["facilities" ] if f .get ("osm_result_url" ) and f ["osm_result_url" ])
71+ enrich_data = copy .deepcopy (enrichment_print_schema )
72+ enrich_data ["wiki_found" ] = sum (1 for f in facilities_data ["facilities" ] if f .get ("wikipedia_page_url" , None ))
73+ enrich_data ["wikidata_found" ] = sum (1 for f in facilities_data ["facilities" ] if f .get ("wikidata_page_url" , None ))
74+ enrich_data ["osm_found" ] = sum (1 for f in facilities_data ["facilities" ] if f .get ("osm_result_url" , None ))
8675
76+ if any (v > 0 for v in enrich_data .values ()):
8777 logger .info ("\n === External Data Enrichment Results ===" )
8878 logger .info (
8979 "Wikipedia pages found: %s/%s (%s%%)" ,
90- wiki_found ,
80+ enrich_data [ " wiki_found" ] ,
9181 total_facilities ,
92- wiki_found / total_facilities * 100 ,
82+ enrich_data [ " wiki_found" ] / total_facilities * 100 ,
9383 )
9484 logger .info (
9585 "Wikidata entries found: %s/%s (%s%%)" ,
96- wikidata_found ,
86+ enrich_data [ " wikidata_found" ] ,
9787 total_facilities ,
98- wikidata_found / total_facilities * 100 ,
88+ enrich_data [ " wikidata_found" ] / total_facilities * 100 ,
9989 )
10090 logger .info (
10191 "OpenStreetMap results found: %s/%s (%s%%)" ,
102- osm_found ,
92+ enrich_data [ " osm_found" ] ,
10393 total_facilities ,
104- osm_found / total_facilities * 100 ,
94+ enrich_data [ " osm_found" ] / total_facilities * 100 ,
10595 )
10696
10797 # Debug information if available
108- if "wikipedia_search_query" in facilities_data ["facilities" ][0 ]:
98+ if facilities_data ["facilities" ][0 ]. get ( "wikipedia_search_query" , None ) :
10999 logger .info ("\n === Wikipedia Debug Information ===" )
110100 false_positives = 0
111101 errors = 0
@@ -120,10 +110,10 @@ def print_summary(facilities_data: dict) -> None:
120110 logger .info ("Search errors encountered: %s" , errors )
121111 logger .info ("Note: Review 'wikipedia_search_query' column for detailed search information" )
122112
123- if "wikidata_search_query" in facilities_data ["facilities" ][0 ]:
113+ if facilities_data ["facilities" ][0 ]. get ( "wikidata_search_query" , None ) :
124114 logger .warning ("Note: Review 'wikidata_search_query' column for detailed search information" )
125115
126- if "osm_search_query" in facilities_data ["facilities" ][0 ]:
116+ if facilities_data ["facilities" ][0 ]. get ( "osm_search_query" , None ) :
127117 logger .warning ("Note: Review 'osm_search_query' column for detailed search information" )
128118
129119 logger .info ("\n === ICE Detention Facilities Scraper: Run completed ===" )
0 commit comments