diff --git a/scripts/world_bank/datasets/manifest.json b/scripts/world_bank/datasets/manifest.json index ca34f5b612..74776f0d0d 100644 --- a/scripts/world_bank/datasets/manifest.json +++ b/scripts/world_bank/datasets/manifest.json @@ -25,7 +25,12 @@ "cleaned_csv": "gcs_output/output/bq-results-20250423.csv" } ], - "cron_schedule": "0 9 5,25 * *" + "cron_schedule": "0 9 5,25 * *", + "resource_limits": { + "cpu": 16, + "memory": 256, + "disk": 500 + } } ] } \ No newline at end of file diff --git a/scripts/world_bank/datasets/places.csv b/scripts/world_bank/datasets/places.csv new file mode 100644 index 0000000000..60c433f6b5 --- /dev/null +++ b/scripts/world_bank/datasets/places.csv @@ -0,0 +1,265 @@ +Country code,dcid +HRV,country/HRV +HTI,country/HTI +HUN,country/HUN +IDN,country/IDN +IND,country/IND +IRL,country/IRL +IRN,country/IRN +IRQ,country/IRQ +ISL,country/ISL +ISR,country/ISR +ITA,country/ITA +JAM,country/JAM +JOR,country/JOR +JPN,country/JPN +KAZ,country/KAZ +KEN,country/KEN +KGZ,country/KGZ +KHM,country/KHM +KOR,country/KOR +KWT,country/KWT +LAO,country/LAO +LBN,country/LBN +LBR,country/LBR +LBY,country/LBY +LCA,country/LCA +LKA,country/LKA +LSO,country/LSO +LTU,country/LTU +LUX,country/LUX +LVA,country/LVA +MAC,country/MAC +MAR,country/MAR +MCO,country/MCO +MDA,country/MDA +MDG,country/MDG +MDV,country/MDV +MEX,country/MEX +MKD,country/MKD +MLI,country/MLI +MLT,country/MLT +MMR,country/MMR +MNE,country/MNE +MNG,country/MNG +MOZ,country/MOZ +MRT,country/MRT +MUS,country/MUS +MWI,country/MWI +MYS,country/MYS +NAM,country/NAM +NER,country/NER +NGA,country/NGA +NIC,country/NIC +NLD,country/NLD +NOR,country/NOR +NPL,country/NPL +NZL,country/NZL +OMN,country/OMN +PAK,country/PAK +PAN,country/PAN +PER,country/PER +PHL,country/PHL +PNG,country/PNG +POL,country/POL +PRK,country/PRK +PRT,country/PRT +PRY,country/PRY +PSE,country/PSE +QAT,country/QAT +ROU,country/ROU +RUS,country/RUS +RWA,country/RWA +SAU,country/SAU +SDN,country/SDN +SEN,country/SEN +SGP,country/SGP +SLE,country/SLE +SLV,country/SLV +SMR,country/SMR +SRB,country/SRB +SSD,country/SSD +SUR,country/SUR +SVK,country/SVK +SVN,country/SVN +SWE,country/SWE +SWZ,country/SWZ +SYC,country/SYC +SYR,country/SYR +TCD,country/TCD +TGO,country/TGO +THA,country/THA +TJK,country/TJK +TKM,country/TKM +TTO,country/TTO +TUN,country/TUN +TUR,country/TUR +TWN,country/TWN +TZA,country/TZA +UGA,country/UGA +UKR,country/UKR +URY,country/URY +USA,country/USA +UZB,country/UZB +VEN,country/VEN +VNM,country/VNM +WSM,country/WSM +YEM,country/YEM +ZAF,country/ZAF +ZMB,country/ZMB +ZWE,country/ZWE +ABW,country/ABW +AFG,country/AFG +AGO,country/AGO +ALB,country/ALB +AND,country/AND +ARE,country/ARE +ARG,country/ARG +ARM,country/ARM +ASM,country/ASM +ATG,country/ATG +AUS,country/AUS +AUT,country/AUT +AZE,country/AZE +BDI,country/BDI +BEL,country/BEL +BEN,country/BEN +BFA,country/BFA +BGD,country/BGD +BGR,country/BGR +BHR,country/BHR +BHS,country/BHS +BIH,country/BIH +BLR,country/BLR +BLZ,country/BLZ +BMU,country/BMU +BOL,country/BOL +BRA,country/BRA +BRB,country/BRB +BRN,country/BRN +BTN,country/BTN +BWA,country/BWA +CAF,country/CAF +CAN,country/CAN +CHE,country/CHE +CHL,country/CHL +CHN,country/CHN +CIV,country/CIV +CMR,country/CMR +COD,country/COD +COG,country/COG +COL,country/COL +COM,country/COM +CPV,country/CPV +CRI,country/CRI +CUB,country/CUB +CUW,country/CUW +CYM,country/CYM +CYP,country/CYP +CZE,country/CZE +DEU,country/DEU +DMA,country/DMA +DNK,country/DNK +DOM,country/DOM +DZA,country/DZA +ECU,country/ECU +EGY,country/EGY +ERI,country/ERI +ESP,country/ESP +EST,country/EST +ETH,country/ETH +FIN,country/FIN +FJI,country/FJI +FRA,country/FRA +FSM,country/FSM +GAB,country/GAB +GBR,country/GBR +GEO,country/GEO +GHA,country/GHA +GIB,country/GIB +GIN,country/GIN +GMB,country/GMB +GNB,country/GNB +GNQ,country/GNQ +GRC,country/GRC +GRD,country/GRD +GTM,country/GTM +GUM,country/GUM +GUY,country/GUY +HKG,country/HKG +HND,country/HND +KIR,country/KIR +KNA,country/KNA +LIE,country/LIE +MHL,country/MHL +NCL,country/NCL +NRU,country/NRU +PLW,country/PLW +PRI,country/PRI +PYF,country/PYF +SLB,country/SLB +SOM,country/SOM +STP,country/STP +SXM,country/SXM +TCA,country/TCA +TLS,country/TLS +TON,country/TON +TUV,country/TUV +VCT,country/VCT +VGB,country/VGB +VIR,country/VIR +VUT,country/VUT +DJI,country/DJI +YUG,country/YUG +GRL,country/GRL +IMN,country/IMN +MNP,country/MNP +FRO,country/FRO +COK,country/COK +MSR,country/MSR +MAF,country/MAF +NIU,country/NIU +TKL,country/TKL +ANT,country/ANT +GLP,country/GLP +GUF,country/GUF +MTQ,country/MTQ +MYT,country/MYT +REU,country/REU +AIA,country/AIA +BES,country/BES +ESH,country/ESH +FLK,country/FLK +SHN,country/SHN +SPM,country/SPM +WLF,country/WLF +EUU,EuropeanUnion +XKX,country/XKS +BGD_CHI,wikidataId/Q158087 +BGD_DHA,wikidataId/Q330158 +BRA_RIO,wikidataId/Q41428 +BRA_SAO,wikidataId/Q175 +CHN_BEI,wikidataId/Q956 +CHN_SHA,wikidataId/Q8686 +IDN_JAK,wikidataId/Q3630 +IDN_SUR,wikidataId/Q11462 +IND_DEL,wikidataId/Q1353 +IND_MUM,wikidataId/Q1156 +JPN_OSA,wikidataId/Q122723 +JPN_TOK,wikidataId/Q1490 +MEX_MEC,wikidataId/Q1489 +MEX_MON,wikidataId/Q81033 +NGA_KAN,wikidataId/Q182984 +NGA_LAG,wikidataId/Q815913 +PAK_KAR,wikidataId/Q8660 +PAK_LAH,wikidataId/Q3308170 +RUS_MOS,wikidataId/Q649 +RUS_STP,geoId/1263000 +USA_LA,geoId/0644000 +USA_NY,geoId/36 +NAC,northamerica +SAS,undata-geo/G00158000 +WLD,Earth +CHI,ChannelIslands +AFR,africa +AME,undata-geo/G00134000 \ No newline at end of file diff --git a/scripts/world_bank/datasets/process.py b/scripts/world_bank/datasets/process.py index e2e9bb73da..1f1e62f930 100644 --- a/scripts/world_bank/datasets/process.py +++ b/scripts/world_bank/datasets/process.py @@ -13,6 +13,9 @@ output = os.path.join(_GCS_OUTPUT_DIR, 'output') output_file_path = os.path.join(output, 'transformed_data_for_all_final.csv') +places_csv = os.path.join(_MODULE_DIR, 'places.csv') +skip_places_csv = os.path.join(_MODULE_DIR, 'skip_places.csv') + _UTIL_DIR = os.path.dirname(os.path.abspath(__file__)) sys.path.append(os.path.join(_UTIL_DIR, '../../../util/')) import file_util @@ -27,6 +30,31 @@ flags.DEFINE_string("historical_file", "bq-results-20250423.csv", "historical file name") +DCID_MAP = {} +IGNORE_DCIDS = set() + + +def _load_place_mapping(): + try: + with open(places_csv, 'r', newline='', encoding='utf-8') as f: + reader = csv.DictReader(f) + for row in reader: + DCID_MAP[row['country code'].strip()] = row['dcid'].strip() + except FileNotFoundError: + logging.fatal(f"File not found: {places_csv}") + except csv.Error as e: + logging.fatal(f"Error reading CSV {places_csv}: {e}") + + try: + with open(skip_places_csv, 'r', newline='', encoding='utf-8') as f: + reader = csv.DictReader(f) + for row in reader: + IGNORE_DCIDS.add(row['country code'].strip()) + except FileNotFoundError: + logging.fatal(f"File not found: {skip_places_csv}") + except csv.Error as e: + logging.fatal(f"Error reading CSV {skip_places_csv}: {e}") + def transform_worldbank_csv(input_filename, writer, @@ -57,7 +85,6 @@ def transform_worldbank_csv(input_filename, break except ValueError: pass - """Ignoring ValueError: expecting integer for year column identification.""" except ValueError: logging.info( f"Error: Could not find required columns in header of '{input_filename}'." @@ -74,8 +101,16 @@ def transform_worldbank_csv(input_filename, country_code_column_index is not None and year_columns_start_index is not None: indicator_code = row[indicator_code_column_index].strip( ) - country_code = "country/" + row[ - country_code_column_index].strip() + code_from_csv = row[country_code_column_index] + raw_country_code = "country/" + code_from_csv + + if code_from_csv in IGNORE_DCIDS: + continue + if code_from_csv in DCID_MAP: + country_code = DCID_MAP[code_from_csv] + else: + country_code = raw_country_code + stat_var = "worldBank/" + indicator_code.replace( '.', '_') @@ -84,7 +119,6 @@ def transform_worldbank_csv(input_filename, year = header[j] value = row[j].strip() if value: - """Keeping the first occurrence and removing subsequent duplicates. Verified with source and production; the initial value from the source now is matching with the production data(checked for 4-5 samples) .""" duplicate_key = (indicator_code, stat_var, MEASUREMENT_METHOD, country_code, year, unit_value) @@ -125,10 +159,14 @@ def get_unit_by_indicator(target_indicator_code): except FileNotFoundError: return "" except Exception as e: + logging.warning( + f"Error while reading unit for indicator {target_indicator_code}: {e}" + ) return "" def main(_): + _load_place_mapping() input_files = [ os.path.join(input_directory, f) for f in os.listdir(input_directory) @@ -151,9 +189,10 @@ def main(_): logging.info( f"\nSuccessfully processed {len(input_files)} files. Combined output written to '{output_file_path}'" ) - # historical_file = "bq-results-20250423.csv" + file_util.file_copy(f'{FLAGS.gs_path}{FLAGS.historical_file}', f'{output}/{FLAGS.historical_file}') + expected_output_files = [ FLAGS.historical_file, 'transformed_data_for_all_final.csv' ] diff --git a/scripts/world_bank/datasets/skip_places.csv b/scripts/world_bank/datasets/skip_places.csv new file mode 100644 index 0000000000..069c032c6b --- /dev/null +++ b/scripts/world_bank/datasets/skip_places.csv @@ -0,0 +1,61 @@ +country code +FCS +HIC +HPC +IBD +IBT +IDA +IDB +IDX +LAC +LCN +LDC +LIC +LMC +LMY +LTE +MEA +MIC +OED +OSS +PRE +PSS +PST +SSA +SSF +SST +TEA +TEC +TLA +TSA +TSS +UMC +MNA +TMN +AFE +AFW +ARB +CEB +CSS +EAP +EAR +EAS +ECA +ECS +EMU +NOC +OEC +FTI +NAF +SXZ +XZN +EMD +LIX +DEA +DEC +DFS +DLA +DMN +DNF +DSA +DSS \ No newline at end of file