Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion scripts/world_bank/datasets/manifest.json
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,12 @@
"cleaned_csv": "gcs_output/output/bq-results-20250423.csv"
}
],
"cron_schedule": "0 9 5,25 * *"
"cron_schedule": "0 9 5,25 * *",
"resource_limits": {
"cpu": 16,
"memory": 256,
"disk": 500
}
}
]
}
265 changes: 265 additions & 0 deletions scripts/world_bank/datasets/places.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,265 @@
Country code,dcid
HRV,country/HRV
HTI,country/HTI
HUN,country/HUN
IDN,country/IDN
IND,country/IND
IRL,country/IRL
IRN,country/IRN
IRQ,country/IRQ
ISL,country/ISL
ISR,country/ISR
ITA,country/ITA
JAM,country/JAM
JOR,country/JOR
JPN,country/JPN
KAZ,country/KAZ
KEN,country/KEN
KGZ,country/KGZ
KHM,country/KHM
KOR,country/KOR
KWT,country/KWT
LAO,country/LAO
LBN,country/LBN
LBR,country/LBR
LBY,country/LBY
LCA,country/LCA
LKA,country/LKA
LSO,country/LSO
LTU,country/LTU
LUX,country/LUX
LVA,country/LVA
MAC,country/MAC
MAR,country/MAR
MCO,country/MCO
MDA,country/MDA
MDG,country/MDG
MDV,country/MDV
MEX,country/MEX
MKD,country/MKD
MLI,country/MLI
MLT,country/MLT
MMR,country/MMR
MNE,country/MNE
MNG,country/MNG
MOZ,country/MOZ
MRT,country/MRT
MUS,country/MUS
MWI,country/MWI
MYS,country/MYS
NAM,country/NAM
NER,country/NER
NGA,country/NGA
NIC,country/NIC
NLD,country/NLD
NOR,country/NOR
NPL,country/NPL
NZL,country/NZL
OMN,country/OMN
PAK,country/PAK
PAN,country/PAN
PER,country/PER
PHL,country/PHL
PNG,country/PNG
POL,country/POL
PRK,country/PRK
PRT,country/PRT
PRY,country/PRY
PSE,country/PSE
QAT,country/QAT
ROU,country/ROU
RUS,country/RUS
RWA,country/RWA
SAU,country/SAU
SDN,country/SDN
SEN,country/SEN
SGP,country/SGP
SLE,country/SLE
SLV,country/SLV
SMR,country/SMR
SRB,country/SRB
SSD,country/SSD
SUR,country/SUR
SVK,country/SVK
SVN,country/SVN
SWE,country/SWE
SWZ,country/SWZ
SYC,country/SYC
SYR,country/SYR
TCD,country/TCD
TGO,country/TGO
THA,country/THA
TJK,country/TJK
TKM,country/TKM
TTO,country/TTO
TUN,country/TUN
TUR,country/TUR
TWN,country/TWN
TZA,country/TZA
UGA,country/UGA
UKR,country/UKR
URY,country/URY
USA,country/USA
UZB,country/UZB
VEN,country/VEN
VNM,country/VNM
WSM,country/WSM
YEM,country/YEM
ZAF,country/ZAF
ZMB,country/ZMB
ZWE,country/ZWE
ABW,country/ABW
AFG,country/AFG
AGO,country/AGO
ALB,country/ALB
AND,country/AND
ARE,country/ARE
ARG,country/ARG
ARM,country/ARM
ASM,country/ASM
ATG,country/ATG
AUS,country/AUS
AUT,country/AUT
AZE,country/AZE
BDI,country/BDI
BEL,country/BEL
BEN,country/BEN
BFA,country/BFA
BGD,country/BGD
BGR,country/BGR
BHR,country/BHR
BHS,country/BHS
BIH,country/BIH
BLR,country/BLR
BLZ,country/BLZ
BMU,country/BMU
BOL,country/BOL
BRA,country/BRA
BRB,country/BRB
BRN,country/BRN
BTN,country/BTN
BWA,country/BWA
CAF,country/CAF
CAN,country/CAN
CHE,country/CHE
CHL,country/CHL
CHN,country/CHN
CIV,country/CIV
CMR,country/CMR
COD,country/COD
COG,country/COG
COL,country/COL
COM,country/COM
CPV,country/CPV
CRI,country/CRI
CUB,country/CUB
CUW,country/CUW
CYM,country/CYM
CYP,country/CYP
CZE,country/CZE
DEU,country/DEU
DMA,country/DMA
DNK,country/DNK
DOM,country/DOM
DZA,country/DZA
ECU,country/ECU
EGY,country/EGY
ERI,country/ERI
ESP,country/ESP
EST,country/EST
ETH,country/ETH
FIN,country/FIN
FJI,country/FJI
FRA,country/FRA
FSM,country/FSM
GAB,country/GAB
GBR,country/GBR
GEO,country/GEO
GHA,country/GHA
GIB,country/GIB
GIN,country/GIN
GMB,country/GMB
GNB,country/GNB
GNQ,country/GNQ
GRC,country/GRC
GRD,country/GRD
GTM,country/GTM
GUM,country/GUM
GUY,country/GUY
HKG,country/HKG
HND,country/HND
KIR,country/KIR
KNA,country/KNA
LIE,country/LIE
MHL,country/MHL
NCL,country/NCL
NRU,country/NRU
PLW,country/PLW
PRI,country/PRI
PYF,country/PYF
SLB,country/SLB
SOM,country/SOM
STP,country/STP
SXM,country/SXM
TCA,country/TCA
TLS,country/TLS
TON,country/TON
TUV,country/TUV
VCT,country/VCT
VGB,country/VGB
VIR,country/VIR
VUT,country/VUT
DJI,country/DJI
YUG,country/YUG
GRL,country/GRL
IMN,country/IMN
MNP,country/MNP
FRO,country/FRO
COK,country/COK
MSR,country/MSR
MAF,country/MAF
NIU,country/NIU
TKL,country/TKL
ANT,country/ANT
GLP,country/GLP
GUF,country/GUF
MTQ,country/MTQ
MYT,country/MYT
REU,country/REU
AIA,country/AIA
BES,country/BES
ESH,country/ESH
FLK,country/FLK
SHN,country/SHN
SPM,country/SPM
WLF,country/WLF
EUU,EuropeanUnion
XKX,country/XKS
BGD_CHI,wikidataId/Q158087
BGD_DHA,wikidataId/Q330158
BRA_RIO,wikidataId/Q41428
BRA_SAO,wikidataId/Q175
CHN_BEI,wikidataId/Q956
CHN_SHA,wikidataId/Q8686
IDN_JAK,wikidataId/Q3630
IDN_SUR,wikidataId/Q11462
IND_DEL,wikidataId/Q1353
IND_MUM,wikidataId/Q1156
JPN_OSA,wikidataId/Q122723
JPN_TOK,wikidataId/Q1490
MEX_MEC,wikidataId/Q1489
MEX_MON,wikidataId/Q81033
NGA_KAN,wikidataId/Q182984
NGA_LAG,wikidataId/Q815913
PAK_KAR,wikidataId/Q8660
PAK_LAH,wikidataId/Q3308170
RUS_MOS,wikidataId/Q649
RUS_STP,geoId/1263000
USA_LA,geoId/0644000
USA_NY,geoId/36
NAC,northamerica
SAS,undata-geo/G00158000
WLD,Earth
CHI,ChannelIslands
AFR,africa
AME,undata-geo/G00134000
49 changes: 44 additions & 5 deletions scripts/world_bank/datasets/process.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,9 @@
output = os.path.join(_GCS_OUTPUT_DIR, 'output')
output_file_path = os.path.join(output, 'transformed_data_for_all_final.csv')

places_csv = os.path.join(_MODULE_DIR, 'places.csv')
skip_places_csv = os.path.join(_MODULE_DIR, 'skip_places.csv')

_UTIL_DIR = os.path.dirname(os.path.abspath(__file__))
sys.path.append(os.path.join(_UTIL_DIR, '../../../util/'))
import file_util
Expand All @@ -27,6 +30,31 @@
flags.DEFINE_string("historical_file", "bq-results-20250423.csv",
"historical file name")

DCID_MAP = {}
IGNORE_DCIDS = set()


def _load_place_mapping():
try:
with open(places_csv, 'r', newline='', encoding='utf-8') as f:
reader = csv.DictReader(f)
for row in reader:
DCID_MAP[row['country code'].strip()] = row['dcid'].strip()
except FileNotFoundError:
logging.fatal(f"File not found: {places_csv}")
except csv.Error as e:
logging.fatal(f"Error reading CSV {places_csv}: {e}")

try:
with open(skip_places_csv, 'r', newline='', encoding='utf-8') as f:
reader = csv.DictReader(f)
for row in reader:
IGNORE_DCIDS.add(row['country code'].strip())
except FileNotFoundError:
logging.fatal(f"File not found: {skip_places_csv}")
except csv.Error as e:
logging.fatal(f"Error reading CSV {skip_places_csv}: {e}")


def transform_worldbank_csv(input_filename,
writer,
Expand Down Expand Up @@ -57,7 +85,6 @@ def transform_worldbank_csv(input_filename,
break
except ValueError:
pass
"""Ignoring ValueError: expecting integer for year column identification."""
except ValueError:
logging.info(
f"Error: Could not find required columns in header of '{input_filename}'."
Expand All @@ -74,8 +101,16 @@ def transform_worldbank_csv(input_filename,
country_code_column_index is not None and year_columns_start_index is not None:
indicator_code = row[indicator_code_column_index].strip(
)
country_code = "country/" + row[
country_code_column_index].strip()
code_from_csv = row[country_code_column_index]
raw_country_code = "country/" + code_from_csv

if code_from_csv in IGNORE_DCIDS:
continue
if code_from_csv in DCID_MAP:
country_code = DCID_MAP[code_from_csv]
else:
country_code = raw_country_code

stat_var = "worldBank/" + indicator_code.replace(
'.', '_')

Expand All @@ -84,7 +119,6 @@ def transform_worldbank_csv(input_filename,
year = header[j]
value = row[j].strip()
if value:
"""Keeping the first occurrence and removing subsequent duplicates. Verified with source and production; the initial value from the source now is matching with the production data(checked for 4-5 samples) ."""
duplicate_key = (indicator_code, stat_var,
MEASUREMENT_METHOD,
country_code, year, unit_value)
Expand Down Expand Up @@ -125,10 +159,14 @@ def get_unit_by_indicator(target_indicator_code):
except FileNotFoundError:
return ""
except Exception as e:
logging.warning(
f"Error while reading unit for indicator {target_indicator_code}: {e}"
)
return ""


def main(_):
_load_place_mapping()
input_files = [
os.path.join(input_directory, f)
for f in os.listdir(input_directory)
Expand All @@ -151,9 +189,10 @@ def main(_):
logging.info(
f"\nSuccessfully processed {len(input_files)} files. Combined output written to '{output_file_path}'"
)
# historical_file = "bq-results-20250423.csv"

file_util.file_copy(f'{FLAGS.gs_path}{FLAGS.historical_file}',
f'{output}/{FLAGS.historical_file}')

expected_output_files = [
FLAGS.historical_file, 'transformed_data_for_all_final.csv'
]
Expand Down
Loading
Loading