|
1 | 1 | import os |
| 2 | +import shutil |
2 | 3 | import urllib.request |
3 | 4 | import zipfile |
4 | 5 | import pandas as pd |
5 | 6 |
|
6 | | -wgms_zip_file = "DOI-WGMS-FoG-2025-02b.zip" |
7 | | -data_path = ".data/WGMS" |
| 7 | +wgms_zip_file = "DOI-WGMS-FoG-2026-02-10.zip" |
| 8 | +data_path = ".data" |
8 | 9 |
|
9 | 10 | wgms_source_data_link = f"https://wgms.ch/downloads/{wgms_zip_file}" |
10 | | -local_path_wgms = f"{data_path}/{wgms_zip_file}" |
| 11 | +local_path_wgms = f"{data_path}/WGMS/{wgms_zip_file}" |
11 | 12 |
|
12 | | -wgms_folder = f"{data_path}/{wgms_zip_file.replace('.zip', '')}" |
| 13 | +wgms_folder = f"{data_path}/WGMS/{wgms_zip_file.replace('.zip', '')}" |
| 14 | + |
| 15 | + |
| 16 | +def _clean_extracted_wgms(): |
| 17 | + if os.path.isdir(wgms_folder): |
| 18 | + shutil.rmtree(wgms_folder) |
13 | 19 |
|
14 | 20 |
|
15 | 21 | def check_and_download_wgms(): |
16 | | - os.makedirs(data_path, exist_ok=True) |
| 22 | + os.makedirs(f"{data_path}/WGMS/", exist_ok=True) |
17 | 23 | if not os.path.isdir(wgms_folder): |
18 | 24 | if not os.path.isfile(local_path_wgms): |
19 | | - print("Downloading from WGMS website") |
| 25 | + print("Downloading data from WGMS website") |
20 | 26 | urllib.request.urlretrieve(wgms_source_data_link, local_path_wgms) |
21 | | - print("Unzipping file") |
| 27 | + print("Unzipping WGMS archive") |
22 | 28 | with zipfile.ZipFile(local_path_wgms, "r") as zip_ref: |
23 | 29 | zip_ref.extractall(wgms_folder) |
24 | 30 |
|
25 | 31 |
|
26 | 32 | def load_wgms_data(): |
| 33 | + """ |
| 34 | + Load WGMS data and enrich mass balance data with rgi_region. |
| 35 | +
|
| 36 | + Returns: |
| 37 | + pd.DataFrame: mass balance data with added 'rgi_region' column |
| 38 | + """ |
27 | 39 | check_and_download_wgms() |
| 40 | + |
28 | 41 | point_mb_file = f"{wgms_folder}/data/mass_balance_point.csv" |
29 | | - data = pd.read_csv(point_mb_file) |
30 | | - return data |
| 42 | + glacier_file = f"{wgms_folder}/data/glacier.csv" |
| 43 | + |
| 44 | + data_mb = pd.read_csv(point_mb_file) |
| 45 | + data_glacier = pd.read_csv(glacier_file) |
| 46 | + |
| 47 | + # Build mapping: id -> rgi_region (extract number before "_") |
| 48 | + mapping = data_glacier.assign( |
| 49 | + rgi_region=data_glacier["gtng_region"].str.split("_").str[0].astype(int) |
| 50 | + ).set_index("id")["rgi_region"] |
| 51 | + |
| 52 | + # Apply mapping to data_mb |
| 53 | + data_mb["rgi_region"] = data_mb["glacier_id"].map(mapping) |
| 54 | + |
| 55 | + return data_mb |
| 56 | + |
| 57 | + |
| 58 | +def parse_wgms_format(data_mb): |
| 59 | + """ |
| 60 | + Converts the WGMS point balance DataFrame to a dataframe ready to be used by MBM Data preparation notebook. |
| 61 | +
|
| 62 | + Args: |
| 63 | + df_pb (pd.DataFrame): dataframe loaded by load_wgms_data "mass_balance_point.csv" from WGMS. |
| 64 | + Returns: |
| 65 | + pd.DataFrame |
| 66 | + """ |
| 67 | + |
| 68 | + new_df = data_mb.drop( |
| 69 | + columns=[ |
| 70 | + "country", |
| 71 | + "glacier_name", |
| 72 | + "original_id", |
| 73 | + "glacier_id", |
| 74 | + "time_system", |
| 75 | + "begin_date_unc", |
| 76 | + "end_date_unc", |
| 77 | + "balance_unc", |
| 78 | + "density", |
| 79 | + "density_unc", |
| 80 | + "method", |
| 81 | + "remarks", |
| 82 | + ] |
| 83 | + ) |
| 84 | + new_df = new_df.rename( |
| 85 | + columns={ |
| 86 | + "id": "ID", |
| 87 | + "year": "YEAR", |
| 88 | + "balance": "POINT_BALANCE", |
| 89 | + "latitude": "POINT_LAT", |
| 90 | + "longitude": "POINT_LON", |
| 91 | + "elevation": "POINT_ELEVATION", |
| 92 | + "begin_date": "FROM_DATE", |
| 93 | + "end_date": "TO_DATE", |
| 94 | + "balance_code": "PERIOD", |
| 95 | + }, |
| 96 | + ) |
| 97 | + assert new_df.ID.nunique() == new_df.shape[0], "It seems that ID are not unique" |
| 98 | + |
| 99 | + new_df["FROM_DATE"] = pd.to_datetime(new_df["FROM_DATE"]).dt.strftime("%Y%m%d") |
| 100 | + new_df["TO_DATE"] = pd.to_datetime(new_df["TO_DATE"]).dt.strftime("%Y%m%d") |
| 101 | + |
| 102 | + return new_df |
| 103 | + |
| 104 | + |
| 105 | +def filter_dates(df): |
| 106 | + # Remove points for which the dates have a too large uncertainty |
| 107 | + threshold_date_uncertainty = 5 |
| 108 | + filtered_df = df[ |
| 109 | + (df.end_date_unc <= threshold_date_uncertainty) |
| 110 | + & (df.begin_date_unc <= threshold_date_uncertainty) |
| 111 | + ] |
| 112 | + |
| 113 | + return filtered_df |
| 114 | + |
| 115 | + |
| 116 | +def load_processed_wgms(rgi_region=None): |
| 117 | + check_and_download_wgms() |
| 118 | + df = load_wgms_data() |
| 119 | + df = filter_dates(df) |
| 120 | + df = parse_wgms_format(df) |
| 121 | + if rgi_region is not None: |
| 122 | + df = df.loc[df.rgi_region == rgi_region] |
| 123 | + return df |
0 commit comments