Skip to content

Commit 7528fb9

Browse files
Wgms preprocess (#172)
* improve code efficiency of two functions of class AggregatedDataset() : init() and mapSplitsToDataset() * allows to record dataframe in parquet format in addition to csv format * add possibility to divide between test and train absed on subregion (c-region) as well as the possibility to have randomness and different sampling from sampling to sampling in set_train_test_split(). assign_train_test_indices(self,train_indices, test_indices, test_size) is defined to update dataloader with the values of the selected test/train divisions after 10 sampling based on subregion. * adapation of dataset.py to choose output format of _get_output_filename() between csv and parquet * Alban's feedback on PR : mapSplitsToDataset() and init() more efficient for large dataset, output format to csv and parquet * Alban's feedback on PR : split on subregion added, modification of _create_group_kfold_splits() to cross-validate on subregion * Add function to plot test and train dataset (SMB versus elevation * Adapation of dataloader to asnwer review of #158 and new plot functions * to asnwer review of #158 * correct __init__ for mbm plot * preprocess WGMS data to be used by MBM #169 --------- Co-authored-by: Alban Gossard <alban.paul.gossard@gmail.com>
1 parent d593a35 commit 7528fb9

File tree

2 files changed

+72
-4
lines changed

2 files changed

+72
-4
lines changed

massbalancemachine/data_processing/__init__.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,4 +6,8 @@
66
MBSequenceDataset,
77
)
88
import data_processing.utils
9-
from data_processing.wgms import load_wgms_data
9+
from data_processing.wgms import (
10+
check_and_download_wgms,
11+
load_wgms_data,
12+
parse_wgms_format,
13+
)

massbalancemachine/data_processing/wgms.py

Lines changed: 67 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
import zipfile
44
import pandas as pd
55

6-
wgms_zip_file = "DOI-WGMS-FoG-2025-02b.zip"
6+
wgms_zip_file = "DOI-WGMS-FoG-2026-02-10.zip"
77
data_path = ".data"
88

99
wgms_source_data_link = f"https://wgms.ch/downloads/{wgms_zip_file}"
@@ -24,7 +24,71 @@ def check_and_download_wgms():
2424

2525

2626
def load_wgms_data():
27+
"""
28+
Load WGMS data and enrich mass balance data with rgi_region.
29+
30+
Returns:
31+
pd.DataFrame: mass balance data with added 'rgi_region' column
32+
"""
2733
check_and_download_wgms()
34+
2835
point_mb_file = f"{wgms_folder}/data/mass_balance_point.csv"
29-
data = pd.read_csv(point_mb_file)
30-
return data
36+
glacier_file = f"{wgms_folder}/data/glacier.csv"
37+
38+
data_mb = pd.read_csv(point_mb_file)
39+
data_glacier = pd.read_csv(glacier_file)
40+
41+
# Build mapping: id -> rgi_region (extract number before "_")
42+
mapping = data_glacier.assign(
43+
rgi_region=data_glacier["gtng_region"].str.split("_").str[0].astype(int)
44+
).set_index("id")["rgi_region"]
45+
46+
# Apply mapping to data_mb
47+
data_mb["rgi_region"] = data_mb["glacier_id"].map(mapping)
48+
49+
return data_mb
50+
51+
52+
def parse_wgms_format(data_mb):
53+
"""
54+
Converts the WGMS point balance DataFrame to a dataframe ready to be used by MBM Data preparation notebook.
55+
56+
Args:
57+
df_pb (pd.DataFrame): dataframe loaded by load_wgms_data "mass_balance_point.csv" from WGMS.
58+
Returns:
59+
pd.DataFrame
60+
"""
61+
62+
new_df = data_mb.drop(
63+
columns=[
64+
"country",
65+
"glacier_name",
66+
"original_id",
67+
"glacier_id",
68+
"time_system",
69+
"begin_date_unc",
70+
"end_date_unc",
71+
"balance_unc",
72+
"density",
73+
"density_unc",
74+
"method",
75+
"balance_code",
76+
"remarks",
77+
]
78+
)
79+
new_df = new_df.rename(
80+
columns={
81+
"id": "ID",
82+
"year": "YEAR",
83+
"balance": "POINT_BALANCE",
84+
"latitude": "POINT_LAT",
85+
"longitude": "POINT_LON",
86+
"elevation": "POINT_ELEVATION",
87+
"begin_date": "FROM_DATE",
88+
"end_date": "TO_DATE",
89+
},
90+
)
91+
new_df["FROM_DATE"] = pd.to_datetime(new_df["FROM_DATE"]).dt.strftime("%Y%m%d")
92+
new_df["TO_DATE"] = pd.to_datetime(new_df["FROM_DATE"]).dt.strftime("%Y%m%d")
93+
94+
return new_df

0 commit comments

Comments
 (0)