Skip to content

Commit b50a728

Browse files
committed
Merge branch 'dev' into scriptsGeo
2 parents 785d973 + e6889e1 commit b50a728

File tree

3 files changed

+147
-10
lines changed

3 files changed

+147
-10
lines changed

massbalancemachine/data_processing/__init__.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,11 @@
66
MBSequenceDataset,
77
)
88
import data_processing.utils
9-
from data_processing.wgms import load_wgms_data
9+
from data_processing.wgms import (
10+
check_and_download_wgms,
11+
load_wgms_data,
12+
parse_wgms_format,
13+
)
1014
from data_processing.Product import Product
1115
from data_processing.product_utils import rgi_id_to_folders
1216
from data_processing.gridded_utils import (
Lines changed: 102 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,30 +1,123 @@
11
import os
2+
import shutil
23
import urllib.request
34
import zipfile
45
import pandas as pd
56

6-
wgms_zip_file = "DOI-WGMS-FoG-2025-02b.zip"
7-
data_path = ".data/WGMS"
7+
wgms_zip_file = "DOI-WGMS-FoG-2026-02-10.zip"
8+
data_path = ".data"
89

910
wgms_source_data_link = f"https://wgms.ch/downloads/{wgms_zip_file}"
10-
local_path_wgms = f"{data_path}/{wgms_zip_file}"
11+
local_path_wgms = f"{data_path}/WGMS/{wgms_zip_file}"
1112

12-
wgms_folder = f"{data_path}/{wgms_zip_file.replace('.zip', '')}"
13+
wgms_folder = f"{data_path}/WGMS/{wgms_zip_file.replace('.zip', '')}"
14+
15+
16+
def _clean_extracted_wgms():
17+
if os.path.isdir(wgms_folder):
18+
shutil.rmtree(wgms_folder)
1319

1420

1521
def check_and_download_wgms():
16-
os.makedirs(data_path, exist_ok=True)
22+
os.makedirs(f"{data_path}/WGMS/", exist_ok=True)
1723
if not os.path.isdir(wgms_folder):
1824
if not os.path.isfile(local_path_wgms):
19-
print("Downloading from WGMS website")
25+
print("Downloading data from WGMS website")
2026
urllib.request.urlretrieve(wgms_source_data_link, local_path_wgms)
21-
print("Unzipping file")
27+
print("Unzipping WGMS archive")
2228
with zipfile.ZipFile(local_path_wgms, "r") as zip_ref:
2329
zip_ref.extractall(wgms_folder)
2430

2531

2632
def load_wgms_data():
33+
"""
34+
Load WGMS data and enrich mass balance data with rgi_region.
35+
36+
Returns:
37+
pd.DataFrame: mass balance data with added 'rgi_region' column
38+
"""
2739
check_and_download_wgms()
40+
2841
point_mb_file = f"{wgms_folder}/data/mass_balance_point.csv"
29-
data = pd.read_csv(point_mb_file)
30-
return data
42+
glacier_file = f"{wgms_folder}/data/glacier.csv"
43+
44+
data_mb = pd.read_csv(point_mb_file)
45+
data_glacier = pd.read_csv(glacier_file)
46+
47+
# Build mapping: id -> rgi_region (extract number before "_")
48+
mapping = data_glacier.assign(
49+
rgi_region=data_glacier["gtng_region"].str.split("_").str[0].astype(int)
50+
).set_index("id")["rgi_region"]
51+
52+
# Apply mapping to data_mb
53+
data_mb["rgi_region"] = data_mb["glacier_id"].map(mapping)
54+
55+
return data_mb
56+
57+
58+
def parse_wgms_format(data_mb):
59+
"""
60+
Converts the WGMS point balance DataFrame to a dataframe ready to be used by MBM Data preparation notebook.
61+
62+
Args:
63+
df_pb (pd.DataFrame): dataframe loaded by load_wgms_data "mass_balance_point.csv" from WGMS.
64+
Returns:
65+
pd.DataFrame
66+
"""
67+
68+
new_df = data_mb.drop(
69+
columns=[
70+
"country",
71+
"glacier_name",
72+
"original_id",
73+
"glacier_id",
74+
"time_system",
75+
"begin_date_unc",
76+
"end_date_unc",
77+
"balance_unc",
78+
"density",
79+
"density_unc",
80+
"method",
81+
"remarks",
82+
]
83+
)
84+
new_df = new_df.rename(
85+
columns={
86+
"id": "ID",
87+
"year": "YEAR",
88+
"balance": "POINT_BALANCE",
89+
"latitude": "POINT_LAT",
90+
"longitude": "POINT_LON",
91+
"elevation": "POINT_ELEVATION",
92+
"begin_date": "FROM_DATE",
93+
"end_date": "TO_DATE",
94+
"balance_code": "PERIOD",
95+
},
96+
)
97+
assert new_df.ID.nunique() == new_df.shape[0], "It seems that ID are not unique"
98+
99+
new_df["FROM_DATE"] = pd.to_datetime(new_df["FROM_DATE"]).dt.strftime("%Y%m%d")
100+
new_df["TO_DATE"] = pd.to_datetime(new_df["TO_DATE"]).dt.strftime("%Y%m%d")
101+
102+
return new_df
103+
104+
105+
def filter_dates(df):
106+
# Remove points for which the dates have a too large uncertainty
107+
threshold_date_uncertainty = 5
108+
filtered_df = df[
109+
(df.end_date_unc <= threshold_date_uncertainty)
110+
& (df.begin_date_unc <= threshold_date_uncertainty)
111+
]
112+
113+
return filtered_df
114+
115+
116+
def load_processed_wgms(rgi_region=None):
117+
check_and_download_wgms()
118+
df = load_wgms_data()
119+
df = filter_dates(df)
120+
df = parse_wgms_format(df)
121+
if rgi_region is not None:
122+
df = df.loc[df.rgi_region == rgi_region]
123+
return df
Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
import os
2+
import pytest
3+
import tempfile
4+
import pandas as pd
5+
import geopandas as gpd
6+
import massbalancemachine as mbm
7+
8+
9+
@pytest.mark.order1
10+
def test_data_retrieval():
11+
mbm.data_processing.wgms._clean_extracted_wgms()
12+
mbm.data_processing.check_and_download_wgms()
13+
14+
15+
@pytest.mark.order2
16+
def test_data_preprocessing_wgms():
17+
df = mbm.data_processing.wgms.load_processed_wgms()
18+
expected_columns = [
19+
"YEAR",
20+
"ID",
21+
"FROM_DATE",
22+
"TO_DATE",
23+
"POINT_LAT",
24+
"POINT_LON",
25+
"POINT_ELEVATION",
26+
"POINT_BALANCE",
27+
"PERIOD",
28+
"rgi_region",
29+
]
30+
assert set(expected_columns).issubset(
31+
set(df.columns)
32+
), f"Not all features are in the dataframe. Expected {set(expected_columns)} but {set(expected_columns).difference(set(df.columns))} are missing."
33+
assert df.shape == (64143, 10)
34+
df_alps = mbm.data_processing.wgms.load_processed_wgms(rgi_region=11)
35+
assert df_alps.shape == (27137, 10)
36+
37+
38+
if __name__ == "__main__":
39+
test_data_retrieval()
40+
test_data_preprocessing_wgms()

0 commit comments

Comments
 (0)