Skip to content

Commit f1a6509

Browse files
committed
merge
Merge remote-tracking branch 'upstream/main'
2 parents 3bb1f82 + bde0d84 commit f1a6509

File tree

4 files changed

+155
-12
lines changed

4 files changed

+155
-12
lines changed

massbalancemachine/data_processing/__init__.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,4 +7,8 @@
77
MBSequenceDatasetTL,
88
)
99
import data_processing.utils
10-
from data_processing.wgms import load_wgms_data
10+
from data_processing.wgms import (
11+
check_and_download_wgms,
12+
load_wgms_data,
13+
parse_wgms_format,
14+
)

massbalancemachine/data_processing/get_topo_data.py

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -195,13 +195,19 @@ def _retrieve_topo_features(
195195
"""Find the nearest recorded point with topographical features on the glacier for each stake."""
196196

197197
for gdir, gdir_grid in zip(glacier_directories, gdirs_gridded):
198-
lat = grouped_stakes.get_group(gdir.rgi_id)[["POINT_LAT"]].values.flatten()
199-
lon = grouped_stakes.get_group(gdir.rgi_id)[["POINT_LON"]].values.flatten()
198+
lat = xr.DataArray(
199+
grouped_stakes.get_group(gdir.rgi_id)[["POINT_LAT"]].values.flatten(),
200+
dims="points",
201+
)
202+
lon = xr.DataArray(
203+
grouped_stakes.get_group(gdir.rgi_id)[["POINT_LON"]].values.flatten(),
204+
dims="points",
205+
)
200206

201207
topo_data = (
202208
gdir_grid.sel(x=lon, y=lat, method="nearest")[voi]
203209
.to_dataframe()
204210
.reset_index(drop=True)
205211
)
206212

207-
df.loc[df["RGIId"] == gdir.rgi_id, voi] = topo_data[voi]
213+
df.loc[df["RGIId"] == gdir.rgi_id, voi] = topo_data[voi].values
Lines changed: 101 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,30 +1,123 @@
11
import os
2+
import shutil
23
import urllib.request
34
import zipfile
45
import pandas as pd
56

6-
wgms_zip_file = "DOI-WGMS-FoG-2025-02b.zip"
7+
wgms_zip_file = "DOI-WGMS-FoG-2026-02-10.zip"
78
data_path = ".data"
89

910
wgms_source_data_link = f"https://wgms.ch/downloads/{wgms_zip_file}"
10-
local_path_wgms = f"{data_path}/{wgms_zip_file}"
11+
local_path_wgms = f"{data_path}/WGMS/{wgms_zip_file}"
1112

12-
wgms_folder = f"{data_path}/{wgms_zip_file.replace('.zip', '')}"
13+
wgms_folder = f"{data_path}/WGMS/{wgms_zip_file.replace('.zip', '')}"
14+
15+
16+
def _clean_extracted_wgms():
17+
if os.path.isdir(wgms_folder):
18+
shutil.rmtree(wgms_folder)
1319

1420

1521
def check_and_download_wgms():
16-
os.makedirs(data_path, exist_ok=True)
22+
os.makedirs(f"{data_path}/WGMS/", exist_ok=True)
1723
if not os.path.isdir(wgms_folder):
1824
if not os.path.isfile(local_path_wgms):
19-
print("Downloading from WGMS website")
25+
print("Downloading data from WGMS website")
2026
urllib.request.urlretrieve(wgms_source_data_link, local_path_wgms)
21-
print("Unzipping file")
27+
print("Unzipping WGMS archive")
2228
with zipfile.ZipFile(local_path_wgms, "r") as zip_ref:
2329
zip_ref.extractall(wgms_folder)
2430

2531

2632
def load_wgms_data():
33+
"""
34+
Load WGMS data and enrich mass balance data with rgi_region.
35+
36+
Returns:
37+
pd.DataFrame: mass balance data with added 'rgi_region' column
38+
"""
2739
check_and_download_wgms()
40+
2841
point_mb_file = f"{wgms_folder}/data/mass_balance_point.csv"
29-
data = pd.read_csv(point_mb_file)
30-
return data
42+
glacier_file = f"{wgms_folder}/data/glacier.csv"
43+
44+
data_mb = pd.read_csv(point_mb_file)
45+
data_glacier = pd.read_csv(glacier_file)
46+
47+
# Build mapping: id -> rgi_region (extract number before "_")
48+
mapping = data_glacier.assign(
49+
rgi_region=data_glacier["gtng_region"].str.split("_").str[0].astype(int)
50+
).set_index("id")["rgi_region"]
51+
52+
# Apply mapping to data_mb
53+
data_mb["rgi_region"] = data_mb["glacier_id"].map(mapping)
54+
55+
return data_mb
56+
57+
58+
def parse_wgms_format(data_mb):
59+
"""
60+
Converts the WGMS point balance DataFrame to a dataframe ready to be used by MBM Data preparation notebook.
61+
62+
Args:
63+
df_pb (pd.DataFrame): dataframe loaded by load_wgms_data "mass_balance_point.csv" from WGMS.
64+
Returns:
65+
pd.DataFrame
66+
"""
67+
68+
new_df = data_mb.drop(
69+
columns=[
70+
"country",
71+
"glacier_name",
72+
"original_id",
73+
"glacier_id",
74+
"time_system",
75+
"begin_date_unc",
76+
"end_date_unc",
77+
"balance_unc",
78+
"density",
79+
"density_unc",
80+
"method",
81+
"remarks",
82+
]
83+
)
84+
new_df = new_df.rename(
85+
columns={
86+
"id": "ID",
87+
"year": "YEAR",
88+
"balance": "POINT_BALANCE",
89+
"latitude": "POINT_LAT",
90+
"longitude": "POINT_LON",
91+
"elevation": "POINT_ELEVATION",
92+
"begin_date": "FROM_DATE",
93+
"end_date": "TO_DATE",
94+
"balance_code": "PERIOD",
95+
},
96+
)
97+
assert new_df.ID.nunique() == new_df.shape[0], "It seems that ID are not unique"
98+
99+
new_df["FROM_DATE"] = pd.to_datetime(new_df["FROM_DATE"]).dt.strftime("%Y%m%d")
100+
new_df["TO_DATE"] = pd.to_datetime(new_df["TO_DATE"]).dt.strftime("%Y%m%d")
101+
102+
return new_df
103+
104+
105+
def filter_dates(df):
106+
# Remove points for which the dates have a too large uncertainty
107+
threshold_date_uncertainty = 5
108+
filtered_df = df[
109+
(df.end_date_unc <= threshold_date_uncertainty)
110+
& (df.begin_date_unc <= threshold_date_uncertainty)
111+
]
112+
113+
return filtered_df
114+
115+
116+
def load_processed_wgms(rgi_region=None):
117+
check_and_download_wgms()
118+
df = load_wgms_data()
119+
df = filter_dates(df)
120+
df = parse_wgms_format(df)
121+
if rgi_region is not None:
122+
df = df.loc[df.rgi_region == rgi_region]
123+
return df
Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
import os
2+
import pytest
3+
import tempfile
4+
import pandas as pd
5+
import geopandas as gpd
6+
import massbalancemachine as mbm
7+
8+
9+
@pytest.mark.order1
10+
def test_data_retrieval():
11+
mbm.data_processing.wgms._clean_extracted_wgms()
12+
mbm.data_processing.check_and_download_wgms()
13+
14+
15+
@pytest.mark.order2
16+
def test_data_preprocessing_wgms():
17+
df = mbm.data_processing.wgms.load_processed_wgms()
18+
expected_columns = [
19+
"YEAR",
20+
"ID",
21+
"FROM_DATE",
22+
"TO_DATE",
23+
"POINT_LAT",
24+
"POINT_LON",
25+
"POINT_ELEVATION",
26+
"POINT_BALANCE",
27+
"PERIOD",
28+
"rgi_region",
29+
]
30+
assert set(expected_columns).issubset(
31+
set(df.columns)
32+
), f"Not all features are in the dataframe. Expected {set(expected_columns)} but {set(expected_columns).difference(set(df.columns))} are missing."
33+
assert df.shape == (64143, 10)
34+
df_alps = mbm.data_processing.wgms.load_processed_wgms(rgi_region=11)
35+
assert df_alps.shape == (27137, 10)
36+
37+
38+
if __name__ == "__main__":
39+
test_data_retrieval()
40+
test_data_preprocessing_wgms()

0 commit comments

Comments
 (0)