Skip to content

Commit e6889e1

Browse files
committed
add WGMS unit tests + fix a few issues + add data filtering
1 parent 7528fb9 commit e6889e1

File tree

2 files changed

+76
-7
lines changed

2 files changed

+76
-7
lines changed

massbalancemachine/data_processing/wgms.py

Lines changed: 36 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import os
2+
import shutil
23
import urllib.request
34
import zipfile
45
import pandas as pd
@@ -7,18 +8,23 @@
78
data_path = ".data"
89

910
wgms_source_data_link = f"https://wgms.ch/downloads/{wgms_zip_file}"
10-
local_path_wgms = f"{data_path}/{wgms_zip_file}"
11+
local_path_wgms = f"{data_path}/WGMS/{wgms_zip_file}"
1112

12-
wgms_folder = f"{data_path}/{wgms_zip_file.replace('.zip', '')}"
13+
wgms_folder = f"{data_path}/WGMS/{wgms_zip_file.replace('.zip', '')}"
14+
15+
16+
def _clean_extracted_wgms():
17+
if os.path.isdir(wgms_folder):
18+
shutil.rmtree(wgms_folder)
1319

1420

1521
def check_and_download_wgms():
16-
os.makedirs(data_path, exist_ok=True)
22+
os.makedirs(f"{data_path}/WGMS/", exist_ok=True)
1723
if not os.path.isdir(wgms_folder):
1824
if not os.path.isfile(local_path_wgms):
19-
print("Downloading from WGMS website")
25+
print("Downloading data from WGMS website")
2026
urllib.request.urlretrieve(wgms_source_data_link, local_path_wgms)
21-
print("Unzipping file")
27+
print("Unzipping WGMS archive")
2228
with zipfile.ZipFile(local_path_wgms, "r") as zip_ref:
2329
zip_ref.extractall(wgms_folder)
2430

@@ -72,7 +78,6 @@ def parse_wgms_format(data_mb):
7278
"density",
7379
"density_unc",
7480
"method",
75-
"balance_code",
7681
"remarks",
7782
]
7883
)
@@ -86,9 +91,33 @@ def parse_wgms_format(data_mb):
8691
"elevation": "POINT_ELEVATION",
8792
"begin_date": "FROM_DATE",
8893
"end_date": "TO_DATE",
94+
"balance_code": "PERIOD",
8995
},
9096
)
97+
assert new_df.ID.nunique() == new_df.shape[0], "It seems that ID are not unique"
98+
9199
new_df["FROM_DATE"] = pd.to_datetime(new_df["FROM_DATE"]).dt.strftime("%Y%m%d")
92-
new_df["TO_DATE"] = pd.to_datetime(new_df["FROM_DATE"]).dt.strftime("%Y%m%d")
100+
new_df["TO_DATE"] = pd.to_datetime(new_df["TO_DATE"]).dt.strftime("%Y%m%d")
93101

94102
return new_df
103+
104+
105+
def filter_dates(df):
106+
# Remove points for which the dates have a too large uncertainty
107+
threshold_date_uncertainty = 5
108+
filtered_df = df[
109+
(df.end_date_unc <= threshold_date_uncertainty)
110+
& (df.begin_date_unc <= threshold_date_uncertainty)
111+
]
112+
113+
return filtered_df
114+
115+
116+
def load_processed_wgms(rgi_region=None):
117+
check_and_download_wgms()
118+
df = load_wgms_data()
119+
df = filter_dates(df)
120+
df = parse_wgms_format(df)
121+
if rgi_region is not None:
122+
df = df.loc[df.rgi_region == rgi_region]
123+
return df
Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
import os
2+
import pytest
3+
import tempfile
4+
import pandas as pd
5+
import geopandas as gpd
6+
import massbalancemachine as mbm
7+
8+
9+
@pytest.mark.order1
10+
def test_data_retrieval():
11+
mbm.data_processing.wgms._clean_extracted_wgms()
12+
mbm.data_processing.check_and_download_wgms()
13+
14+
15+
@pytest.mark.order2
16+
def test_data_preprocessing_wgms():
17+
df = mbm.data_processing.wgms.load_processed_wgms()
18+
expected_columns = [
19+
"YEAR",
20+
"ID",
21+
"FROM_DATE",
22+
"TO_DATE",
23+
"POINT_LAT",
24+
"POINT_LON",
25+
"POINT_ELEVATION",
26+
"POINT_BALANCE",
27+
"PERIOD",
28+
"rgi_region",
29+
]
30+
assert set(expected_columns).issubset(
31+
set(df.columns)
32+
), f"Not all features are in the dataframe. Expected {set(expected_columns)} but {set(expected_columns).difference(set(df.columns))} are missing."
33+
assert df.shape == (64143, 10)
34+
df_alps = mbm.data_processing.wgms.load_processed_wgms(rgi_region=11)
35+
assert df_alps.shape == (27137, 10)
36+
37+
38+
if __name__ == "__main__":
39+
test_data_retrieval()
40+
test_data_preprocessing_wgms()

0 commit comments

Comments
 (0)