Skip to content

Commit ebeebea

Browse files
committed
feat: Add obs4ref datasets
1 parent f466985 commit ebeebea

6 files changed

Lines changed: 122 additions & 37 deletions

File tree

scripts/fetch_test_data.py

Lines changed: 3 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -2,45 +2,16 @@
22
from pathlib import Path
33
from typing import Annotated
44

5-
import pandas as pd
65
import pooch
76
import typer
87
import xarray as xr
98

10-
from ref_sample_data import CMIP6Request, DataRequest, Obs4MIPsRequest
9+
from ref_sample_data import CMIP6Request, DataRequest, Obs4MIPsRequest, Obs4REFRequest
1110

1211
OUTPUT_PATH = Path("data")
1312
app = typer.Typer()
1413

1514

16-
def deduplicate_datasets(datasets: pd.DataFrame) -> pd.DataFrame:
17-
"""
18-
Deduplicate a dataset collection.
19-
20-
Uses the metadata from the first dataset in each group,
21-
but expands the time range to the min/max timespan of the group.
22-
23-
Parameters
24-
----------
25-
datasets
26-
The dataset collection
27-
28-
Returns
29-
-------
30-
pd.DataFrame
31-
The deduplicated dataset collection spanning the times requested
32-
"""
33-
34-
def _deduplicate_group(group: pd.DataFrame) -> pd.DataFrame:
35-
first = group.iloc[0].copy()
36-
first.time_start = group.time_start.min()
37-
first.time_end = group.time_end.max()
38-
39-
return first
40-
41-
return datasets.groupby("key").apply(_deduplicate_group, include_groups=False).reset_index()
42-
43-
4415
def process_sample_data_request(
4516
request: DataRequest, decimate: bool, output_directory: Path, quiet: bool
4617
) -> None:
@@ -61,7 +32,6 @@ def process_sample_data_request(
6132
Whether to suppress progress messages
6233
"""
6334
datasets = request.fetch_datasets()
64-
datasets = deduplicate_datasets(datasets)
6535

6636
for _, dataset in datasets.iterrows():
6737
for ds_filename in dataset["files"]:
@@ -183,6 +153,8 @@ def process_sample_data_request(
183153
remove_ensembles=False,
184154
time_span=("2002", "2016"),
185155
),
156+
# All unpublished obs4mips datasets
157+
Obs4REFRequest(),
186158
]
187159

188160

src/ref_sample_data/__init__.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,5 +10,6 @@
1010
from .data_request.base import DataRequest
1111
from .data_request.cmip6 import CMIP6Request
1212
from .data_request.obs4mips import Obs4MIPsRequest
13+
from .data_request.obs4ref import Obs4REFRequest
1314

14-
__all__ = ["CMIP6Request", "DataRequest", "Obs4MIPsRequest"]
15+
__all__ = ["CMIP6Request", "DataRequest", "Obs4MIPsRequest", "Obs4REFRequest"]

src/ref_sample_data/data_request/base.py

Lines changed: 29 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,34 @@ def generate_filename(
3434
...
3535

3636

37+
def _deduplicate_datasets(datasets: pd.DataFrame) -> pd.DataFrame:
38+
"""
39+
Deduplicate a dataset collection.
40+
41+
Uses the metadata from the first dataset in each group,
42+
but expands the time range to the min/max timespan of the group.
43+
44+
Parameters
45+
----------
46+
datasets
47+
The dataset collection
48+
49+
Returns
50+
-------
51+
pd.DataFrame
52+
The deduplicated dataset collection spanning the times requested
53+
"""
54+
55+
def _deduplicate_group(group: pd.DataFrame) -> pd.DataFrame:
56+
first = group.iloc[0].copy()
57+
first.time_start = group.time_start.min()
58+
first.time_end = group.time_end.max()
59+
60+
return first
61+
62+
return datasets.groupby("key").apply(_deduplicate_group, include_groups=False).reset_index()
63+
64+
3765
class IntakeESGFDataRequest(DataRequest):
3866
"""
3967
A data request that fetches datasets from ESGF using intake-esgf.
@@ -56,4 +84,4 @@ def fetch_datasets(self) -> pd.DataFrame:
5684
if self.time_span:
5785
merged_df["time_start"] = self.time_span[0]
5886
merged_df["time_end"] = self.time_span[1]
59-
return merged_df
87+
return _deduplicate_datasets(merged_df)

src/ref_sample_data/data_request/cmip6.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -94,8 +94,6 @@ def decimate_dataset(self, dataset: xr.Dataset) -> xr.Dataset | None:
9494
----------
9595
dataset
9696
The dataset to downscale
97-
time_span
98-
The time span to extract from a dataset
9997
10098
Returns
10199
-------

src/ref_sample_data/data_request/obs4mips.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -73,8 +73,6 @@ def decimate_dataset(self, dataset: xr.Dataset) -> xr.Dataset | None:
7373
----------
7474
dataset
7575
The dataset to downscale
76-
time_span
77-
The time span to extract from a dataset
7876
7977
Returns
8078
-------
Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,88 @@
1+
import pathlib
2+
from pathlib import Path
3+
4+
import pandas as pd
5+
import xarray as xr
6+
from cmip_ref.dataset_registry import build_reference_data_registry
7+
8+
from ref_sample_data.data_request.base import DataRequest
9+
from ref_sample_data.resample import decimate_curvilinear, decimate_rectilinear
10+
11+
12+
class Obs4REFRequest(DataRequest):
13+
"""
14+
Fetch the unpublished Obs4MIPs datasets from the PMP registry
15+
16+
This includes all files that would be downloaded if you ran:
17+
```
18+
ref datasets fetch-obs4ref-data --output-data ...
19+
```
20+
"""
21+
22+
def fetch_datasets(self) -> pd.DataFrame:
23+
"""
24+
Fetch the datasets from the source
25+
26+
Returns a dataframe of the metadata and paths to the fetched datasets.
27+
"""
28+
registry = build_reference_data_registry()
29+
30+
datasets = []
31+
for key in registry.registry.keys():
32+
dataset_path = registry.fetch(key)
33+
datasets.append(
34+
{
35+
"key": key,
36+
"files": [dataset_path],
37+
}
38+
)
39+
return pd.DataFrame(datasets)
40+
41+
def decimate_dataset(self, dataset: xr.Dataset) -> xr.Dataset | None:
42+
"""
43+
Downscale the dataset to a smaller size.
44+
45+
Parameters
46+
----------
47+
dataset
48+
The dataset to downscale
49+
50+
Returns
51+
-------
52+
xr.Dataset
53+
The downscaled dataset
54+
"""
55+
has_latlon = "lat" in dataset.dims and "lon" in dataset.dims
56+
has_ij = "i" in dataset.dims and "j" in dataset.dims
57+
58+
if has_latlon:
59+
assert len(dataset.lat.dims) == 1 and len(dataset.lon.dims) == 1
60+
61+
result = decimate_rectilinear(dataset)
62+
elif has_ij:
63+
# 2d curvilinear grid (generally ocean variables)
64+
result = decimate_curvilinear(dataset)
65+
else:
66+
raise ValueError("Cannot decimate this grid: too many dimensions")
67+
68+
return result
69+
70+
def generate_filename(self, metadata: pd.Series, ds: xr.Dataset, ds_filename: pathlib.Path) -> Path:
71+
"""
72+
Create the output filename for the dataset.
73+
74+
Parameters
75+
----------
76+
metadata
77+
Metadata from the file
78+
ds
79+
Loaded dataset
80+
81+
ds_filename:
82+
Filename of the dataset (Unused)
83+
84+
Returns
85+
-------
86+
The output filename
87+
"""
88+
return Path("obs4REF") / metadata.key

0 commit comments

Comments
 (0)