ref-sample-data/src/ref_sample_data/data_request/cmip6.py at 69275b0231105269c890a01f65b486ec7542e1c6 · Climate-REF/ref-sample-data · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
import os.path
import pathlib
from pathlib import Path
from typing import Any

import pandas as pd
import xarray as xr

from ref_sample_data.data_request.base import IntakeESGFDataRequest
from ref_sample_data.resample import decimate_curvilinear, decimate_rectilinear


def prefix_to_filename(ds, filename_prefix: str) -> str:
    """
    Create a filename from a dataset and a prefix.

    Optionally includes the time range of the dataset if it has a time dimension.

    Parameters
    ----------
    ds
        Dataset
    filename_prefix
        Prefix for the filename

        This includes the different facets of the dataset

    Returns
    -------
        Filename for the dataset
    """
    if "time" in ds.dims:
        time_range = f"{ds.time.min().dt.strftime('%Y%m').item()}-{ds.time.max().dt.strftime('%Y%m').item()}"
        filename = f"{filename_prefix}_{time_range}.nc"
    else:
        filename = f"{filename_prefix}.nc"
    return filename


class CMIP6Request(IntakeESGFDataRequest):
    """
    Represents a CMIP6 dataset request

    These data are fetched from ESGF and decimated according to their grid type
    """

    source_type = "CMIP6"

    cmip6_path_items = (
        "mip_era",
        "activity_drs",
        "institution_id",
        "source_id",
        "experiment_id",
        "member_id",
        "table_id",
        "variable_id",
        "grid_label",
    )

    cmip6_filename_paths = (
        "variable_id",
        "table_id",
        "source_id",
        "experiment_id",
        "member_id",
        "grid_label",
    )

    def __init__(self, facets: dict[str, Any], remove_ensembles: bool, time_span: tuple[str, str] | None):
        self.avail_facets = [
            "mip_era",
            "activity_drs",
            "institution_id",
            "source_id",
            "experiment_id",
            "member_id",
            "table_id",
            "variable_id",
            "grid_label",
            "version",
            "data_node",
        ]

        self.facets = facets
        self.remove_ensembles = remove_ensembles
        self.time_span = time_span

        assert all(key in self.avail_facets for key in self.cmip6_path_items), "Error message"
        assert all(key in self.avail_facets for key in self.cmip6_filename_paths), "Error message"

    def decimate_dataset(self, dataset: xr.Dataset) -> xr.Dataset | None:
        """
        Downscale the dataset to a smaller size.

        Parameters
        ----------
        dataset
            The dataset to downscale

        Returns
        -------
        xr.Dataset
            The downscaled dataset
        """
        has_latlon = "lat" in dataset.dims and "lon" in dataset.dims
        has_ij = "i" in dataset.dims and "j" in dataset.dims

        # The AMOC variable `msftmz` has these strange dims and we do not want to decimate
        skip_decimate = {"time", "basin", "lev", "lat"}.issubset(dataset.dims)

        if has_latlon:
            assert len(dataset.lat.dims) == 1 and len(dataset.lon.dims) == 1

            result = decimate_rectilinear(dataset)
        elif has_ij:
            # 2d curvilinear grid (generally ocean variables)
            result = decimate_curvilinear(dataset)
        elif skip_decimate:
            result = dataset
        else:
            raise ValueError("Cannot decimate this grid: too many dimensions")

        if "time" in dataset.dims and self.time_span is not None:
            result = result.sel(time=slice(*self.time_span))
            if result.time.size == 0:
                result = None

        return result

    def generate_filename(self, metadata: pd.Series, ds: xr.Dataset, ds_filename: pathlib.Path) -> Path:
        """
        Create the output filename for the dataset.

        Parameters
        ----------
        ds
            Loaded dataset

        ds_filename:
            Filename of the dataset (Unused)

        Returns
        -------
            The output filename
        """
        output_path = (
            Path(os.path.join(*[metadata[item] for item in self.cmip6_path_items]))
            / f"v{metadata['version']}"
        )
        filename_prefix = "_".join([metadata[item] for item in self.cmip6_filename_paths])

        return output_path / prefix_to_filename(ds, filename_prefix)