diff --git a/esmvalcore/_recipe/check.py b/esmvalcore/_recipe/check.py index a33868da74..aafd4a0e3a 100644 --- a/esmvalcore/_recipe/check.py +++ b/esmvalcore/_recipe/check.py @@ -36,7 +36,6 @@ from esmvalcore._task import TaskSet from esmvalcore.dataset import Dataset - from esmvalcore.typing import Facets logger = logging.getLogger(__name__) @@ -466,20 +465,6 @@ def valid_time_selection(timerange: str) -> None: _check_timerange_values(date, timerange_list) -def differing_timeranges( - timeranges: set[str], - required_vars: list[Facets], -) -> None: - """Log error if required variables have differing timeranges.""" - if len(timeranges) > 1: - msg = ( - f"Differing timeranges with values {timeranges} " - f"found for required variables {required_vars}. " - "Set `timerange` to a common value." - ) - raise ValueError(msg) - - def _check_literal( settings: dict, *, diff --git a/esmvalcore/_recipe/recipe.py b/esmvalcore/_recipe/recipe.py index c749f4aff1..ad9274240e 100644 --- a/esmvalcore/_recipe/recipe.py +++ b/esmvalcore/_recipe/recipe.py @@ -51,7 +51,6 @@ from . import check from .from_datasets import datasets_to_recipe from .to_datasets import ( - _derive_needed, _get_input_datasets, _representative_datasets, ) @@ -246,7 +245,7 @@ def _get_default_settings(dataset: Dataset) -> PreprocessorSettings: settings = {} - if _derive_needed(dataset): + if dataset._derivation_necessary(): # noqa: SLF001 (will be replaced soon) settings["derive"] = { "short_name": facets["short_name"], "standard_name": facets["standard_name"], @@ -615,21 +614,26 @@ def _allow_skipping(dataset: Dataset) -> bool: ) -def _set_version(dataset: Dataset, input_datasets: list[Dataset]) -> None: - """Set the 'version' facet based on derivation input datasets.""" - versions = set() - for in_dataset in input_datasets: - in_dataset.set_version() - if version := in_dataset.facets.get("version"): - if isinstance(version, list): - versions.update(version) - else: - versions.add(version) - if versions: - version = versions.pop() if len(versions) == 1 else sorted(versions) - dataset.set_facet("version", version) - for supplementary_ds in dataset.supplementaries: - supplementary_ds.set_version() +def _fix_cmip5_fx_ensemble(dataset: Dataset) -> None: + """Automatically correct the wrong ensemble for CMIP5 fx variables.""" + if ( + dataset.facets.get("project") == "CMIP5" + and dataset.facets.get("mip") == "fx" + and dataset.facets.get("ensemble") != "r0i0p0" + and not dataset.files + ): + original_ensemble = dataset["ensemble"] + copy = dataset.copy() + copy.facets["ensemble"] = "r0i0p0" + if copy.files: + dataset.facets["ensemble"] = "r0i0p0" + logger.info( + "Corrected wrong 'ensemble' from '%s' to '%s' for %s", + original_ensemble, + dataset["ensemble"], + dataset.summary(shorten=True), + ) + dataset.find_files() def _get_preprocessor_products( @@ -655,6 +659,7 @@ def _get_preprocessor_products( settings = _get_default_settings(dataset) _apply_preprocessor_profile(settings, profile) _update_multi_dataset_settings(dataset.facets, settings) + _fix_cmip5_fx_ensemble(dataset) _update_preproc_functions(settings, dataset, datasets, missing_vars) _add_dataset_specific_settings(dataset, settings) check.preprocessor_supplementaries(dataset, settings) @@ -666,7 +671,7 @@ def _get_preprocessor_products( else: missing_vars.update(missing) continue - _set_version(dataset, input_datasets) + dataset.set_version() USED_DATASETS.append(dataset) _schedule_for_download(input_datasets) _log_input_files(input_datasets) diff --git a/esmvalcore/_recipe/to_datasets.py b/esmvalcore/_recipe/to_datasets.py index 7aab83719b..6f6bd46f23 100644 --- a/esmvalcore/_recipe/to_datasets.py +++ b/esmvalcore/_recipe/to_datasets.py @@ -13,7 +13,6 @@ from esmvalcore.esgf.facets import FACETS from esmvalcore.exceptions import RecipeError from esmvalcore.local import LocalFile, _replace_years_with_timerange -from esmvalcore.preprocessor._derive import get_required from esmvalcore.preprocessor._io import DATASET_KEYS from esmvalcore.preprocessor._supplementary_vars import ( PREPROCESSOR_SUPPLEMENTARIES, @@ -188,28 +187,6 @@ def _merge_supplementary_dicts( return list(merged.values()) -def _fix_cmip5_fx_ensemble(dataset: Dataset) -> None: - """Automatically correct the wrong ensemble for CMIP5 fx variables.""" - if ( - dataset.facets.get("project") == "CMIP5" - and dataset.facets.get("mip") == "fx" - and dataset.facets.get("ensemble") != "r0i0p0" - and not dataset.files - ): - original_ensemble = dataset["ensemble"] - copy = dataset.copy() - copy.facets["ensemble"] = "r0i0p0" - if copy.files: - dataset.facets["ensemble"] = "r0i0p0" - logger.info( - "Corrected wrong 'ensemble' from '%s' to '%s' for %s", - original_ensemble, - dataset["ensemble"], - dataset.summary(shorten=True), - ) - dataset.find_files() - - def _get_supplementary_short_names( facets: Facets, step: str, @@ -428,9 +405,7 @@ def datasets_from_recipe( return datasets -def _dataset_from_files( # noqa: C901 - dataset: Dataset, -) -> list[Dataset]: +def _dataset_from_files(dataset: Dataset) -> list[Dataset]: """Replace facet values of '*' based on available files.""" result: list[Dataset] = [] errors: list[str] = [] @@ -441,53 +416,32 @@ def _dataset_from_files( # noqa: C901 dataset.summary(shorten=True), ) - representative_datasets = _representative_datasets(dataset) - - # For derived variables, representative_datasets might contain more than - # one element - all_datasets: list[list[tuple[dict, Dataset]]] = [] - for representative_dataset in representative_datasets: - all_datasets.append([]) - for expanded_ds in representative_dataset.from_files(): - updated_facets = {} - unexpanded_globs = {} - for key, value in dataset.facets.items(): - if _isglob(value): - if key in expanded_ds.facets and not _isglob( - expanded_ds[key], - ): - updated_facets[key] = expanded_ds.facets[key] - else: - unexpanded_globs[key] = value - - if unexpanded_globs: - msg = _report_unexpanded_globs( - dataset, - expanded_ds, - unexpanded_globs, - ) - errors.append(msg) - continue + for expanded_ds in dataset.from_files(): + updated_facets = {} + unexpanded_globs = {} + for key, value in dataset.facets.items(): + if _isglob(value): + if key in expanded_ds.facets and not _isglob( + expanded_ds[key], + ): + updated_facets[key] = expanded_ds.facets[key] + else: + unexpanded_globs[key] = value + + if unexpanded_globs: + msg = _report_unexpanded_globs( + dataset, + expanded_ds, + unexpanded_globs, + ) + errors.append(msg) + continue - new_ds = dataset.copy() - new_ds.facets.update(updated_facets) - new_ds.supplementaries = expanded_ds.supplementaries + new_ds = dataset.copy() + new_ds.facets.update(updated_facets) + new_ds.supplementaries = expanded_ds.supplementaries - all_datasets[-1].append((updated_facets, new_ds)) - - # If globs have been expanded, only consider those datasets that contain - # all necessary input variables if derivation is necessary - for updated_facets, new_ds in all_datasets[0]: - other_facets = [[d[0] for d in ds] for ds in all_datasets[1:]] - if all(updated_facets in facets for facets in other_facets): - result.append(new_ds) - else: - logger.debug( - "Not all necessary input variables to derive '%s' are " - "available for dataset %s", - dataset["short_name"], - updated_facets, - ) + result.append(new_ds) if errors: raise RecipeError("\n".join(errors)) @@ -538,59 +492,23 @@ def _report_unexpanded_globs( return msg -def _derive_needed(dataset: Dataset) -> bool: - """Check if dataset needs to be derived from other datasets.""" - if not dataset.facets.get("derive"): - return False - if dataset.facets.get("force_derivation"): - return True - if _isglob(dataset.facets.get("timerange", "")): - # Our file finding routines are not able to handle globs. - dataset = dataset.copy() - dataset.facets.pop("timerange") - - copy = dataset.copy() - copy.supplementaries = [] - return not copy.files - - def _get_input_datasets(dataset: Dataset) -> list[Dataset]: """Determine the input datasets needed for deriving `dataset`.""" - facets = dataset.facets - if not _derive_needed(dataset): - _fix_cmip5_fx_ensemble(dataset) - return [dataset] + if not dataset._derivation_necessary(): # noqa: SLF001 + return dataset.input_datasets - # Configure input datasets needed to derive variable - datasets = [] - required_vars = get_required(facets["short_name"], facets["project"]) # type: ignore - # idea: add option to specify facets in list of dicts that is value of - # 'derive' in the recipe and use that instead of get_required? - for input_facets in required_vars: - input_dataset = dataset.copy() - keep = {"alias", "recipe_dataset_index", *dataset.minimal_facets} - input_dataset.facets = { - k: v for k, v in input_dataset.facets.items() if k in keep - } - input_dataset.facets.update(input_facets) - input_dataset.augment_facets() - _fix_cmip5_fx_ensemble(input_dataset) - if input_facets.get("optional") and not input_dataset.files: + # Skip optional datasets if no data is available + input_datasets: list[Dataset] = [] + for input_dataset in dataset.input_datasets: + if input_dataset.facets.get("optional") and not input_dataset.files: logger.info( "Skipping: no data found for %s which is marked as 'optional'", input_dataset, ) else: - datasets.append(input_dataset) + input_datasets.append(input_dataset) - # Check timeranges of available input data. - timeranges: set[str] = set() - for input_dataset in datasets: - if "timerange" in input_dataset.facets: - timeranges.add(input_dataset.facets["timerange"]) # type: ignore - check.differing_timeranges(timeranges, required_vars) - - return datasets + return input_datasets def _representative_datasets(dataset: Dataset) -> list[Dataset]: diff --git a/esmvalcore/dataset.py b/esmvalcore/dataset.py index bc5998d58a..5fc24adb53 100644 --- a/esmvalcore/dataset.py +++ b/esmvalcore/dataset.py @@ -33,6 +33,7 @@ _get_start_end_date, ) from esmvalcore.preprocessor import preprocess +from esmvalcore.preprocessor._derive import get_required if TYPE_CHECKING: from collections.abc import Iterable, Iterator, Sequence @@ -102,7 +103,7 @@ class Dataset: Attributes ---------- - supplementaries : list[Dataset] + supplementaries: list[Dataset] List of supplementary datasets. facets: :obj:`esmvalcore.typing.Facets` Facets describing the dataset. @@ -132,6 +133,7 @@ def __init__(self, **facets: FacetValue) -> None: self._session: Session | None = None self._files: Sequence[File] | None = None self._file_globs: Sequence[Path] | None = None + self._input_datasets: list[Dataset] = [] for key, value in facets.items(): self.set_facet(key, deepcopy(value), persist=True) @@ -188,50 +190,170 @@ def _derivation_necessary(self) -> bool: # are found ds_copy = self.copy() ds_copy.supplementaries = [] + + # Avoid potential errors from missing data during timerange glob + # expansion + if _isglob(ds_copy.facets.get("timerange", "")): + ds_copy.facets.pop("timerange", None) + return not ds_copy.files + def _get_input_datasets(self) -> list[Dataset]: + """Get input datasets.""" + input_datasets: list[Dataset] = [] + required_vars_facets = get_required( + self.facets["short_name"], # type: ignore + self.facets["project"], # type: ignore + ) + + for required_facets in required_vars_facets: + input_dataset = self._copy(derive=False, force_derivation=False) + keep = {"alias", "recipe_dataset_index", *self.minimal_facets} + input_dataset.facets = { + k: v for k, v in input_dataset.facets.items() if k in keep + } + input_dataset.facets.update(required_facets) + input_dataset.augment_facets() + input_datasets.append(input_dataset) + + return input_datasets + + @property + def input_datasets(self) -> list[Dataset]: + """Get input datasets. + + For non-derived variables (i.e., those with facet ``derive=False``), + this will simply return the dataset itself in a list. + + For derived variables (i.e., those with facet ``derive=True``), this + will return the datasets required for derivation if derivation is + necessary, and the dataset itself if derivation is not necessary. + Derivation is necessary if the facet ``force_derivation=True`` is set + or no files for the dataset itself are available. + + See also :func:`esmvalcore.preprocessor.derive` for an example usage. + + """ + if self._input_datasets: + return self._input_datasets + + if not self._derivation_necessary(): + input_datasets = [self] + else: + input_datasets = self._get_input_datasets() + + self._input_datasets = input_datasets + return input_datasets + + @staticmethod def _file_to_dataset( - self, + dataset: Dataset, file: esgf.ESGFFile | local.LocalFile, ) -> Dataset: """Create a dataset from a file with a `facets` attribute.""" facets = dict(file.facets) - if "version" not in self.facets: + if "version" not in dataset.facets: # Remove version facet if no specific version requested facets.pop("version", None) updated_facets = { f: v for f, v in facets.items() - if f in self.facets - and _isglob(self.facets[f]) - and _ismatch(v, self.facets[f]) + if f in dataset.facets + and _isglob(dataset.facets[f]) + and _ismatch(v, dataset.facets[f]) } - dataset = self.copy() - dataset.facets.update(updated_facets) + new_dataset = dataset.copy() + new_dataset.facets.update(updated_facets) # If possible, remove unexpanded facets that can be automatically # populated. - unexpanded = {f for f, v in dataset.facets.items() if _isglob(v)} + unexpanded = {f for f, v in new_dataset.facets.items() if _isglob(v)} required_for_augment = {"project", "mip", "short_name", "dataset"} if unexpanded and not unexpanded & required_for_augment: - copy = dataset.copy() + copy = new_dataset.copy() copy.supplementaries = [] for facet in unexpanded: copy.facets.pop(facet) copy.augment_facets() for facet in unexpanded: if facet in copy.facets: - dataset.facets.pop(facet) + new_dataset.facets.pop(facet) + + return new_dataset + + def _get_all_available_datasets(self) -> Iterator[Dataset]: # noqa: C901 + """Yield datasets based on the available files. + + This function requires that self.facets['mip'] is not a glob pattern. - return dataset + Does take variable derivation into account, i.e., datasets available + through variable derivation are returned. - def _get_available_datasets(self) -> Iterator[Dataset]: + """ + datasets_found = False + + # If no forced derivation is requested, search for datasets based on + # files from self + if not self._is_force_derived(): + for dataset in self._get_available_datasets(self): + datasets_found = True + yield dataset + + # For variables that cannot be derived, we are done here + if not self._is_derived(): + return + + # If forced derivation is requested or no datasets based on files from + # self have been found, search for datasets based on files from input + # datasets + if self._is_force_derived() or not datasets_found: + all_datasets: list[list[tuple[dict, Dataset]]] = [] + for input_dataset in self._get_input_datasets(): + all_datasets.append([]) + for expanded_ds in self._get_available_datasets( + input_dataset, + ): + updated_facets = {} + for key, value in self.facets.items(): + if _isglob(value): + if key in expanded_ds.facets and not _isglob( + expanded_ds[key], + ): + updated_facets[key] = expanded_ds.facets[key] + new_ds = self.copy() + new_ds.facets.update(updated_facets) + new_ds.supplementaries = self.supplementaries + + all_datasets[-1].append((updated_facets, new_ds)) + + # Only consider those datasets that contain all input variables + # necessary for derivation + for updated_facets, new_ds in all_datasets[0]: + other_facets = [[d[0] for d in ds] for ds in all_datasets[1:]] + if all(updated_facets in facets for facets in other_facets): + yield new_ds + else: + logger.debug( + "Not all necessary input variables to derive '%s' are " + "available for %s with facets %s", + self["short_name"], + new_ds.summary(shorten=True), + updated_facets, + ) + + def _get_available_datasets(self, dataset: Dataset) -> Iterator[Dataset]: """Yield datasets based on the available files. This function requires that self.facets['mip'] is not a glob pattern. + + Does not take variable derivation into account, i.e., datasets + potentially available through variable derivation are ignored. To + consider derived variables properly, use the function + :func:`_get_all_available_datasets`. + """ - dataset_template = self.copy() + dataset_template = dataset.copy() dataset_template.supplementaries = [] if _isglob(dataset_template.facets.get("timerange")): # Remove wildcard `timerange` facet, because data finding cannot @@ -242,31 +364,30 @@ def _get_available_datasets(self) -> Iterator[Dataset]: partially_defined = [] expanded = False for file in dataset_template.files: - dataset = self._file_to_dataset(file) + new_dataset = self._file_to_dataset(dataset, file) # Filter out identical datasets facetset = frozenset( (f, frozenset(v) if isinstance(v, list) else v) - for f, v in dataset.facets.items() + for f, v in new_dataset.facets.items() ) if facetset not in seen: seen.add(facetset) if any( _isglob(v) - for f, v in dataset.facets.items() + for f, v in new_dataset.facets.items() if f != "timerange" ): - partially_defined.append((dataset, file)) + partially_defined.append((new_dataset, file)) else: - dataset._update_timerange() # noqa: SLF001 - dataset._supplementaries_from_files() # noqa: SLF001 + new_dataset._update_timerange() # noqa: SLF001 expanded = True - yield dataset + yield new_dataset # Only yield datasets with globs if there is no better alternative - for dataset, file in partially_defined: + for new_dataset, file in partially_defined: msg = ( - f"{dataset} with unexpanded wildcards, created from file " + f"{new_dataset} with unexpanded wildcards, created from file " f"{file} with facets {file.facets}. Are the missing facets " "in the path to the file?" if isinstance(file, local.LocalFile) @@ -280,14 +401,13 @@ def _get_available_datasets(self) -> Iterator[Dataset]: "because it still contains wildcards.", msg, ) - yield dataset + yield new_dataset def from_files(self) -> Iterator[Dataset]: """Create datasets based on the available files. The facet values for local files are retrieved from the directory tree where the directories represent the facets values. - Reading facet values from file names is not yet supported. See :ref:`CMOR-DRS` for more information on this kind of file organization. @@ -305,6 +425,10 @@ def from_files(self) -> Iterator[Dataset]: Supplementary datasets will in inherit the facet values from the main dataset for those facets listed in :obj:`INHERITED_FACETS`. + This also works for :ref:`derived variables `. The + input datasets that are necessary for derivation can be accessed via + :attr:`Dataset.input_datasets`. + Examples -------- See :ref:`/notebooks/discovering-data.ipynb` for example use cases. @@ -331,7 +455,8 @@ def from_files(self) -> Iterator[Dataset]: for mip in mips: dataset_template = self.copy(mip=mip) - for dataset in dataset_template._get_available_datasets(): # noqa: SLF001 + for dataset in dataset_template._get_all_available_datasets(): # noqa: SLF001 + dataset._supplementaries_from_files() # noqa: SLF001 expanded = True yield dataset @@ -606,15 +731,29 @@ def minimal_facets(self) -> Facets: """Return a dictionary with the persistent facets.""" return {k: v for k, v in self.facets.items() if k in self._persist} + @staticmethod + def _get_version(dataset: Dataset) -> str | list[str]: + """Get available version(s) of dataset.""" + versions: set[str] = set() + for file in dataset.files: + if "version" in file.facets: + versions.add(str(file.facets["version"])) + return versions.pop() if len(versions) == 1 else sorted(versions) + def set_version(self) -> None: """Set the ``'version'`` facet based on the available data.""" versions: set[str] = set() - for file in self.files: - if "version" in file.facets: - versions.add(file.facets["version"]) # type: ignore + for input_dataset in self.input_datasets: + version = self._get_version(input_dataset) + if version: + if isinstance(version, list): + versions.update(version) + else: + versions.add(version) version = versions.pop() if len(versions) == 1 else sorted(versions) if version: self.set_facet("version", version) + for supplementary_ds in self.supplementaries: supplementary_ds.set_version() diff --git a/esmvalcore/preprocessor/_derive/__init__.py b/esmvalcore/preprocessor/_derive/__init__.py index cbf138e2d7..5c14367dd6 100644 --- a/esmvalcore/preprocessor/_derive/__init__.py +++ b/esmvalcore/preprocessor/_derive/__init__.py @@ -2,6 +2,7 @@ import importlib import logging +from collections.abc import Sequence from copy import deepcopy from pathlib import Path @@ -70,7 +71,7 @@ def get_required(short_name: str, project: str) -> list[Facets]: def derive( - cubes: CubeList, + cubes: Sequence[Cube], short_name: str, long_name: str, units: str | Unit, @@ -81,8 +82,7 @@ def derive( Parameters ---------- cubes: - Includes all the needed variables for derivation defined in - :func:`get_required`. + Includes all the needed variables for derivation. short_name: short_name long_name: @@ -96,6 +96,38 @@ def derive( ------- iris.cube.Cube The new derived variable. + + Examples + -------- + Input variables for derivation can be obtained via + :attr:`esmvalcore.dataset.Dataset.input_datasets`. + + For example, to derive the longwave cloud radiative effect (LWCRE) for the + model CESM2, you can use: + + >>> from esmvalcore.dataset import Dataset + >>> from esmvalcore.preprocessor import derive + >>> dataset = Dataset( + ... project="CMIP6", + ... dataset="CESM2", + ... exp="historical", + ... ensemble="r1i1p1f1", + ... grid="gn", + ... timerange="2000/2014", + ... short_name="lwcre", + ... mip="Amon", + ... derive=True, + ... ) + >>> cubes = [d.load() for d in dataset.input_datasets] + >>> cube = derive( + ... cubes, + ... short_name="lwcre", + ... long_name="TOA Longwave Cloud Radiative Effect", + ... units="W m-2", + ... ) + >>> print(cube.var_name) + lwcre # doctest: +SKIP + """ if short_name == cubes[0].var_name: return cubes[0] diff --git a/notebooks/discovering-data.ipynb b/notebooks/discovering-data.ipynb index d6c9001ef2..581e8ca249 100644 --- a/notebooks/discovering-data.ipynb +++ b/notebooks/discovering-data.ipynb @@ -13,7 +13,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "id": "f0ccfe7f-c535-4606-99ce-be24960aece1", "metadata": {}, "outputs": [], @@ -89,7 +89,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Found 778 datasets, showing the first 10:\n" + "Found 727 datasets, showing the first 10:\n" ] }, { @@ -168,20 +168,20 @@ " 'grid': 'gn',\n", " 'institute': 'AWI'},\n", " Dataset:\n", - " {'dataset': 'BCC-CSM2-MR',\n", + " {'dataset': 'AWI-ESM-1-REcoM',\n", " 'project': 'CMIP6',\n", " 'mip': 'Amon',\n", " 'short_name': 'tas',\n", " 'ensemble': 'r1i1p1f1',\n", " 'exp': 'historical',\n", " 'grid': 'gn',\n", - " 'institute': 'BCC'},\n", + " 'institute': 'AWI'},\n", " Dataset:\n", " {'dataset': 'BCC-CSM2-MR',\n", " 'project': 'CMIP6',\n", " 'mip': 'Amon',\n", " 'short_name': 'tas',\n", - " 'ensemble': 'r2i1p1f1',\n", + " 'ensemble': 'r1i1p1f1',\n", " 'exp': 'historical',\n", " 'grid': 'gn',\n", " 'institute': 'BCC'}]" @@ -253,7 +253,7 @@ { "data": { "text/plain": [ - "[ESGFFile:CMIP6/CMIP/AS-RCEC/TaiESM1/historical/r1i1p1f1/Amon/tas/gn/v20200623/tas_Amon_TaiESM1_historical_r1i1p1f1_gn_185001-201412.nc on hosts ['esgf-data1.llnl.gov', 'esgf.ceda.ac.uk', 'esgf.rcec.sinica.edu.tw', 'esgf3.dkrz.de', 'esgf-data04.diasjp.net', 'esgf.nci.org.au', 'esgf3.dkrz.de']]" + "[ESGFFile:CMIP6/CMIP/AS-RCEC/TaiESM1/historical/r1i1p1f1/Amon/tas/gn/v20200623/tas_Amon_TaiESM1_historical_r1i1p1f1_gn_185001-201412.nc on hosts ['esgf.ceda.ac.uk', 'esgf.rcec.sinica.edu.tw', 'esgf3.dkrz.de', 'esgf3.dkrz.de']]" ] }, "execution_count": 6, @@ -282,7 +282,7 @@ { "data": { "text/plain": [ - "LocalFile('~/climate_data/CMIP6/CMIP/AS-RCEC/TaiESM1/historical/r1i1p1f1/Amon/tas/gn/v20200623/tas_Amon_TaiESM1_historical_r1i1p1f1_gn_185001-201412.nc')" + "LocalFile('/home/manuel/climate_data/CMIP6/CMIP/AS-RCEC/TaiESM1/historical/r1i1p1f1/Amon/tas/gn/v20200623/tas_Amon_TaiESM1_historical_r1i1p1f1_gn_185001-201412.nc')" ] }, "execution_count": 7, @@ -312,6 +312,235 @@ "source": [ "download(dataset.files, CFG[\"download_dir\"])" ] + }, + { + "cell_type": "markdown", + "id": "d3006d90", + "metadata": {}, + "source": [ + "`Dataset.from_files` can also handle derived variables properly:" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "b75314e3", + "metadata": {}, + "outputs": [], + "source": [ + "dataset_template = Dataset(\n", + " short_name=\"lwcre\",\n", + " mip=\"Amon\",\n", + " project=\"CMIP6\",\n", + " exp=\"historical\",\n", + " dataset=\"*\",\n", + " institute=\"*\",\n", + " ensemble=\"r1i1p1f1\",\n", + " grid=\"gn\",\n", + " derive=True,\n", + " force_derivation=True,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "b87c247f", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Found 36 datasets, showing the first 10:\n" + ] + }, + { + "data": { + "text/plain": [ + "[Dataset:\n", + " {'dataset': 'TaiESM1',\n", + " 'project': 'CMIP6',\n", + " 'mip': 'Amon',\n", + " 'short_name': 'lwcre',\n", + " 'derive': True,\n", + " 'ensemble': 'r1i1p1f1',\n", + " 'exp': 'historical',\n", + " 'force_derivation': True,\n", + " 'grid': 'gn',\n", + " 'institute': 'AS-RCEC'},\n", + " Dataset:\n", + " {'dataset': 'AWI-CM-1-1-MR',\n", + " 'project': 'CMIP6',\n", + " 'mip': 'Amon',\n", + " 'short_name': 'lwcre',\n", + " 'derive': True,\n", + " 'ensemble': 'r1i1p1f1',\n", + " 'exp': 'historical',\n", + " 'force_derivation': True,\n", + " 'grid': 'gn',\n", + " 'institute': 'AWI'},\n", + " Dataset:\n", + " {'dataset': 'AWI-ESM-1-1-LR',\n", + " 'project': 'CMIP6',\n", + " 'mip': 'Amon',\n", + " 'short_name': 'lwcre',\n", + " 'derive': True,\n", + " 'ensemble': 'r1i1p1f1',\n", + " 'exp': 'historical',\n", + " 'force_derivation': True,\n", + " 'grid': 'gn',\n", + " 'institute': 'AWI'},\n", + " Dataset:\n", + " {'dataset': 'AWI-ESM-1-REcoM',\n", + " 'project': 'CMIP6',\n", + " 'mip': 'Amon',\n", + " 'short_name': 'lwcre',\n", + " 'derive': True,\n", + " 'ensemble': 'r1i1p1f1',\n", + " 'exp': 'historical',\n", + " 'force_derivation': True,\n", + " 'grid': 'gn',\n", + " 'institute': 'AWI'},\n", + " Dataset:\n", + " {'dataset': 'BCC-CSM2-MR',\n", + " 'project': 'CMIP6',\n", + " 'mip': 'Amon',\n", + " 'short_name': 'lwcre',\n", + " 'derive': True,\n", + " 'ensemble': 'r1i1p1f1',\n", + " 'exp': 'historical',\n", + " 'force_derivation': True,\n", + " 'grid': 'gn',\n", + " 'institute': 'BCC'},\n", + " Dataset:\n", + " {'dataset': 'BCC-ESM1',\n", + " 'project': 'CMIP6',\n", + " 'mip': 'Amon',\n", + " 'short_name': 'lwcre',\n", + " 'derive': True,\n", + " 'ensemble': 'r1i1p1f1',\n", + " 'exp': 'historical',\n", + " 'force_derivation': True,\n", + " 'grid': 'gn',\n", + " 'institute': 'BCC'},\n", + " Dataset:\n", + " {'dataset': 'CAMS-CSM1-0',\n", + " 'project': 'CMIP6',\n", + " 'mip': 'Amon',\n", + " 'short_name': 'lwcre',\n", + " 'derive': True,\n", + " 'ensemble': 'r1i1p1f1',\n", + " 'exp': 'historical',\n", + " 'force_derivation': True,\n", + " 'grid': 'gn',\n", + " 'institute': 'CAMS'},\n", + " Dataset:\n", + " {'dataset': 'CAS-ESM2-0',\n", + " 'project': 'CMIP6',\n", + " 'mip': 'Amon',\n", + " 'short_name': 'lwcre',\n", + " 'derive': True,\n", + " 'ensemble': 'r1i1p1f1',\n", + " 'exp': 'historical',\n", + " 'force_derivation': True,\n", + " 'grid': 'gn',\n", + " 'institute': 'CAS'},\n", + " Dataset:\n", + " {'dataset': 'FGOALS-g3',\n", + " 'project': 'CMIP6',\n", + " 'mip': 'Amon',\n", + " 'short_name': 'lwcre',\n", + " 'derive': True,\n", + " 'ensemble': 'r1i1p1f1',\n", + " 'exp': 'historical',\n", + " 'force_derivation': True,\n", + " 'grid': 'gn',\n", + " 'institute': 'CAS'},\n", + " Dataset:\n", + " {'dataset': 'IITM-ESM',\n", + " 'project': 'CMIP6',\n", + " 'mip': 'Amon',\n", + " 'short_name': 'lwcre',\n", + " 'derive': True,\n", + " 'ensemble': 'r1i1p1f1',\n", + " 'exp': 'historical',\n", + " 'force_derivation': True,\n", + " 'grid': 'gn',\n", + " 'institute': 'CCCR-IITM'}]" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "datasets = list(dataset_template.from_files())\n", + "print(f\"Found {len(datasets)} datasets, showing the first 10:\")\n", + "datasets[:10]" + ] + }, + { + "cell_type": "markdown", + "id": "18e3a0b7", + "metadata": {}, + "source": [ + "The facet `force_derivation=True` ensures variable derivation. If omitted and files that provide the variable `lwcre` without derivation are present, only those are returned." + ] + }, + { + "cell_type": "markdown", + "id": "f00a886f", + "metadata": {}, + "source": [ + "If variable derivation is necessary (this will always be the case if `force_derivation=True` is used), the `files` attribute of the datasets may be empty. In this case, the input files of the input variables necessary for derivation can be accessed via the `Dataset.input_datasets` attribute:" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "c5edfa65", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[]" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dataset = datasets[0]\n", + "dataset.files" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "97cdf12d", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "rlut\n", + "[ESGFFile:CMIP6/CMIP/AS-RCEC/TaiESM1/historical/r1i1p1f1/Amon/rlut/gn/v20200623/rlut_Amon_TaiESM1_historical_r1i1p1f1_gn_185001-201412.nc on hosts ['esgf.ceda.ac.uk', 'esgf.rcec.sinica.edu.tw', 'esgf3.dkrz.de', 'esgf3.dkrz.de']]\n", + "rlutcs\n", + "[ESGFFile:CMIP6/CMIP/AS-RCEC/TaiESM1/historical/r1i1p1f1/Amon/rlutcs/gn/v20200623/rlutcs_Amon_TaiESM1_historical_r1i1p1f1_gn_185001-201412.nc on hosts ['esgf.ceda.ac.uk', 'esgf.rcec.sinica.edu.tw', 'esgf3.dkrz.de']]\n" + ] + } + ], + "source": [ + "for d in dataset.input_datasets:\n", + " print(d[\"short_name\"])\n", + " print(d.files)" + ] } ], "metadata": { diff --git a/tests/integration/recipe/test_check.py b/tests/integration/recipe/test_check.py index 551603a446..6aec456f80 100644 --- a/tests/integration/recipe/test_check.py +++ b/tests/integration/recipe/test_check.py @@ -274,27 +274,6 @@ def test_valid_time_selection_rejections(timerange, message): assert str(rec_err.value) == message -def test_differing_timeranges(caplog): - timeranges = set() - timeranges.add("1950/1951") - timeranges.add("1950/1952") - required_variables = [ - {"short_name": "rsdscs", "timerange": "1950/1951"}, - {"short_name": "rsuscs", "timerange": "1950/1952"}, - ] - with pytest.raises(ValueError) as exc: - check.differing_timeranges(timeranges, required_variables) - expected_log = ( - f"Differing timeranges with values {timeranges} " - "found for required variables " - "[{'short_name': 'rsdscs', 'timerange': '1950/1951'}, " - "{'short_name': 'rsuscs', 'timerange': '1950/1952'}]. " - "Set `timerange` to a common value." - ) - - assert expected_log in str(exc.value) - - def test_data_availability_nonexistent(tmp_path): var = { "dataset": "ABC", diff --git a/tests/integration/recipe/test_recipe.py b/tests/integration/recipe/test_recipe.py index 5bd6ad47dc..8be287d7d6 100644 --- a/tests/integration/recipe/test_recipe.py +++ b/tests/integration/recipe/test_recipe.py @@ -127,7 +127,7 @@ def get_required(short_name, _): ] monkeypatch.setattr( - esmvalcore._recipe.to_datasets, + esmvalcore.dataset, "get_required", get_required, ) @@ -2538,9 +2538,7 @@ def test_representative_dataset_derived_var( expected_facets = { # Already present in variable "dataset": "ICON", - "derive": True, "exp": "atm_amip-rad_R2B4_r1i1p1f1", - "force_derivation": force_derivation, "frequency": "mon", "mip": "Amon", "project": "ICON", @@ -2550,6 +2548,9 @@ def test_representative_dataset_derived_var( "units": "W m-2", # Added by _add_extra_facets "var_type": "atm_2d_ml", + # Added/changed by Dataset._get_input_datasets() + "derive": False, + "force_derivation": False, } if force_derivation: expected_datasets = [ @@ -2604,9 +2605,7 @@ def test_get_derive_input_variables(patched_datafinder, session): "short_name": "rsdscs", # Already present in variables "dataset": "ICON", - "derive": True, "exp": "atm_amip-rad_R2B4_r1i1p1f1", - "force_derivation": True, "frequency": "mon", "mip": "Amon", "project": "ICON", @@ -2619,6 +2618,9 @@ def test_get_derive_input_variables(patched_datafinder, session): "units": "W m-2", # Added by _add_extra_facets "var_type": "atm_2d_ml", + # Added/changed by Dataset._get_input_datasets() + "derive": False, + "force_derivation": False, } rsdscs = Dataset(**rsdscs_facets) rsdscs.session = session @@ -2628,9 +2630,7 @@ def test_get_derive_input_variables(patched_datafinder, session): "short_name": "rsuscs", # Already present in variables "dataset": "ICON", - "derive": True, "exp": "atm_amip-rad_R2B4_r1i1p1f1", - "force_derivation": True, "frequency": "mon", "mip": "Amon", "project": "ICON", @@ -2643,6 +2643,9 @@ def test_get_derive_input_variables(patched_datafinder, session): "units": "W m-2", # Added by _add_extra_facets "var_type": "atm_2d_ml", + # Added/changed by Dataset._get_input_datasets() + "derive": False, + "force_derivation": False, } rsuscs = Dataset(**rsuscs_facets) rsuscs.session = session diff --git a/tests/unit/recipe/test_recipe.py b/tests/unit/recipe/test_recipe.py index a3678924e8..4cf7bfc7b4 100644 --- a/tests/unit/recipe/test_recipe.py +++ b/tests/unit/recipe/test_recipe.py @@ -912,28 +912,6 @@ def test_get_default_settings(mocker): } -def test_set_version(mocker): - dataset = Dataset(short_name="tas") - supplementary = Dataset(short_name="areacella") - dataset.supplementaries = [supplementary] - - input_dataset = Dataset(short_name="tas") - file1 = mocker.Mock() - file1.facets = {"version": "v1"} - file2 = mocker.Mock() - file2.facets = {"version": "v2"} - input_dataset.files = [file1, file2] - - file3 = mocker.Mock() - file3.facets = {"version": "v3"} - supplementary.files = [file3] - - _recipe._set_version(dataset, [input_dataset]) - print(dataset) - assert dataset.facets["version"] == ["v1", "v2"] - assert dataset.supplementaries[0].facets["version"] == "v3" - - def test_extract_preprocessor_order(): profile = { "custom_order": True, @@ -1003,3 +981,23 @@ def test_special_name_to_dataset_invalid_special_name_type(): ) with pytest.raises(RecipeError, match=msg): _recipe._special_name_to_dataset(facets, "reference_dataset") + + +def test_fix_cmip5_fx_ensemble(monkeypatch): + def find_files(self): + if self.facets["ensemble"] == "r0i0p0": + self._files = ["file1.nc"] + + monkeypatch.setattr(Dataset, "find_files", find_files) + + dataset = Dataset( + dataset="dataset1", + short_name="orog", + mip="fx", + project="CMIP5", + ensemble="r1i1p1", + ) + + _recipe._fix_cmip5_fx_ensemble(dataset) + + assert dataset["ensemble"] == "r0i0p0" diff --git a/tests/unit/recipe/test_to_datasets.py b/tests/unit/recipe/test_to_datasets.py index 20439a1d07..2e560765f1 100644 --- a/tests/unit/recipe/test_to_datasets.py +++ b/tests/unit/recipe/test_to_datasets.py @@ -1,3 +1,4 @@ +import logging import textwrap from pathlib import Path @@ -302,6 +303,57 @@ def test_get_input_datasets_derive(session): assert rlns["frequency"] == "1hr" +def test_get_input_datasets_derive_optional(caplog, tmp_path, session): + facets = { + "project": "OBS6", + "dataset": "SAT", + "mip": "SImon", + "short_name": "siextent", + "tier": 2, + "type": "sat", + "timerange": "1980/2000", + "derive": True, + } + + input_dir = tmp_path / "Tier2" / "SAT" + input_dir.mkdir(parents=True, exist_ok=True) + sic_file = LocalFile( + input_dir / "OBS6_SAT_sat_1_SImon_siconca_1980-2000.nc", + ) + sic_file.touch() + + dataset = Dataset(**facets) + dataset.files = [] + dataset.session = session + + with caplog.at_level(logging.INFO): + datasets = to_datasets._get_input_datasets(dataset) + + expected = Dataset( + dataset="SAT", + project="OBS6", + mip="SImon", + short_name="siconca", + derive=False, + frequency="mon", + long_name="Sea-Ice Area Percentage (Atmospheric Grid)", + modeling_realm=["seaIce"], + optional="true", + original_short_name="siconca", + standard_name="sea_ice_area_fraction", + tier=2, + timerange="1980/2000", + type="sat", + units="%", + ) + expected.session = session + + assert datasets == [expected] + + logger_infos = [r.message for r in caplog.records if r.levelname == "INFO"] + assert "which is marked as 'optional'" in logger_infos[-1] + + def test_max_years(session): recipe_txt = textwrap.dedent(""" diagnostics: @@ -347,26 +399,6 @@ def from_files(_): to_datasets._dataset_from_files(dataset) -def test_fix_cmip5_fx_ensemble(monkeypatch): - def find_files(self): - if self.facets["ensemble"] == "r0i0p0": - self._files = ["file1.nc"] - - monkeypatch.setattr(Dataset, "find_files", find_files) - - dataset = Dataset( - dataset="dataset1", - short_name="orog", - mip="fx", - project="CMIP5", - ensemble="r1i1p1", - ) - - to_datasets._fix_cmip5_fx_ensemble(dataset) - - assert dataset["ensemble"] == "r0i0p0" - - def test_get_supplementary_short_names(monkeypatch): def _update_cmor_facets(facets): facets["modeling_realm"] = "atmos" diff --git a/tests/unit/test_dataset.py b/tests/unit/test_dataset.py index 68e8ceed05..2aa1b91317 100644 --- a/tests/unit/test_dataset.py +++ b/tests/unit/test_dataset.py @@ -1,3 +1,4 @@ +import logging import textwrap from collections import defaultdict from pathlib import Path @@ -1181,6 +1182,642 @@ def test_from_files_with_globs_and_only_missing_facets(monkeypatch, session): assert datasets == [expected] +OBS6_SAT_FACETS = { + "project": "OBS6", + "dataset": "SAT", + "mip": "Amon", + "tier": 2, + "type": "sat", + "timerange": "1980/2000", +} + + +def test_from_files_no_files_glob(session): + dataset = Dataset(**{**OBS6_SAT_FACETS, "type": "*"}, short_name="tas") + datasets = list(dataset.from_files()) + assert datasets == [dataset] + + +@pytest.mark.parametrize("timerange", ["1980/2000", "*"]) +def test_from_files_with_derived_no_files_glob(timerange, session): + dataset = Dataset( + **{**OBS6_SAT_FACETS, "type": "*", "timerange": timerange}, + short_name="lwcre", + derive=True, + ) + datasets = list(dataset.from_files()) + assert datasets == [dataset] + + +@pytest.fixture +def lwcre_file(tmp_path): + input_dir = tmp_path / "Tier2" / "SAT" + input_dir.mkdir(parents=True, exist_ok=True) + lwcre = esmvalcore.local.LocalFile( + input_dir, + "OBS6_SAT_sat_1_Amon_lwcre_1980-2000.nc", + ) + lwcre.touch() + return lwcre + + +@pytest.fixture +def lwcre_file_ground(tmp_path): + input_dir = tmp_path / "Tier2" / "SAT" + input_dir.mkdir(parents=True, exist_ok=True) + lwcre = esmvalcore.local.LocalFile( + input_dir, + "OBS6_SAT_ground_1_Amon_lwcre_1980-2000.nc", + ) + lwcre.touch() + return lwcre + + +@pytest.fixture +def rlut_file(tmp_path): + input_dir = tmp_path / "Tier2" / "SAT" + input_dir.mkdir(parents=True, exist_ok=True) + rlut = esmvalcore.local.LocalFile( + input_dir, + "OBS6_SAT_sat_1_Amon_rlut_1980-2000.nc", + ) + rlut.touch() + return rlut + + +@pytest.fixture +def rlut_file_ground(tmp_path): + input_dir = tmp_path / "Tier2" / "SAT" + input_dir.mkdir(parents=True, exist_ok=True) + rlut = esmvalcore.local.LocalFile( + input_dir, + "OBS6_SAT_ground_1_Amon_rlut_1980-2000.nc", + ) + rlut.touch() + return rlut + + +@pytest.fixture +def rlutcs_file(tmp_path): + input_dir = tmp_path / "Tier2" / "SAT" + input_dir.mkdir(parents=True, exist_ok=True) + rlutcs = esmvalcore.local.LocalFile( + input_dir, + "OBS6_SAT_sat_1_Amon_rlutcs_1980-2000.nc", + ) + rlutcs.touch() + return rlutcs + + +@pytest.fixture +def pr_file(tmp_path): + input_dir = tmp_path / "Tier2" / "SAT" + input_dir.mkdir(parents=True, exist_ok=True) + pr = esmvalcore.local.LocalFile( + input_dir, + "OBS6_SAT_sat_1_Amon_pr_1980-2000.nc", + ) + pr.touch() + return pr + + +def test_from_files_with_derived_no_derivation(lwcre_file, session): + """Test `from_files` with derived variable and supplementary.""" + dataset = Dataset(**OBS6_SAT_FACETS, short_name="lwcre", derive=True) + dataset.add_supplementary(short_name="pr") + dataset.session = session + + datasets = list(dataset.from_files()) + + expected = Dataset(**OBS6_SAT_FACETS, short_name="lwcre", derive=True) + expected.add_supplementary(short_name="pr") + expected.session = session + + assert datasets == [expected] + assert datasets[0].files == [lwcre_file] + assert datasets[0].supplementaries[0].files == [] + + expected_input_dataset = Dataset( + **OBS6_SAT_FACETS, + short_name="lwcre", + derive=True, + frequency="mon", + long_name="TOA Longwave Cloud Radiative Effect", + modeling_realm=["atmos"], + original_short_name="lwcre", + standard_name="", + units="W m-2", + ) + expected_input_dataset.supplementaries = [ + Dataset( + **OBS6_SAT_FACETS, + short_name="pr", + derive=False, + frequency="mon", + long_name="Precipitation", + modeling_realm=["atmos"], + original_short_name="pr", + standard_name="precipitation_flux", + units="kg m-2 s-1", + ), + ] + expected_input_dataset.session = session + + assert datasets[0].input_datasets == [expected_input_dataset] + assert expected_input_dataset.files == [lwcre_file] + + +@pytest.mark.parametrize("timerange", ["1980/2000", "*"]) +def test_from_files_with_derived_no_derivation_glob( + timerange, + lwcre_file, + lwcre_file_ground, + pr_file, + session, +): + """Test `from_files` with derived variable and supplementary.""" + dataset = Dataset( + **{**OBS6_SAT_FACETS, "type": "*", "timerange": timerange}, + short_name="lwcre", + derive=True, + ) + dataset.add_supplementary(short_name="pr") + dataset.session = session + + datasets = list(dataset.from_files()) + + expected_datasets = [ + Dataset( + **{**OBS6_SAT_FACETS, "type": "ground"}, + short_name="lwcre", + derive=True, + ), + Dataset(**OBS6_SAT_FACETS, short_name="lwcre", derive=True), + ] + for expected_ds in expected_datasets: + expected_ds.add_supplementary(short_name="pr", type="sat") + expected_ds.session = session + + assert datasets == expected_datasets + assert datasets[0].files == [lwcre_file_ground] + assert datasets[0].supplementaries[0].files == [pr_file] + assert datasets[1].files == [lwcre_file] + assert datasets[1].supplementaries[0].files == [pr_file] + + expected_input_datasets = [ + Dataset( + **{**OBS6_SAT_FACETS, "type": "ground"}, + short_name="lwcre", + derive=True, + frequency="mon", + long_name="TOA Longwave Cloud Radiative Effect", + modeling_realm=["atmos"], + original_short_name="lwcre", + standard_name="", + units="W m-2", + ), + Dataset( + **OBS6_SAT_FACETS, + short_name="lwcre", + derive=True, + frequency="mon", + long_name="TOA Longwave Cloud Radiative Effect", + modeling_realm=["atmos"], + original_short_name="lwcre", + standard_name="", + units="W m-2", + ), + ] + for expected_ds in expected_input_datasets: + expected_ds.supplementaries = [ + Dataset( + **OBS6_SAT_FACETS, + short_name="pr", + derive=False, + frequency="mon", + long_name="Precipitation", + modeling_realm=["atmos"], + original_short_name="pr", + standard_name="precipitation_flux", + units="kg m-2 s-1", + ), + ] + expected_ds.session = session + + for dataset, expected in zip( + datasets, + expected_input_datasets, + strict=True, + ): + assert dataset.input_datasets == [expected] + assert expected_input_datasets[0].files == [lwcre_file_ground] + assert expected_input_datasets[1].files == [lwcre_file] + + +def test_from_files_with_derived(rlut_file, rlutcs_file, session): + """Test `from_files` with derived variable and supplementary.""" + dataset = Dataset(**OBS6_SAT_FACETS, short_name="lwcre", derive=True) + dataset.add_supplementary(short_name="pr") + dataset.session = session + + datasets = list(dataset.from_files()) + + expected = Dataset(**OBS6_SAT_FACETS, short_name="lwcre", derive=True) + expected.add_supplementary(short_name="pr") + expected.session = session + + assert datasets == [expected] + assert datasets[0].files == [] + assert datasets[0].supplementaries[0].files == [] + + expected_input_datasets = [ + Dataset( + **OBS6_SAT_FACETS, + short_name="rlut", + derive=False, + frequency="mon", + long_name="TOA Outgoing Longwave Radiation", + modeling_realm=["atmos"], + original_short_name="rlut", + standard_name="toa_outgoing_longwave_flux", + units="W m-2", + ), + Dataset( + **OBS6_SAT_FACETS, + short_name="rlutcs", + derive=False, + frequency="mon", + long_name="TOA Outgoing Clear-Sky Longwave Radiation", + modeling_realm=["atmos"], + original_short_name="rlutcs", + standard_name="toa_outgoing_longwave_flux_assuming_clear_sky", + units="W m-2", + ), + ] + for expected_ds in expected_input_datasets: + expected_ds.session = session + + assert datasets[0].input_datasets == expected_input_datasets + assert expected_input_datasets[0].files == [rlut_file] + assert expected_input_datasets[1].files == [rlutcs_file] + + +@pytest.mark.parametrize("timerange", ["1980/2000", "*"]) +def test_from_files_with_derived_glob( + timerange, + rlut_file, + rlut_file_ground, + rlutcs_file, + pr_file, + session, + caplog, +): + """Test `from_files` with derived variable and supplementary.""" + dataset = Dataset( + **{**OBS6_SAT_FACETS, "type": "*", "timerange": timerange}, + short_name="lwcre", + derive=True, + ) + dataset.add_supplementary(short_name="pr") + dataset.session = session + + with caplog.at_level(logging.DEBUG): + datasets = list(dataset.from_files()) + + expected = Dataset(**OBS6_SAT_FACETS, short_name="lwcre", derive=True) + expected.add_supplementary(short_name="pr") + expected.session = session + + assert datasets == [expected] + assert datasets[0].files == [] + assert datasets[0].supplementaries[0].files == [pr_file] + + expected_input_datasets = [ + Dataset( + **OBS6_SAT_FACETS, + short_name="rlut", + derive=False, + frequency="mon", + long_name="TOA Outgoing Longwave Radiation", + modeling_realm=["atmos"], + original_short_name="rlut", + standard_name="toa_outgoing_longwave_flux", + units="W m-2", + ), + Dataset( + **OBS6_SAT_FACETS, + short_name="rlutcs", + derive=False, + frequency="mon", + long_name="TOA Outgoing Clear-Sky Longwave Radiation", + modeling_realm=["atmos"], + original_short_name="rlutcs", + standard_name="toa_outgoing_longwave_flux_assuming_clear_sky", + units="W m-2", + ), + ] + for expected_ds in expected_input_datasets: + expected_ds.session = session + + assert datasets[0].input_datasets == expected_input_datasets + assert expected_input_datasets[0].files == [rlut_file] + assert expected_input_datasets[1].files == [rlutcs_file] + + log_debugs = [r.message for r in caplog.records if r.levelname == "DEBUG"] + msg = "Not all necessary input variables to derive 'lwcre' are available" + for log_debug in log_debugs: + if msg in log_debug: + break + else: + pytest.fail(f"No debug message '{msg}'") + + +def test_from_files_with_derived_no_force_derivation( + lwcre_file, + rlut_file, + rlutcs_file, + session, +): + """Test `from_files` with derived variable and supplementary.""" + dataset = Dataset(**OBS6_SAT_FACETS, short_name="lwcre", derive=True) + dataset.add_supplementary(short_name="pr") + dataset.session = session + + datasets = list(dataset.from_files()) + + expected = Dataset(**OBS6_SAT_FACETS, short_name="lwcre", derive=True) + expected.add_supplementary(short_name="pr") + expected.session = session + + assert datasets == [expected] + assert datasets[0].files == [lwcre_file] + assert datasets[0].supplementaries[0].files == [] + + expected_input_dataset = Dataset( + **OBS6_SAT_FACETS, + short_name="lwcre", + derive=True, + frequency="mon", + long_name="TOA Longwave Cloud Radiative Effect", + modeling_realm=["atmos"], + original_short_name="lwcre", + standard_name="", + units="W m-2", + ) + expected_input_dataset.supplementaries = [ + Dataset( + **OBS6_SAT_FACETS, + short_name="pr", + derive=False, + frequency="mon", + long_name="Precipitation", + modeling_realm=["atmos"], + original_short_name="pr", + standard_name="precipitation_flux", + units="kg m-2 s-1", + ), + ] + expected_input_dataset.session = session + + assert datasets[0].input_datasets == [expected_input_dataset] + assert expected_input_dataset.files == [lwcre_file] + + +@pytest.mark.parametrize("timerange", ["1980/2000", "*"]) +def test_from_files_with_derived_no_force_derivation_glob( # noqa: PLR0913 + timerange, + lwcre_file, + lwcre_file_ground, + rlut_file, + rlut_file_ground, + rlutcs_file, + pr_file, + session, +): + """Test `from_files` with derived variable and supplementary.""" + dataset = Dataset( + **{**OBS6_SAT_FACETS, "type": "*", "timerange": timerange}, + short_name="lwcre", + derive=True, + ) + dataset.add_supplementary(short_name="pr") + dataset.session = session + + datasets = list(dataset.from_files()) + + expected_datasets = [ + Dataset( + **{**OBS6_SAT_FACETS, "type": "ground"}, + short_name="lwcre", + derive=True, + ), + Dataset(**OBS6_SAT_FACETS, short_name="lwcre", derive=True), + ] + for expected_ds in expected_datasets: + expected_ds.add_supplementary(short_name="pr", type="sat") + expected_ds.session = session + + assert datasets == expected_datasets + assert datasets[0].files == [lwcre_file_ground] + assert datasets[0].supplementaries[0].files == [pr_file] + assert datasets[1].files == [lwcre_file] + assert datasets[1].supplementaries[0].files == [pr_file] + + expected_input_datasets = [ + Dataset( + **{**OBS6_SAT_FACETS, "type": "ground"}, + short_name="lwcre", + derive=True, + frequency="mon", + long_name="TOA Longwave Cloud Radiative Effect", + modeling_realm=["atmos"], + original_short_name="lwcre", + standard_name="", + units="W m-2", + ), + Dataset( + **OBS6_SAT_FACETS, + short_name="lwcre", + derive=True, + frequency="mon", + long_name="TOA Longwave Cloud Radiative Effect", + modeling_realm=["atmos"], + original_short_name="lwcre", + standard_name="", + units="W m-2", + ), + ] + for expected_ds in expected_input_datasets: + expected_ds.supplementaries = [ + Dataset( + **OBS6_SAT_FACETS, + short_name="pr", + derive=False, + frequency="mon", + long_name="Precipitation", + modeling_realm=["atmos"], + original_short_name="pr", + standard_name="precipitation_flux", + units="kg m-2 s-1", + ), + ] + expected_ds.session = session + + for dataset, expected in zip( + datasets, + expected_input_datasets, + strict=True, + ): + assert dataset.input_datasets == [expected] + assert expected_input_datasets[0].files == [lwcre_file_ground] + assert expected_input_datasets[1].files == [lwcre_file] + + +def test_from_files_with_derived_force_derivation( + lwcre_file, + rlut_file, + rlutcs_file, + session, +): + """Test `from_files` with derived variable and supplementary.""" + dataset = Dataset( + **OBS6_SAT_FACETS, + short_name="lwcre", + derive=True, + force_derivation=True, + ) + dataset.add_supplementary(short_name="pr") + dataset.session = session + + datasets = list(dataset.from_files()) + + expected = Dataset( + **OBS6_SAT_FACETS, + short_name="lwcre", + derive=True, + force_derivation=True, + ) + expected.add_supplementary(short_name="pr") + expected.session = session + + assert datasets == [expected] + assert datasets[0].files == [lwcre_file] + assert datasets[0].supplementaries[0].files == [] + + expected_input_datasets = [ + Dataset( + **OBS6_SAT_FACETS, + short_name="rlut", + derive=False, + force_derivation=False, + frequency="mon", + long_name="TOA Outgoing Longwave Radiation", + modeling_realm=["atmos"], + original_short_name="rlut", + standard_name="toa_outgoing_longwave_flux", + units="W m-2", + ), + Dataset( + **OBS6_SAT_FACETS, + short_name="rlutcs", + derive=False, + force_derivation=False, + frequency="mon", + long_name="TOA Outgoing Clear-Sky Longwave Radiation", + modeling_realm=["atmos"], + original_short_name="rlutcs", + standard_name="toa_outgoing_longwave_flux_assuming_clear_sky", + units="W m-2", + ), + ] + for expected_ds in expected_input_datasets: + expected_ds.session = session + + assert datasets[0].input_datasets == expected_input_datasets + assert expected_input_datasets[0].files == [rlut_file] + assert expected_input_datasets[1].files == [rlutcs_file] + + +@pytest.mark.parametrize("timerange", ["1980/2000", "*"]) +def test_from_files_with_derived_force_derivation_glob( # noqa: PLR0913 + timerange, + lwcre_file, + lwcre_file_ground, + rlut_file, + rlut_file_ground, + rlutcs_file, + pr_file, + session, + caplog, +): + """Test `from_files` with derived variable and supplementary.""" + dataset = Dataset( + **{**OBS6_SAT_FACETS, "type": "*", "timerange": timerange}, + short_name="lwcre", + derive=True, + force_derivation=True, + ) + dataset.add_supplementary(short_name="pr") + dataset.session = session + + with caplog.at_level(logging.DEBUG): + datasets = list(dataset.from_files()) + + expected = Dataset( + **OBS6_SAT_FACETS, + short_name="lwcre", + derive=True, + force_derivation=True, + ) + expected.add_supplementary(short_name="pr") + expected.session = session + + assert datasets == [expected] + assert datasets[0].files == [lwcre_file] + assert datasets[0].supplementaries[0].files == [pr_file] + + expected_input_datasets = [ + Dataset( + **OBS6_SAT_FACETS, + short_name="rlut", + derive=False, + force_derivation=False, + frequency="mon", + long_name="TOA Outgoing Longwave Radiation", + modeling_realm=["atmos"], + original_short_name="rlut", + standard_name="toa_outgoing_longwave_flux", + units="W m-2", + ), + Dataset( + **OBS6_SAT_FACETS, + short_name="rlutcs", + derive=False, + force_derivation=False, + frequency="mon", + long_name="TOA Outgoing Clear-Sky Longwave Radiation", + modeling_realm=["atmos"], + original_short_name="rlutcs", + standard_name="toa_outgoing_longwave_flux_assuming_clear_sky", + units="W m-2", + ), + ] + for expected_ds in expected_input_datasets: + expected_ds.session = session + + assert datasets[0].input_datasets == expected_input_datasets + assert expected_input_datasets[0].files == [rlut_file] + assert expected_input_datasets[1].files == [rlutcs_file] + + log_debugs = [r.message for r in caplog.records if r.levelname == "DEBUG"] + msg = "Not all necessary input variables to derive 'lwcre' are available" + for log_debug in log_debugs: + if msg in log_debug: + break + else: + pytest.fail(f"No debug message '{msg}'") + + def test_match(): dataset1 = Dataset( short_name="areacella", @@ -1609,7 +2246,7 @@ def test_find_files_non_esgf_projects(mocker, project, monkeypatch): assert tas._file_globs == mock.sentinel.file_globs -def test_set_version(): +def test_set_version_non_derived_var(): dataset = Dataset(short_name="tas") dataset.add_supplementary(short_name="areacella") file_v1 = esmvalcore.local.LocalFile("/path/to/v1/tas.nc") @@ -1625,6 +2262,43 @@ def test_set_version(): assert dataset.supplementaries[0].facets["version"] == "v3" +def test_set_version_derive_var(monkeypatch): + dataset = Dataset(**OBS6_SAT_FACETS, short_name="lwcre", derive=True) + dataset.add_supplementary(short_name="areacella") + dataset.files = [] + areacella_file = esmvalcore.local.LocalFile("/path/to/areacella.nc") + areacella_file.facets["version"] = "v4" + dataset.supplementaries[0].files = [areacella_file] + + def _get_input_datasets(): + rlut_file = esmvalcore.local.LocalFile("/path/to/rlut.nc") + rlut_file.facets["version"] = "v1" + rlut_dataset = Dataset( + **OBS6_SAT_FACETS, + short_name="rlut", + derive=False, + ) + rlut_dataset.files = [rlut_file] + rlutcs_file_1 = esmvalcore.local.LocalFile("/path/to/rlutcs_1.nc") + rlutcs_file_2 = esmvalcore.local.LocalFile("/path/to/rlutcs_2.nc") + rlutcs_file_1.facets["version"] = "v2" + rlutcs_file_2.facets["version"] = "v3" + rlutcs_dataset = Dataset( + **OBS6_SAT_FACETS, + short_name="rlutcs", + derive=False, + ) + rlutcs_dataset.files = [rlutcs_file_1, rlutcs_file_2] + return [rlut_dataset, rlutcs_dataset] + + monkeypatch.setattr(dataset, "_get_input_datasets", _get_input_datasets) + + dataset.set_version() + + assert dataset.facets["version"] == ["v1", "v2", "v3"] + assert dataset.supplementaries[0].facets["version"] == "v4" + + @pytest.mark.parametrize("timerange", ["*", "185001/*", "*/185112"]) def test_update_timerange_from_esgf(mocker, timerange): esgf_files = [ @@ -2137,16 +2811,6 @@ def test_get_extra_facets_native6(): } -OBS6_SAT_FACETS = { - "project": "OBS6", - "dataset": "SAT", - "mip": "Amon", - "tier": 2, - "type": "sat", - "timerange": "1980/2000", -} - - def test_is_derived_no_derivation(): dataset = Dataset(**OBS6_SAT_FACETS, short_name="tas") assert dataset._is_derived() is False @@ -2196,6 +2860,15 @@ def test_derivation_necessary_no_force_derivation_no_files(): assert dataset._derivation_necessary() is True +def test_derivation_necessary_no_force_derivation_no_files_glob(): + dataset = Dataset( + **{**OBS6_SAT_FACETS, "timerange": "*"}, + short_name="lwcre", + derive=True, + ) + assert dataset._derivation_necessary() is True + + def test_derivation_necessary_no_force_derivation(tmp_path, session): dataset = Dataset(**OBS6_SAT_FACETS, short_name="lwcre", derive=True) dataset.session = session @@ -2269,3 +2942,67 @@ def test_add_derived_supplementary_to_derived(): force_derivation=True, ) assert dataset.supplementaries[0] == expected_supplementary + + +def test_input_datasets_derivation(): + dataset = Dataset(**OBS6_SAT_FACETS, short_name="lwcre", derive=True) + dataset.add_supplementary(short_name="pr") + + expected_datasets = [ + Dataset( + **OBS6_SAT_FACETS, + short_name="rlut", + derive=False, + frequency="mon", + long_name="TOA Outgoing Longwave Radiation", + modeling_realm=["atmos"], + original_short_name="rlut", + standard_name="toa_outgoing_longwave_flux", + units="W m-2", + ), + Dataset( + **OBS6_SAT_FACETS, + short_name="rlutcs", + derive=False, + frequency="mon", + long_name="TOA Outgoing Clear-Sky Longwave Radiation", + modeling_realm=["atmos"], + original_short_name="rlutcs", + standard_name="toa_outgoing_longwave_flux_assuming_clear_sky", + units="W m-2", + ), + ] + for expected_dataset in expected_datasets: + expected_dataset.session = dataset.session + + assert dataset.input_datasets == expected_datasets + + +def test_input_datasets_no_derivation(): + dataset = Dataset(**OBS6_SAT_FACETS, short_name="tas") + dataset.add_supplementary(short_name="pr") + + assert dataset.input_datasets == [dataset] + + +def test_input_datasets_no_force_derivation(tmp_path, session): + dataset = Dataset(**OBS6_SAT_FACETS, short_name="lwcre", derive=True) + dataset.add_supplementary(short_name="pr") + dataset.session = session + + input_dir = tmp_path / "Tier2" / "SAT" + input_dir.mkdir(parents=True, exist_ok=True) + lwcre_file = esmvalcore.local.LocalFile( + input_dir / "OBS6_SAT_sat_1_Amon_lwcre_1980-2000.nc", + ) + lwcre_file.touch() + + assert dataset.input_datasets == [dataset] + + +def test_input_datasets_no_derivation_available(): + dataset = Dataset(**OBS6_SAT_FACETS, short_name="tas", derive=True) + + msg = r"Cannot derive variable 'tas': no derivation script available" + with pytest.raises(NotImplementedError, match=msg): + dataset.input_datasets # noqa: B018