diff --git a/.circleci/config.yml b/.circleci/config.yml index 2950d4feed..e2e4907b69 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -66,7 +66,7 @@ commands: - run: name: Install git+ssh environment: - DEBIAN_FRONTEND: noninteractive # needed to install tzdata + DEBIAN_FRONTEND: noninteractive # needed to install tzdata command: apt update && apt install -y git ssh - checkout - check_changes @@ -141,6 +141,7 @@ jobs: . /opt/conda/etc/profile.d/conda.sh mkdir /logs conda activate esmvaltool + pip install intake-esgf 'globus-sdk<4' # TODO: remove before merging pip install --no-deps .[test] > /logs/install.txt 2>&1 pip check - test_and_report: @@ -155,7 +156,7 @@ jobs: name: Install gpg (required by codecov orb) command: apt update && apt install -y gpg - codecov/upload: - files: 'test-reports/coverage.xml' + files: "test-reports/coverage.xml" disable_search: true test_installation_from_source_test_mode: diff --git a/.gitignore b/.gitignore index 7f17eca52c..ee821184de 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,6 @@ # Autogenerated files _sidebar.rst.inc +jupyter_execute/ # Distribution / packaging .Python diff --git a/doc/api/esmvalcore.esgf.rst b/doc/api/esmvalcore.esgf.rst index c6fac3553b..0420985f50 100644 --- a/doc/api/esmvalcore.esgf.rst +++ b/doc/api/esmvalcore.esgf.rst @@ -1,18 +1,10 @@ Find and download files from ESGF ================================= -This module provides the function :py:func:`esmvalcore.esgf.find_files` -for searching for files on ESGF using the ESMValTool vocabulary. -It returns :py:class:`esmvalcore.esgf.ESGFFile` objects, which have a convenient -:py:meth:`esmvalcore.esgf.ESGFFile.download` method for downloading the files. - -See :ref:`config-esgf` for instructions on configuring this module. - esmvalcore.esgf --------------- -.. autofunction:: esmvalcore.esgf.find_files -.. autofunction:: esmvalcore.esgf.download -.. autoclass:: esmvalcore.esgf.ESGFFile +.. automodule:: esmvalcore.esgf + :noindex: esmvalcore.esgf.facets ---------------------- diff --git a/doc/api/esmvalcore.io.intake_esgf.rst b/doc/api/esmvalcore.io.intake_esgf.rst new file mode 100644 index 0000000000..4fcb6c0bde --- /dev/null +++ b/doc/api/esmvalcore.io.intake_esgf.rst @@ -0,0 +1,5 @@ +esmvalcore.io.intake_esgf +========================= + +.. automodule:: esmvalcore.io.intake_esgf + :no-inherited-members: diff --git a/doc/api/esmvalcore.io.protocol.rst b/doc/api/esmvalcore.io.protocol.rst new file mode 100644 index 0000000000..f785893af9 --- /dev/null +++ b/doc/api/esmvalcore.io.protocol.rst @@ -0,0 +1,5 @@ +esmvalcore.io.protocol +====================== + +.. automodule:: esmvalcore.io.protocol + :no-inherited-members: diff --git a/doc/api/esmvalcore.io.rst b/doc/api/esmvalcore.io.rst new file mode 100644 index 0000000000..5d41a029c0 --- /dev/null +++ b/doc/api/esmvalcore.io.rst @@ -0,0 +1,18 @@ +Access data from any source +=========================== + +ESMValCore supports a modular system for reading data from various data sources. +In the future, this module may be extended with support for writing output data. + +The interface is defined in the :mod:`esmvalcore.io.protocol` module and +the other modules here provide an implementation for a particular data source. + +.. toctree:: + :maxdepth: 1 + + esmvalcore.io.protocol + esmvalcore.io.intake_esgf + +esmvalcore.io +------------- +.. automodule:: esmvalcore.io diff --git a/doc/api/esmvalcore.rst b/doc/api/esmvalcore.rst index d160246243..a2833b821e 100644 --- a/doc/api/esmvalcore.rst +++ b/doc/api/esmvalcore.rst @@ -14,6 +14,7 @@ library. This section documents the public API of ESMValCore. esmvalcore.dataset esmvalcore.esgf esmvalcore.exceptions + esmvalcore.io esmvalcore.iris_helpers esmvalcore.local esmvalcore.preprocessor diff --git a/doc/conf.py b/doc/conf.py index 3d5bf9d9d3..8008b73645 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -461,6 +461,7 @@ 'dask': ('https://docs.dask.org/en/stable/', None), 'distributed': ('https://distributed.dask.org/en/stable/', None), 'iris': ('https://scitools-iris.readthedocs.io/en/stable/', None), + 'intake_esgf': ('https://intake-esgf.readthedocs.io/en/stable/', None), 'esmf_regrid': ('https://iris-esmf-regrid.readthedocs.io/en/stable/', None), 'matplotlib': ('https://matplotlib.org/stable/', None), 'ncdata': ('https://ncdata.readthedocs.io/en/stable/', None), diff --git a/doc/configurations b/doc/configurations new file mode 120000 index 0000000000..17a515d17e --- /dev/null +++ b/doc/configurations @@ -0,0 +1 @@ +../esmvalcore/config/configurations \ No newline at end of file diff --git a/environment.yml b/environment.yml index 00a251a4ba..ff4fd48f70 100644 --- a/environment.yml +++ b/environment.yml @@ -9,7 +9,7 @@ dependencies: - cartopy - cf-units - cftime - - dask >=2025 # github.com/ESMValGroup/ESMValCore/issues/2503 + - dask >=2025 # github.com/ESMValGroup/ESMValCore/issues/2503 - dask-jobqueue - distributed - esgf-pyclient >=0.3.1 @@ -20,13 +20,15 @@ dependencies: - fire - geopy - humanfriendly + - intake-esgf + - globus-sdk <4 # https://github.com/esgf2-us/intake-esgf/issues/150 - intake-esm - - iris >=3.12.2 # https://github.com/SciTools/iris/issues/6417 + - iris >=3.12.2 # https://github.com/SciTools/iris/issues/6417 - iris-esmf-regrid >=0.11.0 - - iris-grib >=0.20.0 # github.com/ESMValGroup/ESMValCore/issues/2535 - - isodate >=0.7.0 # incompatible with very old 0.6.1 + - iris-grib >=0.20.0 # github.com/ESMValGroup/ESMValCore/issues/2535 + - isodate >=0.7.0 # incompatible with very old 0.6.1 - jinja2 - - libnetcdf !=4.9.1 # to avoid hdf5 warnings; only on conda-forge + - libnetcdf !=4.9.1 # to avoid hdf5 warnings; only on conda-forge - nc-time-axis - ncdata - nested-lookup diff --git a/esmvalcore/_main.py b/esmvalcore/_main.py index 7068f5afa2..f2b6f3b03b 100644 --- a/esmvalcore/_main.py +++ b/esmvalcore/_main.py @@ -159,6 +159,97 @@ class Config: files. """ + def __init__(self) -> None: + from rich.console import Console + + self.console = Console() + + def show( + self, + filter: tuple[str] | None = ("extra_facets",), # noqa: A002 + ) -> None: + """Show the current configuration. + + Parameters + ---------- + filter: + Filter this list of keys. By default, the `extra_facets` + key is filtered out, as it can be very large. + + """ + import yaml + from nested_lookup import nested_delete + from rich.syntax import Syntax + + from esmvalcore.config import CFG + + cfg = dict(CFG) + if filter: + for key in filter: + cfg = nested_delete(cfg, key) + exclude_msg = ( + ", excluding the keys " + ", ".join(f"'{f}'" for f in filter) + if filter + else "" + ) + self.console.print(f"# Current configuration{exclude_msg}:") + self.console.print( + Syntax( + yaml.safe_dump(cfg), + "yaml", + background_color="default", + ), + ) + + def list(self) -> None: + """List all available example configuration files.""" + import importlib.resources + + import esmvalcore.config + + config_dir = ( + importlib.resources.files(esmvalcore.config) / "configurations" + ) + self.console.print("Available configuration files:") + available_files = sorted( + f.name + for f in config_dir.iterdir() + if f.suffix == ".yml" # type: ignore[attr-defined] + ) + self.console.print("\n".join(f"- {f}" for f in available_files)) + + def copy( + self, + source_file: str, + target_file: Path | None = None, + overwrite: bool = False, + ) -> None: + """Copy one of the available example configuration files to the configuration directory.""" + import importlib.resources + + import esmvalcore.config + + target_dir = esmvalcore.config._config_object._get_user_config_dir() # noqa: SLF001 + target_file = target_dir / ( + source_file if target_file is None else target_file + ) + config_dir = ( + importlib.resources.files(esmvalcore.config) / "configurations" + ) + available_files = sorted( + f.name + for f in config_dir.iterdir() + if f.suffix == ".yml" # type: ignore[attr-defined] + ) + if source_file not in available_files: + msg = ( + f"Configuration file {source_file} not found, choose from " + f"{', '.join(available_files)}" + ) + raise FileNotFoundError(msg) + with importlib.resources.as_file(config_dir / source_file) as file: + self._copy_config_file(file, target_file, overwrite=overwrite) + @staticmethod def _copy_config_file( in_file: Path, @@ -184,7 +275,7 @@ def _copy_config_file( logger.info("Creating folder %s", target_folder) target_folder.mkdir(parents=True, exist_ok=True) - logger.info("Copying file %s to path %s.", in_file, out_file) + logger.info("Copying file %s to path %s", in_file, out_file) shutil.copy2(in_file, out_file) logger.info("Copy finished.") diff --git a/esmvalcore/_provenance.py b/esmvalcore/_provenance.py index dc669731e5..a4f3b4c79d 100644 --- a/esmvalcore/_provenance.py +++ b/esmvalcore/_provenance.py @@ -1,33 +1,48 @@ """Provenance module.""" +from __future__ import annotations + import copy import logging import os from functools import total_ordering +from pathlib import Path +from typing import TYPE_CHECKING, Any from netCDF4 import Dataset from PIL import Image from PIL.PngImagePlugin import PngInfo from prov.model import ProvDerivation, ProvDocument -from ._version import __version__ +from esmvalcore._version import __version__ +from esmvalcore.io.protocol import DataElement + +if TYPE_CHECKING: + from collections.abc import Iterable + + import prov.model + + from esmvalcore._task import BaseTask logger = logging.getLogger(__name__) ESMVALTOOL_URI_PREFIX = "https://www.esmvaltool.org/" -def create_namespace(provenance, namespace): +def create_namespace( + provenance: prov.model.ProvBundle, + namespace: str, +) -> None: """Create an esmvaltool namespace.""" provenance.add_namespace(namespace, uri=ESMVALTOOL_URI_PREFIX + namespace) -def get_esmvaltool_provenance(): +def get_esmvaltool_provenance() -> prov.model.ProvActivity: """Create an esmvaltool run activity.""" provenance = ProvDocument() namespace = "software" create_namespace(provenance, namespace) - attributes = {} # TODO: add dependencies with versions here + attributes: dict = {} # TODO: add dependencies with versions here return provenance.activity( namespace + ":esmvaltool==" + __version__, other_attributes=attributes, @@ -37,7 +52,10 @@ def get_esmvaltool_provenance(): ESMVALTOOL_PROVENANCE = get_esmvaltool_provenance() -def attribute_to_authors(entity, authors): +def attribute_to_authors( + entity: prov.model.ProvEntity, + authors: list[dict[str, str]], +) -> None: """Attribute entity to authors.""" namespace = "author" create_namespace(entity.bundle, namespace) @@ -53,7 +71,10 @@ def attribute_to_authors(entity, authors): entity.wasAttributedTo(agent) -def attribute_to_projects(entity, projects): +def attribute_to_projects( + entity: prov.model.ProvEntity, + projects: list[str], +) -> None: """Attribute entity to projects.""" namespace = "project" create_namespace(entity.bundle, namespace) @@ -63,7 +84,10 @@ def attribute_to_projects(entity, projects): entity.wasAttributedTo(agent) -def get_recipe_provenance(documentation, filename): +def get_recipe_provenance( + documentation: dict[str, Any], + filename: Path, +) -> prov.model.ProvEntity: """Create a provenance entity describing a recipe.""" provenance = ProvDocument() @@ -84,7 +108,10 @@ def get_recipe_provenance(documentation, filename): return entity -def get_task_provenance(task, recipe_entity): +def get_task_provenance( + task: BaseTask, + recipe_entity: prov.model.ProvEntity, +) -> prov.model.ProvActivity: """Create a provenance activity describing a task.""" provenance = ProvDocument() create_namespace(provenance, "task") @@ -108,81 +135,102 @@ class TrackedFile: def __init__( self, - filename, - attributes=None, - ancestors=None, - prov_filename=None, + filename: DataElement | Path, + attributes: dict[str, Any] | None = None, + ancestors: Iterable[TrackedFile] | None = None, + prov_filename: str | None = None, ): """Create an instance of a file with provenance tracking. Arguments --------- - filename: str - Path to the file on disk. - attributes: dict + filename: + Path or data element containing the data described by the provenance. + + Attributes + ---------- Dictionary with facets describing the file. If set to None, this will be read from the file when provenance is initialized. - ancestors: :obj:`list` of :obj:`TrackedFile` + ancestors: Ancestor files. - prov_filename: str + prov_filename: The path this file has in the provenance record. This can differ from `filename` if the file was moved before resuming processing. """ self._filename = filename if prov_filename is None: - self.prov_filename = filename + self.prov_filename = ( + str(filename) if isinstance(filename, Path) else filename.name + ) else: self.prov_filename = prov_filename + self.attributes = copy.deepcopy(attributes) self.provenance = None self.entity = None self.activity = None - self._ancestors = [] if ancestors is None else ancestors + self._ancestors = [] if ancestors is None else list(ancestors) + + @property + def attributes(self) -> dict[str, Any]: + """Attributes describing the file.""" + if self._attributes is None: + msg = f"Call {self.__class__.__name__}.initialize_provenance before accessing attributes" + raise ValueError(msg) + return self._attributes - def __str__(self): + @attributes.setter + def attributes(self, value: dict[str, Any] | None): + """Set attributes describing the file.""" + self._attributes = value + + def __str__(self) -> str: """Return summary string.""" return f"{self.__class__.__name__}: {self.filename}" - def __repr__(self): + def __repr__(self) -> str: """Return representation string (e.g., used by ``pformat``).""" return f"{self.__class__.__name__}: {self.filename}" - def __eq__(self, other): + def __eq__(self, other) -> bool: """Check if `other` equals `self`.""" return hasattr(other, "filename") and self.filename == other.filename - def __lt__(self, other): + def __lt__(self, other) -> bool: """Check if `other` should be sorted before `self`.""" return hasattr(other, "filename") and self.filename < other.filename - def __hash__(self): + def __hash__(self) -> int: """Return a unique hash for the file.""" return hash(self.filename) - def copy_provenance(self): + def copy_provenance(self) -> TrackedFile: """Create a copy with identical provenance information.""" if self.provenance is None: msg = f"Provenance of {self} not initialized" raise ValueError(msg) - new = TrackedFile(self.filename, self.attributes) + new = TrackedFile(Path(self.filename), self.attributes) new.provenance = copy.deepcopy(self.provenance) new.entity = new.provenance.get_record(self.entity.identifier)[0] new.activity = new.provenance.get_record(self.activity.identifier)[0] return new @property - def filename(self): - """Filename.""" + def filename(self) -> DataElement | Path: + """Name of data described by this provenance document.""" return self._filename @property - def provenance_file(self): - """Filename of provenance.""" - return os.path.splitext(self.filename)[0] + "_provenance.xml" - - def initialize_provenance(self, activity): + def provenance_file(self) -> Path: + """Filename of provenance file.""" + if not isinstance(self.filename, Path): + msg = f"Saving provenance is only supported for pathlib.Path, not {type(self.filename)}" + raise NotImplementedError(msg) + return self.filename.with_name(f"{self.filename.stem}_provenance.xml") + + def initialize_provenance(self, activity: prov.model.ProvActivity) -> None: """Initialize the provenance document. Note: this also copies the ancestor provenance. Therefore, changes @@ -191,30 +239,33 @@ def initialize_provenance(self, activity): """ if self.provenance is not None: msg = f"Provenance of {self} already initialized" - raise ValueError( - msg, - ) + raise ValueError(msg) self.provenance = ProvDocument() self._initialize_namespaces() self._initialize_activity(activity) self._initialize_entity() self._initialize_ancestors(activity) - def _initialize_namespaces(self): + def _initialize_namespaces(self) -> None: """Initialize the namespaces.""" for namespace in ("file", "attribute", "preprocessor", "task"): create_namespace(self.provenance, namespace) - def _initialize_activity(self, activity): + def _initialize_activity(self, activity: prov.model.ProvActivity) -> None: """Copy the preprocessor task activity.""" self.activity = activity - self.provenance.update(activity.bundle) + self.provenance.update(activity.bundle) # type: ignore[attr-defined] - def _initialize_entity(self): + def _initialize_entity(self) -> None: """Initialize the entity representing the file.""" - if self.attributes is None: - # This happens for ancestor files of preprocessor files as created - # in esmvalcore.preprocessor.Processorfile.__init__. + if self._attributes is None: + if not isinstance(self.filename, DataElement): + msg = "Delayed reading of attributes is only supported for `DataElement`s" + raise TypeError(msg) + # This is used to delay reading the attributes of ancestor files of + # preprocessor files as created in + # esmvalcore.preprocessor.Processorfile.__init__ until after the data + # has been loaded. self.attributes = copy.deepcopy(self.filename.attributes) attributes = { @@ -222,38 +273,44 @@ def _initialize_entity(self): for k, v in self.attributes.items() if k not in ("authors", "projects") } - self.entity = self.provenance.entity( - f"file:{self.filename}", + self.entity = self.provenance.entity( # type: ignore[attr-defined] + f"file:{self.prov_filename}", attributes, ) attribute_to_authors(self.entity, self.attributes.get("authors", [])) attribute_to_projects(self.entity, self.attributes.get("projects", [])) - def _initialize_ancestors(self, activity): + def _initialize_ancestors(self, activity: prov.model.ProvActivity) -> None: """Register ancestor files for provenance tracking.""" for ancestor in self._ancestors: if ancestor.provenance is None: - if os.path.exists(ancestor.provenance_file): + if ( + isinstance(ancestor.filename, Path) + and ancestor.provenance_file.exists() + ): ancestor.restore_provenance() else: ancestor.initialize_provenance(activity) - self.provenance.update(ancestor.provenance) + self.provenance.update(ancestor.provenance) # type: ignore[attr-defined] self.wasderivedfrom(ancestor) - def wasderivedfrom(self, other): + def wasderivedfrom( + self, + other: TrackedFile | prov.model.ProvEntity, + ) -> None: """Let the file know that it was derived from other.""" if isinstance(other, TrackedFile): other_entity = other.entity else: other_entity = other - self.provenance.update(other_entity.bundle) if not self.activity: - msg = "Activity not initialized." + msg = f"Provenance of {self} not initialized" raise ValueError(msg) + self.provenance.update(other_entity.bundle) # type: ignore[attr-defined, union-attr] self.entity.wasDerivedFrom(other_entity, self.activity) - def _select_for_include(self): + def _select_for_include(self) -> dict[str, str]: attributes = { "software": f"Created with ESMValTool v{__version__}", } @@ -262,13 +319,19 @@ def _select_for_include(self): return attributes @staticmethod - def _include_provenance_nc(filename, attributes): + def _include_provenance_nc( + filename: Path, + attributes: dict[str, str], + ) -> None: with Dataset(filename, "a") as dataset: for key, value in attributes.items(): setattr(dataset, key, value) @staticmethod - def _include_provenance_png(filename, attributes): + def _include_provenance_png( + filename: Path, + attributes: dict[str, str], + ) -> None: pnginfo = PngInfo() exif_tags = { "caption": "ImageDescription", @@ -279,8 +342,11 @@ def _include_provenance_png(filename, attributes): with Image.open(filename) as image: image.save(filename, pnginfo=pnginfo) - def _include_provenance(self): + def _include_provenance(self) -> None: """Include provenance information as metadata.""" + if not isinstance(self.filename, Path): + msg = f"Writing attributes is only supported for pathlib.Path, not {type(self.filename)}" + raise NotImplementedError(msg) attributes = self._select_for_include() # Attach provenance to supported file types @@ -289,32 +355,32 @@ def _include_provenance(self): if write: write(self.filename, attributes) - def save_provenance(self): + def save_provenance(self) -> None: """Export provenance information.""" self.provenance = ProvDocument( - records=set(self.provenance.records), - namespaces=self.provenance.namespaces, + records=set(self.provenance.records), # type: ignore[attr-defined] + namespaces=self.provenance.namespaces, # type: ignore[attr-defined] ) self._include_provenance() with open(self.provenance_file, "wb") as file: # Create file with correct permissions before saving. - self.provenance.serialize(file, format="xml") + self.provenance.serialize(file, format="xml") # type: ignore[attr-defined] self.activity = None self.entity = None self.provenance = None - def restore_provenance(self): + def restore_provenance(self) -> None: """Import provenance information from a previously saved file.""" self.provenance = ProvDocument.deserialize( self.provenance_file, format="xml", ) entity_uri = f"{ESMVALTOOL_URI_PREFIX}file{self.prov_filename}" - self.entity = self.provenance.get_record(entity_uri)[0] + self.entity = self.provenance.get_record(entity_uri)[0] # type: ignore[attr-defined] # Find the associated activity - for rec in self.provenance.records: + for rec in self.provenance.records: # type: ignore[attr-defined] if isinstance(rec, ProvDerivation): - if rec.args[0] == self.entity.identifier: + if rec.args[0] == self.entity.identifier: # type: ignore[attr-defined] activity_id = rec.args[2] - self.activity = self.provenance.get_record(activity_id)[0] + self.activity = self.provenance.get_record(activity_id)[0] # type: ignore[attr-defined] break diff --git a/esmvalcore/_recipe/check.py b/esmvalcore/_recipe/check.py index 70bc46eeb6..51cf777caa 100644 --- a/esmvalcore/_recipe/check.py +++ b/esmvalcore/_recipe/check.py @@ -16,7 +16,7 @@ import esmvalcore.preprocessor from esmvalcore.exceptions import InputFilesNotFound, RecipeError -from esmvalcore.local import _get_start_end_year, _parse_period +from esmvalcore.local import _parse_period from esmvalcore.preprocessor import TIME_PREPROCESSORS, PreprocessingTask from esmvalcore.preprocessor._multimodel import _get_operator_and_kwargs from esmvalcore.preprocessor._other import _get_var_info @@ -231,7 +231,9 @@ def data_availability(dataset: Dataset, log: bool = True) -> None: msg = f"Missing data for {dataset.summary(True)}" raise InputFilesNotFound(msg) - if "timerange" not in facets: + if "timerange" not in facets or any( + "timerange" not in f.facets for f in input_files + ): return start_date, end_date = _parse_period(facets["timerange"]) @@ -241,8 +243,10 @@ def data_availability(dataset: Dataset, log: bool = True) -> None: available_years: set[int] = set() for file in input_files: - start, end = _get_start_end_year(file) - available_years.update(range(start, end + 1)) + start_date, end_date = file.facets["timerange"].split("/") # type: ignore[union-attr] + start_year = int(start_date[:4]) + end_year = int(end_date[:4]) + available_years.update(range(start_year, end_year + 1)) missing_years = required_years - available_years if missing_years: diff --git a/esmvalcore/_recipe/recipe.py b/esmvalcore/_recipe/recipe.py index d6285c9aed..d9777e2582 100644 --- a/esmvalcore/_recipe/recipe.py +++ b/esmvalcore/_recipe/recipe.py @@ -23,6 +23,7 @@ from esmvalcore.dataset import Dataset from esmvalcore.exceptions import InputFilesNotFound, RecipeError from esmvalcore.local import ( + GRIB_FORMATS, _dates_to_timerange, _get_multiproduct_filename, _get_output_file, @@ -38,7 +39,6 @@ PreprocessorFile, ) from esmvalcore.preprocessor._area import _update_shapefile_path -from esmvalcore.preprocessor._io import GRIB_FORMATS from esmvalcore.preprocessor._multimodel import _get_stat_identifier from esmvalcore.preprocessor._regrid import ( _spec_to_latlonvals, @@ -60,6 +60,7 @@ from collections.abc import Iterable, Sequence from esmvalcore.config import Session + from esmvalcore.io.protocol import DataElement from esmvalcore.typing import Facets logger = logging.getLogger(__name__) @@ -328,20 +329,12 @@ def _update_weighting_settings( _exclude_dataset(settings, facets, "weighting_landsea_fraction") -def _add_to_download_list(dataset: Dataset) -> None: - """Add the files of `dataset` to `DOWNLOAD_FILES`.""" - for i, file in enumerate(dataset.files): - if isinstance(file, esgf.ESGFFile): - DOWNLOAD_FILES.add(file) - dataset.files[i] = file.local_file(dataset.session["download_dir"]) - - def _schedule_for_download(datasets: Iterable[Dataset]) -> None: """Schedule files for download.""" for dataset in datasets: - _add_to_download_list(dataset) + DOWNLOAD_FILES.update(dataset.files) for supplementary_ds in dataset.supplementaries: - _add_to_download_list(supplementary_ds) + DOWNLOAD_FILES.update(supplementary_ds.files) def _log_input_files(datasets: Iterable[Dataset]) -> None: @@ -367,12 +360,7 @@ def _log_input_files(datasets: Iterable[Dataset]) -> None: def _get_files_str(dataset: Dataset) -> str: """Get nice string representation of all files of a dataset.""" - return "\n".join( - f" {f}" - if f.exists() # type: ignore - else f" {f} (will be downloaded)" - for f in dataset.files - ) + return "\n".join(f" {f}" for f in dataset.files) def _check_input_files(input_datasets: Iterable[Dataset]) -> set[str]: @@ -455,10 +443,7 @@ def _get_common_attributes( # Ensure that attributes start_year and end_year are always available if at # least one of the input datasets defines it - if "timerange" in attributes: - start_year, end_year = _parse_period(attributes["timerange"]) - attributes["start_year"] = int(str(start_year[0:4])) - attributes["end_year"] = int(str(end_year[0:4])) + _set_start_end_year(attributes) return attributes @@ -722,7 +707,7 @@ def _get_preprocessor_products( ) for product in products: - _set_start_end_year(product) + _set_start_end_year(product.attributes) product.check() return products @@ -782,18 +767,18 @@ def _configure_multi_product_preprocessor( for product in multimodel_products | ensemble_products: product.check() - _set_start_end_year(product) + _set_start_end_year(product.attributes) -def _set_start_end_year(product: PreprocessorFile) -> None: +def _set_start_end_year(attributes: dict[str, Any]) -> None: """Set the attributes `start_year` and `end_year`. These attributes are used by many diagnostic scripts in ESMValTool. """ - if "timerange" in product.attributes: - start_year, end_year = _parse_period(product.attributes["timerange"]) - product.attributes["start_year"] = int(str(start_year[0:4])) - product.attributes["end_year"] = int(str(end_year[0:4])) + if "timerange" in attributes: + start_year, end_year = _parse_period(attributes["timerange"]) + attributes["start_year"] = int(str(start_year[0:4])) + attributes["end_year"] = int(str(end_year[0:4])) def _update_preproc_functions( @@ -916,7 +901,7 @@ def __init__( # Clear the global variable containing the set of files to download DOWNLOAD_FILES.clear() USED_DATASETS.clear() - self._download_files: set[esgf.ESGFFile] = set() + self._download_files: set[DataElement] = set() self.session = session self.session["write_ncl_interface"] = self._need_ncl( raw_recipe["diagnostics"], @@ -973,7 +958,7 @@ def _log_recipe_errors(self, exc: RecipeError) -> None: ) @staticmethod - def _need_ncl(raw_diagnostics: Diagnostic) -> bool: + def _need_ncl(raw_diagnostics: dict[str, Diagnostic]) -> bool: if not raw_diagnostics: return False for diagnostic in raw_diagnostics.values(): @@ -996,8 +981,8 @@ def _initialize_provenance(self, raw_documentation: dict[str, Any]): def _initialize_diagnostics( self, - raw_diagnostics: Diagnostic, - ) -> Diagnostic: + raw_diagnostics: dict[str, Diagnostic], + ) -> dict[str, Diagnostic]: """Define diagnostics in recipe.""" logger.debug("Retrieving diagnostics from recipe") check.diagnostics(raw_diagnostics) @@ -1013,7 +998,7 @@ def _initialize_diagnostics( variable_names = tuple(raw_diagnostic.get("variables", {})) diagnostic["scripts"] = self._initialize_scripts( name, - raw_diagnostic.get("scripts"), + raw_diagnostic.get("scripts", {}), variable_names, ) for key in ("themes", "realms"): @@ -1342,8 +1327,10 @@ def run(self) -> None: filled_recipe = self.write_filled_recipe() # Download required data - if self.session["search_esgf"] != "never": - esgf.download(self._download_files, self.session["download_dir"]) + # Add a special case for ESGF files to enable parallel downloads + esgf.download(self._download_files, self.session["download_dir"]) + for file in self._download_files: + file.prepare() self.tasks.run(max_parallel_tasks=self.session["max_parallel_tasks"]) logger.info( diff --git a/esmvalcore/_recipe/to_datasets.py b/esmvalcore/_recipe/to_datasets.py index 7aab83719b..5855353cc9 100644 --- a/esmvalcore/_recipe/to_datasets.py +++ b/esmvalcore/_recipe/to_datasets.py @@ -5,14 +5,13 @@ import logging from collections.abc import Iterable, Iterator, Sequence from copy import deepcopy -from numbers import Number from typing import TYPE_CHECKING, Any from esmvalcore.cmor.table import _CMOR_KEYS, _update_cmor_facets from esmvalcore.dataset import INHERITED_FACETS, Dataset, _isglob from esmvalcore.esgf.facets import FACETS from esmvalcore.exceptions import RecipeError -from esmvalcore.local import LocalFile, _replace_years_with_timerange +from esmvalcore.local import _replace_years_with_timerange from esmvalcore.preprocessor._derive import get_required from esmvalcore.preprocessor._io import DATASET_KEYS from esmvalcore.preprocessor._supplementary_vars import ( @@ -219,7 +218,7 @@ def _get_supplementary_short_names( var_facets = dict(facets) _update_cmor_facets(var_facets) realms = var_facets.get("modeling_realm", []) - if isinstance(realms, (str, Number, bool)): + if isinstance(realms, (str, int)): realms = [str(realms)] ocean_realms = {"ocean", "seaIce", "ocnBgchem"} is_ocean_variable = any(realm in ocean_realms for realm in realms) @@ -511,16 +510,12 @@ def _report_unexpanded_globs( expanded_ds.supplementaries = [] if expanded_ds.files: - if any(isinstance(f, LocalFile) for f in expanded_ds.files): - paths_msg = "paths to the " - else: - paths_msg = "" msg = ( - f"{msg}\nDo the {paths_msg}files:\n" + f"{msg}\nPlease check why the files:\n" + "\n".join( f"{f} with facets: {f.facets}" for f in expanded_ds.files ) - + "\nprovide the missing facet values?" + + "\ndo not provide the missing facet values." ) else: timerange = expanded_ds.facets.get("timerange") diff --git a/esmvalcore/_task.py b/esmvalcore/_task.py index d2c0831ed3..fe77472888 100644 --- a/esmvalcore/_task.py +++ b/esmvalcore/_task.py @@ -351,7 +351,7 @@ def __init__(self, prev_preproc_dir, preproc_dir, name): for prov_filename, attributes in prev_metadata.items(): # Update the filename in case the output directory was moved # since the original run - filename = str(prev_preproc_dir / Path(prov_filename).name) + filename = prev_preproc_dir / Path(prov_filename).name attributes["filename"] = filename product = TrackedFile( filename, @@ -676,7 +676,7 @@ def _run(self, input_files): msg, ) - def _collect_provenance(self): + def _collect_provenance(self) -> None: """Process provenance information provided by the diagnostic script.""" provenance_file = ( Path(self.settings["run_dir"]) / "diagnostic_provenance.yml" @@ -766,7 +766,7 @@ def _collect_provenance(self): TAGS.replace_tags_in_dict(attributes) - product = TrackedFile(filename, attributes, ancestors) + product = TrackedFile(Path(filename), attributes, ancestors) product.initialize_provenance(self.activity) _write_citation_files(product.filename, product.provenance) product.save_provenance() diff --git a/esmvalcore/cmor/_fixes/icon/_base_fixes.py b/esmvalcore/cmor/_fixes/icon/_base_fixes.py index c4c12da334..3e268ec29e 100644 --- a/esmvalcore/cmor/_fixes/icon/_base_fixes.py +++ b/esmvalcore/cmor/_fixes/icon/_base_fixes.py @@ -23,9 +23,10 @@ from iris.cube import Cube, CubeList from iris.mesh import Connectivity, MeshXY +import esmvalcore.local from esmvalcore.cmor._fixes.native_datasets import NativeDatasetFix +from esmvalcore.config._data_sources import _get_data_sources from esmvalcore.iris_helpers import add_leading_dim_to_cube, date2num -from esmvalcore.local import _get_data_sources logger = logging.getLogger(__name__) @@ -322,10 +323,11 @@ def _get_grid_from_cube_attr(self, cube: Cube) -> Cube: def _get_grid_from_rootpath(self, grid_name: str) -> CubeList | None: """Try to get grid from the ICON rootpath.""" glob_patterns: list[Path] = [] - for data_source in _get_data_sources("ICON"): - glob_patterns.extend( - data_source.get_glob_patterns(**self.extra_facets), - ) + for data_source in _get_data_sources(self.session, "ICON"): # type: ignore[arg-type] + if isinstance(data_source, esmvalcore.local.LocalDataSource): + glob_patterns.extend( + data_source.get_glob_patterns(**self.extra_facets), + ) possible_grid_paths = [d.parent / grid_name for d in glob_patterns] for grid_path in possible_grid_paths: if grid_path.is_file(): diff --git a/esmvalcore/cmor/check.py b/esmvalcore/cmor/check.py index cfc803a796..e2d40aa05c 100644 --- a/esmvalcore/cmor/check.py +++ b/esmvalcore/cmor/check.py @@ -14,6 +14,7 @@ import iris.exceptions import iris.util import numpy as np +import yaml from esmvalcore.cmor._utils import ( _get_alternative_generic_lev_coord, @@ -50,6 +51,12 @@ class CheckLevels(IntEnum): """Do not fail for any discrepancy with CMOR standards.""" +yaml.representer.SafeRepresenter.add_representer( + CheckLevels, + lambda dumper, data: dumper.represent_str(data.name.lower()), +) + + class CMORCheckError(Exception): """Exception raised when a cube does not pass the CMORCheck.""" diff --git a/esmvalcore/config/_config.py b/esmvalcore/config/_config.py index 6a3670a7ca..121ee2b126 100644 --- a/esmvalcore/config/_config.py +++ b/esmvalcore/config/_config.py @@ -94,7 +94,7 @@ def warn_if_old_extra_facets_exist() -> None: ) -def load_config_developer(cfg_file): +def load_config_developer(cfg_file) -> dict: """Read the developer's configuration file.""" with open(cfg_file, encoding="utf-8") as file: cfg = yaml.safe_load(file) @@ -120,6 +120,7 @@ def load_config_developer(cfg_file): CFG[project] = settings read_cmor_tables(cfg_file) + return cfg def get_project_config(project): diff --git a/esmvalcore/config/_config_object.py b/esmvalcore/config/_config_object.py index 0030b23adb..175c8115a4 100644 --- a/esmvalcore/config/_config_object.py +++ b/esmvalcore/config/_config_object.py @@ -13,6 +13,7 @@ import yaml import esmvalcore +from esmvalcore.config._config import load_config_developer from esmvalcore.config._config_validators import ( _deprecated_options_defaults, _deprecators, @@ -50,9 +51,7 @@ def _get_user_config_dir() -> Path: f"ESMVALTOOL_CONFIG_DIR environment variable: " f"{user_config_dir} is not an existing directory" ) - raise NotADirectoryError( - msg, - ) + raise NotADirectoryError(msg) return user_config_dir return Path.home() / ".config" / "esmvaltool" @@ -85,10 +84,7 @@ class Config(ValidatedConfig): _validate = _validators _deprecate = _deprecators _deprecated_defaults = _deprecated_options_defaults - _warn_if_missing = ( - ("drs", URL), - ("rootpath", URL), - ) + _warn_if_missing = (("projects", URL),) def __init__(self, *args, **kwargs): """Initialize class instance.""" @@ -145,6 +141,10 @@ def _load_user_config( try: new.update(mapping) + # Add known projects from config-developer file while we still have it. + for project in load_config_developer(new["config_developer_file"]): + if project not in new["projects"]: + new["projects"][project] = {} new.check_missing() except InvalidConfigParameter as exc: msg = ( @@ -362,7 +362,10 @@ def load_from_dirs(self, dirs: Iterable[str | Path]) -> None: new_config_dict = self._get_config_dict_from_dirs(dirs) self.clear() self.update(new_config_dict) - + # Add known projects from config-developer file while we still have it. + for project in load_config_developer(self["config_developer_file"]): + if project not in self["projects"]: + self["projects"][project] = {} self.check_missing() def reload(self) -> None: diff --git a/esmvalcore/config/_config_validators.py b/esmvalcore/config/_config_validators.py index c61e4ea309..df45da3c93 100644 --- a/esmvalcore/config/_config_validators.py +++ b/esmvalcore/config/_config_validators.py @@ -347,6 +347,7 @@ def validate_projects(value: Any) -> Any: """Validate projects mapping.""" mapping = validate_dict(value) options_for_project: dict[str, Callable[[Any], Any]] = { + "data": validate_dict, # TODO: try to create data sources here "extra_facets": validate_dict, } for project, project_config in mapping.items(): @@ -490,11 +491,67 @@ def deprecate_extra_facets_dir( _handle_deprecation(option, deprecated_version, remove_version, more_info) +def deprecate_rootpath( + validated_config: ValidatedConfig, + value: Any, + validated_value: Any, +) -> None: + """Deprecate ``config_file`` option. + + Parameters + ---------- + validated_config: + ``ValidatedConfig`` instance which will be modified in place. + value: + Raw input value for ``config_file`` option. + validated_value: + Validated value for ``config_file`` option. + + """ + validated_config # noqa: B018 + value # noqa: B018 + validated_value # noqa: B018 + option = "rootpath" + deprecated_version = "2.13.0" + remove_version = "2.16.0" + more_info = " Please define data sources using the option `projects: data:` instead." + _handle_deprecation(option, deprecated_version, remove_version, more_info) + + +def deprecate_drs( + validated_config: ValidatedConfig, + value: Any, + validated_value: Any, +) -> None: + """Deprecate ``config_file`` option. + + Parameters + ---------- + validated_config: + ``ValidatedConfig`` instance which will be modified in place. + value: + Raw input value for ``config_file`` option. + validated_value: + Validated value for ``config_file`` option. + + """ + validated_config # noqa: B018 + value # noqa: B018 + validated_value # noqa: B018 + option = "drs" + deprecated_version = "2.13.0" + remove_version = "2.16.0" + more_info = " Please define data sources using the option `projects: data:` instead." + _handle_deprecation(option, deprecated_version, remove_version, more_info) + + # Example usage: see removed files in # https://github.com/ESMValGroup/ESMValCore/pull/2213 _deprecators: dict[str, Callable] = { "config_file": deprecate_config_file, # TODO: remove in v2.14.0 "extra_facets_dir": deprecate_extra_facets_dir, # TODO: remove in v2.15.0 + "drs": deprecate_drs, # TODO: remove in v2.16.0 + "rootpath": deprecate_rootpath, # TODO: remove in v2.16.0 } diff --git a/esmvalcore/config/_data_sources.py b/esmvalcore/config/_data_sources.py new file mode 100644 index 0000000000..7da25beb6b --- /dev/null +++ b/esmvalcore/config/_data_sources.py @@ -0,0 +1,71 @@ +"""Module for configuring data sources.""" + +import logging + +import esmvalcore.esgf +import esmvalcore.local +from esmvalcore.config import Session +from esmvalcore.exceptions import RecipeError +from esmvalcore.io import load_data_sources +from esmvalcore.io.protocol import DataSource + +logger = logging.getLogger(__name__) + + +def _get_data_sources( + session: Session, + project: str, +) -> list[DataSource]: + """Get the list of available data sources including legacy configuration. + + Arguments + --------- + session: + The configuration. + project: + Data sources for this project are returned. + + Returns + ------- + :obj:`list` of :obj:`DataSource`: + A list of available data sources. + + Raises + ------ + ValueError: + If the project or its settings are not found in the configuration. + + """ + try: + return load_data_sources(session, project) + except ValueError: + pass + + # Use legacy data sources from config-user.yml and config-developer.yml. + data_sources: list[DataSource] = [] + try: + legacy_local_data_sources = esmvalcore.local._get_data_sources(project) # noqa: SLF001 + except (RecipeError, KeyError): + # The project is not configured in config-developer.yml + legacy_local_data_sources = [] + else: + if ( + session["search_esgf"] != "never" + and project in esmvalcore.esgf.facets.FACETS + ): + data_source = esmvalcore.esgf.ESGFDataSource( + name="legacy-esgf", + project=project, + priority=2, + download_dir=session["download_dir"], + ) + data_sources.append(data_source) + data_sources.extend(legacy_local_data_sources) + + if not data_sources: + msg = ( + f"No data sources found for project '{project}'. " + f"Check your configuration under 'projects: {project}: data'" + ) + raise ValueError(msg) + return data_sources diff --git a/esmvalcore/config/_validated_config.py b/esmvalcore/config/_validated_config.py index 0dfca3b521..624068c411 100644 --- a/esmvalcore/config/_validated_config.py +++ b/esmvalcore/config/_validated_config.py @@ -57,7 +57,7 @@ class ValidatedConfig(MutableMapping): """ # validate values on the way in - def __init__(self, *args, **kwargs): + def __init__(self, *args, **kwargs) -> None: super().__init__() self._mapping: dict[str, Any] = {} self.update(*args, **kwargs) diff --git a/esmvalcore/config/config-logging.yml b/esmvalcore/config/config-logging.yml index 6635ca63ec..8d4fb94b17 100644 --- a/esmvalcore/config/config-logging.yml +++ b/esmvalcore/config/config-logging.yml @@ -1,25 +1,28 @@ # Logger configuration --- - version: 1 disable_existing_loggers: false formatters: console: - format: '%(asctime)s UTC [%(process)d] %(levelname)-7s %(message)s' + format: "%(asctime)s UTC [%(process)d] %(levelname)-7s %(message)s" brief: - format: '%(levelname)-7s [%(process)d] %(message)s' + format: "%(levelname)-7s [%(process)d] %(message)s" debug: - format: '%(asctime)s UTC [%(process)d] %(levelname)-7s %(name)s:%(lineno)s %(message)s' + format: "%(asctime)s UTC [%(process)d] %(levelname)-7s %(name)s:%(lineno)s %(message)s" filters: - only_cmor: # only events from CMOR check and generic fixes + only_cmor: # only events from CMOR check and generic fixes (): esmvalcore.config._logging.FilterMultipleNames names: [esmvalcore.cmor.check, esmvalcore.cmor._fixes.fix.genericfix] mode: allow - no_cmor: # no events from CMOR check and generic fixes + no_cmor: # no events from CMOR check and generic fixes (): esmvalcore.config._logging.FilterMultipleNames names: [esmvalcore.cmor.check, esmvalcore.cmor._fixes.fix.genericfix] mode: disallow - no_external_warnings: # no events from external Python warnings + no_intake_esgf: # no events from intake-esgf + (): esmvalcore.config._logging.FilterMultipleNames + names: ["intake-esgf"] + mode: disallow + no_external_warnings: # no events from external Python warnings (): esmvalcore.config._logging.FilterExternalWarnings handlers: console: @@ -27,21 +30,21 @@ handlers: level: INFO formatter: console stream: ext://sys.stdout - filters: [no_cmor, no_external_warnings] + filters: [no_cmor, no_external_warnings, no_intake_esgf] simple_log_file: class: logging.FileHandler level: INFO formatter: brief filename: main_log.txt mode: w - filters: [no_cmor, no_external_warnings] + filters: [no_cmor, no_external_warnings, no_intake_esgf] debug_log_file: class: logging.FileHandler level: DEBUG formatter: debug filename: main_log_debug.txt mode: w - cmor_log: # only contains output from CMOR check and generic fixes + cmor_log: # only contains output from CMOR check and generic fixes class: logging.FileHandler level: INFO formatter: brief diff --git a/esmvalcore/config/configurations/access-data.yml b/esmvalcore/config/configurations/access-data.yml new file mode 100644 index 0000000000..259bc32cd4 --- /dev/null +++ b/esmvalcore/config/configurations/access-data.yml @@ -0,0 +1,13 @@ +projects: + ACCESS: + data: + access-sub-dataset: + type: "esmvalcore.local.LocalDataSource" + rootpath: ~/climate_data + dirname_template: "{dataset}/{sub_dataset}/{exp}/{modeling_realm}/netCDF" + filename_template: "{sub_dataset}.{freq_attribute}-*.nc" + access-ocean: + type: "esmvalcore.local.LocalDataSource" + rootpath: ~/climate_data + dirname_template: "{dataset}/{sub_dataset}/{exp}/{modeling_realm}/netCDF" + filename_template: "ocean_{freq_attribute}.nc-*" diff --git a/esmvalcore/config/configurations/badc-data.yml b/esmvalcore/config/configurations/badc-data.yml new file mode 100644 index 0000000000..0b9558335f --- /dev/null +++ b/esmvalcore/config/configurations/badc-data.yml @@ -0,0 +1,50 @@ +projects: + CMIP6: + data: + badc: + type: "esmvalcore.local.LocalDataSource" + rootpath: /badc/cmip6/data + dirname_template: "{project}/{activity}/{institute}/{dataset}/{exp}/{ensemble}/{mip}/{short_name}/{grid}/{version}" + filename_template: "{short_name}_{mip}_{dataset}_{exp}_{ensemble}_{grid}*.nc" + CMIP5: + data: + badc: + type: "esmvalcore.local.LocalDataSource" + rootpath: /badc/cmip5/data + dirname_template: "{project.lower}/{product}/{institute}/{dataset}/{exp}/{frequency}/{modeling_realm}/{mip}/{ensemble}/{version}" + filename_template: "{short_name}_{mip}_{dataset}_{exp}_{ensemble}*.nc" + CMIP3: + data: + badc: + type: "esmvalcore.local.LocalDataSource" + rootpath: /badc/cmip3_drs/data + dirname_template: "{project.lower}/output/{institute}/{dataset}/{exp}/{frequency}/{modeling_realm}/{short_name}/{ensemble}/{version}" + filename_template: "{short_name}_*.nc" + CORDEX: + data: + badc: + type: "esmvalcore.local.LocalDataSource" + rootpath: /badc/cordex/data + dirname_template: "{project}/output/{domain}/{institute}/{driver}/{exp}/{ensemble}/{institute}-{dataset}/{rcm_version}/{mip}/{short_name}/{version}" + filename_template: "{short_name}_{domain}_{driver}_{exp}_{ensemble}_{institute}-{dataset}_{rcm_version}_{mip}*.nc" + obs4MIPs: + data: + badc: + type: "esmvalcore.local.LocalDataSource" + rootpath: /gws/nopw/j04/esmeval/obsdata-v2 + dirname_template: "Tier{tier}/{dataset}" + filename_template: "{short_name}_*.nc" + OBS6: + data: + badc: + type: "esmvalcore.local.LocalDataSource" + rootpath: /gws/nopw/j04/esmeval/obsdata-v2 + dirname_template: "Tier{tier}/{dataset}" + filename_template: "{project}_{dataset}_{type}_{version}_{mip}_{short_name}[_.]*nc" + OBS: + data: + badc: + type: "esmvalcore.local.LocalDataSource" + rootpath: /gws/nopw/j04/esmeval/obsdata-v2 + dirname_template: "Tier{tier}/{dataset}" + filename_template: "{project}_{dataset}_{type}_{version}_{mip}_{short_name}[_.]*nc" diff --git a/esmvalcore/config/configurations/cesm-data.yml b/esmvalcore/config/configurations/cesm-data.yml new file mode 100644 index 0000000000..95439d6d8c --- /dev/null +++ b/esmvalcore/config/configurations/cesm-data.yml @@ -0,0 +1,14 @@ +projects: + CESM: + data: + run: &cesm + type: "esmvalcore.local.LocalDataSource" + rootpath: ~/climate_data + dirname_template: "" # run directory + filename_template: "{case}.{scomp}.{type}.{string}*nc" + short-term-archive: + <<: *cesm + dirname_template: "{case}/{gcomp}/hist" # short-term archiving + postprocessed: + <<: *cesm + dirname_template: "{case}/{gcomp}/proc/{tdir}/{tperiod}" # postprocessed data diff --git a/esmvalcore/config/configurations/defaults/config-user.yml b/esmvalcore/config/configurations/defaults/config-user.yml index b2f8950a1c..a13aa6e32d 100644 --- a/esmvalcore/config/configurations/defaults/config-user.yml +++ b/esmvalcore/config/configurations/defaults/config-user.yml @@ -4,17 +4,12 @@ # # Note for users: # -------------- -# Site-specific entries for different HPC centers are given at the bottom of -# this file. Comment out/replace as needed. This default version of the file -# can be used in combination with the command line argument -# ``search_esgf=when_missing``. If only certain values are allowed for an -# option, these are listed after ``---``. The option in square brackets is the -# default value, i.e., the one that is used if this option is omitted in the -# file. +# If only certain values are allowed for an option, these are listed after +# ``---``. The option in square brackets is the default value, i.e., the one +# that is used if this option is omitted in the file. # ############################################################################### --- - # Destination directory where all output will be written # Includes log files and performance stats. output_dir: ~/esmvaltool_output @@ -82,225 +77,3 @@ config_developer_file: null # A profiler tells you which functions in your code take most time to run. # Only available for Python diagnostics. profile_diagnostic: false - -# Rootpaths to the data from different projects -# This default setting will work if files have been downloaded by ESMValTool -# via ``search_esgf``. Lists are also possible. For site-specific entries and -# more examples, see below. Comment out these when using a site-specific path. -rootpath: - default: ~/climate_data - -# Directory structure for input data --- [default]/ESGF/BADC/DKRZ/ETHZ/etc. -# This default setting will work if files have been downloaded by ESMValTool -# via ``search_esgf``. See ``config-developer.yml`` for definitions. Comment -# out/replace as per needed. -drs: - CMIP3: ESGF - CMIP5: ESGF - CMIP6: ESGF - CORDEX: ESGF - obs4MIPs: ESGF - -# Example rootpaths and directory structure names for different projects. -# For each project, the entry can be a single path, a list of paths, or a -# mapping from paths to directory structure names. -# For single paths and list of paths, the directory structure names can be -# defined under 'drs'. -# If no path is defined for a project, the tool will look in the 'default' -# path. -# If no directory structure name is given, the name 'default' will be used. -# Directory structures corresponding to the names are defined in the file -# config-developer.yml. -# For site-specific entries, see below. -#rootpath: -# CMIP6: -# /path/to/data: DKRZ -# ~/path/to/more/data: ESGF -# CMIP5: -# - ~/cmip5_inputpath1 -# - ~/cmip5_inputpath2 -# CMIP3: ~/cmip6_inputpath -# OBS: ~/obs_inputpath -# OBS6: ~/obs6_inputpath -# obs4MIPs: ~/obs4mips_inputpath -# ana4mips: ~/ana4mips_inputpath -# native6: ~/native6_inputpath -# RAWOBS: ~/rawobs_inputpath -# default: ~/default_inputpath -#drs: -# CMIP3: ESGF -# CMIP5: ESGF -# CORDEX: ESGF -# obs4MIPs: ESGF - -# Directory tree created by automatically downloading from ESGF -# Uncomment the lines below to locate data that has been automatically -# downloaded from ESGF (using ``search_esgf``). -#rootpath: -# CMIP3: ~/climate_data -# CMIP5: ~/climate_data -# CMIP6: ~/climate_data -# CORDEX: ~/climate_data -# obs4MIPs: ~/climate_data -#drs: -# CMIP3: ESGF -# CMIP5: ESGF -# CMIP6: ESGF -# CORDEX: ESGF -# obs4MIPs: ESGF - -# Site-specific entries: JASMIN -# Uncomment the lines below to locate data on JASMIN. -#auxiliary_data_dir: /gws/nopw/j04/esmeval/aux_data/AUX -#rootpath: -# CMIP6: /badc/cmip6/data/CMIP6 -# CMIP5: /badc/cmip5/data/cmip5/output1 -# CMIP3: /badc/cmip3_drs/data/cmip3/output -# OBS: /gws/nopw/j04/esmeval/obsdata-v2 -# OBS6: /gws/nopw/j04/esmeval/obsdata-v2 -# obs4MIPs: /gws/nopw/j04/esmeval/obsdata-v2 -# ana4mips: /gws/nopw/j04/esmeval/obsdata-v2 -# CORDEX: /badc/cordex/data/CORDEX/output -#drs: -# CMIP6: BADC -# CMIP5: BADC -# CMIP3: BADC -# CORDEX: BADC -# OBS: default -# OBS6: default -# obs4MIPs: default -# ana4mips: default - -# Site-specific entries: DKRZ-Levante -# For bd0854 members a shared download directory is available -#search_esgf: when_missing -#download_dir: /work/bd0854/DATA/ESMValTool2/download -# Uncomment the lines below to locate data on Levante at DKRZ. -#auxiliary_data_dir: /work/bd0854/DATA/ESMValTool2/AUX -#rootpath: -# CMIP6: -# /work/bd0854/DATA/ESMValTool2/CMIP6_DKRZ: DKRZ -# /work/bd0854/DATA/ESMValTool2/download: ESGF -# CMIP5: -# /work/bd0854/DATA/ESMValTool2/CMIP5_DKRZ: DKRZ -# /work/bd0854/DATA/ESMValTool2/download: ESGF -# CMIP3: -# /work/bd0854/DATA/ESMValTool2/CMIP3: DKRZ -# /work/bd0854/DATA/ESMValTool2/download: ESGF -# CORDEX: -# /work/ik1017/C3SCORDEX/data/c3s-cordex/output: BADC -# /work/bd0854/DATA/ESMValTool2/download: ESGF -# OBS: /work/bd0854/DATA/ESMValTool2/OBS -# OBS6: /work/bd0854/DATA/ESMValTool2/OBS -# obs4MIPs: -# /work/bd0854/DATA/ESMValTool2/OBS: default -# /work/bd0854/DATA/ESMValTool2/download: ESGF -# ana4mips: /work/bd0854/DATA/ESMValTool2/OBS -# native6: -# /work/bd0854/DATA/ESMValTool2/RAWOBS: default -# /pool/data/ERA5: DKRZ-ERA5-GRIB -# RAWOBS: /work/bd0854/DATA/ESMValTool2/RAWOBS -#drs: -# ana4mips: default -# OBS: default -# OBS6: default -# native6: default - -# Site-specific entries: ETHZ -# Uncomment the lines below to locate data at ETHZ. -#rootpath: -# CMIP6: /net/atmos/data/cmip6 -# CMIP5: /net/atmos/data/cmip5 -# CMIP3: /net/atmos/data/cmip3 -# OBS: /net/exo/landclim/PROJECTS/C3S/datadir/obsdir/ -#drs: -# CMIP6: ETHZ -# CMIP5: ETHZ -# CMIP3: ETHZ - -# Site-specific entries: IPSL -# Uncomment the lines below to locate data on Ciclad at IPSL. -#rootpath: -# IPSLCM: / -# CMIP5: /bdd/CMIP5/output -# CMIP6: /bdd/CMIP6 -# CMIP3: /bdd/CMIP3 -# CORDEX: /bdd/CORDEX/output -# obs4MIPs: /bdd/obs4MIPS/obs-CFMIP/observations -# ana4mips: /not_yet -# OBS: /not_yet -# OBS6: /not_yet -# RAWOBS: /not_yet -#drs: -# CMIP6: DKRZ -# CMIP5: DKRZ -# CMIP3: IPSL -# CORDEX: BADC -# obs4MIPs: IPSL -# ana4mips: default -# OBS: not_yet -# OBS6: not_yet - -# Site-specific entries: Met Office - Old VDI -# Uncomment the lines below to locate data at the Met Office. -#rootpath: -# CMIP5: /project/champ/data/cmip5/output1 -# CMIP6: /project/champ/data/CMIP6 -# CORDEX: /project/champ/data/cordex/output -# OBS: /data/users/esmval/ESMValTool/obs -# OBS6: /data/users/esmval/ESMValTool/obs -# obs4MIPs: /data/users/esmval/ESMValTool/obs -# ana4mips: /project/champ/data/ana4MIPs -# native6: /data/users/esmval/ESMValTool/rawobs -# RAWOBS: /data/users/esmval/ESMValTool/rawobs -#drs: -# CMIP5: BADC -# CMIP6: BADC -# CORDEX: BADC -# OBS: default -# OBS6: default -# obs4MIPs: default -# ana4mips: BADC -# native6: default - -# Site-specific entries: Met Office - New VDI -# Uncomment the lines below to locate data at the Met Office. -#rootpath: -# CMIP5: /data/users/managecmip/champ/cmip5/output1 -# CMIP6: /data/users/managecmip/champ/CMIP6 -# CORDEX: /data/users/managecmip/champ/cordex/output -# OBS: /data/users/esmval/ESMValTool/obs -# OBS6: /data/users/esmval/ESMValTool/obs -# obs4MIPs: /data/users/esmval/ESMValTool/obs -# ana4mips: /data/users/managecmip/champ/ana4MIPs -# native6: /data/users/esmval/ESMValTool/rawobs -# RAWOBS: /data/users/esmval/ESMValTool/rawobs -#drs: -# CMIP5: BADC -# CMIP6: BADC -# CORDEX: BADC -# OBS: default -# OBS6: default -# obs4MIPs: default -# ana4mips: BADC -# native6: default - -# Site-specific entries: NCI -# Uncomment the lines below to locate data at NCI. -#rootpath: -# CMIP6: [/g/data/oi10/replicas/CMIP6, /g/data/fs38/publications/CMIP6, /g/data/xp65/public/apps/esmvaltool/replicas/CMIP6] -# CMIP5: [/g/data/r87/DRSv3/CMIP5, /g/data/al33/replicas/CMIP5/combined, /g/data/rr3/publications/CMIP5/output1, /g/data/xp65/public/apps/esmvaltool/replicas/cmip5/output1] -# CMIP3: /g/data/r87/DRSv3/CMIP3 -# OBS: /g/data/ct11/access-nri/replicas/esmvaltool/obsdata-v2 -# OBS6: /g/data/ct11/access-nri/replicas/esmvaltool/obsdata-v2 -# obs4MIPs: /g/data/ct11/access-nri/replicas/esmvaltool/obsdata-v2 -# ana4mips: /g/data/ct11/access-nri/replicas/esmvaltool/obsdata-v2 -# native6: /g/data/xp65/public/apps/esmvaltool/native6 -# -#drs: -# CMIP6: NCI -# CMIP5: NCI -# CMIP3: NCI -# CORDEX: ESGF -# obs4MIPs: default -# ana4mips: default diff --git a/esmvalcore/config/configurations/dkrz-data.yml b/esmvalcore/config/configurations/dkrz-data.yml new file mode 100644 index 0000000000..bbf418eae7 --- /dev/null +++ b/esmvalcore/config/configurations/dkrz-data.yml @@ -0,0 +1,87 @@ +projects: + CMIP6: + data: + dkrz: + type: "esmvalcore.local.LocalDataSource" + rootpath: /work/ik1017/CMIP6/data + dirname_template: "{project}/{activity}/{institute}/{dataset}/{exp}/{ensemble}/{mip}/{short_name}/{grid}/{version}" + filename_template: "{short_name}_{mip}_{dataset}_{exp}_{ensemble}_{grid}*.nc" + esgf-cache: + type: "esmvalcore.local.LocalDataSource" + rootpath: /work/bd0854/DATA/ESMValTool2/download + dirname_template: "{project}/{activity}/{institute}/{dataset}/{exp}/{ensemble}/{mip}/{short_name}/{grid}/{version}" + filename_template: "{short_name}_{mip}_{dataset}_{exp}_{ensemble}_{grid}*.nc" + CMIP5: + data: + dkrz: + type: "esmvalcore.local.LocalDataSource" + rootpath: /work/kd0956/CMIP5/data + dirname_template: "{project.lower}/{product}/{institute}/{dataset}/{exp}/{frequency}/{modeling_realm}/{mip}/{ensemble}/{version}/{short_name}" + filename_template: "{short_name}_{mip}_{dataset}_{exp}_{ensemble}*.nc" + esgf-cache: + type: "esmvalcore.local.LocalDataSource" + rootpath: /work/bd0854/DATA/ESMValTool2/download + dirname_template: "{project.lower}/{product}/{institute}/{dataset}/{exp}/{frequency}/{modeling_realm}/{mip}/{ensemble}/{version}" + filename_template: "{short_name}_{mip}_{dataset}_{exp}_{ensemble}*.nc" + CMIP3: + data: + dkrz: + type: "esmvalcore.local.LocalDataSource" + rootpath: /work/bd0854/DATA/ESMValTool2/CMIP3 + dirname_template: "{exp}/{modeling_realm}/{frequency}/{short_name}/{dataset}/{ensemble}" + filename_template: "{short_name}_*.nc" + esgf-cache: + type: "esmvalcore.local.LocalDataSource" + rootpath: /work/bd0854/DATA/ESMValTool2/download + dirname_template: "{project.lower}/{institute}/{dataset}/{exp}/{frequency}/{modeling_realm}/{ensemble}/{short_name}/{version}" + filename_template: "{short_name}_*.nc" + CORDEX: + data: + dkrz: + type: "esmvalcore.local.LocalDataSource" + rootpath: /work/ik1017/C3SCORDEX/data/c3s-cordex/output + dirname_template: "{domain}/{institute}/{driver}/{exp}/{ensemble}/{institute}-{dataset}/{rcm_version}/{mip}/{short_name}/{version}" + filename_template: "{short_name}_{domain}_{driver}_{exp}_{ensemble}_{institute}-{dataset}_{rcm_version}_{mip}*.nc" + esgf-cache: + type: "esmvalcore.local.LocalDataSource" + rootpath: /work/bd0854/DATA/ESMValTool2/download + dirname_template: "{project.lower}/output/{domain}/{institute}/{driver}/{exp}/{ensemble}/{dataset}/{rcm_version}/{frequency}/{short_name}/{version}" + filename_template: "{short_name}_{domain}_{driver}_{exp}_{ensemble}_{institute}-{dataset}_{rcm_version}_{mip}*.nc" + obs4MIPs: + data: + dkrz: + type: "esmvalcore.local.LocalDataSource" + rootpath: /work/bd0854/DATA/ESMValTool2/OBS + dirname_template: "Tier{tier}/{dataset}" + filename_template: "{short_name}_*.nc" + esgf-cache: + type: "esmvalcore.local.LocalDataSource" + rootpath: /work/bd0854/DATA/ESMValTool2/download + dirname_template: "{project}/{dataset}/{version}" + filename_template: "{short_name}_*.nc" + native6: + data: + dkrz: + type: "esmvalcore.local.LocalDataSource" + rootpath: /work/bd0854/DATA/ESMValTool2/RAWOBS + dirname_template: "Tier{tier}/{dataset}/{version}/{frequency}/{short_name}" + filename_template: "*.nc" + dkrz-era5: + type: "esmvalcore.local.LocalDataSource" + rootpath: /pool/data/ERA5 + dirname_template: "{family}/{level}/{type}/{tres}/{grib_id}" + filename_template: "{family}{level}{typeid}_{tres}_*_{grib_id}.grb" + OBS6: + data: + dkrz: + type: "esmvalcore.local.LocalDataSource" + rootpath: /work/bd0854/DATA/ESMValTool2/OBS + dirname_template: "Tier{tier}/{dataset}" + filename_template: "{project}_{dataset}_{type}_{version}_{mip}_{short_name}[_.]*nc" + OBS: + data: + dkrz: + type: "esmvalcore.local.LocalDataSource" + rootpath: /work/bd0854/DATA/ESMValTool2/OBS + dirname_template: "Tier{tier}/{dataset}" + filename_template: "{project}_{dataset}_{type}_{version}_{mip}_{short_name}[_.]*nc" diff --git a/esmvalcore/config/configurations/emac-data.yml b/esmvalcore/config/configurations/emac-data.yml new file mode 100644 index 0000000000..5875f68cc8 --- /dev/null +++ b/esmvalcore/config/configurations/emac-data.yml @@ -0,0 +1,8 @@ +projects: + EMAC: + data: + emac: + type: "esmvalcore.local.LocalDataSource" + rootpath: ~/climate_data + dirname_template: "{exp}/{channel}" + filename_template: "{exp}*{channel}{postproc_flag}.nc" diff --git a/esmvalcore/config/configurations/esgf-pyclient-data.yml b/esmvalcore/config/configurations/esgf-pyclient-data.yml new file mode 100644 index 0000000000..96c253f138 --- /dev/null +++ b/esmvalcore/config/configurations/esgf-pyclient-data.yml @@ -0,0 +1,17 @@ +# Use a lower priority than for esmvalcore.local.LocalDataSource +# to avoid searching ESGF with the setting `search_esgf: when_missing`. +projects: + CMIP6: &esgf-pyclient-data + data: + esgf-pyclient: + type: "esmvalcore.esgf.ESGFDataSource" + download_dir: ~/climate_data + priority: 10 + CMIP5: + <<: *esgf-pyclient-data + CMIP3: + <<: *esgf-pyclient-data + CORDEX: + <<: *esgf-pyclient-data + obs4MIPs: + <<: *esgf-pyclient-data diff --git a/esmvalcore/config/configurations/ethz-data.yml b/esmvalcore/config/configurations/ethz-data.yml new file mode 100644 index 0000000000..c2bead9523 --- /dev/null +++ b/esmvalcore/config/configurations/ethz-data.yml @@ -0,0 +1,29 @@ +projects: + CMIP6: + data: + ethz: + type: "esmvalcore.local.LocalDataSource" + rootpath: /net/atmos/data + dirname_template: "{project.lower}/{exp}/{mip}/{short_name}/{dataset}/{ensemble}/{grid}/" + filename_template: "{short_name}_{mip}_{dataset}_{exp}_{ensemble}_{grid}*.nc" + CMIP5: + data: + ethz: + type: "esmvalcore.local.LocalDataSource" + rootpath: /net/atmos/data + dirname_template: "{project.lower}/{institute}/{dataset}/{exp}/{frequency}/{modeling_realm}/{mip}/{ensemble}/{version}/{short_name}" + filename_template: "{short_name}_{mip}_{dataset}_{exp}_{ensemble}*.nc" + CMIP3: + data: + ethz: + type: "esmvalcore.local.LocalDataSource" + rootpath: /net/atmos/data + dirname_template: "{project.lower}/{exp}/{modeling_realm}/{frequency}/{short_name}/{dataset}/{ensemble}" + filename_template: "{short_name}_*.nc" + OBS: + data: + ethz: + type: "esmvalcore.local.LocalDataSource" + rootpath: /net/exo/landclim/PROJECTS/C3S/datadir/obsdir/ + dirname_template: "Tier{tier}/{dataset}" + filename_template: "{project}_{dataset}_{type}_{version}_{mip}_{short_name}[_.]*nc" diff --git a/esmvalcore/config/configurations/icon-data.yml b/esmvalcore/config/configurations/icon-data.yml new file mode 100644 index 0000000000..f4c5c4799a --- /dev/null +++ b/esmvalcore/config/configurations/icon-data.yml @@ -0,0 +1,14 @@ +projects: + ICON: + data: + icon: &icon + type: "esmvalcore.local.LocalDataSource" + rootpath: ~/climate_data + dirname_template: "{exp}" + filename_template: "{exp}_{var_type}*.nc" + icon-outdata: + <<: *icon + dirname_template: "{exp}/outdata" + icon-output: + <<: *icon + dirname_template: "{exp}/output" diff --git a/esmvalcore/config/configurations/intake-esgf-data.yml b/esmvalcore/config/configurations/intake-esgf-data.yml new file mode 100644 index 0000000000..e4e8f7e045 --- /dev/null +++ b/esmvalcore/config/configurations/intake-esgf-data.yml @@ -0,0 +1,76 @@ +projects: + CMIP6: + data: + intake-esgf: + type: "esmvalcore.io.intake_esgf.IntakeESGFDataSource" + facets: + activity: "activity_drs" + dataset: "source_id" + ensemble: "member_id" + exp: "experiment_id" + institute: "institution_id" + grid: "grid_label" + mip: "table_id" + project: "project" + short_name: "variable_id" + CMIP5: + data: + intake-esgf: + type: "esmvalcore.io.intake_esgf.IntakeESGFDataSource" + facets: + dataset: "model" + ensemble: "ensemble" + exp: "experiment" + frequency: "time_frequency" + institute: "institute" + mip: "cmor_table" + product: "product" + project: "project" + short_name: "variable" + values: + dataset: + "ACCESS1-0": "ACCESS1.0" + "ACCESS1-3": "ACCESS1.3" + "bcc-csm1-1": "BCC-CSM1.1" + "bcc-csm1-1-m": "BCC-CSM1.1(m)" + "CESM1-BGC": "CESM1(BGC)" + "CESM1-CAM5": "CESM1(CAM5)" + "CESM1-CAM5-1-FV2": "CESM1(CAM5.1,FV2)" + "CESM1-FASTCHEM": "CESM1(FASTCHEM)" + "CESM1-WACCM": "CESM1(WACCM)" + "CSIRO-Mk3-6-0": "CSIRO-Mk3.6.0" + "fio-esm": "FIO-ESM" + "GFDL-CM2p1": "GFDL-CM2.1" + "inmcm4": "INM-CM4" + "MRI-AGCM3-2H": "MRI-AGCM3.2H" + "MRI-AGCM3-2S": "MRI-AGCM3.2S" + CMIP3: + data: + intake-esgf: + type: "esmvalcore.io.intake_esgf.IntakeESGFDataSource" + facets: + dataset: "model" + ensemble: "ensemble" + exp: "experiment" + frequency: "time_frequency" + project: "project" + short_name: "variable" + obs4MIPs: + data: + intake-esgf-v2: + type: "esmvalcore.io.intake_esgf.IntakeESGFDataSource" + facets: + dataset: "source_id" + frequency: "frequency" + institute: "institution_id" + project: "project" + short_name: "variable_id" + # TODO: Add support for older ODS V1.0 obs4MIPs (CMIP5 style) data to intake-esgf + # intake-esgf-v1: + # type: "esmvalcore.io.intake_esgf.IntakeESGFDataSource" + # facets: + # dataset: "source_id" + # frequency: "time_frequency" + # institute: "institute" + # project: "project" + # short_name: "variable" diff --git a/esmvalcore/config/configurations/ipsl-data.yml b/esmvalcore/config/configurations/ipsl-data.yml new file mode 100644 index 0000000000..1a84d47606 --- /dev/null +++ b/esmvalcore/config/configurations/ipsl-data.yml @@ -0,0 +1,36 @@ +projects: + CMIP6: + data: + ipsl: + type: "esmvalcore.local.LocalDataSource" + rootpath: /bdd + dirname_template: "{project}/{activity}/{institute}/{dataset}/{exp}/{ensemble}/{mip}/{short_name}/{grid}/{version}" + filename_template: "{short_name}_{mip}_{dataset}_{exp}_{ensemble}_{grid}*.nc" + CMIP5: + data: + ipsl: + type: "esmvalcore.local.LocalDataSource" + rootpath: /bdd + dirname_template: "{project}/output/{institute}/{dataset}/{exp}/{frequency}/{modeling_realm}/{mip}/{ensemble}/{version}/{short_name}" + filename_template: "{short_name}_{mip}_{dataset}_{exp}_{ensemble}*.nc" + CMIP3: + data: + ipsl: + type: "esmvalcore.local.LocalDataSource" + rootpath: /bdd + dirname_template: "{project}/{institute}/{dataset}/{exp}/{frequency}/{modeling_realm}/{ensemble}/{short_name}/{version}/{short_name}" + filename_template: "{short_name}_*.nc" + CORDEX: + data: + ipsl: + type: "esmvalcore.local.LocalDataSource" + rootpath: /bdd + dirname_template: "{project}/output/{domain}/{institute}/{driver}/{exp}/{ensemble}/{institute}-{dataset}/{rcm_version}/{mip}/{short_name}/{version}" + filename_template: "{short_name}_{domain}_{driver}_{exp}_{ensemble}_{institute}-{dataset}_{rcm_version}_{mip}*.nc" + obs4MIPs: + data: + ipsl: + type: "esmvalcore.local.LocalDataSource" + rootpath: /bdd + dirname_template: "{project}/obs-CFMIP/observations/{realm}/{short_name}/{frequency}/{grid}/{institute}/{dataset}/{version}" + filename_template: "{short_name}_*.nc" diff --git a/esmvalcore/config/configurations/ipslcm-data.yml b/esmvalcore/config/configurations/ipslcm-data.yml new file mode 100644 index 0000000000..e51344915d --- /dev/null +++ b/esmvalcore/config/configurations/ipslcm-data.yml @@ -0,0 +1,13 @@ +projects: + IPSLCM: + data: + ipslcm-varname: + type: "esmvalcore.local.LocalDataSource" + rootpath: ~/climate_data + dirname_template: "{root}/{account}/{model}/{status}/{exp}/{simulation}/{dir}/{out}/{freq}" + filename_template: "{simulation}_*_{ipsl_varname}.nc" + ipslcm-group: + type: "esmvalcore.local.LocalDataSource" + rootpath: ~/climate_data + dirname_template: "{root}/{account}/{model}/{status}/{exp}/{simulation}/{dir}/{out}/{freq}" + filename_template: "{simulation}_*_{group}.nc" diff --git a/esmvalcore/config/configurations/local-data.yml b/esmvalcore/config/configurations/local-data.yml new file mode 100644 index 0000000000..81cc931d46 --- /dev/null +++ b/esmvalcore/config/configurations/local-data.yml @@ -0,0 +1,57 @@ +projects: + CMIP6: + data: + local: + type: "esmvalcore.local.LocalDataSource" + rootpath: ~/climate_data + dirname_template: "{project}/{activity}/{institute}/{dataset}/{exp}/{ensemble}/{mip}/{short_name}/{grid}/{version}" + filename_template: "{short_name}_{mip}_{dataset}_{exp}_{ensemble}_{grid}*.nc" + CMIP5: + data: + local: + type: "esmvalcore.local.LocalDataSource" + rootpath: ~/climate_data + dirname_template: "{project.lower}/{product}/{institute}/{dataset}/{exp}/{frequency}/{modeling_realm}/{mip}/{ensemble}/{version}" + filename_template: "{short_name}_{mip}_{dataset}_{exp}_{ensemble}*.nc" + CMIP3: + data: + local: + type: "esmvalcore.local.LocalDataSource" + rootpath: ~/climate_data + dirname_template: "{project.lower}/{institute}/{dataset}/{exp}/{frequency}/{modeling_realm}/{ensemble}/{short_name}/{version}" + filename_template: "{short_name}_*.nc" + CORDEX: + data: + local: + type: "esmvalcore.local.LocalDataSource" + rootpath: ~/climate_data + dirname_template: "{project.lower}/output/{domain}/{institute}/{driver}/{exp}/{ensemble}/{dataset}/{rcm_version}/{frequency}/{short_name}/{version}" + filename_template: "{short_name}_{domain}_{driver}_{exp}_{ensemble}_{institute}-{dataset}_{rcm_version}_{mip}*.nc" + obs4MIPs: + data: + local: + type: "esmvalcore.local.LocalDataSource" + rootpath: ~/climate_data + dirname_template: "{project}/{dataset}/{version}" + filename_template: "{short_name}_*.nc" + native6: + data: + local: + type: "esmvalcore.local.LocalDataSource" + rootpath: ~/climate_data + dirname_template: "Tier{tier}/{dataset}/{version}/{frequency}/{short_name}" + filename_template: "*.nc" + OBS6: + data: + local: + type: "esmvalcore.local.LocalDataSource" + rootpath: ~/climate_data + dirname_template: "Tier{tier}/{dataset}" + filename_template: "{project}_{dataset}_{type}_{version}_{mip}_{short_name}[_.]*nc" + OBS: + data: + local: + type: "esmvalcore.local.LocalDataSource" + rootpath: ~/climate_data + dirname_template: "Tier{tier}/{dataset}" + filename_template: "{project}_{dataset}_{type}_{version}_{mip}_{short_name}[_.]*nc" diff --git a/esmvalcore/config/configurations/mo-data.yml b/esmvalcore/config/configurations/mo-data.yml new file mode 100644 index 0000000000..a8aa420396 --- /dev/null +++ b/esmvalcore/config/configurations/mo-data.yml @@ -0,0 +1,62 @@ +projects: + CMIP6: + data: + mo: &cmip6 + type: "esmvalcore.local.LocalDataSource" + rootpath: /data/users/managecmip/champ + dirname_template: "{project}/{activity}/{institute}/{dataset}/{exp}/{ensemble}/{mip}/{short_name}/{grid}/{version}" + filename_template: "{short_name}_{mip}_{dataset}_{exp}_{ensemble}_{grid}*.nc" + mo-old-vdi: + <<: *cmip6 + priority: 2 + rootpath: /project/champ/data + CMIP5: + data: + mo: &cmip5 + type: "esmvalcore.local.LocalDataSource" + rootpath: /data/users/managecmip/champ + dirname_template: "{project.lower}/{product}/{institute}/{dataset}/{exp}/{frequency}/{modeling_realm}/{mip}/{ensemble}/{version}/{short_name}" + filename_template: "{short_name}_{mip}_{dataset}_{exp}_{ensemble}*.nc" + mo-old-vdi: + <<: *cmip5 + priority: 2 + rootpath: /project/champ/data + CORDEX: + data: + mo: &cordex + type: "esmvalcore.local.LocalDataSource" + rootpath: /data/users/managecmip/champ + dirname_template: "{project.lower}/output/{domain}/{institute}/{driver}/{exp}/{ensemble}/{institute}-{dataset}/{rcm_version}/{mip}/{short_name}/{version}" + filename_template: "{short_name}_{domain}_{driver}_{exp}_{ensemble}_{institute}-{dataset}_{rcm_version}_{mip}*.nc" + mo-old-vdi: + <<: *cordex + priority: 2 + rootpath: /project/champ/data + obs4MIPs: + data: + mo: + type: "esmvalcore.local.LocalDataSource" + rootpath: /data/users/esmval/ESMValTool/obs + dirname_template: "Tier{tier}/{dataset}" + filename_template: "{short_name}_*.nc" + native6: + data: + mo: + type: "esmvalcore.local.LocalDataSource" + rootpath: /data/users/esmval/ESMValTool/rawobs + dirname_template: "Tier{tier}/{dataset}/{version}/{frequency}/{short_name}" + filename_template: "*.nc" + OBS6: + data: + mo: + type: "esmvalcore.local.LocalDataSource" + rootpath: /data/users/esmval/ESMValTool/obs + dirname_template: "Tier{tier}/{dataset}" + filename_template: "{project}_{dataset}_{type}_{version}_{mip}_{short_name}[_.]*nc" + OBS: + data: + mo: + type: "esmvalcore.local.LocalDataSource" + rootpath: /data/users/esmval/ESMValTool/obs + dirname_template: "Tier{tier}/{dataset}" + filename_template: "{project}_{dataset}_{type}_{version}_{mip}_{short_name}[_.]*nc" diff --git a/esmvalcore/config/configurations/nci-data.yml b/esmvalcore/config/configurations/nci-data.yml new file mode 100644 index 0000000000..9179abc1a5 --- /dev/null +++ b/esmvalcore/config/configurations/nci-data.yml @@ -0,0 +1,66 @@ +projects: + CMIP6: + data: + oi10: &cmip6 + type: "esmvalcore.local.LocalDataSource" + rootpath: /g/data/oi10/replicas + dirname_template: "{project}/{activity}/{institute}/{dataset}/{exp}/{ensemble}/{mip}/{short_name}/{grid}/{version}" + filename_template: "{short_name}_{mip}_{dataset}_{exp}_{ensemble}_{grid}*.nc" + fs38: + <<: *cmip6 + rootpath: /g/data/fs38/publications + xp65: + <<: *cmip6 + rootpath: /g/data/xp65/public/apps/esmvaltool/replicas + CMIP5: + data: + r87: &cmip5 + type: "esmvalcore.local.LocalDataSource" + rootpath: /g/data/r87/DRSv3/CMIP5 + dirname_template: "{institute}/{dataset}/{exp}/{frequency}/{modeling_realm}/{mip}/{ensemble}/{version}/{short_name}" + filename_template: "{short_name}_{mip}_{dataset}_{exp}_{ensemble}*.nc" + al33: + <<: *cmip5 + rootpath: /g/data/al33/replicas/CMIP5/combined + rr3: &cmip5-default + <<: *cmip5 + rootpath: /g/data/rr3/publications + dirname_template: "{project}/{product}/{institute}/{dataset}/{exp}/{frequency}/{modeling_realm}/{mip}/{ensemble}/{version}/{short_name}" + xp65: + <<: *cmip5-default + rootpath: /g/data/xp65/public/apps/esmvaltool/replicas + CMIP3: + data: + local: + type: "esmvalcore.local.LocalDataSource" + rootpath: /g/data/r87/DRSv3/CMIP3 + dirname_template: "{institute}/{dataset}/{exp}/{frequency}/{modeling_realm}/{ensemble}/{short_name}/{latestversion}" + filename_template: "{short_name}_*.nc" + obs4MIPs: + data: + local: + type: "esmvalcore.local.LocalDataSource" + rootpath: /g/data/ct11/access-nri/replicas/esmvaltool/obsdata-v2 + dirname_template: "Tier{tier}/{dataset}" + filename_template: "{short_name}_*.nc" + native6: + data: + local: + type: "esmvalcore.local.LocalDataSource" + rootpath: /g/data/xp65/public/apps/esmvaltool/native6 + dirname_template: "Tier{tier}/{dataset}/{version}/{frequency}/{short_name}" + filename_template: "*.nc" + OBS6: + data: + local: + type: "esmvalcore.local.LocalDataSource" + rootpath: /g/data/ct11/access-nri/replicas/esmvaltool/obsdata-v2 + dirname_template: "Tier{tier}/{dataset}" + filename_template: "{project}_{dataset}_{type}_{version}_{mip}_{short_name}[_.]*nc" + OBS: + data: + local: + type: "esmvalcore.local.LocalDataSource" + rootpath: /g/data/ct11/access-nri/replicas/esmvaltool/obsdata-v2 + dirname_template: "Tier{tier}/{dataset}" + filename_template: "{project}_{dataset}_{type}_{version}_{mip}_{short_name}[_.]*nc" diff --git a/esmvalcore/dataset.py b/esmvalcore/dataset.py index 229ba59bd9..5ec8a40b9b 100644 --- a/esmvalcore/dataset.py +++ b/esmvalcore/dataset.py @@ -15,7 +15,7 @@ from pathlib import Path from typing import TYPE_CHECKING, Any -from esmvalcore import esgf, local +from esmvalcore import esgf from esmvalcore._recipe import check from esmvalcore._recipe.from_datasets import datasets_to_recipe from esmvalcore.cmor.table import _get_mips, _update_cmor_facets @@ -26,11 +26,11 @@ get_institutes, load_extra_facets, ) +from esmvalcore.config._data_sources import _get_data_sources from esmvalcore.exceptions import InputFilesNotFound, RecipeError from esmvalcore.local import ( _dates_to_timerange, _get_output_file, - _get_start_end_date, ) from esmvalcore.preprocessor import preprocess @@ -39,6 +39,8 @@ from iris.cube import Cube + from esmvalcore.io.protocol import DataElement + from esmvalcore.preprocessor import PreprocessorItem from esmvalcore.typing import Facets, FacetValue __all__ = [ @@ -49,8 +51,6 @@ logger = logging.getLogger(__name__) -File = esgf.ESGFFile | local.LocalFile - INHERITED_FACETS: list[str] = [ "dataset", "domain", @@ -130,8 +130,8 @@ def __init__(self, **facets: FacetValue) -> None: self._persist: set[str] = set() self._session: Session | None = None - self._files: Sequence[File] | None = None - self._file_globs: Sequence[Path] | None = None + self._files: Sequence[DataElement] | None = None + self._file_globs: Sequence[str] = [] for key, value in facets.items(): self.set_facet(key, deepcopy(value), persist=True) @@ -192,7 +192,7 @@ def _derivation_necessary(self) -> bool: def _file_to_dataset( self, - file: esgf.ESGFFile | local.LocalFile, + file: DataElement, ) -> Dataset: """Create a dataset from a file with a `facets` attribute.""" facets = dict(file.facets) @@ -243,6 +243,12 @@ def _get_available_datasets(self) -> Iterator[Dataset]: expanded = False for file in dataset_template.files: dataset = self._file_to_dataset(file) + # Do not use the timerange facet from the file because there may be multiple + # files per dataset. + dataset.facets.pop("timerange", None) + # Restore the original timerange facet if it was specified. + if "timerange" in self.facets: + dataset.facets["timerange"] = self.facets["timerange"] # Filter out identical datasets facetset = frozenset( @@ -267,10 +273,8 @@ def _get_available_datasets(self) -> Iterator[Dataset]: for dataset, file in partially_defined: msg = ( f"{dataset} with unexpanded wildcards, created from file " - f"{file} with facets {file.facets}. Are the missing facets " - "in the path to the file?" - if isinstance(file, local.LocalFile) - else "available on ESGF?" + f"{file} with facets {file.facets}. Please check why " + "the missing facets are not available for the file." ) if expanded: logger.info("Ignoring %s", msg) @@ -287,7 +291,6 @@ def from_files(self) -> Iterator[Dataset]: The facet values for local files are retrieved from the directory tree where the directories represent the facets values. - Reading facet values from file names is not yet supported. See :ref:`CMOR-DRS` for more information on this kind of file organization. @@ -750,56 +753,43 @@ def find_files(self) -> None: supplementary.find_files() def _find_files(self) -> None: - self.files, self._file_globs = local.find_files( - debug=True, - **self.facets, - ) - - # If project does not support automatic downloads from ESGF, stop here - if self.facets["project"] not in esgf.facets.FACETS: - return - - # 'never' mode: never download files from ESGF and stop here - if self.session["search_esgf"] == "never": - return - - # 'when_missing' mode: if files are available locally, do not check - # ESGF - if self.session["search_esgf"] == "when_missing": - try: - check.data_availability(self, log=False) - except InputFilesNotFound: - pass # search ESGF for files - else: - return # use local files - - # Local files are not available in 'when_missing' mode or 'always' mode - # is used: check ESGF - local_files = {f.name: f for f in self.files} - search_result = esgf.find_files(**self.facets) - for file in search_result: - if file.name not in local_files: - # Use ESGF files that are not available locally. - self.files.append(file) - else: - # Use ESGF files that are newer than the locally available - # files. - local_file = local_files[file.name] - if "version" in local_file.facets: - if file.facets["version"] > local_file.facets["version"]: - idx = self.files.index(local_file) - self.files[idx] = file + def version(file: DataElement) -> str: + return str(file.facets.get("version", "")) + + self._file_globs = [] + files: dict[str, DataElement] = {} + for data_source in sorted( + _get_data_sources(self.session, self.facets["project"]), # type: ignore[arg-type] + key=lambda ds: ds.priority, + ): + result = data_source.find_data(**self.facets) + for file in result: + if file.name not in files: + files[file.name] = file + if version(files[file.name]) < version(file): + files[file.name] = file + self.files = list(files.values()) + self._file_globs.append(data_source.debug_info) + # 'when_missing' mode: if files are available from a higher + # priority source, do not search lower priority sources. + if self.session["search_esgf"] == "when_missing": + try: + check.data_availability(self, log=False) + except InputFilesNotFound: + pass # continue search for data + else: + return # use what has been found so far @property - def files(self) -> list[File]: + def files(self) -> list[DataElement]: """The files associated with this dataset.""" if self._files is None: self.find_files() return self._files # type: ignore @files.setter - def files(self, value: Sequence[File]) -> None: - self._files = value + def files(self, value: Sequence[DataElement]) -> None: + self._files = list(value) def load(self) -> Cube: """Load dataset. @@ -897,12 +887,7 @@ def _load(self) -> Cube: "short_name": self.facets["short_name"], } - result = [ - file.local_file(self.session["download_dir"]) - if isinstance(file, esgf.ESGFFile) - else file - for file in self.files - ] + result: Sequence[PreprocessorItem] = self.files for step, kwargs in settings.items(): result = preprocess( result, @@ -993,25 +978,37 @@ def _update_timerange(self) -> None: check.valid_time_selection(timerange) if "*" in timerange: + # Replace wildcards in timerange with "timerange" from DataElements, + # but only if all DataElements have the "timerange" facet. dataset = self.copy() dataset.facets.pop("timerange") dataset.supplementaries = [] check.data_availability(dataset) - intervals = [_get_start_end_date(f) for f in dataset.files] - - min_date = min(interval[0] for interval in intervals) - max_date = max(interval[1] for interval in intervals) + if all("timerange" in f.facets for f in dataset.files): + # "timerange" can only be reliably computed when all DataElements + # provide it. + intervals = [ + f.facets["timerange"].split("/") # type: ignore[union-attr] + for f in dataset.files + ] - if timerange == "*": - timerange = f"{min_date}/{max_date}" - if "*" in timerange.split("/")[0]: - timerange = timerange.replace("*", min_date) - if "*" in timerange.split("/")[1]: - timerange = timerange.replace("*", max_date) + min_date = min(interval[0] for interval in intervals) + max_date = max(interval[1] for interval in intervals) - # Make sure that years are in format YYYY - start_date, end_date = timerange.split("/") - timerange = _dates_to_timerange(start_date, end_date) - check.valid_time_selection(timerange) + if timerange == "*": + timerange = f"{min_date}/{max_date}" + if "*" in timerange.split("/")[0]: + timerange = timerange.replace("*", min_date) + if "*" in timerange.split("/")[1]: + timerange = timerange.replace("*", max_date) - self.set_facet("timerange", timerange) + if "*" in timerange: + # Drop the timerange facet if it still contains wildcards. + self.facets.pop("timerange") + else: + # Make sure that years are in format YYYY + start_date, end_date = timerange.split("/") + timerange = _dates_to_timerange(start_date, end_date) + # Update the timerange + check.valid_time_selection(timerange) + self.set_facet("timerange", timerange) diff --git a/esmvalcore/esgf/__init__.py b/esmvalcore/esgf/__init__.py index ca8607f964..2e03b90013 100644 --- a/esmvalcore/esgf/__init__.py +++ b/esmvalcore/esgf/__init__.py @@ -1,10 +1,41 @@ -"""Find files on the ESGF and download them.""" +"""Find files on the ESGF and download them. -from ._download import ESGFFile, download -from ._search import find_files +This module uses `esgf-pyclient `_ +to search for and download files from the Earth System Grid Federation (ESGF). +`esgf-pyclient`_ uses a +`deprecated API `__ +that is scheduled to be taken offline and replaced by new APIs based on +STAC (ESGF East) and Globus (ESGF West). An ESGF node mimicking the deprecated +API but built op top of Globus will be kept online for some time at +https://esgf-node.ornl.gov/esgf-1-5-bridge, but users are encouraged +to migrate to the new APIs as soon as possible by using the +:mod:`esmvalcore.io.intake_esgf` module instead. + +This module provides the function :py:func:`esmvalcore.esgf.find_files` +for searching for files on ESGF using the ESMValTool vocabulary. +It returns :class:`esmvalcore.esgf.ESGFFile` objects, which have a convenient +:meth:`esmvalcore.esgf.ESGFFile.download` method for downloading the file. +A :func:`esmvalcore.esgf.download` function for downloading multiple files in +parallel is also available. + +It also provides an :class:`esmvalcore.esgf.ESGFDataSource` that can be +used to find files on ESGF from the :class:`~esmvalcore.dataset.Dataset` +or the :ref:`recipe `. To use it, create a file with the following +:ref:`configuration ` in ``~/.config/esmvaltool``: + +.. literalinclude:: ../configurations/esgf-pyclient-data.yml + :language: yaml + +See :ref:`config-esgf` for instructions on additional configuration +options of this module. +""" + +from esmvalcore.esgf._download import ESGFFile, download +from esmvalcore.esgf._search import ESGFDataSource, find_files __all__ = [ "ESGFFile", + "ESGFDataSource", "download", "find_files", ] diff --git a/esmvalcore/esgf/_download.py b/esmvalcore/esgf/_download.py index 9a1ff04fcb..9a0b04c24f 100644 --- a/esmvalcore/esgf/_download.py +++ b/esmvalcore/esgf/_download.py @@ -11,16 +11,26 @@ import random import re import shutil +from collections.abc import Iterable from pathlib import Path from statistics import median from tempfile import NamedTemporaryFile +from typing import Any from urllib.parse import urlparse +import iris.cube import requests import yaml from humanfriendly import format_size, format_timespan - -from esmvalcore.local import LocalFile +from pyesgf.search.results import FileResult + +from esmvalcore.config import CFG +from esmvalcore.io.protocol import DataElement +from esmvalcore.local import ( + LocalFile, + _dates_to_timerange, + _get_start_end_date_from_filename, +) from esmvalcore.typing import Facets from .facets import DATASET_MAP, FACETS @@ -166,7 +176,7 @@ def sort_hosts(urls): @functools.total_ordering -class ESGFFile: +class ESGFFile(DataElement): """File on the ESGF. This is the object returned by :func:`esmvalcore.esgf.find_files`. @@ -185,7 +195,11 @@ class ESGFFile: The URLs where the file can be downloaded. """ - def __init__(self, results): + def __init__( + self, + results: Iterable[FileResult], + dest_folder: Path | None = None, + ) -> None: results = list(results) self.name = str(Path(results[0].filename).with_suffix(".nc")) self.size = results[0].size @@ -196,6 +210,39 @@ def __init__(self, results): for result in results: self.urls.append(result.download_url) self._checksums.append((result.checksum_type, result.checksum)) + self.dest_folder = ( + CFG["download_dir"] if dest_folder is None else dest_folder + ) + self._attributes: dict[str, Any] | None = None + + def prepare(self) -> None: + """Prepare the data for access.""" + self.download(self.dest_folder) + + @property + def attributes(self) -> dict[str, Any]: + """Attributes are key-value pairs describing the data.""" + if self._attributes is None: + msg = ( + "Attributes have not been read yet. Call the `to_iris` method " + "first to read the attributes from the file." + ) + raise ValueError(msg) + return self._attributes + + @attributes.setter + def attributes(self, value: dict[str, Any]) -> None: + self._attributes = value + + def to_iris( + self, + ignore_warnings: list[dict[str, Any]] | None = None, + ) -> iris.cube.CubeList: + self.prepare() + local_file = self.local_file(self.dest_folder) + cube = local_file.to_iris(ignore_warnings=ignore_warnings) + self.attributes = local_file.attributes + return cube @classmethod def _from_results(cls, results, facets): @@ -275,6 +322,9 @@ def _get_facets(self, results): self.name, ) facets[facet] = value + start_date, end_date = _get_start_end_date_from_filename(self.name) + if start_date and end_date: + facets["timerange"] = _dates_to_timerange(start_date, end_date) return facets @staticmethod @@ -383,16 +433,16 @@ def __lt__(self, other): """Compare `self` to `other`.""" return (self.dataset, self.name) < (other.dataset, other.name) - def __hash__(self): - """Compute a unique hash value.""" + def __hash__(self) -> int: + """Return a number uniquely representing the data element.""" return hash((self.dataset, self.name)) - def local_file(self, dest_folder): + def local_file(self, dest_folder: Path | None) -> LocalFile: """Return the path to the local file after download. Arguments --------- - dest_folder: Path + dest_folder: The destination folder. Returns @@ -400,16 +450,17 @@ def local_file(self, dest_folder): LocalFile The path where the file will be located after download. """ + dest_folder = self.dest_folder if dest_folder is None else dest_folder file = LocalFile(dest_folder, self._get_relative_path()) file.facets = self.facets return file - def download(self, dest_folder): + def download(self, dest_folder: Path | None) -> LocalFile: """Download the file. Arguments --------- - dest_folder: Path + dest_folder: The destination folder. Raises @@ -424,7 +475,6 @@ def download(self, dest_folder): """ local_file = self.local_file(dest_folder) if local_file.exists(): - logger.debug("Skipping download of existing file %s", local_file) return local_file os.makedirs(local_file.parent, exist_ok=True) @@ -552,9 +602,6 @@ def download(files, dest_folder, n_jobs=4): and not file.local_file(dest_folder).exists() ] if not files: - logger.debug( - "All required data is available locally, not downloading anything.", - ) return files = sorted(files) diff --git a/esmvalcore/esgf/_search.py b/esmvalcore/esgf/_search.py index 911e44cacb..841cef5529 100644 --- a/esmvalcore/esgf/_search.py +++ b/esmvalcore/esgf/_search.py @@ -2,18 +2,22 @@ import itertools import logging +import os.path +from dataclasses import dataclass, field from functools import lru_cache +from pathlib import Path import pyesgf.search import requests.exceptions from esmvalcore.config._esgf_pyclient import get_esgf_config +from esmvalcore.io.protocol import DataSource from esmvalcore.local import ( - _get_start_end_date, _parse_period, _replace_years_with_timerange, _truncate_dates, ) +from esmvalcore.typing import FacetValue from ._download import ESGFFile from .facets import DATASET_MAP, FACETS @@ -177,17 +181,16 @@ def select_by_time(files, timerange): for file in files: start_date, end_date = _parse_period(timerange) - try: - start, end = _get_start_end_date(file) - except ValueError: - # If start and end year cannot be read from the filename - # just select everything. - selection.append(file) - else: + if "timerange" in file.facets: + start, end = file.facets["timerange"].split("/") start_date, end = _truncate_dates(start_date, end) end_date, start = _truncate_dates(end_date, start) if start <= end_date and end >= start_date: selection.append(file) + else: + # If start and end year cannot be read from the filename just select + # everything. + selection.append(file) return selection @@ -378,3 +381,44 @@ def cached_search(**facets): logger.debug("Selected files:\n%s", "\n".join(str(f) for f in files)) return files + + +@dataclass +class ESGFDataSource(DataSource): + name: str + """A name identifying the data source.""" + + project: str + """The project that the data source provides data for.""" + + priority: int + """The priority of the data source. Lower values have priority.""" + + download_dir: Path + """The destination directory where data will be downloaded.""" + + debug_info: str = field(init=False, default="") + """A string containing debug information when no data is found.""" + + def __post_init__(self) -> None: + self.download_dir = Path( + os.path.expandvars(self.download_dir), + ).expanduser() + + def find_data(self, **facets: FacetValue) -> list[ESGFFile]: + """Find data. + + Parameters + ---------- + **facets : + Find data matching these facets. + + Returns + ------- + :obj:`list` of :obj:`esmvalcore.esgf.ESGFFile` + A list of files that have been found on ESGF. + """ + files = find_files(**facets) + for file in files: + file.dest_folder = self.download_dir + return files diff --git a/esmvalcore/io/__init__.py b/esmvalcore/io/__init__.py new file mode 100644 index 0000000000..97cd78653b --- /dev/null +++ b/esmvalcore/io/__init__.py @@ -0,0 +1,76 @@ +"""A modular system for reading input data from various sources.""" + +import importlib +import logging + +from esmvalcore.config import Session +from esmvalcore.io.protocol import DataSource + +logger = logging.getLogger(__name__) + + +def load_data_sources( + session: Session, + project: str | None = None, +) -> list[DataSource]: + """Get the list of available data sources. + + Arguments + --------- + session: + The configuration. + project: + If specified, only data sources for this project are returned. + + Returns + ------- + :obj:`list` of :obj:`DataSource`: + A list of available data sources. + + Raises + ------ + ValueError: + If the project or its settings are not found in the configuration. + + """ + data_sources: list[DataSource] = [] + if project is not None and project not in session["projects"]: + msg = f"Unknown project '{project}', please configure it under 'projects'." + raise ValueError(msg) + settings = ( + session["projects"] + if project is None + else {project: session["projects"][project]} + ) + for project_, project_settings in settings.items(): + for name, orig_kwargs in project_settings.get("data", {}).items(): + kwargs = orig_kwargs.copy() + module_name, cls_name = kwargs.pop("type").rsplit(".", 1) + module = importlib.import_module(module_name) + cls = getattr(module, cls_name) + priority = kwargs.pop("priority", 1) + data_source = cls( + name=name, + project=project_, + priority=priority, + **kwargs, + ) + if not isinstance(data_source, DataSource): + msg = ( + "Expected a data source of type `esmvalcore.io.protocol.DataSource`, " + f"but your configuration for project '{project_}' contains " + f"'{data_source}' of type '{type(data_source)}'." + ) + raise TypeError(msg) + data_sources.append(data_source) + + if not data_sources: + if project is None: + msg = "No data sources found. Check your configuration under 'projects'" + else: + msg = ( + f"No data sources found for project '{project}'. " + f"Check your configuration under 'projects: {project}: data'" + ) + raise ValueError(msg) + return data_sources diff --git a/esmvalcore/io/intake_esgf.py b/esmvalcore/io/intake_esgf.py new file mode 100644 index 0000000000..106dcdf08f --- /dev/null +++ b/esmvalcore/io/intake_esgf.py @@ -0,0 +1,241 @@ +"""Access data using `intake-esgf `_. + +This module replaces the :mod:`esmvalcore.io.esgf` module. Please use this +module instead of :mod:`esmvalcore.io.esgf` to access data on ESGF. If you +encounter any issues using this module, please report them at +https://github.com/ESMValGroup/ESMValCore/issues. + +Run the command ``esmvalcore config copy intake-esgf-data.yml`` to update +your :ref:`configuration ` to use this module. This will +create a file with the following content in ``~/.config/esmvaltool`` or +the path specified by the ``ESMVALTOOL_CONFIG_DIR`` environment variable: + +.. literalinclude:: ../configurations/intake-esgf-data.yml + :language: yaml + +""" + +import copy +from dataclasses import dataclass, field +from typing import Any + +import intake_esgf +import intake_esgf.exceptions +import iris.cube +import isodate + +from esmvalcore.io.protocol import DataElement, DataSource +from esmvalcore.iris_helpers import dataset_to_iris +from esmvalcore.local import _parse_period +from esmvalcore.typing import Facets, FacetValue + +__all__ = [ + "IntakeESGFDataSource", + "IntakeESGFDataset", +] + + +@dataclass +class IntakeESGFDataset(DataElement): + """A dataset that can be used to load data found using intake-esgf_.""" + + name: str + """A unique name identifying the data.""" + + facets: Facets + """Facets are key-value pairs that were used to find this data.""" + + catalog: intake_esgf.ESGFCatalog + """The intake-esgf catalog describing this data.""" + + _attributes: dict[str, Any] | None = field(init=False, default=None) + + def __hash__(self) -> int: + """Return a number uniquely representing the data element.""" + return hash(self.name) + + def prepare(self) -> None: + """Prepare the data for access.""" + self.catalog.to_path_dict() + + @property + def attributes(self) -> dict[str, Any]: + """Attributes are key-value pairs describing the data.""" + if self._attributes is None: + msg = ( + "Attributes have not been read yet. Call the `to_iris` method " + "first to read the attributes from the file." + ) + raise ValueError(msg) + return self._attributes + + @attributes.setter + def attributes(self, value: dict[str, Any]) -> None: + self._attributes = value + + def to_iris(self, ignore_warnings=None) -> iris.cube.CubeList: + """Load the data as Iris cubes. + + Returns + ------- + : + The loaded data. + """ + files = self.catalog.to_path_dict( + minimal_keys=False, + quiet=True, + )[self.name] + dataset = self.catalog.to_dataset_dict( + minimal_keys=False, + add_measures=False, + quiet=True, + )[self.name] + # Store the local paths in the attributes for easier debugging. + dataset.attrs["source_file"] = ", ".join(str(f) for f in files) + # Cache the attributes. + self.attributes = copy.deepcopy(dataset.attrs) + return dataset_to_iris(dataset, ignore_warnings=ignore_warnings) + + +@dataclass +class IntakeESGFDataSource(DataSource): + """Data source that can be used to find data using intake-esgf.""" + + name: str + """A name identifying the data source.""" + + project: str + """The project that the data source provides data for.""" + + priority: int + """The priority of the data source. Lower values have priority.""" + + facets: dict[str, str] + """Mapping between the ESMValCore and ESGF facet names.""" + + values: dict[str, dict[str, str]] = field(default_factory=dict) + """Mapping between the ESMValCore and ESGF facet values.""" + + debug_info: str = field(init=False, default="") + """A string containing debug information when no data is found.""" + + catalog: intake_esgf.ESGFCatalog = field( + init=False, + default_factory=intake_esgf.ESGFCatalog, + ) + """The intake-esgf catalog used to find data.""" + + def __post_init__(self): + self.catalog.project = intake_esgf.projects.projects[ + self.project.lower() + ] + + def find_data(self, **facets: FacetValue) -> list[IntakeESGFDataset]: + """Find data. + + Parameters + ---------- + **facets : + Find data matching these facets. + + Returns + ------- + : + A list of data elements that have been found. + """ + # Normalize facets so all values are `list[str]`. + our_facets = { + facet: [str(values)] if isinstance(values, str | int) else values + for facet, values in facets.items() + } + # Translate "our" facets to ESGF facets and "our" values to ESGF values. + esgf_facets = { + their_facet: [ + self.values.get(our_facet, {}).get(v, v) + for v in our_facets[our_facet] + ] + for our_facet, their_facet in self.facets.items() + if our_facet in our_facets + } + if ( + "timerange" in facets and "*" not in facets["timerange"] # type: ignore[operator] + ): + start, end = _parse_period(facets["timerange"]) + esgf_facets["file_start"] = isodate.date_isoformat( + isodate.parse_date(start.split("T")[0]), + ) + esgf_facets["file_end"] = isodate.date_isoformat( + isodate.parse_date(end.split("T")[0]), + ) + # Search ESGF. + try: + self.catalog.search(**esgf_facets, quiet=True) + except intake_esgf.exceptions.NoSearchResults: + self.debug_info = ( + "intake_esgf.ESGFCatalog.search(" + + ", ".join( + [ + f"{k}={v if isinstance(v, list) else [v]}" + for k, v in self.catalog.last_search.items() + ], + ) + + ") did not return any results." + ) + return [] + + # Return a list of datasets, with one IntakeESGFDataset per dataset_id. + result: list[IntakeESGFDataset] = [] + + # These are the keys in the dict[str, xarray.Dataset] returned by + # `intake_esgf.ESGFCatalog.to_dataset_dict`. Taken from: + # https://github.com/esgf2-us/intake-esgf/blob/c34124e54078e70ef271709a6d158edb22bcdb96/intake_esgf/catalog.py#L523-L528 + self.catalog.df["key"] = self.catalog.df.apply( + lambda row: ".".join( + [row[f] for f in self.catalog.project.master_id_facets()], + ), + axis=1, + ) + inverse_values = { + our_facet: { + their_value: our_value + for our_value, their_value in self.values[our_facet].items() + } + for our_facet in self.values + } + for _, row in self.catalog.df.iterrows(): + dataset_id = row["key"] + # Subset the catalog to a single dataset. + cat = self.catalog.clone() + cat.file_start = self.catalog.file_start + cat.file_end = self.catalog.file_end + cat.df = self.catalog.df[self.catalog.df.key == dataset_id] + # Discard all but the latest version. It is not clear how/if + # `intake_esgf.ESGFCatalog.to_dataset_dict` supports multiple versions. + cat.df = cat.df[cat.df.version == cat.df.version.max()] + cat.project = self.catalog.project + if "short_name" in our_facets: + cat.last_search[self.facets["short_name"]] = [ + self.values.get("short_name", {}).get(v, v) + for v in our_facets["short_name"] + ] + # Retrieve "our" facets associated with the dataset_id. + dataset_facets = {} + for our_facet, esgf_facet in self.facets.items(): + if esgf_facet in row: + esgf_values = row[esgf_facet] + if isinstance(esgf_values, str): + esgf_values = [esgf_values] + our_values = [ + inverse_values.get(our_facet, {}).get(v, v) + for v in esgf_values + ] + if len(our_values) == 1: + our_values = our_values[0] + dataset_facets[our_facet] = our_values + dataset = IntakeESGFDataset( + name=dataset_id, + facets=dataset_facets, # type: ignore[arg-type] + catalog=cat, + ) + result.append(dataset) + return result diff --git a/esmvalcore/io/protocol.py b/esmvalcore/io/protocol.py new file mode 100644 index 0000000000..a2799f7947 --- /dev/null +++ b/esmvalcore/io/protocol.py @@ -0,0 +1,120 @@ +"""Protocols for accessing data. + +An input data source can be defined in the configuration by using :obj:`esmvalcore.config.CFG` + +.. code-block:: python + + >>> from esmvalcore.config import CFG + >>> CFG["projects"]["example-project"]["data"]["example-source-name"] = { + "type": "example_module.ExampleDataSource" + "argument1": "value1" + "argument2": "value2" + } + +or as a :ref:`YAML configuration file ` + +.. code-block:: yaml + + projects: + example-project: + data: + example-source-name + type: example_module.ExampleDataSource + argument1: value1 + argument2: value2 + + +where ``example-project`` is a project, e.g. ``CMIP6``, and ``example-source-name`` +is a unique name describing the data source. The datasource type, in the +example above called ``example_module.ExampleDataSource`` needs to implement the +:class:`esmvalcore.io.protocol.DataSource` protocol. Any remaining key-value pairs +in the configuration, ``argument1: value1`` and ``argument2: value2`` are +passed as keyword arguments to the data source when it is created. + +Dedeplication of search results happens based on the +:attr:`esmvalcore.io.protocol.DataElement.name` attribute and the ``"version"`` +facet in :attr:`esmvalcore.io.protocol.DataElement.facets` of the data elements +provided by the data sources. If there is a tie, the data element provided by +the data source with the lowest value of +:attr:`esmvalcore.io.protocol.DataSource.priority` is chosen. +""" + +from collections.abc import Iterable +from typing import Any, Protocol, runtime_checkable + +import iris.cube + +from esmvalcore.typing import FacetValue + + +@runtime_checkable +class DataElement(Protocol): + """A data element represents some data that can be loaded. + + A file is an example of a data element. + """ + + name: str + """A unique name identifying the data.""" + + facets: dict[str, FacetValue] + """Facets are key-value pairs that can be used for searching the data.""" + + attributes: dict[str, Any] + """Attributes are key-value pairs describing the data.""" + + def __hash__(self) -> int: + """Return a number uniquely representing the data element.""" + + def prepare(self) -> None: + """Prepare the data for access.""" + + def to_iris( + self, + ignore_warnings: list[dict[str, Any]] | None = None, + ) -> iris.cube.CubeList: + """Load the data as Iris cubes. + + Parameters + ---------- + ignore_warnings: + Keyword arguments passed to :func:`warnings.filterwarnings` used to + ignore warnings issued by :func:`iris.load_raw`. Each list element + corresponds to one call to :func:`warnings.filterwarnings`. + + Returns + ------- + iris.cube.CubeList + The loaded data. + """ + + +@runtime_checkable +class DataSource(Protocol): + """A data source can be used to find data.""" + + name: str + """A name identifying the data source.""" + + project: str + """The project that the data source provides data for.""" + + priority: int + """The priority of the data source. Lower values have priority.""" + + debug_info: str + """A string containing debug information when no data is found.""" + + def find_data(self, **facets: FacetValue) -> Iterable[DataElement]: + """Find data. + + Parameters + ---------- + **facets : + Find data matching these facets. + + Returns + ------- + :obj:`typing.Iterable` of :obj:`esmvalcore.io.base.DataElement` + The data elements that have been found. + """ diff --git a/esmvalcore/local.py b/esmvalcore/local.py index 70f30adee2..208c4267cb 100644 --- a/esmvalcore/local.py +++ b/esmvalcore/local.py @@ -1,4 +1,48 @@ -"""Find files on the local filesystem.""" +"""Find files on the local filesystem. + +Example configuration to find CMIP6 data on a personal computer: + +.. code-block:: yaml + + projects: + CMIP6: + data: + local-data: + type: "esmvalcore.local.LocalDataSource" + rootpath: ~/climate_data + dirname_template: "{project}/{activity}/{institute}/{dataset}/{exp}/{ensemble}/{mip}/{short_name}/{grid}/{version}" + filename_template: "{short_name}_{mip}_{dataset}_{exp}_{ensemble}_{grid}*.nc" + +The module will find files matching the :func:`glob.glob` pattern formed by +``rootpath/dirname_template/filename_template``, where the facets defined +inside the curly braces of the templates are replaced by their values +from the :class:`~esmvalcore.dataset.Dataset` or the :ref:`recipe `. Note +that the name of the data source, ``local-data`` in the example above, +must be unique within each project but can otherwise be chosen freely. + +To start using this module, download the complete file for personal computers +:download:`here <../configurations/local-data.yml>`, copy it to the +directory ``~/.config/esmvaltool/``, and tailor it for your own system +if needed. + +Example configuration files for popular HPC systems are also available: + + - :download:`Jasmin at CEDA<../configurations/badc-data.yml>` + - :download:`Levante at DKRZ<../configurations/dkrz-data.yml>` + - :download:`UK Met Office <../configurations/mo-data.yml>` + - :download:`NCI Australia <../configurations/nci-data.yml>` + - :download:`IPSL <../configurations/ipsl-data.yml>` + - :download:`ETHZ <../configurations/ethz-data.yml>` + +Example configuration files for +:ref:`supported climate models ` are also available: + + - :download:`ACCESS <../configurations/access-data.yml>` + - :download:`ICON <../configurations/icon-data.yml>` + - :download:`IPSLCM <../configurations/ipslcm-data.yml>` + - :download:`EMAC <../configurations/emac-data.yml>` + +""" from __future__ import annotations @@ -6,27 +50,29 @@ import itertools import logging import os +import os.path import re -from dataclasses import dataclass +import warnings +from dataclasses import dataclass, field from glob import glob from pathlib import Path from typing import TYPE_CHECKING, Any +import iris.cube +import iris.fileformats.cf import isodate from cf_units import Unit from netCDF4 import Dataset, Variable +import esmvalcore.io.protocol from esmvalcore.config import CFG from esmvalcore.config._config import get_project_config from esmvalcore.exceptions import RecipeError -from esmvalcore.preprocessor._io import _load_from_file +from esmvalcore.iris_helpers import ignore_warnings_context if TYPE_CHECKING: from collections.abc import Iterable - import iris.cube - - from esmvalcore.esgf import ESGFFile from esmvalcore.typing import Facets, FacetValue logger = logging.getLogger(__name__) @@ -86,9 +132,9 @@ def _get_var_name(variable: Variable) -> str: return str(variable.name) -def _get_start_end_date( - file: str | Path | LocalFile | ESGFFile, -) -> tuple[str, str]: +def _get_start_end_date_from_filename( + file: str | Path, +) -> tuple[str | None, str | None]: """Get the start and end dates as a string from a file name. Examples of allowed dates: 1980, 198001, 1980-01, 19801231, 1980-12-31, @@ -117,13 +163,6 @@ def _get_start_end_date( ValueError Start or end date cannot be determined. """ - if hasattr(file, "name"): # noqa: SIM108 - # Path, LocalFile, ESGFFile - stem = Path(file.name).stem - else: - # str - stem = Path(file).stem - start_date = end_date = None # Build regex @@ -151,9 +190,34 @@ def _get_start_end_date( start_date, end_date = _get_from_pattern( datetime_pattern, date_range_pattern, - stem, + Path(file).stem, "datetime", ) + return start_date, end_date + + +def _get_start_end_date(file: str | Path) -> tuple[str, str]: + """Get the start and end dates as a string from a file. + + This function first tries to finds the dates from the filename and if that + fails it will try to read them from the file. + + Parameters + ---------- + file: + The file to read the start and end data from. + + Returns + ------- + tuple[str, str] + The start and end date. + + Raises + ------ + ValueError + Start or end date cannot be determined. + """ + start_date, end_date = _get_start_end_date_from_filename(file) # As final resort, try to get the dates from the file contents if ( @@ -199,17 +263,6 @@ def _get_start_end_date( return start_date, end_date -def _get_start_end_year( - file: str | Path | LocalFile | ESGFFile, -) -> tuple[int, int]: - """Get the start and end year as int from a file name. - - See :func:`_get_start_end_date`. - """ - (start_date, end_date) = _get_start_end_date(file) - return (int(start_date[:4]), int(end_date[:4])) - - def _dates_to_timerange(start_date: int | str, end_date: int | str) -> str: """Convert ``start_date`` and ``end_date`` to ``timerange``. @@ -467,15 +520,33 @@ def _select_drs(input_type: str, project: str, structure: str) -> list[str]: @dataclass(order=True) -class DataSource: - """Class for storing a data source and finding the associated files.""" +class LocalDataSource(esmvalcore.io.protocol.DataSource): + """Data source for finding files on a local filesystem.""" + + name: str + """A name identifying the data source.""" + + project: str + """The project that the data source provides data for.""" + + priority: int + """The priority of the data source. Lower values have priority.""" + + debug_info: str = field(init=False, default="") + """A string containing debug information when no data is found.""" rootpath: Path + """The path where the directories are located.""" + dirname_template: str + """The template for the directory names.""" + filename_template: str + """The template for the file names.""" def __post_init__(self) -> None: """Set further attributes.""" + self.rootpath = Path(os.path.expandvars(self.rootpath)).expanduser() self._regex_pattern = self._templates_to_regex() @property @@ -495,30 +566,59 @@ def get_glob_patterns(self, **facets) -> list[Path]: def find_files(self, **facets) -> list[LocalFile]: """Find files.""" + # TODO: deprecate this method + return self.find_data(**facets) + + def find_data(self, **facets) -> list[LocalFile]: + """Find data locally.""" + facets = dict(facets) + if "original_short_name" in facets: + facets["short_name"] = facets["original_short_name"] + globs = self.get_glob_patterns(**facets) + self.debug_info = "\n".join(str(g) for g in globs) logger.debug("Looking for files matching %s", globs) files: list[LocalFile] = [] for glob_ in globs: for filename in glob(str(glob_)): file = LocalFile(filename) - file.facets.update(self.path2facets(file)) + file.facets.update( + self.path2facets( + file, + add_timerange="timerange" in facets, + ), + ) files.append(file) + + files = _filter_versions_called_latest(files) + + if "version" not in facets: + files = _select_latest_version(files) + files.sort() # sorting makes it easier to see what was found if "timerange" in facets: files = _select_files(files, facets["timerange"]) return files - def path2facets(self, path: Path) -> dict[str, str]: + def path2facets(self, path: Path, add_timerange: bool) -> dict[str, str]: """Extract facets from path.""" facets: dict[str, str] = {} - match = re.search(self.regex_pattern, str(path)) - if match is None: - return facets - for facet, value in match.groupdict().items(): - if value: - facets[facet] = value + + if (match := re.search(self.regex_pattern, str(path))) is not None: + for facet, value in match.groupdict().items(): + if value: + facets[facet] = value + + if add_timerange: + try: + start_date, end_date = _get_start_end_date(path) + except ValueError: + pass + else: + facets["timerange"] = _dates_to_timerange(start_date, end_date) + return facets def _templates_to_regex(self) -> str: @@ -607,11 +707,40 @@ def _templates_to_regex(self) -> str: return pattern +class DataSource(LocalDataSource): + """Data source for finding files on a local filesystem. + + .. deprecated:: 2.13.0 + This class is deprecated and will be removed in version 2.16.0. + Please use 'esmvalcore.local.LocalDataSource' instead. + """ + + def __init__(self, *args, **kwargs) -> None: + msg = ( + "The 'esmvalcore.local.LocalDataSource' class is deprecated and will be " + "removed in version 2.16.0. Please use 'esmvalcore.local.LocalDataSource'" + ) + warnings.warn(msg, DeprecationWarning, stacklevel=2) + super().__init__(*args, **kwargs) + + _ROOTPATH_WARNED: set[tuple[str, tuple[str]]] = set() +_LEGACY_DATA_SOURCES_WARNED: set[str] = set() -def _get_data_sources(project: str) -> list[DataSource]: + +def _get_data_sources(project: str) -> list[LocalDataSource]: """Get a list of data sources.""" + if project not in _LEGACY_DATA_SOURCES_WARNED: + logger.warning( + ( + "Using legacy data sources for project '%s' using 'rootpath' " + "and 'drs' settings and the path templates from '%s'" + ), + project, + CFG["config_developer_file"], + ) + _LEGACY_DATA_SOURCES_WARNED.add(project) rootpaths = CFG["rootpath"] for key in (project, "default"): if key in rootpaths: @@ -627,12 +756,19 @@ def _get_data_sources(project: str) -> list[DataSource]: if isinstance(paths, list): structure = CFG["drs"].get(project, "default") paths = dict.fromkeys(paths, structure) - sources: list[DataSource] = [] + sources: list[LocalDataSource] = [] for path, structure in paths.items(): dir_templates = _select_drs("input_dir", project, structure) file_templates = _select_drs("input_file", project, structure) sources.extend( - DataSource(Path(path), d, f) + LocalDataSource( + name="legacy-local", + project=project, + priority=1, + rootpath=Path(path), + dirname_template=d, + filename_template=f, + ) for d in dir_templates for f in file_templates ) @@ -746,6 +882,7 @@ def version(file): return result +# TODO: Deprecate this? def find_files( *, debug: bool = False, @@ -839,18 +976,48 @@ def find_files( return files -class LocalFile(type(Path())): # type: ignore +GRIB_FORMATS = (".grib2", ".grib", ".grb2", ".grb", ".gb2", ".gb") +"""GRIB file extensions.""" + + +def _get_attr_from_field_coord( + ncfield: iris.fileformats.cf.CFVariable, + coord_name: str | None, + attr: str, +) -> Any: + """Get attribute from netCDF field coordinate.""" + if coord_name is not None: + attrs = ncfield.cf_group[coord_name].cf_attrs() + attr_val = [value for (key, value) in attrs if key == attr] + if attr_val: + return attr_val[0] + return None + + +def _restore_lat_lon_units( + cube: iris.cube.Cube, + field: iris.fileformats.cf.CFVariable, + filename: str, # noqa: ARG001 +) -> None: # pylint: disable=unused-argument + """Use this callback to restore the original lat/lon units.""" + # Iris chooses to change longitude and latitude units to degrees + # regardless of value in file, so reinstating file value + for coord in cube.coords(): + if coord.standard_name in ["longitude", "latitude"]: + units = _get_attr_from_field_coord(field, coord.var_name, "units") + if units is not None: + coord.units = units + + +class LocalFile(type(Path()), esmvalcore.io.protocol.DataElement): # type: ignore """File on the local filesystem.""" + def prepare(self) -> None: + """Prepare the data for access.""" + @property def facets(self) -> Facets: - """Facets describing the file. - - Note - ---- - When using :func:`find_files`, facets are read from the directory - structure. Facets stored in filenames are not yet supported. - """ + """Facets are key-value pairs that were used to find this data.""" if not hasattr(self, "_facets"): self._facets: Facets = {} return self._facets @@ -861,7 +1028,7 @@ def facets(self, value: Facets) -> None: @property def attributes(self) -> dict[str, Any]: - """Attributes read from the file.""" + """Attributes are key-value pairs describing the data.""" if not hasattr(self, "_attributes"): msg = ( "Attributes have not been read yet. Call the `to_iris` method " @@ -885,7 +1052,22 @@ def to_iris( iris.cube.CubeList The loaded data. """ - cubes = _load_from_file(self, ignore_warnings=ignore_warnings) + file = Path(self) + logger.debug("Loading:\n%s", file) + + with ignore_warnings_context(ignore_warnings): + # GRIB files need to be loaded with iris.load, otherwise we will + # get separate (lat, lon) slices for each time step, pressure + # level, etc. + if file.suffix in GRIB_FORMATS: + cubes = iris.load(file, callback=_restore_lat_lon_units) + else: + cubes = iris.load_raw(file, callback=_restore_lat_lon_units) + logger.debug("Done with loading %s", file) + + for cube in cubes: + cube.attributes.globals["source_file"] = str(file) + # Cache the attributes. self.attributes = copy.deepcopy(dict(cubes[0].attributes.globals)) return cubes diff --git a/esmvalcore/preprocessor/__init__.py b/esmvalcore/preprocessor/__init__.py index ff6f560cac..de4337f948 100644 --- a/esmvalcore/preprocessor/__init__.py +++ b/esmvalcore/preprocessor/__init__.py @@ -5,7 +5,6 @@ import copy import inspect import logging -from pathlib import Path from pprint import pformat from typing import TYPE_CHECKING, Any, TypeAlias @@ -15,6 +14,7 @@ from esmvalcore._task import BaseTask from esmvalcore.cmor.check import cmor_check_data, cmor_check_metadata from esmvalcore.cmor.fix import fix_data, fix_file, fix_metadata +from esmvalcore.io.protocol import DataElement from esmvalcore.preprocessor._area import ( area_statistics, extract_named_regions, @@ -103,11 +103,12 @@ if TYPE_CHECKING: from collections.abc import Callable, Iterable, Sequence + from pathlib import Path import prov.model from dask.delayed import Delayed - from esmvalcore.dataset import Dataset, File + from esmvalcore.dataset import Dataset logger = logging.getLogger(__name__) @@ -374,7 +375,7 @@ def _run_preproc_function( function: Callable, items: PreprocessorItem | Sequence[PreprocessorItem], kwargs: Any, - input_files: Sequence[File] | None = None, + input_files: Sequence[DataElement] | None = None, ) -> PreprocessorItem | Sequence[PreprocessorItem]: """Run preprocessor function.""" kwargs_str = ",\n".join( @@ -410,7 +411,7 @@ def _run_preproc_function( ) # Make sure that the arguments are indexable - if isinstance(items, (PreprocessorFile, Cube, str, Path)): + if isinstance(items, (PreprocessorFile, Cube, DataElement)): items = [items] if isinstance(items, set): items = list(items) @@ -438,7 +439,7 @@ def _run_preproc_function( def preprocess( items: Sequence[PreprocessorItem], step: str, - input_files: list[File] | None = None, + input_files: list[DataElement] | None = None, output_file: Path | None = None, debug: bool = False, **settings: Any, @@ -478,7 +479,7 @@ def preprocess( items = [] for item in result: - if isinstance(item, (PreprocessorFile, Cube, str, Path)): + if isinstance(item, (PreprocessorFile, Cube, DataElement)): items.append(item) else: items.extend(item) @@ -573,7 +574,7 @@ def apply(self, step: str, debug: bool = False) -> None: self.cubes, step, input_files=self._input_files, - output_file=self.filename, + output_file=self.filename, # type: ignore[arg-type] debug=debug, **self.settings[step], ) @@ -646,7 +647,7 @@ def _initialize_entity(self) -> None: settings = { "preprocessor:" + k: str(v) for k, v in self.settings.items() } - self.entity.add_attributes(settings) + self.entity.add_attributes(settings) # type: ignore[attr-defined] def group(self, keys: list) -> str: """Generate group keyword. @@ -671,7 +672,7 @@ def group(self, keys: list) -> str: return "_".join(identifier) -PreprocessorItem: TypeAlias = PreprocessorFile | Cube | str | Path +PreprocessorItem: TypeAlias = PreprocessorFile | Cube | DataElement def _apply_multimodel( diff --git a/esmvalcore/preprocessor/_io.py b/esmvalcore/preprocessor/_io.py index f050c4cfa7..bd0926f31c 100644 --- a/esmvalcore/preprocessor/_io.py +++ b/esmvalcore/preprocessor/_io.py @@ -20,16 +20,14 @@ from esmvalcore._task import write_ncl_settings from esmvalcore.exceptions import ESMValCoreLoadWarning -from esmvalcore.iris_helpers import ( - dataset_to_iris, - ignore_warnings_context, -) +from esmvalcore.io.protocol import DataElement +from esmvalcore.iris_helpers import dataset_to_iris +from esmvalcore.local import LocalFile if TYPE_CHECKING: from collections.abc import Sequence from dask.delayed import Delayed - from iris.fileformats.cf import CFVariable logger = logging.getLogger(__name__) @@ -42,40 +40,16 @@ "reference_dataset", "alternative_dataset", } -GRIB_FORMATS = (".grib2", ".grib", ".grb2", ".grb", ".gb2", ".gb") - - -def _get_attr_from_field_coord( - ncfield: CFVariable, - coord_name: str | None, - attr: str, -) -> Any: - """Get attribute from netCDF field coordinate.""" - if coord_name is not None: - attrs = ncfield.cf_group[coord_name].cf_attrs() - attr_val = [value for (key, value) in attrs if key == attr] - if attr_val: - return attr_val[0] - return None - - -def _restore_lat_lon_units( - cube: Cube, - field: CFVariable, - filename: str, # noqa: ARG001 -) -> None: # pylint: disable=unused-argument - """Use this callback to restore the original lat/lon units.""" - # Iris chooses to change longitude and latitude units to degrees - # regardless of value in file, so reinstating file value - for coord in cube.coords(): - if coord.standard_name in ["longitude", "latitude"]: - units = _get_attr_from_field_coord(field, coord.var_name, "units") - if units is not None: - coord.units = units def load( - file: str | Path | Cube | CubeList | xr.Dataset | ncdata.NcData, + file: str + | Path + | DataElement + | Cube + | CubeList + | xr.Dataset + | ncdata.NcData, ignore_warnings: list[dict[str, Any]] | None = None, backend_kwargs: dict[str, Any] | None = None, ) -> CubeList: @@ -113,7 +87,7 @@ def load( Invalid type for ``file``. """ - if hasattr(file, "to_iris"): + if isinstance(file, DataElement): cubes = file.to_iris(ignore_warnings=ignore_warnings) elif isinstance(file, (str, Path)): extension = ( @@ -122,7 +96,7 @@ def load( else os.path.splitext(file)[1] ) if "zarr" not in extension: - cubes = _load_from_file(file, ignore_warnings=ignore_warnings) + cubes = LocalFile(file).to_iris(ignore_warnings=ignore_warnings) else: cubes = _load_zarr( file, @@ -161,7 +135,7 @@ def load( def _load_zarr( - file: str | Path | Cube | CubeList | xr.Dataset | ncdata.NcData, + file: str | Path, ignore_warnings: list[dict[str, Any]] | None = None, backend_kwargs: dict[str, Any] | None = None, ) -> CubeList: @@ -222,30 +196,6 @@ def _load_zarr( return dataset_to_iris(zarr_xr, ignore_warnings=ignore_warnings) -def _load_from_file( - file: str | Path, - ignore_warnings: list[dict[str, Any]] | None = None, -) -> CubeList: - """Load data from file.""" - file = Path(file) - logger.debug("Loading:\n%s", file) - - with ignore_warnings_context(ignore_warnings): - # GRIB files need to be loaded with iris.load, otherwise we will - # get separate (lat, lon) slices for each time step, pressure - # level, etc. - if file.suffix in GRIB_FORMATS: - cubes = iris.load(file, callback=_restore_lat_lon_units) - else: - cubes = iris.load_raw(file, callback=_restore_lat_lon_units) - logger.debug("Done with loading %s", file) - - for cube in cubes: - cube.attributes.globals["source_file"] = str(file) - - return cubes - - def save( # noqa: C901 cubes: Sequence[Cube], filename: Path | str, diff --git a/esmvalcore/typing.py b/esmvalcore/typing.py index 7880bdac1b..1e3735d4f2 100644 --- a/esmvalcore/typing.py +++ b/esmvalcore/typing.py @@ -3,19 +3,18 @@ from __future__ import annotations from collections.abc import Iterable, Sequence -from numbers import Number import dask.array as da import numpy as np from iris.cube import Cube -FacetValue = str | Sequence[str] | Number | bool +FacetValue = str | Sequence[str] | int """Type describing a single facet.""" Facets = dict[str, FacetValue] """Type describing a collection of facets.""" -NetCDFAttr = str | Number | Iterable +NetCDFAttr = str | int | float | Iterable """Type describing netCDF attributes. `NetCDF attributes diff --git a/pyproject.toml b/pyproject.toml index bd853ba50d..0857deda01 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -45,6 +45,7 @@ dependencies = [ "fire", "geopy", "humanfriendly", + "intake-esgf", "intake-esm", "iris-grib>=0.20.0", # github.com/ESMValGroup/ESMValCore/issues/2535 "isodate>=0.7.0", @@ -143,6 +144,7 @@ minversion = "6" markers = [ "installation: Test requires installation of dependencies", "use_sample_data: Run functional tests using real data", + "online: Run tests that require internet access", ] testpaths = ["tests"] xfail_strict = true @@ -220,6 +222,7 @@ ignore = [ "D102", # Missing docstring in public method "D103", # Missing docstring in public function "D104", # Missing docstring in public package + "PT013", # Allow importing fixtures from pytest to avoid repeating 'pytest' many times ] "doc/gensidebar.py" = [ "INP001", # File is part of an implicit namespace package diff --git a/tests/conftest.py b/tests/conftest.py index fc6a39c7b2..ea2dceaeb6 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -27,7 +27,6 @@ def _load_default_config(): "ignore", message="Do not instantiate `Config` objects directly", category=UserWarning, - module="esmvalcore", ) cfg = Config() cfg.load_from_dirs([]) @@ -50,7 +49,6 @@ def ignore_existing_user_config(monkeypatch, cfg_default): @pytest.fixture def session(tmp_path: Path, ignore_existing_user_config, monkeypatch): """Session object with default settings.""" - monkeypatch.setitem(CFG, "rootpath", {"default": {tmp_path: "default"}}) monkeypatch.setitem(CFG, "output_dir", tmp_path / "esmvaltool_output") return CFG.start_session("recipe_test") diff --git a/tests/integration/cmor/_fixes/icon/conftest.py b/tests/integration/cmor/_fixes/icon/conftest.py new file mode 100644 index 0000000000..ee9d6beac7 --- /dev/null +++ b/tests/integration/cmor/_fixes/icon/conftest.py @@ -0,0 +1,36 @@ +"""Fixtures for ICON fixes tests.""" + +import importlib.resources +from pathlib import Path + +import pytest +import yaml + +import esmvalcore.config +from esmvalcore.cmor._fixes.icon._base_fixes import IconFix + + +@pytest.fixture(autouse=True) +def tmp_cache_dir(monkeypatch, tmp_path): + """Use temporary path as cache directory for all tests in this module.""" + monkeypatch.setattr(IconFix, "CACHE_DIR", tmp_path) + + +@pytest.fixture +def session( + tmp_path: Path, + monkeypatch: pytest.MonkeyPatch, + session: esmvalcore.config.Session, +) -> esmvalcore.config.Session: + """Configure ICON data source for all tests in this module.""" + with importlib.resources.as_file( + importlib.resources.files(esmvalcore.config) + / "configurations" + / "icon-data.yml", + ) as config_file: + cfg = yaml.safe_load(config_file.read_text(encoding="utf-8")) + for data_source in cfg["projects"]["ICON"]["data"]: + cfg["projects"]["ICON"]["data"][data_source]["rootpath"] = tmp_path + session["projects"]["ICON"]["data"] = cfg["projects"]["ICON"]["data"] + session["auxiliary_data_dir"] = tmp_path + return session diff --git a/tests/integration/cmor/_fixes/icon/test_icon.py b/tests/integration/cmor/_fixes/icon/test_icon.py index ce7cd6317a..de6e205f52 100644 --- a/tests/integration/cmor/_fixes/icon/test_icon.py +++ b/tests/integration/cmor/_fixes/icon/test_icon.py @@ -1,6 +1,5 @@ """Test the ICON on-the-fly CMORizer.""" -from copy import deepcopy from datetime import datetime from pathlib import Path from unittest import mock @@ -26,7 +25,6 @@ ) from esmvalcore.cmor.fix import Fix from esmvalcore.cmor.table import CoordinateInfo, get_var_info -from esmvalcore.config import CFG from esmvalcore.dataset import Dataset TEST_GRID_FILE_URI = ( @@ -36,12 +34,6 @@ TEST_GRID_FILE_NAME = "icon_grid.nc" -@pytest.fixture(autouse=True) -def tmp_cache_dir(monkeypatch, tmp_path): - """Use temporary path as cache directory for all tests in this module.""" - monkeypatch.setattr(IconFix, "CACHE_DIR", tmp_path) - - @pytest.fixture def cubes_atm_2d(test_data_path): """2D sample cubes.""" @@ -572,9 +564,10 @@ def test_get_areacella_fix(): assert fix == [AllVars(None), GenericFix(None)] -def test_areacella_fix(cubes_grid): +@pytest.mark.online +def test_areacella_fix(cubes_grid, session): """Test fix.""" - fix = get_allvars_fix("fx", "areacella") + fix = get_allvars_fix("fx", "areacella", session=session) fix.extra_facets["var_type"] = "fx" fixed_cubes = fix.fix_metadata(cubes_grid) @@ -595,9 +588,10 @@ def test_get_areacello_fix(): assert fix == [AllVars(None), GenericFix(None)] -def test_areacello_fix(cubes_grid): +@pytest.mark.online +def test_areacello_fix(cubes_grid, session): """Test fix.""" - fix = get_allvars_fix("Ofx", "areacello") + fix = get_allvars_fix("Ofx", "areacello", session=session) fix.extra_facets["var_type"] = "fx" fixed_cubes = fix.fix_metadata(cubes_grid) @@ -655,9 +649,10 @@ def test_get_lwp_fix(): assert fix == [AllVars(None), GenericFix(None)] -def test_lwp_fix(cubes_atm_2d): +@pytest.mark.online +def test_lwp_fix(cubes_atm_2d, session): """Test fix.""" - fix = get_allvars_fix("AERmon", "lwp") + fix = get_allvars_fix("AERmon", "lwp", session) fixed_cubes = fix.fix_metadata(cubes_atm_2d) assert len(fixed_cubes) == 1 @@ -683,9 +678,10 @@ def test_get_rsdt_fix(): assert fix == [AllVars(None), GenericFix(None)] -def test_rsdt_fix(cubes_atm_2d): +@pytest.mark.online +def test_rsdt_fix(cubes_atm_2d, session): """Test fix.""" - fix = get_allvars_fix("Amon", "rsdt") + fix = get_allvars_fix("Amon", "rsdt", session=session) fixed_cubes = fix.fix_metadata(cubes_atm_2d) assert len(fixed_cubes) == 1 @@ -706,9 +702,10 @@ def test_get_rsut_fix(): assert fix == [AllVars(None), GenericFix(None)] -def test_rsut_fix(cubes_atm_2d): +@pytest.mark.online +def test_rsut_fix(cubes_atm_2d, session): """Test fix.""" - fix = get_allvars_fix("Amon", "rsut") + fix = get_allvars_fix("Amon", "rsut", session=session) fixed_cubes = fix.fix_metadata(cubes_atm_2d) assert len(fixed_cubes) == 1 @@ -732,9 +729,10 @@ def test_get_siconc_fix(): assert fix == [AllVars(None), GenericFix(None)] -def test_siconc_fix(cubes_atm_2d): +@pytest.mark.online +def test_siconc_fix(cubes_atm_2d, session): """Test fix.""" - fix = get_allvars_fix("SImon", "siconc") + fix = get_allvars_fix("SImon", "siconc", session=session) fixed_cubes = fix.fix_metadata(cubes_atm_2d) cube = check_siconc_metadata( @@ -758,9 +756,10 @@ def test_get_siconca_fix(): assert fix == [AllVars(None), GenericFix(None)] -def test_siconca_fix(cubes_atm_2d): +@pytest.mark.online +def test_siconca_fix(cubes_atm_2d, session): """Test fix.""" - fix = get_allvars_fix("SImon", "siconca") + fix = get_allvars_fix("SImon", "siconca", session=session) fixed_cubes = fix.fix_metadata(cubes_atm_2d) cube = check_siconc_metadata( @@ -787,9 +786,10 @@ def test_get_ta_fix(): assert fix == [AllVars(None), GenericFix(None)] -def test_ta_fix(cubes_atm_3d): +@pytest.mark.online +def test_ta_fix(cubes_atm_3d, session): """Test fix.""" - fix = get_allvars_fix("Amon", "ta") + fix = get_allvars_fix("Amon", "ta", session=session) fixed_cubes = fix.fix_metadata(cubes_atm_3d) cube = check_ta_metadata(fixed_cubes) @@ -798,9 +798,10 @@ def test_ta_fix(cubes_atm_3d): check_lat_lon(cube) -def test_ta_fix_no_plev_bounds(cubes_atm_3d): +@pytest.mark.online +def test_ta_fix_no_plev_bounds(cubes_atm_3d, session): """Test fix.""" - fix = get_allvars_fix("Amon", "ta") + fix = get_allvars_fix("Amon", "ta", session=session) cubes = CubeList( [ cubes_atm_3d.extract_cube(NameConstraint(var_name="ta")), @@ -824,9 +825,10 @@ def test_get_tas_fix(): assert fix == [AllVars(None), GenericFix(None)] -def test_tas_fix(cubes_atm_2d): +@pytest.mark.online +def test_tas_fix(cubes_atm_2d, session): """Test fix.""" - fix = get_allvars_fix("Amon", "tas") + fix = get_allvars_fix("Amon", "tas", session=session) fixed_cubes = fix.fix_metadata(cubes_atm_2d) cube = check_tas_metadata(fixed_cubes) @@ -835,9 +837,10 @@ def test_tas_fix(cubes_atm_2d): check_heightxm(cube, 2.0) -def test_tas_spatial_index_coord_already_present(cubes_atm_2d): +@pytest.mark.online +def test_tas_spatial_index_coord_already_present(cubes_atm_2d, session): """Test fix.""" - fix = get_allvars_fix("Amon", "tas") + fix = get_allvars_fix("Amon", "tas", session=session) index_coord = DimCoord(np.arange(8), var_name="ncells") cube = cubes_atm_2d.extract_cube(NameConstraint(var_name="tas")) @@ -849,9 +852,10 @@ def test_tas_spatial_index_coord_already_present(cubes_atm_2d): check_lat_lon(cube) -def test_tas_scalar_height2m_already_present(cubes_atm_2d): +@pytest.mark.online +def test_tas_scalar_height2m_already_present(cubes_atm_2d, session): """Test fix.""" - fix = get_allvars_fix("Amon", "tas") + fix = get_allvars_fix("Amon", "tas", session=session) # Scalar height (with wrong metadata) already present height_coord = AuxCoord(2.0, var_name="h", standard_name="height") @@ -901,9 +905,10 @@ def test_tas_no_mesh(cubes_atm_2d): assert cube.coord_dims(lat) == cube.coord_dims(i_coord) -def test_tas_dim_height2m_already_present(cubes_atm_2d): +@pytest.mark.online +def test_tas_dim_height2m_already_present(cubes_atm_2d, session): """Test fix.""" - fix = get_allvars_fix("Amon", "tas") + fix = get_allvars_fix("Amon", "tas", session=session) # Dimensional coordinate height (with wrong metadata) already present height_coord = AuxCoord(2.0, var_name="h", standard_name="height") @@ -920,9 +925,10 @@ def test_tas_dim_height2m_already_present(cubes_atm_2d): check_heightxm(cube, 2.0) -def test_tas_no_shift_time(cubes_atm_2d): +@pytest.mark.online +def test_tas_no_shift_time(cubes_atm_2d, session): """Test fix.""" - fix = get_allvars_fix("Amon", "tas") + fix = get_allvars_fix("Amon", "tas", session=session) fix.extra_facets["shift_time"] = False fixed_cubes = fix.fix_metadata(cubes_atm_2d) @@ -944,9 +950,10 @@ def test_tas_no_shift_time(cubes_atm_2d): assert time.attributes == {} -def test_fix_does_not_change_cached_grid(cubes_atm_2d): +@pytest.mark.online +def test_fix_does_not_change_cached_grid(cubes_atm_2d, session): """Test fix.""" - fix = get_allvars_fix("Amon", "tas") + fix = get_allvars_fix("Amon", "tas", session=session) assert not fix._horizontal_grids assert not fix._meshes @@ -975,9 +982,10 @@ def test_get_uas_fix(): assert fix == [AllVars(None), GenericFix(None)] -def test_uas_fix(cubes_atm_2d): +@pytest.mark.online +def test_uas_fix(cubes_atm_2d, session): """Test fix.""" - fix = get_allvars_fix("Amon", "uas") + fix = get_allvars_fix("Amon", "uas", session=session) fixed_cubes = fix.fix_metadata(cubes_atm_2d) assert len(fixed_cubes) == 1 @@ -1001,9 +1009,10 @@ def test_uas_fix(cubes_atm_2d): assert height.bounds is None -def test_uas_scalar_height10m_already_present(cubes_atm_2d): +@pytest.mark.online +def test_uas_scalar_height10m_already_present(cubes_atm_2d, session): """Test fix.""" - fix = get_allvars_fix("Amon", "uas") + fix = get_allvars_fix("Amon", "uas", session=session) # Scalar height (with wrong metadata) already present height_coord = AuxCoord(10.0, var_name="h", standard_name="height") @@ -1017,9 +1026,10 @@ def test_uas_scalar_height10m_already_present(cubes_atm_2d): check_heightxm(cube, 10.0) -def test_uas_dim_height10m_already_present(cubes_atm_2d): +@pytest.mark.online +def test_uas_dim_height10m_already_present(cubes_atm_2d, session): """Test fix.""" - fix = get_allvars_fix("Amon", "uas") + fix = get_allvars_fix("Amon", "uas", session=session) # Dimensional coordinate height (with wrong metadata) already present height_coord = AuxCoord(10.0, var_name="h", standard_name="height") @@ -1108,9 +1118,10 @@ def test_ch4clim_fix(cubes_regular_grid): # Test fix with empty standard_name -def test_empty_standard_name_fix(cubes_atm_2d, monkeypatch): +@pytest.mark.online +def test_empty_standard_name_fix(cubes_atm_2d, monkeypatch, session): """Test fix.""" - fix = get_allvars_fix("Amon", "tas") + fix = get_allvars_fix("Amon", "tas", session=session) # We know that tas has a standard name, but this being native model output # there may be variables with no standard name. The code is designed to # handle this gracefully and here we test it with an artificial, but @@ -1130,7 +1141,8 @@ def test_empty_standard_name_fix(cubes_atm_2d, monkeypatch): # Test automatic addition of missing coordinates -def test_add_time(cubes_atm_2d): +@pytest.mark.online +def test_add_time(cubes_atm_2d, session): """Test fix.""" # Remove time from tas cube to test automatic addition tas_cube = cubes_atm_2d.extract_cube(NameConstraint(var_name="tas")) @@ -1139,7 +1151,7 @@ def test_add_time(cubes_atm_2d): tas_cube.remove_coord("time") cubes = CubeList([tas_cube, uas_cube]) - fix = get_allvars_fix("Amon", "tas") + fix = get_allvars_fix("Amon", "tas", session=session) fixed_cubes = fix.fix_metadata(cubes) cube = check_tas_metadata(fixed_cubes) @@ -1162,13 +1174,14 @@ def test_add_time_fail(): fix._add_time(cube, cubes) -def test_add_latitude(cubes_atm_2d): +@pytest.mark.online +def test_add_latitude(cubes_atm_2d, session): """Test fix.""" # Remove latitude from tas cube to test automatic addition tas_cube = cubes_atm_2d.extract_cube(NameConstraint(var_name="tas")) tas_cube.remove_coord("latitude") cubes = CubeList([tas_cube]) - fix = get_allvars_fix("Amon", "tas") + fix = get_allvars_fix("Amon", "tas", session=session) assert len(fix._horizontal_grids) == 0 fixed_cubes = fix.fix_metadata(cubes) @@ -1180,13 +1193,14 @@ def test_add_latitude(cubes_atm_2d): assert TEST_GRID_FILE_NAME in fix._horizontal_grids -def test_add_longitude(cubes_atm_2d): +@pytest.mark.online +def test_add_longitude(cubes_atm_2d, session): """Test fix.""" # Remove longitude from tas cube to test automatic addition tas_cube = cubes_atm_2d.extract_cube(NameConstraint(var_name="tas")) tas_cube.remove_coord("longitude") cubes = CubeList([tas_cube]) - fix = get_allvars_fix("Amon", "tas") + fix = get_allvars_fix("Amon", "tas", session=session) assert len(fix._horizontal_grids) == 0 fixed_cubes = fix.fix_metadata(cubes) @@ -1198,14 +1212,15 @@ def test_add_longitude(cubes_atm_2d): assert TEST_GRID_FILE_NAME in fix._horizontal_grids -def test_add_latitude_longitude(cubes_atm_2d): +@pytest.mark.online +def test_add_latitude_longitude(cubes_atm_2d, session): """Test fix.""" # Remove latitude and longitude from tas cube to test automatic addition tas_cube = cubes_atm_2d.extract_cube(NameConstraint(var_name="tas")) tas_cube.remove_coord("latitude") tas_cube.remove_coord("longitude") cubes = CubeList([tas_cube]) - fix = get_allvars_fix("Amon", "tas") + fix = get_allvars_fix("Amon", "tas", session=session) assert len(fix._horizontal_grids) == 0 fixed_cubes = fix.fix_metadata(cubes) @@ -1259,14 +1274,15 @@ def test_add_coord_from_grid_file_fail_no_url(): fix._add_coord_from_grid_file(Cube(0), "clat") -def test_add_coord_from_grid_fail_no_unnamed_dim(cubes_atm_2d): +@pytest.mark.online +def test_add_coord_from_grid_fail_no_unnamed_dim(cubes_atm_2d, session): """Test fix.""" # Remove latitude from tas cube to test automatic addition tas_cube = cubes_atm_2d.extract_cube(NameConstraint(var_name="tas")) tas_cube.remove_coord("latitude") index_coord = DimCoord(np.arange(8), var_name="ncells") tas_cube.add_dim_coord(index_coord, 1) - fix = get_allvars_fix("Amon", "tas") + fix = get_allvars_fix("Amon", "tas", session=session) msg = ( "Cannot determine coordinate dimension for coordinate 'clat', " @@ -1276,13 +1292,14 @@ def test_add_coord_from_grid_fail_no_unnamed_dim(cubes_atm_2d): fix._add_coord_from_grid_file(tas_cube, "clat") -def test_add_coord_from_grid_fail_two_unnamed_dims(cubes_atm_2d): +@pytest.mark.online +def test_add_coord_from_grid_fail_two_unnamed_dims(cubes_atm_2d, session): """Test fix.""" # Remove latitude from tas cube to test automatic addition tas_cube = cubes_atm_2d.extract_cube(NameConstraint(var_name="tas")) tas_cube.remove_coord("latitude") tas_cube = iris.util.new_axis(tas_cube) - fix = get_allvars_fix("Amon", "tas") + fix = get_allvars_fix("Amon", "tas", session=session) msg = ( "Cannot determine coordinate dimension for coordinate 'clat', " @@ -1321,19 +1338,16 @@ def test_get_horizontal_grid_from_attr_cached_in_dict( @mock.patch.object(IconFix, "_get_grid_from_facet", autospec=True) def test_get_horizontal_grid_from_attr_rootpath( mock_get_grid_from_facet, - monkeypatch, tmp_path, + session, ): """Test fix.""" - rootpath = deepcopy(CFG["rootpath"]) - rootpath["ICON"] = str(tmp_path) - monkeypatch.setitem(CFG, "rootpath", rootpath) cube = Cube(0, attributes={"grid_file_uri": "grid.nc"}) grid_cube = Cube(0, var_name="test_grid_cube") (tmp_path / "amip").mkdir(parents=True, exist_ok=True) iris.save(grid_cube, tmp_path / "amip" / "grid.nc") - fix = get_allvars_fix("Amon", "tas") + fix = get_allvars_fix("Amon", "tas", session=session) fix._horizontal_grids["grid_from_facet.nc"] = mock.sentinel.wrong_grid grid = fix.get_horizontal_grid(cube) @@ -1353,6 +1367,7 @@ def test_get_horizontal_grid_from_attr_cached_in_file( mock_requests, mock_get_grid_from_facet, tmp_path, + session, ): """Test fix.""" cube = Cube( @@ -1361,7 +1376,7 @@ def test_get_horizontal_grid_from_attr_cached_in_file( "grid_file_uri": "https://temporary.url/this/is/the/grid_file.nc", }, ) - fix = get_allvars_fix("Amon", "tas") + fix = get_allvars_fix("Amon", "tas", session=session) assert len(fix._horizontal_grids) == 0 # Save temporary grid file @@ -1380,15 +1395,17 @@ def test_get_horizontal_grid_from_attr_cached_in_file( mock_get_grid_from_facet.assert_not_called() +@pytest.mark.online @mock.patch.object(IconFix, "_get_grid_from_facet", autospec=True) def test_get_horizontal_grid_from_attr_cache_file_too_old( mock_get_grid_from_facet, tmp_path, monkeypatch, + session, ): """Test fix.""" cube = Cube(0, attributes={"grid_file_uri": TEST_GRID_FILE_URI}) - fix = get_allvars_fix("Amon", "tas") + fix = get_allvars_fix("Amon", "tas", session=session) assert len(fix._horizontal_grids) == 0 # Save temporary grid file @@ -1417,11 +1434,9 @@ def test_get_horizontal_grid_from_attr_cache_file_too_old( def test_get_horizontal_grid_from_facet_cached_in_dict( mock_get_grid_from_cube_attr, tmp_path, + session, ): """Test fix.""" - session = CFG.start_session("my session") - session["auxiliary_data_dir"] = tmp_path - # Save temporary grid file (this will not be used; however, it is necessary # to not raise a FileNotFoundError) grid_path = "grid.nc" @@ -1451,11 +1466,9 @@ def test_get_horizontal_grid_from_facet( mock_get_grid_from_cube_attr, grid_path, tmp_path, + session, ): """Test fix.""" - session = CFG.start_session("my session") - session["auxiliary_data_dir"] = tmp_path - # Make sure that grid specified by cube attribute is NOT used cube = Cube(0, attributes={"grid_file_uri": "cached_grid_url.nc"}) @@ -1479,11 +1492,8 @@ def test_get_horizontal_grid_from_facet( mock_get_grid_from_cube_attr.assert_not_called() -def test_get_horizontal_grid_from_facet_fail(tmp_path): +def test_get_horizontal_grid_from_facet_fail(session): """Test fix.""" - session = CFG.start_session("my session") - session["auxiliary_data_dir"] = tmp_path - cube = Cube(0) fix = get_allvars_fix("Amon", "tas", session=session) fix.extra_facets["horizontal_grid"] = "/this/does/not/exist.nc" @@ -1742,9 +1752,10 @@ def test_invalid_time_units(cubes_atm_2d): # Test fix with (sub-)hourly data -def test_hourly_data(cubes_atm_2d): +@pytest.mark.online +def test_hourly_data(cubes_atm_2d, session): """Test fix.""" - fix = get_allvars_fix("Amon", "tas") + fix = get_allvars_fix("Amon", "tas", session=session) fix.extra_facets["frequency"] = "1hr" for cube in cubes_atm_2d: cube.coord("time").points = [20041104.5833333] @@ -2093,8 +2104,9 @@ def test_get_previous_timestep(frequency, datetime_in, datetime_out): # Test mesh creation raises warning because bounds do not match vertices +@pytest.mark.online @mock.patch("esmvalcore.cmor._fixes.icon._base_fixes.logger", autospec=True) -def test_get_mesh_fail_invalid_clat_bounds(mock_logger, cubes_atm_2d): +def test_get_mesh_fail_invalid_clat_bounds(mock_logger, cubes_atm_2d, session): """Test fix.""" # Slightly modify latitude bounds from tas cube to make mesh creation fail tas_cube = cubes_atm_2d.extract_cube(NameConstraint(var_name="tas")) @@ -2102,7 +2114,7 @@ def test_get_mesh_fail_invalid_clat_bounds(mock_logger, cubes_atm_2d): lat_bnds[0, 0] = 40.0 tas_cube.coord("latitude").bounds = lat_bnds cubes = CubeList([tas_cube]) - fix = get_allvars_fix("Amon", "tas") + fix = get_allvars_fix("Amon", "tas", session=session) fixed_cubes = fix.fix_metadata(cubes) cube = check_tas_metadata(fixed_cubes) @@ -2117,8 +2129,9 @@ def test_get_mesh_fail_invalid_clat_bounds(mock_logger, cubes_atm_2d): ) +@pytest.mark.online @mock.patch("esmvalcore.cmor._fixes.icon._base_fixes.logger", autospec=True) -def test_get_mesh_fail_invalid_clon_bounds(mock_logger, cubes_atm_2d): +def test_get_mesh_fail_invalid_clon_bounds(mock_logger, cubes_atm_2d, session): """Test fix.""" # Slightly modify longitude bounds from tas cube to make mesh creation fail tas_cube = cubes_atm_2d.extract_cube(NameConstraint(var_name="tas")) @@ -2126,7 +2139,7 @@ def test_get_mesh_fail_invalid_clon_bounds(mock_logger, cubes_atm_2d): lon_bnds[0, 1] = 40.0 tas_cube.coord("longitude").bounds = lon_bnds cubes = CubeList([tas_cube]) - fix = get_allvars_fix("Amon", "tas") + fix = get_allvars_fix("Amon", "tas", session=session) fixed_cubes = fix.fix_metadata(cubes) cube = check_tas_metadata(fixed_cubes) @@ -2189,11 +2202,8 @@ def test_get_mesh_not_cached_from_attr(monkeypatch): fix._create_mesh.assert_called_once_with(cube) -def test_get_mesh_cached_from_facet(monkeypatch, tmp_path): +def test_get_mesh_cached_from_facet(monkeypatch, tmp_path, session): """Test fix.""" - session = CFG.start_session("my session") - session["auxiliary_data_dir"] = tmp_path - # Save temporary grid file (this will not be used; however, it is necessary # to not raise a FileNotFoundError) grid_path = "grid.nc" @@ -2213,11 +2223,8 @@ def test_get_mesh_cached_from_facet(monkeypatch, tmp_path): fix._create_mesh.assert_not_called() -def test_get_mesh_not_cached_from_facet(monkeypatch, tmp_path): +def test_get_mesh_not_cached_from_facet(monkeypatch, tmp_path, session): """Test fix.""" - session = CFG.start_session("my session") - session["auxiliary_data_dir"] = tmp_path - # Save temporary grid file (this will not be used; however, it is necessary # to not raise a FileNotFoundError) grid_path = "grid.nc" @@ -2245,10 +2252,8 @@ def test_get_mesh_not_cached_from_facet(monkeypatch, tmp_path): ("b.nc", "Grid file", "{tmp_path}/b.nc"), ], ) -def test_get_path_from_facet(path, description, output, tmp_path): +def test_get_path_from_facet(path, description, output, tmp_path, session): """Test fix.""" - session = CFG.start_session("my session") - session["auxiliary_data_dir"] = tmp_path path = path.format(tmp_path=tmp_path) fix = get_allvars_fix("Amon", "tas", session=session) fix.extra_facets["test_path"] = path @@ -2271,10 +2276,8 @@ def test_get_path_from_facet(path, description, output, tmp_path): ("b.nc", "Grid file"), ], ) -def test_get_path_from_facet_fail(path, description, tmp_path): +def test_get_path_from_facet_fail(path, description, tmp_path, session): """Test fix.""" - session = CFG.start_session("my session") - session["auxiliary_data_dir"] = tmp_path path = path.format(tmp_path=tmp_path) fix = get_allvars_fix("Amon", "tas", session=session) fix.extra_facets["test_path"] = path @@ -2288,10 +2291,8 @@ def test_get_path_from_facet_fail(path, description, tmp_path): @pytest.mark.parametrize("facet", ["zg_file", "zghalf_file"]) @pytest.mark.parametrize("path", ["{tmp_path}/a.nc", "a.nc"]) -def test_add_additional_cubes(path, facet, tmp_path): +def test_add_additional_cubes(path, facet, tmp_path, session): """Test fix.""" - session = CFG.start_session("my session") - session["auxiliary_data_dir"] = tmp_path path = path.format(tmp_path=tmp_path) fix = get_allvars_fix("Amon", "tas", session=session) fix.extra_facets[facet] = path @@ -2310,10 +2311,8 @@ def test_add_additional_cubes(path, facet, tmp_path): @pytest.mark.parametrize("facet", ["zg_file", "zghalf_file"]) @pytest.mark.parametrize("path", ["{tmp_path}/a.nc", "a.nc"]) -def test_add_additional_cubes_fail(path, facet, tmp_path): +def test_add_additional_cubes_fail(path, facet, tmp_path, session): """Test fix.""" - session = CFG.start_session("my session") - session["auxiliary_data_dir"] = tmp_path path = path.format(tmp_path=tmp_path) fix = get_allvars_fix("Amon", "tas", session=session) fix.extra_facets[facet] = path diff --git a/tests/integration/cmor/_fixes/icon/test_icon_xpp.py b/tests/integration/cmor/_fixes/icon/test_icon_xpp.py index 42d711dd43..a089eba095 100644 --- a/tests/integration/cmor/_fixes/icon/test_icon_xpp.py +++ b/tests/integration/cmor/_fixes/icon/test_icon_xpp.py @@ -10,7 +10,7 @@ import esmvalcore.cmor._fixes.icon.icon_xpp from esmvalcore.cmor._fixes.fix import GenericFix -from esmvalcore.cmor._fixes.icon._base_fixes import AllVarsBase, IconFix +from esmvalcore.cmor._fixes.icon._base_fixes import AllVarsBase from esmvalcore.cmor._fixes.icon.icon_xpp import ( AllVars, Clwvi, @@ -30,12 +30,6 @@ from esmvalcore.dataset import Dataset -@pytest.fixture(autouse=True) -def tmp_cache_dir(monkeypatch, tmp_path): - """Use temporary path as cache directory for all tests in this module.""" - monkeypatch.setattr(IconFix, "CACHE_DIR", tmp_path) - - @pytest.fixture def cubes_atm_2d(test_data_path): """2D sample cubes.""" @@ -732,7 +726,8 @@ def test_get_rlutcs_fix(): assert fix == [Rlutcs(None), AllVars(None), GenericFix(None)] -def test_rlutcs_fix(cubes_atm_3d): +@pytest.mark.online +def test_rlutcs_fix(cubes_atm_3d, session): """Test fix.""" cube = cubes_atm_3d.extract_cube(NameConstraint(var_name="temp")) cube.var_name = "lwflx_up_clr" @@ -740,7 +735,7 @@ def test_rlutcs_fix(cubes_atm_3d): cube.data = np.arange(1 * 47 * 8, dtype=np.float32).reshape(1, 47, 8) cubes = CubeList([cube]) - fixed_cubes = fix_metadata(cubes, "Amon", "rlutcs") + fixed_cubes = fix_metadata(cubes, "Amon", "rlutcs", session=session) assert len(fixed_cubes) == 1 cube = fixed_cubes[0] @@ -770,9 +765,10 @@ def test_get_rsdt_fix(): assert fix == [AllVars(None), GenericFix(None)] -def test_rsdt_fix(cubes_atm_2d): +@pytest.mark.online +def test_rsdt_fix(cubes_atm_2d, session): """Test fix.""" - fix = get_allvars_fix("Amon", "rsdt") + fix = get_allvars_fix("Amon", "rsdt", session=session) fixed_cubes = fix.fix_metadata(cubes_atm_2d) assert len(fixed_cubes) == 1 @@ -793,9 +789,10 @@ def test_get_rsut_fix(): assert fix == [AllVars(None), GenericFix(None)] -def test_rsut_fix(cubes_atm_2d): +@pytest.mark.online +def test_rsut_fix(cubes_atm_2d, session): """Test fix.""" - fix = get_allvars_fix("Amon", "rsut") + fix = get_allvars_fix("Amon", "rsut", session=session) fixed_cubes = fix.fix_metadata(cubes_atm_2d) assert len(fixed_cubes) == 1 @@ -819,7 +816,8 @@ def test_get_rsutcs_fix(): assert fix == [Rsutcs(None), AllVars(None), GenericFix(None)] -def test_rsutcs_fix(cubes_atm_3d): +@pytest.mark.online +def test_rsutcs_fix(cubes_atm_3d, session): """Test fix.""" cube = cubes_atm_3d.extract_cube(NameConstraint(var_name="temp")) cube.var_name = "swflx_up_clr" @@ -827,7 +825,7 @@ def test_rsutcs_fix(cubes_atm_3d): cube.data = np.arange(1 * 47 * 8, dtype=np.float32).reshape(1, 47, 8) cubes = CubeList([cube]) - fixed_cubes = fix_metadata(cubes, "Amon", "rsutcs") + fixed_cubes = fix_metadata(cubes, "Amon", "rsutcs", session=session) assert len(fixed_cubes) == 1 cube = fixed_cubes[0] @@ -923,7 +921,8 @@ def test_get_siconc_fix(): assert fix == [AllVars(None), GenericFix(None)] -def test_siconc_fix(cubes_ocean_3d): +@pytest.mark.online +def test_siconc_fix(cubes_ocean_3d, session): """Test fix.""" cubes = CubeList( [cubes_ocean_3d.extract_cube(NameConstraint(var_name="to")).copy()], @@ -936,7 +935,7 @@ def test_siconc_fix(cubes_ocean_3d): cubes[0].remove_coord("depth") cubes[0].add_dim_coord(DimCoord(0.0, var_name="lev"), 1) - fix = get_allvars_fix("SImon", "siconc") + fix = get_allvars_fix("SImon", "siconc", session=session) fixed_cubes = fix.fix_metadata(cubes) cube = check_siconc_metadata( @@ -978,9 +977,10 @@ def test_get_siconca_fix(): assert fix == [AllVars(None), GenericFix(None)] -def test_siconca_fix(cubes_atm_2d): +@pytest.mark.online +def test_siconca_fix(cubes_atm_2d, session): """Test fix.""" - fix = get_allvars_fix("SImon", "siconca") + fix = get_allvars_fix("SImon", "siconca", session=session) fixed_cubes = fix.fix_metadata(cubes_atm_2d) cube = check_siconc_metadata( @@ -1007,9 +1007,10 @@ def test_get_ta_fix(): assert fix == [AllVars(None), GenericFix(None)] -def test_ta_fix(cubes_atm_3d): +@pytest.mark.online +def test_ta_fix(cubes_atm_3d, session): """Test fix.""" - fix = get_allvars_fix("Amon", "ta") + fix = get_allvars_fix("Amon", "ta", session=session) fixed_cubes = fix.fix_metadata(cubes_atm_3d) cube = check_ta_metadata(fixed_cubes) @@ -1030,9 +1031,10 @@ def test_get_tas_fix(): assert fix == [AllVars(None), GenericFix(None)] -def test_tas_fix(cubes_atm_2d): +@pytest.mark.online +def test_tas_fix(cubes_atm_2d, session): """Test fix.""" - fix = get_allvars_fix("Amon", "tas") + fix = get_allvars_fix("Amon", "tas", session=session) fixed_cubes = fix.fix_metadata(cubes_atm_2d) cube = check_tas_metadata(fixed_cubes) @@ -1068,9 +1070,10 @@ def test_get_thetao_fix(): assert fix == [AllVars(None), GenericFix(None)] -def test_thetao_fix(cubes_ocean_3d): +@pytest.mark.online +def test_thetao_fix(cubes_ocean_3d, session): """Test fix.""" - fix = get_allvars_fix("Omon", "thetao") + fix = get_allvars_fix("Omon", "thetao", session=session) fixed_cubes = fix.fix_metadata(cubes_ocean_3d) @@ -1089,7 +1092,8 @@ def test_thetao_fix(cubes_ocean_3d): assert cube.shape == (1, 47, 8) -def test_thetao_fix_already_bounds(cubes_ocean_3d): +@pytest.mark.online +def test_thetao_fix_already_bounds(cubes_ocean_3d, session): """Test fix.""" cube = cubes_ocean_3d.extract_cube(NameConstraint(var_name="to")) cube.coord("depth").guess_bounds() @@ -1098,7 +1102,7 @@ def test_thetao_fix_already_bounds(cubes_ocean_3d): cube.coord("depth").bounds = bounds cubes = CubeList([cube]) - fix = get_allvars_fix("Omon", "thetao") + fix = get_allvars_fix("Omon", "thetao", session=session) fixed_cubes = fix.fix_metadata(cubes) @@ -1118,12 +1122,13 @@ def test_thetao_fix_already_bounds(cubes_ocean_3d): assert cube.shape == (1, 47, 8) -def test_thetao_fix_no_bounds(cubes_ocean_3d): +@pytest.mark.online +def test_thetao_fix_no_bounds(cubes_ocean_3d, session): """Test fix.""" cube = cubes_ocean_3d.extract_cube(NameConstraint(var_name="to")) cubes = CubeList([cube]) - fix = get_allvars_fix("Omon", "thetao") + fix = get_allvars_fix("Omon", "thetao", session=session) fixed_cubes = fix.fix_metadata(cubes) diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py index 391d2ab258..85b1505866 100644 --- a/tests/integration/conftest.py +++ b/tests/integration/conftest.py @@ -1,4 +1,5 @@ import os +from collections.abc import Iterator from pathlib import Path import iris @@ -11,6 +12,7 @@ _select_drs, _select_files, ) +from esmvalcore.typing import Facets def create_test_file(filename, tracking_id=None): @@ -27,7 +29,12 @@ def create_test_file(filename, tracking_id=None): iris.save(cube, filename) -def _get_files(root_path, facets, tracking_id): +def _get_files( # noqa: C901,PLR0912 + root_path: Path, + facets: Facets, + tracking_id: Iterator[int], + suffix: str = "nc", +) -> tuple[list[LocalFile], list[Path]]: """Return dummy files. Wildcards are only supported for `dataset` and `institute`; in this case @@ -43,8 +50,8 @@ def _get_files(root_path, facets, tracking_id): all_facets = [facets] # Globs without expanded facets - dir_template = _select_drs("input_dir", facets["project"], "default") - file_template = _select_drs("input_file", facets["project"], "default") + dir_template = _select_drs("input_dir", facets["project"], "default") # type: ignore[arg-type] + file_template = _select_drs("input_file", facets["project"], "default") # type: ignore[arg-type] dir_globs = _replace_tags(dir_template, facets) file_globs = _replace_tags(file_template, facets) globs = sorted( @@ -56,49 +63,57 @@ def _get_files(root_path, facets, tracking_id): filenames = [] dir_template = _select_drs( "input_dir", - expanded_facets["project"], + expanded_facets["project"], # type: ignore[arg-type] "default", ) file_template = _select_drs( "input_file", - expanded_facets["project"], + expanded_facets["project"], # type: ignore[arg-type] "default", ) + dir_globs = _replace_tags(dir_template, expanded_facets) file_globs = _replace_tags(file_template, expanded_facets) filename = str( root_path / "input" / dir_globs[0] / Path(file_globs[0]).name, ) + if filename.endswith("nc"): + filename = f"{filename[:-2]}{suffix}" + + if filename.endswith(f"[_.]*{suffix}"): + filename = filename.replace(f"[_.]*{suffix}", f"_*.{suffix}") - if filename.endswith("[_.]*nc"): - filename = filename.replace("[_.]*nc", "_*.nc") - - if filename.endswith("*.nc"): - filename = filename[: -len("*.nc")] + "_" - if facets["frequency"] == "fx": - intervals = [""] - else: - intervals = [ - "1990_1999", - "2000_2009", - "2010_2019", - ] + if facets["frequency"] == "fx": + intervals = [""] + else: + intervals = [ + "1990-1999", + "2000-2009", + "2010-2019", + ] + if filename.endswith(f"*.{suffix}"): + filename = filename[: -len(f"*.{suffix}")] for interval in intervals: - filenames.append(filename + interval + ".nc") + filenames.append(f"{filename}_{interval}.{suffix}") else: filenames.append(filename) - if "timerange" in facets: - filenames = _select_files(filenames, facets["timerange"]) - - for filename in filenames: - create_test_file(filename, next(tracking_id)) + if suffix == "nc": + for filename in filenames: + create_test_file(filename, next(tracking_id)) for filename in filenames: file = LocalFile(filename) - file.facets = expanded_facets + file.facets = dict(expanded_facets) + if facets["frequency"] != "fx": + for interval in intervals: + if interval in filename: + file.facets["timerange"] = interval.replace("-", "/") files.append(file) + if "timerange" in facets: + files = _select_files(files, facets["timerange"]) + return files, globs @@ -108,13 +123,11 @@ def _tracking_ids(i=0): i += 1 -def _get_find_files_func(path: Path, suffix: str = ".nc"): +def _get_find_files_func(path: Path, suffix: str = "nc"): tracking_id = _tracking_ids() - def find_files(*, debug: bool = False, **facets): - files, file_globs = _get_files(path, facets, tracking_id) - files = [f.with_suffix(suffix) for f in files] - file_globs = [g.with_suffix(suffix) for g in file_globs] + def find_files(self, *, debug: bool = False, **facets): + files, file_globs = _get_files(path, facets, tracking_id, suffix) if debug: return files, file_globs return files @@ -125,13 +138,21 @@ def find_files(*, debug: bool = False, **facets): @pytest.fixture def patched_datafinder(tmp_path, monkeypatch): find_files = _get_find_files_func(tmp_path) - monkeypatch.setattr(esmvalcore.local, "find_files", find_files) + monkeypatch.setattr( + esmvalcore.local.LocalDataSource, + "find_data", + find_files, + ) @pytest.fixture def patched_datafinder_grib(tmp_path, monkeypatch): - find_files = _get_find_files_func(tmp_path, suffix=".grib") - monkeypatch.setattr(esmvalcore.local, "find_files", find_files) + find_files = _get_find_files_func(tmp_path, suffix="grib") + monkeypatch.setattr( + esmvalcore.local.LocalDataSource, + "find_data", + find_files, + ) @pytest.fixture @@ -147,7 +168,7 @@ def patched_failing_datafinder(tmp_path, monkeypatch): """ tracking_id = _tracking_ids() - def find_files(*, debug: bool = False, **facets): + def find_files(self, *, debug: bool = False, **facets): files, file_globs = _get_files(tmp_path, facets, tracking_id) if facets["frequency"] == "fx": files = [] @@ -159,4 +180,8 @@ def find_files(*, debug: bool = False, **facets): return returned_files, file_globs return returned_files - monkeypatch.setattr(esmvalcore.local, "find_files", find_files) + monkeypatch.setattr( + esmvalcore.local.LocalDataSource, + "find_data", + find_files, + ) diff --git a/tests/integration/esgf/search_results/expected.yml b/tests/integration/esgf/search_results/expected.yml index 24f02b9181..11f3f423e8 100644 --- a/tests/integration/esgf/search_results/expected.yml +++ b/tests/integration/esgf/search_results/expected.yml @@ -20,6 +20,7 @@ Amon_r1i1p1_historical,rcp85_INM-CM4_CMIP5_tas.json: project: CMIP5 modeling_realm: atmos short_name: tas + timerange: "185001/200512" version: v20130207 local_file: cmip5/output1/INM/inmcm4/historical/mon/atmos/Amon/r1i1p1/v20130207/tas_Amon_inmcm4_historical_r1i1p1_185001-200512.nc name: tas_Amon_inmcm4_historical_r1i1p1_185001-200512.nc @@ -50,6 +51,7 @@ Amon_r1i1p1_historical,rcp85_INM-CM4_CMIP5_tas.json: project: CMIP5 modeling_realm: atmos short_name: tas + timerange: "200601/210012" version: v20130207 local_file: cmip5/output1/INM/inmcm4/rcp85/mon/atmos/Amon/r1i1p1/v20130207/tas_Amon_inmcm4_rcp85_r1i1p1_200601-210012.nc name: tas_Amon_inmcm4_rcp85_r1i1p1_200601-210012.nc @@ -81,6 +83,7 @@ Amon_r1i1p1_historical_FIO-ESM_CMIP5_tas.json: project: CMIP5 modeling_realm: atmos short_name: tas + timerange: "185001/200512" version: v20121010 local_file: cmip5/output1/FIO/FIO-ESM/historical/mon/atmos/Amon/r1i1p1/v20121010/tas_Amon_FIO-ESM_historical_r1i1p1_185001-200512.nc name: tas_Amon_FIO-ESM_historical_r1i1p1_185001-200512.nc @@ -108,6 +111,7 @@ Amon_r1i1p1_rcp85_HadGEM2-CC_CMIP5_tas.json: project: CMIP5 modeling_realm: atmos short_name: tas + timerange: "205512/208011" version: v20120531 local_file: cmip5/output1/MOHC/HadGEM2-CC/rcp85/mon/atmos/Amon/r1i1p1/v20120531/tas_Amon_HadGEM2-CC_rcp85_r1i1p1_205512-208011.nc name: tas_Amon_HadGEM2-CC_rcp85_r1i1p1_205512-208011.nc @@ -132,6 +136,7 @@ Amon_r1i1p1_rcp85_HadGEM2-CC_CMIP5_tas.json: project: CMIP5 modeling_realm: atmos short_name: tas + timerange: "208012/209912" version: v20120531 local_file: cmip5/output1/MOHC/HadGEM2-CC/rcp85/mon/atmos/Amon/r1i1p1/v20120531/tas_Amon_HadGEM2-CC_rcp85_r1i1p1_208012-209912.nc name: tas_Amon_HadGEM2-CC_rcp85_r1i1p1_208012-209912.nc @@ -156,6 +161,7 @@ Amon_r1i1p1_rcp85_HadGEM2-CC_CMIP5_tas.json: project: CMIP5 modeling_realm: atmos short_name: tas + timerange: "210001/210012" version: v20120531 local_file: cmip5/output1/MOHC/HadGEM2-CC/rcp85/mon/atmos/Amon/r1i1p1/v20120531/tas_Amon_HadGEM2-CC_rcp85_r1i1p1_210001-210012.nc name: tas_Amon_HadGEM2-CC_rcp85_r1i1p1_210001-210012.nc @@ -180,6 +186,7 @@ EUR-11_MOHC-HadGEM2-ES_r1i1p1_historical_CORDEX_RACMO22E_mon_tas.json: project: CORDEX rcm_version: v2 short_name: tas + timerange: "195001/195012" version: v20160620 local_file: cordex/output/EUR-11/KNMI/MOHC-HadGEM2-ES/historical/r1i1p1/RACMO22E/v2/mon/tas/v20160620/tas_EUR-11_MOHC-HadGEM2-ES_historical_r1i1p1_KNMI-RACMO22E_v2_mon_195001-195012.nc name: tas_EUR-11_MOHC-HadGEM2-ES_historical_r1i1p1_KNMI-RACMO22E_v2_mon_195001-195012.nc @@ -202,6 +209,7 @@ EUR-11_MOHC-HadGEM2-ES_r1i1p1_historical_CORDEX_RACMO22E_mon_tas.json: project: CORDEX rcm_version: v2 short_name: tas + timerange: "195101/196012" version: v20160620 local_file: cordex/output/EUR-11/KNMI/MOHC-HadGEM2-ES/historical/r1i1p1/RACMO22E/v2/mon/tas/v20160620/tas_EUR-11_MOHC-HadGEM2-ES_historical_r1i1p1_KNMI-RACMO22E_v2_mon_195101-196012.nc name: tas_EUR-11_MOHC-HadGEM2-ES_historical_r1i1p1_KNMI-RACMO22E_v2_mon_195101-196012.nc @@ -233,6 +241,7 @@ historical_gn_r4i1p1f1_CMIP6_CESM2_Amon_tas.json: mip: Amon project: CMIP6 short_name: tas + timerange: "185001/201412" version: v20190308 local_file: CMIP6/CMIP/NCAR/CESM2/historical/r4i1p1f1/Amon/tas/gn/v20190308/tas_Amon_CESM2_historical_r4i1p1f1_gn_185001-201412.nc name: tas_Amon_CESM2_historical_r4i1p1f1_gn_185001-201412.nc @@ -256,6 +265,7 @@ obs4MIPs_CERES-EBAF_mon_rsutcs.json: project: obs4MIPs modeling_realm: atmos short_name: rsutcs + timerange: "200003/201404" version: v20160610 local_file: obs4MIPs/CERES-EBAF/v20160610/rsutcs_CERES-EBAF_L3B_Ed2-8_200003-201404.nc name: rsutcs_CERES-EBAF_L3B_Ed2-8_200003-201404.nc @@ -273,6 +283,7 @@ obs4MIPs_GPCP-V2.3_pr.json: institute: NASA-GSFC project: obs4MIPs short_name: pr + timerange: "197901/201710" version: v20180519 local_file: obs4MIPs/GPCP-V2.3/v20180519/pr_GPCP-SG_L3_v2.3_197901-201710.nc name: pr_GPCP-SG_L3_v2.3_197901-201710.nc @@ -293,6 +304,7 @@ run1_historical_cccma_cgcm3_1_CMIP3_mon_tas.json: project: CMIP3 modeling_realm: atmos short_name: tas + timerange: "1850/2000" version: v1 local_file: cmip3/CCCma/cccma_cgcm3_1/historical/mon/atmos/run1/tas/v1/tas_a1_20c3m_1_cgcm3.1_t47_1850_2000.nc name: tas_a1_20c3m_1_cgcm3.1_t47_1850_2000.nc diff --git a/tests/integration/esgf/test_search_download.py b/tests/integration/esgf/test_search_download.py index 33680a42b3..685e55c937 100644 --- a/tests/integration/esgf/test_search_download.py +++ b/tests/integration/esgf/test_search_download.py @@ -183,6 +183,7 @@ def test_mock_search(variable, mocker): ] +@pytest.mark.online def test_real_search(): """Test a real search for a single file.""" variable = { diff --git a/tests/integration/preprocessor/_io/test_load.py b/tests/integration/preprocessor/_io/test_load.py index 59fbe09d78..1a9e747f4a 100644 --- a/tests/integration/preprocessor/_io/test_load.py +++ b/tests/integration/preprocessor/_io/test_load.py @@ -13,7 +13,7 @@ from iris.cube import Cube, CubeList from esmvalcore.exceptions import ESMValCoreLoadWarning -from esmvalcore.preprocessor._io import _get_attr_from_field_coord, load +from esmvalcore.preprocessor._io import load from tests import assert_array_equal @@ -141,15 +141,13 @@ def test_callback_fix_lat_units(tmp_path, sample_cube): assert str(sample_cube.coord("latitude").units) == "degrees_north" -def test_get_attr_from_field_coord_none(mocker): - """Test ``_get_attr_from_field_coord``.""" - attr = _get_attr_from_field_coord(mocker.sentinel.ncfield, None, "attr") - assert attr is None - - def test_fail_empty_cubes(mocker): """Test that ValueError is raised when cubes are empty.""" - mocker.patch("iris.load_raw", autospec=True, return_value=CubeList([])) + mocker.patch( + "esmvalcore.preprocessor._io.LocalFile.to_iris", + autospec=True, + return_value=CubeList([]), + ) msg = "myfilename does not contain any data" with pytest.raises(ValueError, match=msg): load("myfilename") diff --git a/tests/integration/preprocessor/_io/test_zarr.py b/tests/integration/preprocessor/_io/test_zarr.py index fc5684c967..7899a107a9 100644 --- a/tests/integration/preprocessor/_io/test_zarr.py +++ b/tests/integration/preprocessor/_io/test_zarr.py @@ -48,6 +48,7 @@ def test_load_zarr2_local(input_type): assert "latitude" in coord_names +@pytest.mark.online def test_load_zarr2_remote(): """Test loading a Zarr2 store from a https Object Store.""" zarr_path = ( @@ -88,6 +89,7 @@ def test_load_zarr2_remote(): assert "latitude" in coord_names +@pytest.mark.online def test_load_zarr3_remote(): """Test loading a Zarr3 store from a https Object Store.""" zarr_path = ( @@ -114,6 +116,7 @@ def test_load_zarr3_remote(): assert "latitude" in coord_names +@pytest.mark.online def test_load_zarr3_cmip6_metadata(): """ Test loading a Zarr3 store from a https Object Store. diff --git a/tests/integration/recipe/test_check.py b/tests/integration/recipe/test_check.py index 8c6c7009ce..ab00d9a9f4 100644 --- a/tests/integration/recipe/test_check.py +++ b/tests/integration/recipe/test_check.py @@ -14,6 +14,7 @@ from esmvalcore._recipe import check from esmvalcore.dataset import Dataset from esmvalcore.exceptions import RecipeError +from esmvalcore.local import LocalFile from esmvalcore.preprocessor import PreprocessorFile @@ -142,7 +143,12 @@ def test_ncl_version_broken(mocker): def test_data_availability_data(mock_logger, input_files, var, error): """Test check for data when data is present.""" dataset = Dataset(**var) - dataset.files = [Path(f) for f in input_files] + files = [] + for filename in input_files: + file = LocalFile(filename) + file.facets["timerange"] = filename.split("_")[-1].replace("-", "/") + files.append(file) + dataset.files = files if error is None: check.data_availability(dataset) mock_logger.error.assert_not_called() @@ -324,9 +330,9 @@ def test_data_availability_nonexistent(tmp_path): def test_reference_for_bias_preproc_empty(): """Test ``reference_for_bias_preproc``.""" products = { - PreprocessorFile(filename=10), - PreprocessorFile(filename=20), - PreprocessorFile(filename=30), + PreprocessorFile(filename=Path("10")), + PreprocessorFile(filename=Path("20")), + PreprocessorFile(filename=Path("30")), } check.reference_for_bias_preproc(products) @@ -334,11 +340,11 @@ def test_reference_for_bias_preproc_empty(): def test_reference_for_bias_preproc_one_ref(): """Test ``reference_for_bias_preproc`` with one reference.""" products = { - PreprocessorFile(filename=90), - PreprocessorFile(filename=10, settings={"bias": {}}), - PreprocessorFile(filename=20, settings={"bias": {}}), + PreprocessorFile(filename=Path("90")), + PreprocessorFile(filename=Path("10"), settings={"bias": {}}), + PreprocessorFile(filename=Path("20"), settings={"bias": {}}), PreprocessorFile( - filename=30, + filename=Path("30"), settings={"bias": {}}, attributes={"reference_for_bias": True}, ), @@ -349,10 +355,10 @@ def test_reference_for_bias_preproc_one_ref(): def test_reference_for_bias_preproc_no_ref(): """Test ``reference_for_bias_preproc`` with no reference.""" products = { - PreprocessorFile(filename=90), - PreprocessorFile(filename=10, settings={"bias": {}}), - PreprocessorFile(filename=20, settings={"bias": {}}), - PreprocessorFile(filename=30, settings={"bias": {}}), + PreprocessorFile(filename=Path("90")), + PreprocessorFile(filename=Path("10"), settings={"bias": {}}), + PreprocessorFile(filename=Path("20"), settings={"bias": {}}), + PreprocessorFile(filename=Path("30"), settings={"bias": {}}), } with pytest.raises(RecipeError) as rec_err: check.reference_for_bias_preproc(products) @@ -376,15 +382,15 @@ def test_reference_for_bias_preproc_no_ref(): def test_reference_for_bias_preproc_two_refs(): """Test ``reference_for_bias_preproc`` with two references.""" products = { - PreprocessorFile(filename=90), - PreprocessorFile(filename=10, settings={"bias": {}}), + PreprocessorFile(filename=Path("90")), + PreprocessorFile(filename=Path("10"), settings={"bias": {}}), PreprocessorFile( - filename=20, + filename=Path("20"), attributes={"reference_for_bias": True}, settings={"bias": {}}, ), PreprocessorFile( - filename=30, + filename=Path("30"), attributes={"reference_for_bias": True}, settings={"bias": {}}, ), diff --git a/tests/integration/recipe/test_recipe.py b/tests/integration/recipe/test_recipe.py index adeb030ea3..21a9c37bcd 100644 --- a/tests/integration/recipe/test_recipe.py +++ b/tests/integration/recipe/test_recipe.py @@ -1,10 +1,13 @@ +import importlib.resources import inspect import os import re from collections import defaultdict +from functools import lru_cache from pathlib import Path from pprint import pformat from textwrap import dedent +from typing import TYPE_CHECKING from unittest.mock import create_autospec import iris @@ -30,6 +33,48 @@ from esmvalcore.preprocessor import DEFAULT_ORDER, PreprocessingTask from tests.integration.test_provenance import check_provenance +if TYPE_CHECKING: + from esmvalcore.typing import Facets + + +@lru_cache +def _load_data_sources( + filename, +) -> dict[ + str, + dict[str, dict[str, dict[str, dict[str, str]]]], +]: + """Load data source configurations.""" + with importlib.resources.as_file( + importlib.resources.files(esmvalcore.config) + / "configurations" + / filename, + ) as config_file: + return yaml.safe_load(config_file.read_text(encoding="utf-8")) + + +def update_data_sources( + session: Session, + filename: str, + rootpath: Path, +) -> None: + """Update the data sources in `session` using config file `filename`.""" + cfg = _load_data_sources(filename) + projects = cfg["projects"] + for project in projects: + data_sources = projects[project]["data"] + for data_source in data_sources.values(): + data_source["rootpath"] = str(rootpath) + session["projects"][project]["data"] = data_sources + + +@pytest.fixture +def session(tmp_path: Path, session: Session) -> Session: + """Session fixture with default data sources.""" + update_data_sources(session, "local-data.yml", tmp_path) + return session + + TAGS_FOR_TESTING = { "authors": { "andela_bouwe": { @@ -692,7 +737,7 @@ def test_default_fx_preprocessor(tmp_path, patched_datafinder, session): "remove_supplementary_variables": {}, "save": { "compress": False, - "filename": product.filename, + "filename": Path(product.filename), "compute": False, }, } @@ -1539,7 +1584,7 @@ def test_diagnostic_task_provenance( # Test that provenance was saved to xml and info embedded in netcdf product = next( iter( - p for p in diagnostic_task.products if p.filename.endswith(".nc") + p for p in diagnostic_task.products if p.filename.suffix == ".nc" ), ) cube = iris.load_cube(product.filename) @@ -2464,10 +2509,15 @@ def test_recipe_run(tmp_path, patched_datafinder, session, mocker): session["search_esgf"] = "when_missing" mocker.patch.object( - esmvalcore._recipe.recipe.esgf, + esmvalcore.esgf, "download", create_autospec=True, ) + mocker.patch.object( + esmvalcore.local.LocalFile, + "prepare", + create_autospec=True, + ) recipe = get_recipe(tmp_path, content, session) @@ -2476,10 +2526,8 @@ def test_recipe_run(tmp_path, patched_datafinder, session, mocker): recipe.write_html_summary = mocker.Mock() recipe.run() - esmvalcore._recipe.recipe.esgf.download.assert_called_once_with( - set(), - session["download_dir"], - ) + esmvalcore.esgf.download.assert_called() + esmvalcore.local.LocalFile.prepare.assert_called() recipe.tasks.run.assert_called_once_with( max_parallel_tasks=session["max_parallel_tasks"], ) @@ -2487,8 +2535,14 @@ def test_recipe_run(tmp_path, patched_datafinder, session, mocker): recipe.write_html_summary.assert_called_once() -def test_representative_dataset_regular_var(patched_datafinder, session): +def test_representative_dataset_regular_var( + tmp_path: Path, + patched_datafinder: None, + session: Session, +): """Test ``_representative_dataset`` with regular variable.""" + update_data_sources(session, "icon-data.yml", tmp_path) + variable = { "dataset": "ICON", "exp": "atm_amip-rad_R2B4_r1i1p1f1", @@ -2505,18 +2559,20 @@ def test_representative_dataset_regular_var(patched_datafinder, session): datasets = _representative_datasets(dataset) assert len(datasets) == 1 filename = datasets[0].files[0] - path = Path(filename) - assert path.name == "atm_amip-rad_R2B4_r1i1p1f1_atm_2d_ml_1990_1999.nc" + assert filename.name == "atm_amip-rad_R2B4_r1i1p1f1_atm_2d_ml_1990-1999.nc" @pytest.mark.parametrize("force_derivation", [True, False]) def test_representative_dataset_derived_var( - patched_datafinder, - session, - force_derivation, + tmp_path: Path, + patched_datafinder: None, + session: Session, + force_derivation: bool, ): """Test ``_representative_dataset`` with derived variable.""" - variable = { + update_data_sources(session, "icon-data.yml", tmp_path) + + variable: Facets = { "dataset": "ICON", "derive": True, "exp": "atm_amip-rad_R2B4_r1i1p1f1", @@ -2533,7 +2589,7 @@ def test_representative_dataset_derived_var( dataset.session = session representative_datasets = _representative_datasets(dataset) - expected_facets = { + expected_facets: Facets = { # Already present in variable "dataset": "ICON", "derive": True, diff --git a/tests/unit/config/test_config.py b/tests/unit/config/test_config.py index 7795189b3d..61e1597725 100644 --- a/tests/unit/config/test_config.py +++ b/tests/unit/config/test_config.py @@ -138,6 +138,14 @@ def test_load_default_config(cfg_default, monkeypatch): paths=[str(p) for p in config_dir.glob("extra_facets_*.yml")], env={}, )["projects"] + # Add in projects without extra facets from the config developer file + # until we have transitioned all of its content to the new configuration + # system. + for project in yaml.safe_load( + default_dev_file.read_text(encoding="utf-8"), + ): + if project not in default_project_settings: + default_project_settings[project] = {} session = cfg_default.start_session("recipe_example") @@ -164,13 +172,6 @@ def test_load_default_config(cfg_default, monkeypatch): }, "diagnostics": None, "download_dir": Path.home() / "climate_data", - "drs": { - "CMIP3": "ESGF", - "CMIP5": "ESGF", - "CMIP6": "ESGF", - "CORDEX": "ESGF", - "obs4MIPs": "ESGF", - }, "exit_on_warning": False, "log_level": "info", "logging": {"log_progress_interval": 0.0}, @@ -183,7 +184,6 @@ def test_load_default_config(cfg_default, monkeypatch): "projects": default_project_settings, "remove_preproc_dir": True, "resume_from": [], - "rootpath": {"default": [Path.home() / "climate_data"]}, "run_diagnostic": True, "search_esgf": "never", "skip_nonexistent": False, diff --git a/tests/unit/config/test_config_object.py b/tests/unit/config/test_config_object.py index e51fa40707..0b0ceae8ae 100644 --- a/tests/unit/config/test_config_object.py +++ b/tests/unit/config/test_config_object.py @@ -525,7 +525,9 @@ def test_load_from_dirs(dirs, output_file_type, rootpath, tmp_path): cfg.load_from_dirs(config_dirs) assert cfg["output_file_type"] == output_file_type - assert cfg["rootpath"] == rootpath + if any(Path(d).exists() for d in config_dirs): + # Legacy setting "rootpath" is not available in default config. + assert cfg["rootpath"] == rootpath assert cfg["search_esgf"] == "never" diff --git a/tests/unit/config/test_data_sources.py b/tests/unit/config/test_data_sources.py new file mode 100644 index 0000000000..50b0e6fd5c --- /dev/null +++ b/tests/unit/config/test_data_sources.py @@ -0,0 +1,18 @@ +import pytest + +import esmvalcore.config._data_sources +from esmvalcore.config import Session + + +def test_load_data_sources_no_project_data_sources_configured( + session: Session, +) -> None: + """Test that loading data sources when no data sources are configured raises.""" + with pytest.raises( + ValueError, + match=r"No data sources found for project 'test'.*", + ): + esmvalcore.config._data_sources._get_data_sources( + session, + project="test", + ) diff --git a/tests/unit/esgf/test_download.py b/tests/unit/esgf/test_download.py index 85b5cbae3e..9b0a2df2e0 100644 --- a/tests/unit/esgf/test_download.py +++ b/tests/unit/esgf/test_download.py @@ -11,6 +11,7 @@ import requests import yaml from pyesgf.search.results import FileResult +from pytest_mock import MockerFixture import esmvalcore.esgf from esmvalcore.esgf import _download @@ -241,6 +242,7 @@ def test_init(): "dataset": "ABC", "project": "CMIP6", "short_name": "tas", + "timerange": "2000/2001", "version": "v1", } txt = f"ESGFFile:CMIP6/ABC/v1/{filename} on hosts ['something.org']" @@ -248,6 +250,62 @@ def test_init(): assert hash(file) == hash(("CMIP6.ABC.v1", filename)) +@pytest.fixture +def esgf_file() -> _download.ESGFFile: + """ESGFFile fixture.""" + json = { + "dataset_id": "CMIP6.dataset.v1|something.org", + "dataset_id_template_": ["%(mip_era)s.%(source_id)s"], + "project": ["CMIP6"], + "size": 12, + "title": "test.nc", + } + return _download.ESGFFile( + [FileResult(json=json, context=None)], + dest_folder=Path("/path/to/climate_data"), + ) + + +def test_prepare(mocker: MockerFixture, esgf_file: _download.ESGFFile) -> None: + """Test `ESGFFile.prepare`.""" + download = mocker.patch.object(_download.ESGFFile, "download") + esgf_file.prepare() + download.assert_called_once_with(esgf_file.dest_folder) + + +def test_attribute_not_set(esgf_file: _download.ESGFFile) -> None: + """Test accessing `ESGFFile.attributes` before calling to_iris.""" + with pytest.raises( + ValueError, + match=r"Attributes have not been read yet. Call the `to_iris` method .*", + ): + _ = esgf_file.attributes + + +def test_to_iris(mocker: MockerFixture, esgf_file: _download.ESGFFile) -> None: + """Test `ESGFFile.prepare`.""" + prepare = mocker.patch.object(_download.ESGFFile, "prepare") + local_file_to_iris = mocker.patch.object( + esmvalcore.esgf._download.LocalFile, + "to_iris", + return_value=mocker.sentinel.iris_cubes, + ) + mocker.patch.object( + esmvalcore.esgf._download.LocalFile, + "attributes", + new_callable=mocker.PropertyMock, + return_value={"attribute": "value"}, + ) + cubes = esgf_file.to_iris(mocker.sentinel.ignore_warnings) + + assert cubes == mocker.sentinel.iris_cubes + assert esgf_file.attributes == {"attribute": "value"} + prepare.assert_called_once() + local_file_to_iris.assert_called_once_with( + ignore_warnings=mocker.sentinel.ignore_warnings, + ) + + def test_from_results(): """Test ESGFFile._from_results().""" facets = { @@ -478,7 +536,7 @@ def test_single_download(mocker, tmp_path, checksum): response.iter_content.assert_called_with(chunk_size=2**20) -def test_download_skip_existing(tmp_path, caplog): +def test_download_skip_existing(tmp_path: Path, mocker: MockerFixture) -> None: filename = "test.nc" dataset = "dataset" dest_folder = tmp_path @@ -496,12 +554,9 @@ def test_download_skip_existing(tmp_path, caplog): local_file = file.local_file(dest_folder) local_file.parent.mkdir(parents=True) local_file.touch() - - caplog.set_level(logging.DEBUG) - + mock_download = mocker.patch.object(_download.ESGFFile, "_download") local_file = file.download(dest_folder) - - assert f"Skipping download of existing file {local_file}" in caplog.text + mock_download.assert_not_called() def test_single_download_fail(mocker, tmp_path): @@ -632,10 +687,8 @@ def test_download_fail(mocker, tmp_path, caplog): file.download.assert_called_with(dest_folder) -def test_download_noop(caplog): +def test_download_noop(mocker: MockerFixture) -> None: """Test downloading no files.""" - caplog.set_level("DEBUG") + mock_download = mocker.patch.object(_download.ESGFFile, "_download") esmvalcore.esgf.download([], dest_folder="/does/not/exist") - - msg = "All required data is available locally, not downloading anything." - assert msg in caplog.text + mock_download.assert_not_called() diff --git a/tests/unit/esgf/test_search.py b/tests/unit/esgf/test_search.py index 11b582fffb..5949cc5792 100644 --- a/tests/unit/esgf/test_search.py +++ b/tests/unit/esgf/test_search.py @@ -2,13 +2,16 @@ import copy import textwrap +from pathlib import Path import pyesgf.search import pytest import requests.exceptions from pyesgf.search.results import FileResult +from pytest_mock import MockerFixture -from esmvalcore.esgf import ESGFFile, _search, find_files +import esmvalcore.io.protocol +from esmvalcore.esgf import ESGFDataSource, ESGFFile, _search, find_files OUR_FACETS = ( { @@ -433,3 +436,39 @@ def test_search_unknown_project(): ) with pytest.raises(ValueError, match=msg): find_files(project=project, dataset="", short_name="") + + +class TestESGFDataSource: + """Test `esmvalcore.esgf.ESGFDataSource`.""" + + def test_init(self) -> None: + """Test initialization.""" + data_source = ESGFDataSource( + name="esgf-cmip6", + project="CMIP6", + priority=1, + download_dir=Path("/path/to/climate_data"), + ) + assert isinstance(data_source, esmvalcore.io.protocol.DataSource) + + def test_find_data(self, mocker: MockerFixture) -> None: + """Test find_data method.""" + data_source = ESGFDataSource( + name="esgf-cmip6", + project="CMIP6", + priority=1, + download_dir=Path("/path/to/climate_data"), + ) + + mock_result = [mocker.create_autospec(ESGFFile, instance=True)] + mock_find_files = mocker.patch( + "esmvalcore.esgf._search.find_files", + return_value=mock_result, + ) + + facets = {"short_name": "tas", "dataset": "A", "project": "CMIP6"} + result = data_source.find_data(**facets) + + mock_find_files.assert_called_once_with(**facets) + assert result is mock_result + assert result[0].dest_folder == Path("/path/to/climate_data") diff --git a/tests/unit/io/__init__.py b/tests/unit/io/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/unit/io/test_intake_esgf.py b/tests/unit/io/test_intake_esgf.py new file mode 100644 index 0000000000..34effd97f1 --- /dev/null +++ b/tests/unit/io/test_intake_esgf.py @@ -0,0 +1,315 @@ +"""Unit tests for esmvalcore.io.intake_esgf.""" + +import importlib.resources + +import intake_esgf +import iris.cube +import pandas as pd +import pytest +import xarray as xr +import yaml +from pytest import MonkeyPatch +from pytest_mock import MockerFixture + +import esmvalcore.io.intake_esgf +from esmvalcore.config import Session +from esmvalcore.io.intake_esgf import IntakeESGFDataset, IntakeESGFDataSource + + +def test_prepare(mocker: MockerFixture) -> None: + """IntakeESGFDataset.prepare should call the catalog.to_path_dict method.""" + cat = intake_esgf.ESGFCatalog() + to_path_mock = mocker.patch.object(cat, "to_path_dict", autospec=True) + dataset = IntakeESGFDataset(name="id", facets={}, catalog=cat) + + dataset.prepare() + to_path_mock.assert_called_once_with() + + +def test_attributes_raises_before_to_iris() -> None: + """Accessing attributes before to_iris should raise ValueError.""" + cat = intake_esgf.ESGFCatalog() + dataset = IntakeESGFDataset(name="id", facets={}, catalog=cat) + with pytest.raises(ValueError, match="Attributes have not been read yet"): + _ = dataset.attributes + + +def test_to_iris(mocker: MockerFixture) -> None: + """`to_iris` should load the data and cache attributes.""" + cat = intake_esgf.ESGFCatalog() + key = "my.dataset.1" + mocker.patch.object( + cat, + "to_path_dict", + return_value={key: ["/path/to/file.nc"]}, + ) + ds = xr.Dataset(attrs={"attr": "value"}) + mocker.patch.object(cat, "to_dataset_dict", return_value={key: ds}) + + cubes = mocker.sentinel.cubes + mocker.patch.object( + esmvalcore.io.intake_esgf, + "dataset_to_iris", + return_value=cubes, + ) + + dataset = IntakeESGFDataset(name=key, facets={}, catalog=cat) + result = dataset.to_iris(ignore_warnings=[{"message": "ignore"}]) + assert result is cubes + + assert dataset.attributes == { + "attr": "value", + "source_file": "/path/to/file.nc", + } + + +@pytest.mark.online +def test_to_iris_online(): + """`to_iris` should load data from a real ESGF catalog.""" + data_source = IntakeESGFDataSource( + name="src", + project="CMIP6", + priority=1, + facets={ + "activity": "activity_drs", + "dataset": "source_id", + "ensemble": "member_id", + "exp": "experiment_id", + "grid": "grid_label", + "institute": "institution_id", + "mip": "table_id", + "project": "project", + "short_name": "variable_id", + }, + values={}, + ) + results = data_source.find_data( + dataset="CanESM5", + ensemble="r1i1p1f1", + exp="historical", + grid="gn", + mip="fx", + project="CMIP6", + short_name="areacella", + ) + assert len(results) == 1 + dataset = results[0] + assert isinstance(dataset, IntakeESGFDataset) + cubes = dataset.to_iris() + assert len(cubes) == 1 + assert isinstance(cubes[0], iris.cube.Cube) + # Check that the "source_file" attributes is present for debugging. + assert "source_file" in dataset.attributes + assert dataset.attributes["source_file"].endswith(".nc") + + +def test_find_data_no_results_sets_debug_info(mocker: MockerFixture) -> None: + """When catalog.search raises NoSearchResults, find_data should return empty list and set debug_info.""" + data_source = IntakeESGFDataSource( + name="src", + project="CMIP6", + priority=1, + facets={"short_name": "variable_id"}, + ) + + cat = intake_esgf.ESGFCatalog() + # Ensure last_search is present so debug_info can be constructed + cat.last_search = {"variable_id": "tas"} + mocker.patch.object( + cat, + "search", + side_effect=intake_esgf.exceptions.NoSearchResults("no results"), + ) + data_source.catalog = cat + + result = data_source.find_data(short_name="tas") + assert result == [] + expected_debug_info = "intake_esgf.ESGFCatalog.search(variable_id=['tas']) did not return any results." + assert data_source.debug_info == expected_debug_info + + +def test_find_data(mocker: MockerFixture, monkeypatch: MonkeyPatch): + """find_data should convert catalog.df rows into IntakeESGFDataset instances.""" + cat = intake_esgf.ESGFCatalog() + cat.project = intake_esgf.projects.projects["cmip6"] + cat.df = pd.DataFrame.from_dict( + { + "project": ["CMIP6", "CMIP6"], + "mip_era": ["CMIP6", "CMIP6"], + "activity_drs": ["CMIP", "ScenarioMIP"], + "institution_id": ["CCCma", "CCCma"], + "source_id": ["CanESM5", "CanESM5"], + "experiment_id": ["historical", "ssp585"], + "member_id": ["r1i1p1f1", "r1i1p1f1"], + "table_id": ["Amon", "Amon"], + "variable_id": ["tas", "tas"], + "grid_label": ["gn", "gn"], + "version": ["20190429", "20190429"], + "id": [ + [ + "CMIP6.CMIP.CCCma.CanESM5.historical.r1i1p1f1.Amon.tas.gn.v20190429|crd-esgf-drc.ec.gc.ca", + "CMIP6.CMIP.CCCma.CanESM5.historical.r1i1p1f1.Amon.tas.gn.v20190429|eagle.alcf.anl.gov", + "CMIP6.CMIP.CCCma.CanESM5.historical.r1i1p1f1.Amon.tas.gn.v20190429|esgf-data04.diasjp.net", + "CMIP6.CMIP.CCCma.CanESM5.historical.r1i1p1f1.Amon.tas.gn.v20190429|esgf-node.ornl.gov", + ], + [ + "CMIP6.ScenarioMIP.CCCma.CanESM5.ssp585.r1i1p1f1.Amon.tas.gn.v20190429|crd-esgf-drc.ec.gc.ca", + "CMIP6.ScenarioMIP.CCCma.CanESM5.ssp585.r1i1p1f1.Amon.tas.gn.v20190429|eagle.alcf.anl.gov", + "CMIP6.ScenarioMIP.CCCma.CanESM5.ssp585.r1i1p1f1.Amon.tas.gn.v20190429|esgf-data04.diasjp.net", + "CMIP6.ScenarioMIP.CCCma.CanESM5.ssp585.r1i1p1f1.Amon.tas.gn.v20190429|esgf-node.ornl.gov", + ], + ], + }, + ) + + # Patch search to just record last_search + def fake_search(**kwargs): + cat.last_search = kwargs + + mocker.patch.object(cat, "search", side_effect=fake_search) + + data_source = IntakeESGFDataSource( + name="src", + project="CMIP6", + priority=1, + facets={ + "activity": "activity_drs", + "dataset": "source_id", + "ensemble": "member_id", + "exp": "experiment_id", + "institute": "institution_id", + "grid": "grid_label", + "mip": "table_id", + "project": "project", + "short_name": "variable_id", + }, + values={}, + ) + data_source.catalog = cat + + # Call find_data - it should use the df we set and return one dataset + results = data_source.find_data(short_name="tas") + assert isinstance(results, list) + assert len(results) == 2 + + dataset = results[0] + assert isinstance(dataset, IntakeESGFDataset) + assert ( + dataset.name + == "CMIP6.CMIP.CCCma.CanESM5.historical.r1i1p1f1.Amon.tas.gn" + ) + + assert dataset.facets == { + "activity": "CMIP", + "dataset": "CanESM5", + "ensemble": "r1i1p1f1", + "exp": "historical", + "grid": "gn", + "institute": "CCCma", + "mip": "Amon", + "project": "CMIP6", + "short_name": "tas", + } + dataset = results[1] + assert ( + dataset.name + == "CMIP6.ScenarioMIP.CCCma.CanESM5.ssp585.r1i1p1f1.Amon.tas.gn" + ) + assert dataset.facets == { + "activity": "ScenarioMIP", + "dataset": "CanESM5", + "ensemble": "r1i1p1f1", + "exp": "ssp585", + "grid": "gn", + "institute": "CCCma", + "mip": "Amon", + "project": "CMIP6", + "short_name": "tas", + } + + +@pytest.fixture +def data_sources(session: Session) -> list[esmvalcore.io.protocol.DataSource]: + """Fixture providing the default list of IntakeESGFDataSource data sources.""" + with importlib.resources.as_file( + importlib.resources.files(esmvalcore.config) + / "configurations" + / "intake-esgf-data.yml", + ) as config_file: + cfg = yaml.safe_load(config_file.read_text(encoding="utf-8")) + session["projects"] = cfg["projects"] + return esmvalcore.io.load_data_sources(session) + + +@pytest.mark.online +@pytest.mark.parametrize( + ("facets", "expected_names"), + [ + ( + { + "dataset": "CanESM5", + "ensemble": "r1i1p1f1", + "exp": ["historical", "ssp585"], + "grid": "gn", + "mip": "Amon", + "project": "CMIP6", + "short_name": "tas", + "timerange": "1850/2100", + }, + { + "CMIP6.CMIP.CCCma.CanESM5.historical.r1i1p1f1.Amon.tas.gn", + "CMIP6.ScenarioMIP.CCCma.CanESM5.ssp585.r1i1p1f1.Amon.tas.gn", + }, + ), + ( + { + "dataset": "ACCESS1-0", + "ensemble": "r1i1p1", + "exp": ["historical", "rcp85"], + "mip": "Amon", + "project": "CMIP5", + "short_name": "tas", + }, + { + "CSIRO-BOM.ACCESS1.0.historical.mon.atmos.Amon.r1i1p1.tas", + "CSIRO-BOM.ACCESS1.0.rcp85.mon.atmos.Amon.r1i1p1.tas", + }, + ), + ( + { + "dataset": "cccma_cgcm3_1", + "ensemble": "run1", + "exp": "historical", + "mip": "A1", + "project": "CMIP3", + "short_name": "tas", + }, + { + "CMIP3.CCCMA.cccma_cgcm3_1.historical.day.atmos.run1.tas", + "CMIP3.CCCMA.cccma_cgcm3_1.historical.mon.atmos.run1.tas", + }, + ), + ( + { + "dataset": "ERA-5", + "project": "obs4MIPs", + "short_name": "tas", + }, + { + "obs4MIPs.ECMWF.ERA-5.mon.tas.gn", + }, + ), + ], +) +def test_find_data_online( + data_sources: list[IntakeESGFDataSource], + facets: dict[str, str | list[str]], + expected_names: list[str], +) -> None: + """Test finding data from a real ESGF catalog.""" + data_source = next( + ds for ds in data_sources if ds.project == facets["project"] + ) + result = data_source.find_data(**facets) + result_names = {ds.name for ds in result} + assert result_names == expected_names diff --git a/tests/unit/io/test_load_data_sources.py b/tests/unit/io/test_load_data_sources.py new file mode 100644 index 0000000000..de1f7bad23 --- /dev/null +++ b/tests/unit/io/test_load_data_sources.py @@ -0,0 +1,82 @@ +"""Tests for :func:`esmvalcore.io.load_data_sources`.""" + +import importlib.resources +from dataclasses import dataclass + +import pytest + +import esmvalcore.config +import esmvalcore.io + + +def test_configurations_valid(cfg_default: esmvalcore.config.Config) -> None: + """Test that the data sources configuration in esmvalcore/config/configurations are valid.""" + configurations = ( + importlib.resources.files(esmvalcore.config) / "configurations" + ) + with importlib.resources.as_file(configurations) as config_dir: + cfg_default.load_from_dirs([config_dir]) + session = cfg_default.start_session("test") + data_sources = esmvalcore.io.load_data_sources(session) + for data_source in data_sources: + assert isinstance(data_source, esmvalcore.io.DataSource) + + +def test_load_data_sources_unknown_project( + session: esmvalcore.config.Session, +) -> None: + """Test that loading data sources for an unknown project raises.""" + with pytest.raises(ValueError, match=r"Unknown project 'unknown'.*"): + esmvalcore.io.load_data_sources(session, project="unknown") + + +def test_load_data_sources_no_data_sources_configured( + session: esmvalcore.config.Session, +) -> None: + """Test that loading data sources when no data sources are configured raises.""" + session["projects"].clear() + with pytest.raises( + ValueError, + match=r"No data sources found. Check your configuration under 'projects'", + ): + esmvalcore.io.load_data_sources(session) + + +def test_load_data_sources_no_project_data_sources_configured( + session: esmvalcore.config.Session, +) -> None: + """Test that loading data sources when no data sources are configured raises.""" + session["projects"]["test"] = {} + with pytest.raises( + ValueError, + match=r"No data sources found for project 'test'.*", + ): + esmvalcore.io.load_data_sources(session, project="test") + + +@dataclass +class IncompleteDataSource: + """An incomplete data source class for testing.""" + + name: str + project: str + priority: int + # Note the missing implementation of DataSource methods. + + +def test_load_data_sources_invalid_data_source_type( + session: esmvalcore.config.Session, +) -> None: + """Test that loading data sources with an invalid data source type raises.""" + session["projects"]["test"] = { + "data": { + "invalid_source": { + "type": "tests.unit.io.test_load_data_sources.IncompleteDataSource", + }, + }, + } + with pytest.raises( + TypeError, + match=r"Expected a data source of type `esmvalcore.io.protocol.DataSource`.*", + ): + esmvalcore.io.load_data_sources(session, project="test") diff --git a/tests/unit/local/test_facets.py b/tests/unit/local/test_facets.py index 1373b961c6..8aa5123f5f 100644 --- a/tests/unit/local/test_facets.py +++ b/tests/unit/local/test_facets.py @@ -2,7 +2,7 @@ import pytest -from esmvalcore.local import DataSource, LocalFile +from esmvalcore.local import LocalDataSource, LocalFile @pytest.mark.parametrize( @@ -25,6 +25,28 @@ "facet2": "filename", }, ), + ( + "/climate_data/value1/filename_2000-2001.nc", + "/climate_data", + "{facet1}", + "{facet2}[_.]*nc", + { + "facet1": "value1", + "facet2": "filename", + "timerange": "2000/2001", + }, + ), + ( + "/climate_data/value1/filename_20001201-20011231.nc", + "/climate_data", + "{facet1}", + "{facet2}[_.]*nc", + { + "facet1": "value1", + "facet2": "filename", + "timerange": "20001201/20011231", + }, + ), ( "/climate_data/value1/xyz/filename.nc", "/climate_data", @@ -125,6 +147,7 @@ { "tier": "3", "dataset": "ds", + "timerange": "1993/1993", }, ), ( @@ -136,6 +159,7 @@ "tier": "3", "dataset": "ds", "short_name": "tas", + "timerange": "1993/1993", }, ), ( @@ -145,6 +169,7 @@ "{short_name}_*", { "short_name": "tas", + "timerange": "1993/1993", }, ), ( @@ -165,6 +190,7 @@ { "short_name": "tas", "dataset": "ds", + "timerange": "1993/1993", }, ), ( @@ -258,14 +284,42 @@ def test_path2facets( filename_template, facets, ): - """Test `DataSource.path2facets.""" + """Test `LocalDataSource.path2facets.""" path = Path(path) rootpath = Path(rootpath) - data_source = DataSource(rootpath, dirname_template, filename_template) - result = data_source.path2facets(path) + data_source = LocalDataSource( + name="test-source", + project="test-project", + priority=1, + rootpath=rootpath, + dirname_template=dirname_template, + filename_template=filename_template, + ) + add_timerange = "timerange" in facets + result = data_source.path2facets(path, add_timerange=add_timerange) assert result == facets +def test_path2facets_no_timerange(): + # Test that `LocalDataSource.path2facets` does not add "timerange" + # if it cannot determine the timerange. + path = Path("/climate_data/value1/filename.nc") + rootpath = Path("/climate_data") + data_source = LocalDataSource( + name="test-source", + project="test-project", + priority=1, + rootpath=rootpath, + dirname_template="{facet1}", + filename_template="{facet2}[_.]*nc", + ) + result = data_source.path2facets(path, add_timerange=True) + assert result == { + "facet1": "value1", + "facet2": "filename", + } + + def test_localfile(): file = LocalFile("/a/b.nc") file.facets = {"a": "A"} diff --git a/tests/unit/local/test_get_data_sources.py b/tests/unit/local/test_get_data_sources.py index cef6d49891..6494f4be66 100644 --- a/tests/unit/local/test_get_data_sources.py +++ b/tests/unit/local/test_get_data_sources.py @@ -4,7 +4,7 @@ from esmvalcore.config import CFG from esmvalcore.config._config_validators import validate_config_developer -from esmvalcore.local import DataSource, _get_data_sources +from esmvalcore.local import LocalDataSource, _get_data_sources @pytest.mark.parametrize( @@ -33,7 +33,7 @@ def test_get_data_sources(monkeypatch, rootpath_drs): monkeypatch.setitem(CFG, "drs", drs) sources = _get_data_sources("CMIP6") source = sources[0] - assert isinstance(source, DataSource) + assert isinstance(source, LocalDataSource) assert source.rootpath == Path("/climate_data") assert "{project}" in source.dirname_template assert "{short_name}" in source.filename_template diff --git a/tests/unit/local/test_time.py b/tests/unit/local/test_time.py index 30d5d1ea97..5548dfe254 100644 --- a/tests/unit/local/test_time.py +++ b/tests/unit/local/test_time.py @@ -12,7 +12,6 @@ LocalFile, _dates_to_timerange, _get_start_end_date, - _get_start_end_year, _replace_years_with_timerange, _truncate_dates, ) @@ -33,104 +32,46 @@ def _get_esgf_file(path): return ESGFFile([result]) -FILENAME_CASES = [ - ["var_whatever_1980-1981", 1980, 1981], - ["var_whatever_1980.nc", 1980, 1980], - ["a.b.x_yz_185001-200512.nc", 1850, 2005], - ["var_whatever_19800101-19811231.nc1", 1980, 1981], - ["var_whatever_19800101.nc", 1980, 1980], - ["1980-1981_var_whatever.nc", 1980, 1981], - ["1980_var_whatever.nc", 1980, 1980], - ["var_control-1980_whatever.nc", 1980, 1980], - ["19800101-19811231_var_whatever.nc", 1980, 1981], - ["19800101_var_whatever.nc", 1980, 1980], - ["var_control-19800101_whatever.nc", 1980, 1980], - ["19800101_var_control-1950_whatever.nc", 1980, 1980], - ["var_control-1950_whatever_19800101.nc", 1980, 1980], - ["CM61-LR-hist-03.1950_18500101_19491231_1M_concbc.nc", 1850, 1949], - [ - "icon-2.6.1_atm_amip_R2B5_r1v1i1p1l1f1_phy_3d_ml_20150101T000000Z.nc", - 2015, - 2015, - ], - ["pr_A1.186101-200012.nc", 1861, 2000], - ["tas_A1.20C3M_1.CCSM.atmm.1990-01_cat_1999-12.nc", 1990, 1999], - ["E5sf00_1M_1940_032.grb", 1940, 1940], - ["E5sf00_1D_1998-04_167.grb", 1998, 1998], - ["E5sf00_1H_1986-04-11_167.grb", 1986, 1986], - ["E5sf00_1M_1940-1941_032.grb", 1940, 1941], - ["E5sf00_1D_1998-01_1999-12_167.grb", 1998, 1999], - ["E5sf00_1H_2000-01-01_2001-12-31_167.grb", 2000, 2001], -] - -FILENAME_DATE_CASES = [ - ["var_whatever_1980-1981", "1980", "1981"], - ["var_whatever_1980.nc", "1980", "1980"], - ["a.b.x_yz_185001-200512.nc", "185001", "200512"], - ["var_whatever_19800101-19811231.nc1", "19800101", "19811231"], - ["var_whatever_19800101.nc", "19800101", "19800101"], - ["1980-1981_var_whatever.nc", "1980", "1981"], - ["1980_var_whatever.nc", "1980", "1980"], - ["var_control-1980_whatever.nc", "1980", "1980"], - ["19800101-19811231_var_whatever.nc", "19800101", "19811231"], - ["19800101_var_whatever.nc", "19800101", "19800101"], - ["var_control-19800101_whatever.nc", "19800101", "19800101"], - ["19800101_var_control-1950_whatever.nc", "19800101", "19800101"], - ["var_control-1950_whatever_19800101.nc", "19800101", "19800101"], - [ - "CM61-LR-hist-03.1950_18500101_19491231_1M_concbc.nc", - "18500101", - "19491231", - ], +@pytest.mark.parametrize( + "case", [ - "icon-2.6.1_atm_amip_R2B5_r1v1i1p1l1f1_phy_3d_ml_20150101T000000Z.nc", - "20150101T000000Z", - "20150101T000000Z", + ["var_whatever_1980-1981", "1980", "1981"], + ["var_whatever_1980.nc", "1980", "1980"], + ["a.b.x_yz_185001-200512.nc", "185001", "200512"], + ["var_whatever_19800101-19811231.nc1", "19800101", "19811231"], + ["var_whatever_19800101.nc", "19800101", "19800101"], + ["1980-1981_var_whatever.nc", "1980", "1981"], + ["1980_var_whatever.nc", "1980", "1980"], + ["var_control-1980_whatever.nc", "1980", "1980"], + ["19800101-19811231_var_whatever.nc", "19800101", "19811231"], + ["19800101_var_whatever.nc", "19800101", "19800101"], + ["var_control-19800101_whatever.nc", "19800101", "19800101"], + ["19800101_var_control-1950_whatever.nc", "19800101", "19800101"], + ["var_control-1950_whatever_19800101.nc", "19800101", "19800101"], + [ + "CM61-LR-hist-03.1950_18500101_19491231_1M_concbc.nc", + "18500101", + "19491231", + ], + [ + "icon-2.6.1_atm_amip_R2B5_r1v1i1p1l1f1_phy_3d_ml_20150101T000000Z.nc", + "20150101T000000Z", + "20150101T000000Z", + ], + ["pr_A1.186101-200012.nc", "186101", "200012"], + [ + "tas_A1.20C3M_1.CCSM.atmm.1990-01_cat_1999-12.nc", + "199001", + "199912", + ], + ["E5sf00_1M_1940_032.grb", "1940", "1940"], + ["E5sf00_1D_1998-04_167.grb", "199804", "199804"], + ["E5sf00_1H_1986-04-11_167.grb", "19860411", "19860411"], + ["E5sf00_1M_1940-1941_032.grb", "1940", "1941"], + ["E5sf00_1D_1998-01_1999-12_167.grb", "199801", "199912"], + ["E5sf00_1H_2000-01-01_2001-12-31_167.grb", "20000101", "20011231"], ], - ["pr_A1.186101-200012.nc", "186101", "200012"], - ["tas_A1.20C3M_1.CCSM.atmm.1990-01_cat_1999-12.nc", "199001", "199912"], - ["E5sf00_1M_1940_032.grb", "1940", "1940"], - ["E5sf00_1D_1998-04_167.grb", "199804", "199804"], - ["E5sf00_1H_1986-04-11_167.grb", "19860411", "19860411"], - ["E5sf00_1M_1940-1941_032.grb", "1940", "1941"], - ["E5sf00_1D_1998-01_1999-12_167.grb", "199801", "199912"], - ["E5sf00_1H_2000-01-01_2001-12-31_167.grb", "20000101", "20011231"], -] - - -@pytest.mark.parametrize("case", FILENAME_CASES) -def test_get_start_end_year(case): - """Tests for _get_start_end_year function.""" - filename, case_start, case_end = case - - # If the filename is inconclusive or too difficult we resort to reading the - # file, which fails here because the file is not there. - if case_start is None and case_end is None: - with pytest.raises(ValueError): - _get_start_end_year(filename) - with pytest.raises(ValueError): - _get_start_end_year(Path(filename)) - with pytest.raises(ValueError): - _get_start_end_year(LocalFile(filename)) - with pytest.raises(ValueError): - _get_start_end_year(_get_esgf_file(filename)) - - else: - start, end = _get_start_end_year(filename) - assert case_start == start - assert case_end == end - start, end = _get_start_end_year(Path(filename)) - assert case_start == start - assert case_end == end - start, end = _get_start_end_year(LocalFile(filename)) - assert case_start == start - assert case_end == end - start, end = _get_start_end_year(_get_esgf_file(filename)) - assert case_start == start - assert case_end == end - - -@pytest.mark.parametrize("case", FILENAME_DATE_CASES) +) def test_get_start_end_date(case): """Tests for _get_start_end_date function.""" filename, case_start, case_end = case @@ -145,7 +86,7 @@ def test_get_start_end_date(case): with pytest.raises(ValueError): _get_start_end_date(LocalFile(filename)) with pytest.raises(ValueError): - _get_start_end_date(_get_esgf_file(filename)) + _get_start_end_date(_get_esgf_file(filename).name) else: start, end = _get_start_end_date(filename) @@ -157,7 +98,7 @@ def test_get_start_end_date(case): start, end = _get_start_end_date(LocalFile(filename)) assert case_start == start assert case_end == end - start, end = _get_start_end_date(_get_esgf_file(filename)) + start, end = _get_start_end_date(_get_esgf_file(filename).name) assert case_start == start assert case_end == end @@ -173,9 +114,9 @@ def test_read_years_from_cube(tmp_path): ) cube.add_dim_coord(time, 0) iris.save(cube, temp_file) - start, end = _get_start_end_year(temp_file) - assert start == 1990 - assert end == 1991 + start, end = _get_start_end_date(temp_file) + assert int(start[:4]) == 1990 + assert int(end[:4]) == 1991 def test_read_datetime_from_cube(tmp_path): @@ -210,8 +151,6 @@ def test_raises_if_unable_to_deduce_no_time(tmp_path): iris.save(cube, temp_file) with pytest.raises(ValueError): _get_start_end_date(temp_file) - with pytest.raises(ValueError): - _get_start_end_year(temp_file) def test_raises_if_unable_to_deduce_no_time_units(tmp_path): @@ -223,16 +162,12 @@ def test_raises_if_unable_to_deduce_no_time_units(tmp_path): iris.save(cube, temp_file) with pytest.raises(ValueError): _get_start_end_date(temp_file) - with pytest.raises(ValueError): - _get_start_end_year(temp_file) def test_fails_if_no_date_present(): """Test raises if no date is present.""" with pytest.raises(ValueError): _get_start_end_date("var_whatever") - with pytest.raises(ValueError): - _get_start_end_year("var_whatever") def test_get_timerange_from_years(): diff --git a/tests/unit/local/test_to_iris.py b/tests/unit/local/test_to_iris.py index 15a50729ac..44a6a881d3 100644 --- a/tests/unit/local/test_to_iris.py +++ b/tests/unit/local/test_to_iris.py @@ -1,11 +1,14 @@ +from pathlib import Path + import iris.cube import pytest +from pytest_mock import MockerFixture -from esmvalcore.local import LocalFile +from esmvalcore.local import LocalFile, _get_attr_from_field_coord @pytest.fixture -def local_file(tmp_path): +def local_file(tmp_path: Path) -> LocalFile: cube = iris.cube.Cube([0]) cube.attributes.globals["attribute"] = "value" file = tmp_path / "test.nc" @@ -13,21 +16,27 @@ def local_file(tmp_path): return LocalFile(file) -def test_to_iris(local_file): +def test_to_iris(local_file: LocalFile) -> None: cubes = local_file.to_iris() assert len(cubes) == 1 -def test_attributes(local_file): +def test_attributes(local_file: LocalFile) -> None: local_file.to_iris() # Load the file to populate attributes attrs = local_file.attributes assert attrs["attribute"] == "value" -def test_attributes_without_loading(local_file): +def test_attributes_without_loading(local_file: LocalFile) -> None: """Test that accessing attributes without loading the file first raises.""" with pytest.raises( ValueError, match=r"Attributes have not been read yet.*", ): local_file.attributes # noqa: B018 + + +def test_get_attr_from_field_coord_none(mocker: MockerFixture) -> None: + """Test ``_get_attr_from_field_coord``.""" + attr = _get_attr_from_field_coord(mocker.sentinel.ncfield, None, "attr") + assert attr is None diff --git a/tests/unit/preprocessor/test_shared.py b/tests/unit/preprocessor/test_shared.py index 773f380794..a1860ecde2 100644 --- a/tests/unit/preprocessor/test_shared.py +++ b/tests/unit/preprocessor/test_shared.py @@ -2,6 +2,7 @@ import inspect import warnings +from pathlib import Path import dask.array as da import iris.analysis @@ -384,17 +385,17 @@ def test_compute_area_weights(lazy): ) -def test_group_products_string_list(): +def test_group_products_string_list() -> None: products = [ PreprocessorFile( - filename="A_B.nc", + filename=Path("A_B.nc"), attributes={ "project": "A", "dataset": "B", }, ), PreprocessorFile( - filename="A_C.nc", + filename=Path("A_C.nc"), attributes={ "project": "A", "dataset": "C", diff --git a/tests/unit/provenance/test_trackedfile.py b/tests/unit/provenance/test_trackedfile.py index 9e22ca461b..5caebc173b 100644 --- a/tests/unit/provenance/test_trackedfile.py +++ b/tests/unit/provenance/test_trackedfile.py @@ -1,21 +1,48 @@ +from dataclasses import dataclass from pathlib import Path +from typing import Any +import iris.cube +import prov.model import pytest from prov.model import ProvDocument from esmvalcore._provenance import ESMVALTOOL_URI_PREFIX, TrackedFile +from esmvalcore.io.protocol import DataElement from esmvalcore.local import LocalFile +def test_set() -> None: + assert { + TrackedFile(Path("file1.nc"), attributes={}), + TrackedFile(Path("file1.nc"), attributes={}), + TrackedFile(Path("file2.nc"), attributes={}), + } == { + TrackedFile(Path("file1.nc"), attributes={}), + TrackedFile(Path("file2.nc"), attributes={}), + } + + +def test_sort() -> None: + file1 = TrackedFile(Path("file1.nc"), attributes={}) + file2 = TrackedFile(Path("file2.nc"), attributes={}) + assert sorted([file2, file1]) == [file1, file2] + + +def test_equals() -> None: + file = TrackedFile(Path("file.nc"), attributes={}) + assert file == TrackedFile(Path("file.nc"), attributes={}) + + @pytest.fixture -def tracked_input_file_nc(): +def tracked_input_file_nc() -> TrackedFile: input_file_nc = LocalFile("/path/to/file.nc") input_file_nc.attributes = {"a": "A"} return TrackedFile(filename=input_file_nc) @pytest.fixture -def tracked_output_file_nc(): +def tracked_output_file_nc() -> TrackedFile: return TrackedFile( filename=Path("/path/to/file.nc"), attributes={"a": "A"}, @@ -23,41 +50,56 @@ def tracked_output_file_nc(): @pytest.fixture -def tracked_input_file_grb(): +def tracked_input_file_grb() -> TrackedFile: input_file_grb = LocalFile("/path/to/file.grb") input_file_grb.attributes = {"a": "A"} return TrackedFile(filename=input_file_grb) -def test_init_input_nc(tracked_input_file_nc): +def test_init_input_nc(tracked_input_file_nc: TrackedFile) -> None: """Test `esmvalcore._provenance.TrackedFile.__init__`.""" assert tracked_input_file_nc.filename == LocalFile("/path/to/file.nc") - assert tracked_input_file_nc.attributes is None + with pytest.raises( + ValueError, + match=r"Call TrackedFile.initialize_provenance before accessing attributes", + ): + tracked_input_file_nc.attributes # noqa: B018 -def test_init_output_nc(tracked_output_file_nc): +def test_init_output_nc(tracked_output_file_nc: TrackedFile) -> None: """Test `esmvalcore._provenance.TrackedFile.__init__`.""" assert tracked_output_file_nc.filename == Path("/path/to/file.nc") assert tracked_output_file_nc.attributes == {"a": "A"} -def test_init_grb(tracked_input_file_grb): +def test_init_grb(tracked_input_file_grb: TrackedFile) -> None: """Test `esmvalcore._provenance.TrackedFile.__init__`.""" assert tracked_input_file_grb.filename == LocalFile("/path/to/file.grb") - assert tracked_input_file_grb.attributes is None + with pytest.raises( + ValueError, + match=r"Call TrackedFile.initialize_provenance before accessing attributes", + ): + tracked_input_file_grb.attributes # noqa: B018 + + +@pytest.fixture +def activity() -> prov.model.ProvActivity: + provenance = ProvDocument() + provenance.add_namespace("task", uri=ESMVALTOOL_URI_PREFIX + "task") + return provenance.activity("task:test-task-name") @pytest.mark.parametrize( "fixture_name", ["tracked_input_file_nc", "tracked_output_file_nc"], ) -def test_initialize_provenance_nc(fixture_name, request): +def test_initialize_provenance_nc( + fixture_name: str, + request: pytest.FixtureRequest, + activity: prov.model.ProvActivity, +) -> None: """Test `esmvalcore._provenance.TrackedFile.initialize_provenance`.""" tracked_file_nc = request.getfixturevalue(fixture_name) - provenance = ProvDocument() - provenance.add_namespace("task", uri=ESMVALTOOL_URI_PREFIX + "task") - activity = provenance.activity("task:test-task-name") - tracked_file_nc.initialize_provenance(activity) assert isinstance(tracked_file_nc.provenance, ProvDocument) assert tracked_file_nc.activity == activity @@ -65,33 +107,59 @@ def test_initialize_provenance_nc(fixture_name, request): assert tracked_file_nc.attributes == {"a": "A"} -def test_initialize_provenance_grb(tracked_input_file_grb): +def test_initialize_provenance_grb( + tracked_input_file_grb: TrackedFile, + activity: prov.model.ProvActivity, +) -> None: """Test `esmvalcore._provenance.TrackedFile.initialize_provenance`.""" - provenance = ProvDocument() - provenance.add_namespace("task", uri=ESMVALTOOL_URI_PREFIX + "task") - activity = provenance.activity("task:test-task-name") - tracked_input_file_grb.initialize_provenance(activity) assert isinstance(tracked_input_file_grb.provenance, ProvDocument) assert tracked_input_file_grb.activity == activity assert ( - str(tracked_input_file_grb.entity.identifier) + str(tracked_input_file_grb.entity.identifier) # type: ignore[attr-defined] == "file:/path/to/file.grb" ) assert tracked_input_file_grb.attributes == {"a": "A"} +def test_initialize_provenance_twice_raises( + tracked_output_file_nc: TrackedFile, + activity: prov.model.ProvActivity, +) -> None: + """Test `esmvalcore._provenance.TrackedFile.initialize_provenance` raises if called twice.""" + tracked_output_file_nc.initialize_provenance(activity) + + with pytest.raises( + ValueError, + match=r"Provenance of TrackedFile: /path/to/file.nc already initialized", + ): + tracked_output_file_nc.initialize_provenance(activity) + + +def test_initialize_provenance_no_attributes_raises( + activity: prov.model.ProvActivity, +) -> None: + """Test `esmvalcore._provenance.TrackedFile.initialize_provenance` with no attributes.""" + tracked_file = TrackedFile(filename=Path("/path/to/file.nc")) + + with pytest.raises( + TypeError, + match=r"Delayed reading of attributes is only supported for `DataElement`s", + ): + tracked_file.initialize_provenance(activity) + + @pytest.mark.parametrize( "fixture_name", ["tracked_input_file_nc", "tracked_output_file_nc"], ) -def test_copy_provenance(fixture_name, request): +def test_copy_provenance( + fixture_name: str, + request: pytest.FixtureRequest, + activity: prov.model.ProvActivity, +) -> None: """Test `esmvalcore._provenance.TrackedFile.copy_provenance`.""" tracked_file_nc = request.getfixturevalue(fixture_name) - provenance = ProvDocument() - provenance.add_namespace("task", uri=ESMVALTOOL_URI_PREFIX + "task") - activity = provenance.activity("task:test-task-name") - tracked_file_nc.initialize_provenance(activity) copied_file = tracked_file_nc.copy_provenance() @@ -99,3 +167,83 @@ def test_copy_provenance(fixture_name, request): assert copied_file.entity == tracked_file_nc.entity assert copied_file.provenance == tracked_file_nc.provenance assert copied_file.provenance is not tracked_file_nc.provenance + + +def test_copy_provenance_not_initialized() -> None: + """Test `esmvalcore._provenance.TrackedFile.copy_provenance` raises if provenance not initialized.""" + tracked_file = TrackedFile(filename=Path("/path/to/file.nc")) + + with pytest.raises( + ValueError, + match=r"Provenance of TrackedFile: /path/to/file.nc not initialized", + ): + tracked_file.copy_provenance() + + +def test_wasderivedfrom_not_initialized() -> None: + """Test `esmvalcore._provenance.TrackedFile.wasderivedfrom` raises if provenance not initialized.""" + tracked_file = TrackedFile(filename=Path("/path/to/file.nc")) + other_tracked_file = TrackedFile(filename=Path("/path/to/other_file.nc")) + + with pytest.raises( + ValueError, + match=r"Provenance of TrackedFile: /path/to/file.nc not initialized", + ): + tracked_file.wasderivedfrom(other_tracked_file) + + +@dataclass +class MockDataElement(DataElement): + """Mock DataElement for testing purposes.""" + + name: str + facets: dict[str, Any] + attributes: dict[str, Any] + + def prepare(self) -> None: + pass + + def __hash__(self) -> int: + return hash(self.name) + + def to_iris( + self, + ignore_warnings: list[dict[str, Any]] | None = None, + ) -> iris.cube.CubeList: + return [] + + +def test_provenance_file_nonpath_notimplemented() -> None: + """Test `esmvalcore._provenance.TrackedFile.provenance_file` with a DataElement.""" + input_file = MockDataElement( + name="/path/to/input_file.nc", + facets={}, + attributes={}, + ) + tracked_file = TrackedFile(filename=input_file) + + assert tracked_file.filename == input_file + with pytest.raises( + NotImplementedError, + match=r"Saving provenance is only supported for pathlib.Path.*", + ): + _ = tracked_file.provenance_file + + +def test_save_provenance_notimplemented( + activity: prov.model.ProvActivity, +) -> None: + """Test `esmvalcore._provenance.TrackedFile.save_provenance` with a DataElement.""" + input_file = MockDataElement( + name="/path/to/input_file.nc", + facets={}, + attributes={}, + ) + tracked_file = TrackedFile(filename=input_file) + tracked_file.initialize_provenance(activity) + + with pytest.raises( + NotImplementedError, + match=r"Writing attributes is only supported for pathlib.Path.*", + ): + tracked_file.save_provenance() diff --git a/tests/unit/recipe/test_recipe.py b/tests/unit/recipe/test_recipe.py index 367eff6e17..55771a0b23 100644 --- a/tests/unit/recipe/test_recipe.py +++ b/tests/unit/recipe/test_recipe.py @@ -148,6 +148,7 @@ def create_esgf_search_results(): return [file0, file1] +@pytest.mark.skip(reason="May not be needed anymore.") @pytest.mark.parametrize("local_availability", ["all", "partial", "none"]) def test_schedule_for_download(monkeypatch, tmp_path, local_availability): """Test that `_schedule_for_download` updates DOWNLOAD_FILES.""" diff --git a/tests/unit/task/test_diagnostic_task.py b/tests/unit/task/test_diagnostic_task.py index 15517187bb..cb6047bc11 100644 --- a/tests/unit/task/test_diagnostic_task.py +++ b/tests/unit/task/test_diagnostic_task.py @@ -228,7 +228,7 @@ def test_collect_provenance(mocker, diagnostic_task): diagnostic_task._collect_provenance() tracked_file_class.assert_called_once_with( - "test.png", + Path("test.png"), { "caption": "Some figure", "plot_type": ("tag_value",), diff --git a/tests/unit/task/test_print.py b/tests/unit/task/test_print.py index 0ed1352f68..53aad046d3 100644 --- a/tests/unit/task/test_print.py +++ b/tests/unit/task/test_print.py @@ -2,20 +2,22 @@ import copy import textwrap +from pathlib import Path import pytest from esmvalcore._task import DiagnosticTask from esmvalcore.dataset import Dataset +from esmvalcore.local import LocalFile from esmvalcore.preprocessor import PreprocessingTask, PreprocessorFile @pytest.fixture def preproc_file(): dataset = Dataset(short_name="tas") - dataset.files = ["/path/to/input_file.nc"] + dataset.files = [LocalFile("/path/to/input_file.nc")] return PreprocessorFile( - filename="/output/preproc/file.nc", + filename=Path("/output/preproc/file.nc"), attributes={"short_name": "tas"}, settings={ "extract_levels": {"scheme": "linear", "levels": [95000]}, @@ -52,9 +54,9 @@ def test_repr_preproc_task(preproc_task): PreprocessingTask: diag_1/tas order: ['extract_levels', 'save'] PreprocessorFile: /output/preproc/file.nc - input files: ['/path/to/input_file.nc'] + input files: [LocalFile('/path/to/input_file.nc')] settings: {'extract_levels': {'levels': [95000], 'scheme': 'linear'}, - 'save': {'filename': '/output/preproc/file.nc'}} + 'save': {'filename': PosixPath('/output/preproc/file.nc')}} ancestors: None """) @@ -97,9 +99,9 @@ def test_repr_simple_tree(preproc_task, diagnostic_task): PreprocessingTask: diag_1/tas order: ['extract_levels', 'save'] PreprocessorFile: /output/preproc/file.nc - input files: ['/path/to/input_file.nc'] + input files: [LocalFile('/path/to/input_file.nc')] settings: {'extract_levels': {'levels': [95000], 'scheme': 'linear'}, - 'save': {'filename': '/output/preproc/file.nc'}} + 'save': {'filename': PosixPath('/output/preproc/file.nc')}} ancestors: None """) @@ -141,25 +143,25 @@ def test_repr_full_tree(preproc_task, diagnostic_task): PreprocessingTask: diag_1/tas order: ['extract_levels', 'save'] PreprocessorFile: /output/preproc/file.nc - input files: ['/path/to/input_file.nc'] + input files: [LocalFile('/path/to/input_file.nc')] settings: {'extract_levels': {'levels': [95000], 'scheme': 'linear'}, - 'save': {'filename': '/output/preproc/file.nc'}} + 'save': {'filename': PosixPath('/output/preproc/file.nc')}} ancestors: PreprocessingTask: diag_1/tas_derive_input_1 order: ['extract_levels', 'save'] PreprocessorFile: /output/preproc/file.nc - input files: ['/path/to/input_file.nc'] + input files: [LocalFile('/path/to/input_file.nc')] settings: {'extract_levels': {'levels': [95000], 'scheme': 'linear'}, - 'save': {'filename': '/output/preproc/file.nc'}} + 'save': {'filename': PosixPath('/output/preproc/file.nc')}} ancestors: None PreprocessingTask: diag_1/tas_derive_input_2 order: ['extract_levels', 'save'] PreprocessorFile: /output/preproc/file.nc - input files: ['/path/to/input_file.nc'] + input files: [LocalFile('/path/to/input_file.nc')] settings: {'extract_levels': {'levels': [95000], 'scheme': 'linear'}, - 'save': {'filename': '/output/preproc/file.nc'}} + 'save': {'filename': PosixPath('/output/preproc/file.nc')}} ancestors: None """) diff --git a/tests/unit/test_dataset.py b/tests/unit/test_dataset.py index 68e8ceed05..e4f6548e71 100644 --- a/tests/unit/test_dataset.py +++ b/tests/unit/test_dataset.py @@ -1,10 +1,12 @@ +import importlib.resources import textwrap from collections import defaultdict +from functools import lru_cache from pathlib import Path -from unittest import mock import pyesgf import pytest +import yaml import esmvalcore.dataset import esmvalcore.local @@ -13,6 +15,33 @@ from esmvalcore.dataset import Dataset from esmvalcore.esgf import ESGFFile from esmvalcore.exceptions import InputFilesNotFound, RecipeError +from esmvalcore.typing import Facets + + +@lru_cache +def _load_default_data_sources() -> dict[ + str, + dict[str, dict[str, dict[str, dict[str, str]]]], +]: + """Load default data sources for local users.""" + with importlib.resources.as_file( + importlib.resources.files(esmvalcore.config) + / "configurations" + / "local-data.yml", + ) as config_file: + return yaml.safe_load(config_file.read_text(encoding="utf-8")) + + +@pytest.fixture +def session(tmp_path: Path, session: Session) -> Session: + """Session fixture with default local data sources.""" + projects = _load_default_data_sources()["projects"] + for project in projects: + data_sources = projects[project]["data"] + for data_source in data_sources.values(): + data_source["rootpath"] = str(tmp_path) + session["projects"][project]["data"] = data_sources + return session def test_repr(): @@ -873,6 +902,7 @@ def test_from_files_with_globs(monkeypatch, session): "mip": "Amon", "project": "CMIP6", "short_name": "tas", + "timerange": "185001/201412", "version": "v20181126", } file2 = esmvalcore.local.LocalFile( @@ -984,6 +1014,7 @@ def test_from_files_with_globs_and_missing_facets(monkeypatch, session): "mip": "Amon", "project": "CMIP6", "short_name": "tas", + "timerange": "185001/201412", "version": "v20181126", } file2 = esmvalcore.local.LocalFile( @@ -1030,7 +1061,6 @@ def test_from_files_with_globs_and_missing_facets(monkeypatch, session): mip="Amon", project="CMIP6", short_name="tas", - timerange="185001/201412", ) expected.session = session @@ -1065,6 +1095,7 @@ def test_from_files_with_globs_and_automatic_missing(monkeypatch, session): "mip": "Amon", "project": "CMIP6", "short_name": "tas", + "timerange": "185001/201412", "version": "v20181126", } @@ -1250,7 +1281,7 @@ def test_concatenating_historical_and_future_exps(mocker): assert dataset.supplementaries[0].facets["exp"] == "historical" -def test_from_recipe_with_glob(tmp_path, session, mocker): +def test_from_recipe_with_glob(tmp_path: Path, session: Session) -> None: recipe_txt = textwrap.dedent(""" diagnostics: @@ -1267,8 +1298,6 @@ def test_from_recipe_with_glob(tmp_path, session, mocker): recipe = tmp_path / "recipe_test.yml" recipe.write_text(recipe_txt, encoding="utf-8") - session["drs"]["CMIP5"] = "ESGF" - CFG["rootpath"]["CMIP5"] = [tmp_path] filenames = [ "cmip5/output1/CSIRO-QCCCE/CSIRO-Mk3-6-0/rcp85/mon/atmos/Amon/r1i1p1/" "v20120323/tas_Amon_CSIRO-Mk3-6-0_rcp85_r1i1p1_200601-210012.nc", @@ -1280,7 +1309,7 @@ def test_from_recipe_with_glob(tmp_path, session, mocker): path.parent.mkdir(parents=True, exist_ok=True) path.write_text("") - definitions = [ + definitions: list[Facets] = [ { "diagnostic": "diagnostic1", "variable_group": "tas", @@ -1420,18 +1449,32 @@ def dataset(): mip="Amon", frequency="mon", short_name="tas", - dataset="EC.-Earth3", + dataset="EC-Earth3", exp="historical", ensemble="r1i1p1f1", grid="gr", timerange="1850/1851", - alias="CMIP6_EC-Eeath3_tas", + alias="CMIP6_EC-Earth3_tas", ) dataset.session = { "search_esgf": "when_missing", "download_dir": Path("/download_dir"), - "rootpath": None, - "drs": {}, + "projects": { + "CMIP6": { + "data": { + "local": { + "type": "esmvalcore.local.LocalDataSource", + "rootpath": Path("/local_dir"), + "dirname_template": "{project}/{activity}/{institute}/{dataset}/{exp}/{ensemble}/{mip}/{short_name}/{grid}/{version}", + "filename_template": "{short_name}_{mip}_{dataset}_{exp}_{ensemble}_{grid}*.nc", + }, + "esgf": { + "type": "esmvalcore.esgf.ESGFDataSource", + "download_dir": Path("/download_dir"), + }, + }, + }, + }, } return dataset @@ -1461,14 +1504,14 @@ def test_find_files(mocker, dataset, local_availability): ) mocker.patch.object( - esmvalcore.dataset.local, - "find_files", + esmvalcore.local.LocalDataSource, + "find_data", autospec=True, - return_value=(list(local_files), []), + return_value=list(local_files), ) mocker.patch.object( - esmvalcore.dataset.esgf, - "find_files", + esmvalcore.esgf.ESGFDataSource, + "find_data", autospec=True, return_value=list(esgf_files), ) @@ -1498,14 +1541,14 @@ def test_find_files_wildcard_timerange(mocker, dataset): ) mocker.patch.object( - esmvalcore.dataset.local, - "find_files", + esmvalcore.local.LocalDataSource, + "find_data", autospec=True, - return_value=(local_files, []), + return_value=list(local_files), ) mocker.patch.object( - esmvalcore.dataset.esgf, - "find_files", + esmvalcore.esgf.ESGFDataSource, + "find_data", autospec=True, return_value=list(esgf_files), ) @@ -1535,14 +1578,14 @@ def test_find_files_outdated_local(mocker, dataset): ) mocker.patch.object( - esmvalcore.dataset.local, - "find_files", + esmvalcore.local.LocalDataSource, + "find_data", autospec=True, - return_value=(local_files, []), + return_value=list(local_files), ) mocker.patch.object( - esmvalcore.dataset.esgf, - "find_files", + esmvalcore.esgf.ESGFDataSource, + "find_data", autospec=True, return_value=list(esgf_files), ) @@ -1552,20 +1595,39 @@ def test_find_files_outdated_local(mocker, dataset): @pytest.mark.parametrize( "project", - ["CESM", "EMAC", "ICON", "IPSLCM", "OBS", "OBS6", "ana4mips", "native6"], + ["CESM", "EMAC", "ICON", "IPSLCM", "OBS", "OBS6", "native6"], ) -def test_find_files_non_esgf_projects(mocker, project, monkeypatch): +def test_find_files_non_esgf_projects(mocker, monkeypatch, session, project): """Test that find_files does never download files for non-ESGF projects.""" monkeypatch.setitem(CFG, "search_esgf", "always") + # Add "model" projects that are not part of the default local configuration. + with importlib.resources.as_file( + importlib.resources.files(esmvalcore.config) + / "configurations" + / f"{project.lower()}-data.yml", + ) as config_file: + if config_file.exists(): + cfg = yaml.safe_load(config_file.read_text(encoding="utf-8")) + session["projects"][project]["data"] = cfg["projects"][project][ + "data" + ] + + files = [ + mocker.create_autospec( + esmvalcore.local.LocalFile, + spec_set=True, + instance=True, + ), + ] mock_local_find_files = mocker.patch.object( - esmvalcore.dataset.local, - "find_files", + esmvalcore.local.LocalDataSource, + "find_data", autospec=True, - return_value=(mock.sentinel.files, mock.sentinel.file_globs), + return_value=files, ) mock_esgf_find_files = mocker.patch.object( - esmvalcore.dataset.esgf, - "find_files", + esmvalcore.esgf.ESGFDataSource, + "find_data", autospec=True, ) @@ -1599,14 +1661,14 @@ def test_find_files_non_esgf_projects(mocker, project, monkeypatch): var_type="var_type", version=1, ) + tas.session = session tas.augment_facets() tas.find_files() - mock_local_find_files.assert_called_once() + mock_local_find_files.assert_called() mock_esgf_find_files.assert_not_called() - assert tas.files == mock.sentinel.files - assert tas._file_globs == mock.sentinel.file_globs + assert tas.files == files def test_set_version(): @@ -2137,7 +2199,7 @@ def test_get_extra_facets_native6(): } -OBS6_SAT_FACETS = { +OBS6_SAT_FACETS: Facets = { "project": "OBS6", "dataset": "SAT", "mip": "Amon", @@ -2191,8 +2253,11 @@ def test_derivation_necessary_no_derivation(): assert dataset._derivation_necessary() is False -def test_derivation_necessary_no_force_derivation_no_files(): +def test_derivation_necessary_no_force_derivation_no_files( + session: Session, +) -> None: dataset = Dataset(**OBS6_SAT_FACETS, short_name="lwcre", derive=True) + dataset.session = session assert dataset._derivation_necessary() is True diff --git a/tests/unit/test_provenance.py b/tests/unit/test_provenance.py deleted file mode 100644 index b6c20dbc2e..0000000000 --- a/tests/unit/test_provenance.py +++ /dev/null @@ -1,25 +0,0 @@ -"""Test `esmvalcore._provenance`.""" - -from esmvalcore._provenance import TrackedFile - - -def test_set(): - assert { - TrackedFile("file1.nc", attributes={}), - TrackedFile("file1.nc", attributes={}), - TrackedFile("file2.nc", attributes={}), - } == { - TrackedFile("file1.nc", attributes={}), - TrackedFile("file2.nc", attributes={}), - } - - -def test_sort(): - file1 = TrackedFile("file1.nc", attributes={}) - file2 = TrackedFile("file2.nc", attributes={}) - assert sorted([file2, file1]) == [file1, file2] - - -def test_equals(): - file = TrackedFile("file.nc", attributes={}) - assert file == TrackedFile("file.nc", attributes={})