Some progress

bouweandela · bouweandela · commit 5283ecbf2b11 · 2025-08-14T18:27:46.000+02:00
diff --git a/esmvalcore/_provenance.py b/esmvalcore/_provenance.py
@@ -11,8 +11,6 @@
 from PIL.PngImagePlugin import PngInfo
 from prov.model import ProvDerivation, ProvDocument
 
-from esmvalcore.io.protocol import DataElement
-
 from ._version import __version__
 
 logger = logging.getLogger(__name__)
@@ -111,7 +109,7 @@ class TrackedFile:
 
     def __init__(
         self,
-        filename: Path | DataElement,
+        filename,
         attributes=None,
         ancestors=None,
         prov_filename=None,
@@ -120,8 +118,8 @@ def __init__(
 
         Arguments
         ---------
-        filename:
-            Path to the file on disk.
+        filename: :obj:`pathlib.Path` or :obj:`esmvalcore.io.protocol.DataElement`
+            Path or data element containing the data described by the provenance.
         attributes: dict
             Dictionary with facets describing the file. If set to None, this
             will be read from the file when provenance is initialized.
@@ -133,7 +131,9 @@ def __init__(
             processing.
         """
         self._filename = (
-            str(filename) if isinstance(filename, Path) else filename.name
+            str(filename)
+            if isinstance(filename, Path | str)
+            else filename.name
         )
         if prov_filename is None:
             self.prov_filename = self._filename
@@ -178,13 +178,13 @@ def copy_provenance(self):
         return new
 
     @property
-    def filename(self):
-        """Filename."""
+    def filename(self) -> str:
+        """Name of data described by this provenance document."""
         return self._filename
 
     @property
     def provenance_file(self):
-        """Filename of provenance."""
+        """Filename of provenance file."""
         return os.path.splitext(self.filename)[0] + "_provenance.xml"
 
     def initialize_provenance(self, activity):
diff --git a/esmvalcore/_recipe/recipe.py b/esmvalcore/_recipe/recipe.py
@@ -443,10 +443,7 @@ def _get_common_attributes(
 
     # Ensure that attributes start_year and end_year are always available if at
     # least one of the input datasets defines it
-    if "timerange" in attributes:
-        start_year, end_year = _parse_period(attributes["timerange"])
-        attributes["start_year"] = int(str(start_year[0:4]))
-        attributes["end_year"] = int(str(end_year[0:4]))
+    _set_start_end_year(attributes)
 
     return attributes
 
@@ -710,7 +707,7 @@ def _get_preprocessor_products(
     )
 
     for product in products:
-        _set_start_end_year(product)
+        _set_start_end_year(product.attributes)
         product.check()
 
     return products
@@ -770,18 +767,18 @@ def _configure_multi_product_preprocessor(
 
     for product in multimodel_products | ensemble_products:
         product.check()
-        _set_start_end_year(product)
+        _set_start_end_year(product.attributes)
 
 
-def _set_start_end_year(product: PreprocessorFile) -> None:
+def _set_start_end_year(attributes: dict) -> None:
     """Set the attributes `start_year` and `end_year`.
 
     These attributes are used by many diagnostic scripts in ESMValTool.
     """
-    if "timerange" in product.attributes:
-        start_year, end_year = _parse_period(product.attributes["timerange"])
-        product.attributes["start_year"] = int(str(start_year[0:4]))
-        product.attributes["end_year"] = int(str(end_year[0:4]))
+    if "timerange" in attributes:
+        start_year, end_year = _parse_period(attributes["timerange"])
+        attributes["start_year"] = int(str(start_year[0:4]))
+        attributes["end_year"] = int(str(end_year[0:4]))
 
 
 def _update_preproc_functions(
diff --git a/esmvalcore/config/_config.py b/esmvalcore/config/_config.py
@@ -94,7 +94,7 @@ def warn_if_old_extra_facets_exist() -> None:
         )
 
 
-def load_config_developer(cfg_file):
+def load_config_developer(cfg_file) -> dict:
     """Read the developer's configuration file."""
     with open(cfg_file, encoding="utf-8") as file:
         cfg = yaml.safe_load(file)
@@ -118,6 +118,7 @@ def load_config_developer(cfg_file):
         CFG[project] = settings
 
     read_cmor_tables(cfg_file)
+    return cfg
 
 
 def get_project_config(project):
diff --git a/esmvalcore/config/_config_object.py b/esmvalcore/config/_config_object.py
@@ -13,6 +13,7 @@
 import yaml
 
 import esmvalcore
+from esmvalcore.config._config import load_config_developer
 from esmvalcore.config._config_validators import (
     _deprecated_options_defaults,
     _deprecators,
@@ -145,6 +146,10 @@ def _load_user_config(
 
         try:
             new.update(mapping)
+            # Add known projects from config-developer file while we still have it.
+            for project in load_config_developer(new["config_developer_file"]):
+                if project not in new["projects"]:
+                    new["projects"][project] = {}
             new.check_missing()
         except InvalidConfigParameter as exc:
             msg = (
@@ -364,7 +369,10 @@ def load_from_dirs(self, dirs: Iterable[str | Path]) -> None:
         new_config_dict = self._get_config_dict_from_dirs(dirs)
         self.clear()
         self.update(new_config_dict)
-
+        # Add known projects from config-developer file while we still have it.
+        for project in load_config_developer(self["config_developer_file"]):
+            if project not in self["projects"]:
+                self["projects"][project] = {}
         self.check_missing()
 
     def reload(self) -> None:
diff --git a/esmvalcore/config/_data_sources.py b/esmvalcore/config/_data_sources.py
@@ -37,7 +37,7 @@ def _get_data_sources(session: Session) -> list[DataSource]:
                 and project in esmvalcore.esgf.facets.FACETS
             ):
                 data_source = esmvalcore.esgf.ESGFDataSource(
-                    name="legacy",
+                    name="legacy-esgf",
                     project=project,
                     priority=2,
                     download_dir=session["download_dir"],
diff --git a/esmvalcore/esgf/_download.py b/esmvalcore/esgf/_download.py
@@ -22,7 +22,7 @@
 import yaml
 from humanfriendly import format_size, format_timespan
 
-from esmvalcore.config._config import CFG
+from esmvalcore.config import CFG
 from esmvalcore.io.protocol import DataElement
 from esmvalcore.local import LocalFile
 from esmvalcore.typing import Facets
diff --git a/esmvalcore/esgf/_search.py b/esmvalcore/esgf/_search.py
@@ -2,7 +2,7 @@
 
 import itertools
 import logging
-from dataclasses import dataclass
+from dataclasses import dataclass, field
 from functools import lru_cache
 from pathlib import Path
 
@@ -398,12 +398,9 @@ class ESGFDataSource(DataSource):
     download_dir: Path
     """The destination directory where data will be downloaded."""
 
-    debug_info: str = ""
+    debug_info: str = field(init=False, default="")
     """A string containing debug information when no data is found."""
 
-    def __post__init__(self):
-        self.debug_info = ""
-
     def find_data(self, **facets: FacetValue) -> list[ESGFFile]:
         """Find data.
 
diff --git a/esmvalcore/io/intake_esgf.py b/esmvalcore/io/intake_esgf.py
@@ -13,9 +13,11 @@
 import intake_esgf
 import intake_esgf.exceptions
 import iris.cube
+import isodate
 
 from esmvalcore.io.protocol import DataElement, DataSource
 from esmvalcore.iris_helpers import dataset_to_iris
+from esmvalcore.local import _parse_period
 from esmvalcore.typing import Facets, FacetValue
 
 __all__ = [
@@ -51,11 +53,17 @@ def to_iris(self, ignore_warnings=None) -> iris.cube.CubeList:
         :
             The loaded data.
         """
+        files = self.catalog.to_path_dict(
+            minimal_keys=False,
+            quiet=True,
+        )[self.name]
         dataset = self.catalog.to_dataset_dict(
             minimal_keys=False,
             add_measures=False,
             quiet=True,
         )[self.name]
+        dataset.attrs["source_file"] = ", ".join(str(f) for f in files)
+
         return dataset_to_iris(dataset, ignore_warnings=ignore_warnings)
 
 
@@ -121,7 +129,17 @@ def find_data(self, **facets: FacetValue) -> list[IntakeESGFDataset]:
             for our_facet, their_facet in self.facets.items()
             if our_facet in our_facets
         }
-        # TODO: filter by timerange
+        if (
+            "timerange" in facets and "*" not in facets["timerange"]  # type: ignore[operator]
+        ):
+            start, end = _parse_period(facets["timerange"])
+            esgf_facets["file_start"] = isodate.date_isoformat(
+                isodate.parse_date(start.split("T")[0]),
+            )
+            esgf_facets["file_end"] = isodate.date_isoformat(
+                isodate.parse_date(end.split("T")[0]),
+            )
+        # Search ESGF.
         try:
             self.catalog.search(**esgf_facets, quiet=True)
         except intake_esgf.exceptions.NoSearchResults:
@@ -156,6 +174,8 @@ def find_data(self, **facets: FacetValue) -> list[IntakeESGFDataset]:
             dataset_id = row["key"]
             # Subset the catalog to a single dataset.
             cat = self.catalog.clone()
+            cat.file_start = self.catalog.file_start
+            cat.file_end = self.catalog.file_end
             cat.df = self.catalog.df[self.catalog.df.key == dataset_id]
             # Discard all but the latest version. It is not clear how/if
             # `intake_esgf.ESGFCatalog.to_dataset_dict` supports multiple versions.
diff --git a/esmvalcore/local.py b/esmvalcore/local.py
@@ -473,17 +473,21 @@ class DataSource(esmvalcore.io.protocol.DataSource):
     priority: int
     """The priority of the data source. Lower values have priority."""
 
-    debug_info: str = field(init=False)
+    debug_info: str = field(init=False, default="")
     """A string containing debug information when no data is found."""
 
     rootpath: Path
+    """The path where the directories are located."""
+
     dirname_template: str
+    """The template for the directory names."""
+
     filename_template: str
+    """The template for the file names."""
 
     def __post_init__(self) -> None:
         """Set further attributes."""
         self._regex_pattern = self._templates_to_regex()
-        self.debug_info = ""
 
     @property
     def regex_pattern(self) -> str:
@@ -502,6 +506,7 @@ def get_glob_patterns(self, **facets) -> list[Path]:
 
     def find_files(self, **facets) -> list[LocalFile]:
         """Find files."""
+        # TODO: deprecate this method
         return self.find_data(**facets)
 
     def find_data(self, **facets) -> list[LocalFile]:
@@ -656,7 +661,7 @@ def _get_data_sources(project: str) -> list[DataSource]:
                 file_templates = _select_drs("input_file", project, structure)
                 sources.extend(
                     DataSource(
-                        name="legacy",
+                        name="legacy-local",
                         project=project,
                         priority=1,
                         rootpath=path,
@@ -776,6 +781,7 @@ def version(file):
     return result
 
 
+# TODO: Deprecate this?
 def find_files(
     *,
     debug: bool = False,
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -27,7 +27,6 @@ def _load_default_config():
             "ignore",
             message="Do not instantiate `Config` objects directly",
             category=UserWarning,
-            module="esmvalcore",
         )
         cfg = Config()
     cfg.load_from_dirs([])
diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py
@@ -113,7 +113,7 @@ def _tracking_ids(i=0):
 def _get_find_files_func(path: Path, suffix: str = ".nc"):
     tracking_id = _tracking_ids()
 
-    def find_files(*, debug: bool = False, **facets):
+    def find_files(self, *, debug: bool = False, **facets):
         files, file_globs = _get_files(path, facets, tracking_id)
         files = [f.with_suffix(suffix) for f in files]
         file_globs = [g.with_suffix(suffix) for g in file_globs]
@@ -127,13 +127,13 @@ def find_files(*, debug: bool = False, **facets):
 @pytest.fixture
 def patched_datafinder(tmp_path, monkeypatch):
     find_files = _get_find_files_func(tmp_path)
-    monkeypatch.setattr(esmvalcore.local, "find_files", find_files)
+    monkeypatch.setattr(esmvalcore.local.DataSource, "find_data", find_files)
 
 
 @pytest.fixture
 def patched_datafinder_grib(tmp_path, monkeypatch):
     find_files = _get_find_files_func(tmp_path, suffix=".grib")
-    monkeypatch.setattr(esmvalcore.local, "find_files", find_files)
+    monkeypatch.setattr(esmvalcore.local.DataSource, "find_data", find_files)
 
 
 @pytest.fixture
diff --git a/tests/integration/recipe/test_recipe.py b/tests/integration/recipe/test_recipe.py
@@ -790,7 +790,7 @@ def test_recipe_iso_timerange(
     filename = (
         f"CMIP6_HadGEM3-GC31-LL_3hr_historical_r2i1p1f1_pr_gn_{output_time}.nc"
     )
-    assert pr_product.filename.name == filename
+    assert Path(pr_product.filename).name == filename
 
     areacella_task = next(
         t for t in recipe.tasks if t.name.endswith("areacella")
@@ -799,7 +799,7 @@ def test_recipe_iso_timerange(
     areacella_product = areacella_task.products.pop()
 
     filename = "CMIP6_HadGEM3-GC31-LL_fx_historical_r2i1p1f1_areacella_gn.nc"
-    assert areacella_product.filename.name == filename
+    assert Path(areacella_product.filename).name == filename
 
 
 @pytest.mark.parametrize(("input_time", "output_time"), TEST_ISO_TIMERANGE)
@@ -839,7 +839,7 @@ def test_recipe_iso_timerange_as_dataset(
     filename = (
         f"CMIP6_HadGEM3-GC31-LL_3hr_historical_r2i1p1f1_pr_gn_{output_time}.nc"
     )
-    assert product.filename.name == filename
+    assert Path(product.filename).name == filename
 
     assert len(product.datasets) == 1
     dataset = product.datasets[0]
diff --git a/tests/unit/provenance/test_trackedfile.py b/tests/unit/provenance/test_trackedfile.py
@@ -1,13 +1,16 @@
+from pathlib import Path
+
 import pytest
 from prov.model import ProvDocument
 
 from esmvalcore._provenance import ESMVALTOOL_URI_PREFIX, TrackedFile
+from esmvalcore.local import LocalFile
 
 
 @pytest.fixture
 def tracked_file_nc():
     return TrackedFile(
-        filename="/path/to/file.nc",
+        filename=LocalFile("/path/to/file.nc"),
         attributes={"a": "A"},
         prov_filename="/original/path/to/file.nc",
     )
@@ -16,7 +19,7 @@ def tracked_file_nc():
 @pytest.fixture
 def tracked_file_grb():
     return TrackedFile(
-        filename="/path/to/file.grb",
+        filename=Path("/path/to/file.grb"),
         prov_filename="/original/path/to/file.grb",
     )
 

Original file line number	Diff line number	Diff line change
`@@ -27,7 +27,6 @@ def _load_default_config():`
`27`	`27`	`"ignore",`
`28`	`28`	message="Do not instantiate `Config` objects directly",
`29`	`29`	`category=UserWarning,`
`30`		`- module="esmvalcore",`
`31`	`30`	`)`
`32`	`31`	`cfg = Config()`
`33`	`32`	`cfg.load_from_dirs([])`