Add support for CMIP5

bouweandela · bouweandela · commit e91e3830489f · 2025-07-22T15:10:41.000+02:00
diff --git a/esmvalcore/config/configurations/intake-esgf.yml b/esmvalcore/config/configurations/intake-esgf.yml
@@ -0,0 +1,46 @@
+projects:
+  CMIP6:
+    data:
+      intake-esgf:
+        type: esmvalcore.io.intake_esgf.IntakeESGFDataSource
+        facets:
+          activity: activity_drs
+          dataset: source_id
+          ensemble: member_id
+          exp: experiment_id
+          institute: institution_id
+          grid: grid_label
+          mip: table_id
+          project: "project"
+          short_name: variable_id
+  CMIP5:
+    data:
+      intake-esgf:
+        type: esmvalcore.io.intake_esgf.IntakeESGFDataSource
+        facets:
+          dataset: model
+          ensemble: ensemble
+          exp: experiment
+          frequency: time_frequency
+          institute: institute
+          mip: cmor_table
+          product: product
+          project: "project"
+          short_name: variable
+        values:
+          dataset:
+            "ACCESS1-0": "ACCESS1.0"
+            "ACCESS1-3": "ACCESS1.3"
+            "bcc-csm1-1": "BCC-CSM1.1"
+            "bcc-csm1-1-m": "BCC-CSM1.1(m)"
+            "CESM1-BGC": "CESM1(BGC)"
+            "CESM1-CAM5": "CESM1(CAM5)"
+            "CESM1-CAM5-1-FV2": "CESM1(CAM5.1,FV2)"
+            "CESM1-FASTCHEM": "CESM1(FASTCHEM)"
+            "CESM1-WACCM": "CESM1(WACCM)"
+            "CSIRO-Mk3-6-0": "CSIRO-Mk3.6.0"
+            "fio-esm": "FIO-ESM"
+            "GFDL-CM2p1": "GFDL-CM2.1"
+            "inmcm4": "INM-CM4"
+            "MRI-AGCM3-2H": "MRI-AGCM3.2H"
+            "MRI-AGCM3-2S": "MRI-AGCM3.2S"
diff --git a/esmvalcore/config/data_sources.py b/esmvalcore/config/data_sources.py
@@ -15,7 +15,7 @@ def get_data_sources(session: Session) -> list[DataSource]:
         if "data" not in project_settings:
             logger.info("Using legacy data sources for project '%s'", project)
             # Use legacy data sources from config-user.yml.
-            legacy_local_sources = esmvalcore.local._get_data_sources(project)
+            legacy_local_sources = esmvalcore.local._get_data_sources(project)  # noqa: SLF001
             data_sources.extend(legacy_local_sources)
             if (
                 session["search_esgf"] != "never"
diff --git a/esmvalcore/io/intake_esgf.py b/esmvalcore/io/intake_esgf.py
@@ -1,4 +1,5 @@
 from dataclasses import dataclass, field
+from numbers import Number
 
 import intake_esgf.projects
 import iris.cube
@@ -81,11 +82,21 @@ def find_data(self, **facets: FacetValue) -> list[IntakeESGFDataset]:
         :obj:`list` of :obj:`esmvalcore.io.intake_esgf.IntakeESGFDataset`
             A list of data elements that have been found.
         """
-        # Translate "our" facets to ESGF facets
+        # Normalize facets so all values are `list[str]`.
+        facets = {
+            facet: [str(values)]
+            if isinstance(values, str | Number | bool)
+            else values
+            for facet, values in facets.items()
+        }
+        # Translate "our" facets to ESGF facets and "our" values to ESGF values.
         esgf_facets = {
-            self.values.get(k, {}).get(v, v): facets[k]
-            for k, v in self.facets.items()
-            if k in facets and facets[k] != "*"
+            their_facet: [
+                self.values.get(our_facet, {}).get(v, v)
+                for v in facets[our_facet]
+            ]
+            for our_facet, their_facet in self.facets.items()
+            if our_facet in facets
         }
         # TODO: filter by timerange
         try:
@@ -99,32 +110,54 @@ def find_data(self, **facets: FacetValue) -> list[IntakeESGFDataset]:
             )
             return []
 
+        # Return a list of datasets, with one IntakeESGFDataset per dataset_id.
+        result: list[IntakeESGFDataset] = []
+
+        # These are the keys in the dict[str, xarray.Dataset] returned by
+        # `intake_esgf.ESGFCatalog.to_dataset_dict`. Taken from:
+        # https://github.com/esgf2-us/intake-esgf/blob/c34124e54078e70ef271709a6d158edb22bcdb96/intake_esgf/catalog.py#L523-L528
         self.catalog.df["key"] = self.catalog.df.apply(
             lambda row: ".".join(
                 [row[f] for f in self.catalog.project.master_id_facets()],
             ),
             axis=1,
         )
         inverse_values = {
-            facet: {v: k}
-            for facet in self.values
-            for k, v in self.values[facet].items()
+            our_facet: {
+                their_value: our_value
+                for our_value, their_value in self.values[our_facet].items()
+            }
+            for our_facet in self.values
         }
-        datasets = []
         for _, row in self.catalog.df.iterrows():
             dataset_id = row["key"]
             # Subset the catalog to a single dataset.
             cat = self.catalog.clone()
-            cat.project = self.catalog.project
             cat.df = self.catalog.df[self.catalog.df.key == dataset_id]
-            facets = {
-                k: inverse_values.get(k, {}).get(row[v], row[v])
-                for k, v in self.facets.items()
+            # Discard all but the latest version. It is not clear how/if
+            # `intake_esgf.ESGFCatalog.to_dataset_dict` supports multiple versions.
+            cat.df = cat.df[cat.df.version == cat.df.version.max()]
+            cat.project = self.catalog.project
+            if "short_name" in facets:
+                cat.last_search[self.facets["short_name"]] = facets[
+                    "short_name"
+                ]
+            # Retrieve "our" facets associated with the dataset_id.
+            dataset_facets = {
+                our_facet: [
+                    inverse_values.get(our_facet, {}).get(v, v)
+                    for v in row[their_facet]
+                ]
+                for our_facet, their_facet in self.facets.items()
+                if their_facet in row
+            }
+            dataset_facets = {
+                f: v[0] if len(v) == 1 else v for f, v in facets.items()
             }
             dataset = IntakeESGFDataset(
                 name=dataset_id,
-                facets=facets,
+                facets=dataset_facets,
                 catalog=cat,
             )
-            datasets.append(dataset)
-        return datasets
+            result.append(dataset)
+        return result
diff --git a/esmvalcore/preprocessor/__init__.py b/esmvalcore/preprocessor/__init__.py
@@ -5,7 +5,6 @@
 import copy
 import inspect
 import logging
-from pathlib import Path
 from pprint import pformat
 from typing import TYPE_CHECKING, Any
 
@@ -97,6 +96,7 @@
 
 if TYPE_CHECKING:
     from collections.abc import Iterable
+    from pathlib import Path
 
     from dask.delayed import Delayed
 
@@ -399,7 +399,7 @@ def _run_preproc_function(function, items, kwargs, input_files=None):
             )
 
         # Make sure that the arguments are indexable
-        if isinstance(items, (PreprocessorFile, Cube, str, Path)):
+        if isinstance(items, (PreprocessorFile, Cube, DataElement)):
             items = [items]
         if isinstance(items, set):
             items = list(items)