From 4ec6331709b30e3deaea86dc124b654472cb46ba Mon Sep 17 00:00:00 2001
From: Bouwe Andela <b.andela@esciencecenter.nl>
Date: Sun, 16 Mar 2025 20:36:21 +0100
Subject: [PATCH 1/5] Improve support for obs4MIPs

---
 esmvaltool/cmorizers/data/cmorizer.py  |   4 +-
 esmvaltool/cmorizers/data/utilities.py | 399 ++++++++++++++++++++++---
 2 files changed, 360 insertions(+), 43 deletions(-)

diff --git a/esmvaltool/cmorizers/data/cmorizer.py b/esmvaltool/cmorizers/data/cmorizer.py
index b06544ab5f..299ef5bea0 100755
--- a/esmvaltool/cmorizers/data/cmorizer.py
+++ b/esmvaltool/cmorizers/data/cmorizer.py
@@ -21,7 +21,9 @@
 from esmvalcore.config._logging import configure_logging
 
 from esmvaltool import ESMValToolDeprecationWarning
-from esmvaltool.cmorizers.data.utilities import read_cmor_config
+from esmvaltool.cmorizers.data.utilities import (
+    read_cmor_config,
+)
 
 logger = logging.getLogger(__name__)
 datasets_file = os.path.join(os.path.dirname(__file__), "datasets.yml")
diff --git a/esmvaltool/cmorizers/data/utilities.py b/esmvaltool/cmorizers/data/utilities.py
index 4a504cb8c7..de1868f4be 100644
--- a/esmvaltool/cmorizers/data/utilities.py
+++ b/esmvaltool/cmorizers/data/utilities.py
@@ -2,21 +2,29 @@
 
 import datetime
 import gzip
+import json
 import logging
 import os
 import re
 import shutil
+import uuid
+from collections.abc import Callable
 from contextlib import contextmanager
+from functools import lru_cache
 from pathlib import Path
 
+import esmvalcore.cmor
 import iris
 import numpy as np
 import yaml
 from cf_units import Unit
 from dask import array as da
+from esmvalcore.cmor.check import CheckLevels, CMORCheckError, cmor_check
 from esmvalcore.cmor.table import CMOR_TABLES
+from esmvalcore.config import CFG
 from iris.cube import Cube
 
+import esmvaltool
 from esmvaltool import __file__ as esmvaltool_file
 from esmvaltool import __version__ as version
 
@@ -312,12 +320,284 @@ def read_cmor_config(dataset):
     )
     with open(reg_path, encoding="utf-8") as file:
         cfg = yaml.safe_load(file)
-    cfg["cmor_table"] = CMOR_TABLES[cfg["attributes"]["project_id"]]
-    if "comment" not in cfg["attributes"]:
-        cfg["attributes"]["comment"] = ""
+    attributes = cfg["attributes"]
+    if attributes.get("activity_id", "") == "obs4MIPs":
+        # Fill in various attributes automatically.
+        attributes["project_id"] = "obs4MIPs"
+        attributes["tier"] = "1"
+        attributes["source_id"] = dataset
+        cv = load_controlled_vocabulary("obs4MIPs")
+        for key, value in cv["source_id"][dataset].items():
+            attributes[key] = value
+        attributes["institution"] = cv["institution_id"][
+            attributes["institution_id"]
+        ]
+    elif "comment" not in attributes:
+        attributes["comment"] = ""
+
+    cfg["cmor_table"] = CMOR_TABLES[attributes["project_id"]]
+
     return cfg
 
 
+# See https://zenodo.org/records/11500474 for the obs4MIPs specification
+# See https://github.com/PCMDI/obs4MIPs-cmor-tables for the obs4MIPs CMOR tables
+
+DRS_ATTRIBUTE = "^[a-zA-Z0-9-]+$"
+FREE_FORM_ATTRIBUTE = ".*"
+
+
+@lru_cache
+def load_controlled_vocabulary(project: str) -> dict:
+    project_config = yaml.safe_load(
+        CFG["config_developer_file"].read_text(encoding="utf-8")
+    )[project]
+    install_dir = os.path.dirname(os.path.realpath(esmvalcore.cmor.__file__))
+    cmor_type = project_config.get("cmor_type", "CMIP5")
+    default_path = os.path.join(install_dir, "tables", cmor_type.lower())
+    tables_path = project_config.get("cmor_path", default_path)
+    tables_path = os.path.expandvars(os.path.expanduser(tables_path))
+    if not os.path.exists(tables_path):
+        tables_path = os.path.join(install_dir, "tables", tables_path)
+    cv_paths = list(Path(tables_path, "Tables").glob("*_CV.json"))
+    if not cv_paths:
+        return {}
+    cv_path = cv_paths[0]
+    cv = json.loads(cv_path.read_text(encoding="utf-8"))
+    return cv["CV"]
+
+
+def check_with_controlled_vocabulary(
+    project: str,
+    attribute: str,
+    value: str,
+    attributes: dict[str, str],
+) -> bool:
+    cv = load_controlled_vocabulary(project)
+    values = cv[attribute]
+    if attribute in values:
+        values = values[attribute]
+    return value in values
+
+
+def check_institutution(
+    project: str,
+    attribute: str,
+    value: str,
+    attributes: dict[str, str],
+) -> bool:
+    cv = load_controlled_vocabulary(project)
+    institution_id = attributes["institution_id"]
+    return value == cv["institution_id"][institution_id]
+
+
+def create_source_checker(
+    source_attr: str,
+) -> Callable:
+    def checker(
+        project: str, attribute: str, value: str, attributes: dict[str, str]
+    ) -> bool:
+        cv = load_controlled_vocabulary(project)
+        source_id = attributes["source_id"]
+        return value == cv["source_id"][source_id][source_attr]
+
+    return checker
+
+
+def check_source_label(
+    project: str,
+    attribute: str,
+    value: str,
+    attributes: dict[str, str],
+) -> bool:
+    source_id = attributes["source_id"]
+    return source_id.startswith(f"{value}-")
+
+
+def check_variant_label(
+    project: str,
+    attribute: str,
+    value: str,
+    attributes: dict[str, str],
+) -> bool:
+    institution_id = attributes["institution_id"]
+    return (value == institution_id) or re.match(
+        f"^{institution_id}-[a-zA-Z0-9-]+$", value
+    )
+
+
+def check_datetime(
+    project: str,
+    attribute: str,
+    value: str,
+    attributes: dict[str, str],
+) -> bool:
+    try:
+        datetime.datetime.fromisoformat(value)
+    except ValueError as exc:
+        logger.error("Invalid datetime format '%s'", exc)
+        return False
+    return True
+
+
+REQUIRED_GLOBAL_ATTRIBUTES = {
+    "obs4MIPs": {
+        "activity_id": "obs4MIPs",
+        "contact": FREE_FORM_ATTRIBUTE,
+        "creation_date": check_datetime,
+        "dataset_contributor": FREE_FORM_ATTRIBUTE,
+        "data_specs_version": FREE_FORM_ATTRIBUTE,  # TODO: automate, this should be the GH release of obs4MIPs CMOR tables
+        "frequency": check_with_controlled_vocabulary,
+        "grid": FREE_FORM_ATTRIBUTE,
+        "grid_label": check_with_controlled_vocabulary,
+        "institution": check_institutution,
+        "institution_id": check_with_controlled_vocabulary,
+        "license": FREE_FORM_ATTRIBUTE,
+        "nominal_resolution": check_with_controlled_vocabulary,
+        "processing_code_location": FREE_FORM_ATTRIBUTE,  # TODO: automate and add check
+        "product": check_with_controlled_vocabulary,
+        "realm": check_with_controlled_vocabulary,
+        "references": FREE_FORM_ATTRIBUTE,
+        "region": create_source_checker("region"),
+        "source": create_source_checker("source"),
+        "source_id": check_with_controlled_vocabulary,
+        "source_label": check_source_label,
+        "source_type": create_source_checker("source_type"),
+        "source_version_number": create_source_checker(
+            "source_version_number"
+        ),
+        "tracking_id": FREE_FORM_ATTRIBUTE,  # TODO: improve check
+        "variable_id": DRS_ATTRIBUTE,
+        "variant_label": check_variant_label,
+    }
+}
+
+OPTIONAL_GLOBAL_ATTRIBUTES = {
+    "obs4MIPs": {
+        "comment": FREE_FORM_ATTRIBUTE,
+        "external_variables": check_with_controlled_vocabulary,
+        "history": FREE_FORM_ATTRIBUTE,
+        "source_data_notes": FREE_FORM_ATTRIBUTE,
+        # TODO: Maybe we can add the two attributes below based on info from
+        # the automatic download.
+        "source_data_retrieval_date": check_datetime,
+        "source_data_url": FREE_FORM_ATTRIBUTE,
+        "title": FREE_FORM_ATTRIBUTE,
+        "variant_info": FREE_FORM_ATTRIBUTE,
+    }
+}
+
+
+def check_global_attributes(project: str, attributes: dict[str, str]) -> bool:
+    """Check if the required attributes are available for the project."""
+    success = True
+    # Check that required attributes are present.
+    for attr in REQUIRED_GLOBAL_ATTRIBUTES.get(project, {}):
+        if attr not in attributes:
+            logger.error("Missing global attribute '%s'", attr)
+            success = False
+
+    # Check attribute values.
+    attr_definitions = REQUIRED_GLOBAL_ATTRIBUTES.get(
+        project, {}
+    ) | OPTIONAL_GLOBAL_ATTRIBUTES.get(project, {})
+
+    for attr, checker in attr_definitions.items():
+        if attr in attributes:
+            value = attributes[attr]
+            if not isinstance(checker, str):
+                result = checker(project, attr, value, attributes)
+            else:
+                result = re.match(checker, value)
+
+            if not result:
+                logger.error(
+                    "Invalid value '%s' for attribute '%s', expected a value "
+                    "matching '%s'",
+                    value,
+                    attr,
+                    checker,
+                )
+                success = False
+
+    return success
+
+
+def _get_attr_from_field_coord(ncfield, coord_name, attr):
+    if coord_name is not None:
+        attrs = ncfield.cf_group[coord_name].cf_attrs()
+        attr_val = [value for (key, value) in attrs if key == attr]
+        if attr_val:
+            return attr_val[0]
+    return None
+
+
+def _load_callback(raw_cube, field, _):
+    """Use this callback to fix anything Iris tries to break."""
+    for coord in raw_cube.coords():
+        # Iris chooses to change longitude and latitude units to degrees
+        # regardless of value in file, so reinstating file value
+        if coord.standard_name in ["longitude", "latitude"]:
+            units = _get_attr_from_field_coord(field, coord.var_name, "units")
+            if units is not None:
+                coord.units = units
+
+
+def _check_formatting(filename: str, attributes: dict[str, str]) -> None:
+    """Run final cmorization checks."""
+    project = attributes["project_id"]
+    logger.info("Checking compliance with '%s' project standards", project)
+    cube = iris.load_cube(filename, callback=_load_callback)
+
+    attribute_success = check_global_attributes(
+        project, cube.attributes.globals
+    )
+
+    try:
+        cmor_check(
+            cube=cube,
+            cmor_table=project,
+            mip=attributes["mip"],
+            short_name=cube.var_name,
+            frequency=cube.attributes.globals.get("frequency"),
+            check_level=CheckLevels.STRICT,
+        )
+    except CMORCheckError as exc:
+        logger.error("%s", exc)
+        cmor_check_success = False
+    else:
+        cmor_check_success = True
+
+    success = attribute_success and cmor_check_success
+    msg = (
+        f"Data in file {filename} is {'' if success else 'not '}"
+        f"compliant with '{project}' project standards"
+    )
+    if success:
+        logger.info(msg)
+    else:
+        raise ValueError(msg)
+    # TODO: add concatenate test
+    # TODO: add time coverage test
+
+
+FILENAME_TEMPLATE = {
+    "obs4MIPs": "{variable_id}_{frequency}_{source_id}_{variant_label}_{grid_label}",
+    "OBS6": "{project_id}_{dataset_id}_{modeling_realm}_{version}_{mip}_{variable_id}",
+    "OBS": "{project_id}_{dataset_id}_{modeling_realm}_{version}_{mip}_{variable_id}",
+}
+
+
+def get_output_filename(attrs: dict[str, str], time_range: str | None) -> str:
+    """Get the output filename."""
+    project = attrs["project_id"]
+    filename = FILENAME_TEMPLATE[project].format(**attrs)
+    if time_range is not None:
+        filename = f"{filename}_{time_range}"
+    filename = f"{filename}.nc"
+    return filename
+
+
 def save_variable(cube, var, outdir, attrs, **kwargs):
     """Saver function.
 
@@ -341,15 +621,29 @@ def save_variable(cube, var, outdir, attrs, **kwargs):
     **kwargs: kwargs
         Keyword arguments to be passed to `iris.save`
     """
+    if var != cube.var_name:
+        msg = (
+            f"Attempted to save cube with var_name '{cube.var_name}' as "
+            f"variable '{var}'"
+        )
+        raise ValueError(msg)
+
+    # Set global attributes.
+    attrs["variable_id"] = cube.var_name
+    set_global_atts(cube, attrs)
+
+    # Ensure correct dtypes.
     fix_dtype(cube)
-    # CMOR standard
+
+    # Determine the output filename.
     try:
         time = cube.coord("time")
     except iris.exceptions.CoordinateNotFoundError:
         time_suffix = None
     else:
         if (
-            len(time.points) == 1 and "mon" not in cube.attributes.get("mip")
+            len(time.points) == 1
+            and "mon" not in cube.attributes.get("mip", "")
         ) or cube.attributes.get("frequency") == "yr":
             year = str(time.cell(0).point.year)
             time_suffix = "-".join([year + "01", year + "12"])
@@ -362,23 +656,18 @@ def save_variable(cube, var, outdir, attrs, **kwargs):
             )
             time_suffix = "-".join([date1, date2])
 
-    name_elements = [
-        attrs["project_id"],
-        attrs["dataset_id"],
-        attrs["modeling_realm"],
-        attrs["version"],
-        attrs["mip"],
-        var,
-    ]
-    if time_suffix:
-        name_elements.append(time_suffix)
-    file_name = "_".join(name_elements) + ".nc"
+    file_name = get_output_filename(attrs, time_suffix)
     file_path = os.path.join(outdir, file_name)
     logger.info("Saving: %s", file_path)
+
+    # Save the cube.
     status = "lazy" if cube.has_lazy_data() else "realized"
     logger.info("Cube has %s data [lazy is preferred]", status)
     iris.save(cube, file_path, fill_value=1e20, **kwargs)
 
+    # Check that the cube complies with the CMOR tables for the project.
+    _check_formatting(file_path, attrs)
+
 
 def extract_doi_value(tags):
     """Extract doi(s) from a bibtex entry."""
@@ -409,43 +698,69 @@ def extract_doi_value(tags):
     return ", ".join(reference_doi)
 
 
+def _get_processing_code_location() -> str:
+    # TODO: make sure current working dir is not dirty and replace version
+    # by commit that is available online
+    version = ".".join(esmvaltool.__version__.split(".", 3)[:3])
+    return f"https://github.com/ESMValGroup/ESMValTool/tree/{version}"
+
+
 def set_global_atts(cube, attrs):
     """Complete the cmorized file with global metadata."""
     logger.debug("Setting global metadata...")
     attrs = dict(attrs)
     cube.attributes.clear()
-    timestamp = datetime.datetime.utcnow()
-    timestamp_format = "%Y-%m-%d %H:%M:%S"
+    timestamp = datetime.datetime.now(datetime.timezone.utc)
+    timestamp_format = "%Y-%m-%dT%H:%M:%SZ"
     now_time = timestamp.strftime(timestamp_format)
 
     # Necessary attributes
-    try:
+    if attrs["project_id"] == "obs4MIPs":
         glob_dict = {
-            "title": (
-                f"{attrs.pop('dataset_id')} data reformatted for "
-                f"ESMValTool v{version}"
-            ),
-            "version": attrs.pop("version"),
-            "tier": str(attrs.pop("tier")),
-            "source": attrs.pop("source"),
-            "reference": extract_doi_value(attrs.pop("reference")),
-            "comment": attrs.pop("comment"),
-            "user": os.environ.get("USER", "unknown user"),
-            "host": os.environ.get("HOSTNAME", "unknown host"),
-            "history": f"Created on {now_time}",
-            "project_id": attrs.pop("project_id"),
+            "creation_date": now_time,
+            "tracking_id": f"hdl:21.14102/{uuid.uuid4()}",
+            "processing_code_location": _get_processing_code_location(),
+            "variable_id": cube.var_name,
         }
-    except KeyError as original_error:
-        msg = (
-            "All CMORized datasets need the global attributes "
-            "'dataset_id', 'version', 'tier', 'source', 'reference', "
-            "'comment' and 'project_id' "
-            "specified in the configuration file"
-        )
-        raise KeyError(msg) from original_error
+        required_keys = set(REQUIRED_GLOBAL_ATTRIBUTES["obs4MIPs"])
+        for key in required_keys | set(OPTIONAL_GLOBAL_ATTRIBUTES["obs4MIPs"]):
+            if key in attrs:
+                glob_dict[key] = attrs[key]
+        missing = required_keys - set(glob_dict)
+        if missing:
+            msg = (
+                "The following required keys are missing from the "
+                f"configuration file: {', '.join(sorted(missing))}"
+            )
+            raise KeyError(msg)
+    else:
+        try:
+            glob_dict = {
+                "title": (
+                    f"{attrs.pop('dataset_id')} data reformatted for "
+                    f"ESMValTool v{version}"
+                ),
+                "version": attrs.pop("version"),
+                "tier": str(attrs.pop("tier")),
+                "source": attrs.pop("source"),
+                "reference": extract_doi_value(attrs.pop("reference")),
+                "comment": attrs.pop("comment"),
+                "user": os.environ.get("USER", "unknown user"),
+                "host": os.environ.get("HOSTNAME", "unknown host"),
+                "history": f"Created on {now_time}",
+                "project_id": attrs.pop("project_id"),
+            }
+        except KeyError as original_error:
+            msg = (
+                "All CMORized datasets need the global attributes "
+                "'dataset_id', 'version', 'tier', 'source', 'reference', "
+                "'comment' and 'project_id' "
+                "specified in the configuration file"
+            )
+            raise KeyError(msg) from original_error
+        # Additional attributes
+        glob_dict.update(attrs)
 
-    # Additional attributes
-    glob_dict.update(attrs)
     cube.attributes.globals = glob_dict
 
 

From 1a5b6f15044c7f05c6439ce29cad1a3ae5bdf63c Mon Sep 17 00:00:00 2001
From: Bouwe Andela <b.andela@esciencecenter.nl>
Date: Thu, 3 Apr 2025 14:14:54 +0200
Subject: [PATCH 2/5] Various improvements

---
 esmvaltool/cmorizers/data/cmorizer.py  |   4 +-
 esmvaltool/cmorizers/data/utilities.py | 521 +++++++++++++++++--------
 2 files changed, 361 insertions(+), 164 deletions(-)

diff --git a/esmvaltool/cmorizers/data/cmorizer.py b/esmvaltool/cmorizers/data/cmorizer.py
index 299ef5bea0..b06544ab5f 100755
--- a/esmvaltool/cmorizers/data/cmorizer.py
+++ b/esmvaltool/cmorizers/data/cmorizer.py
@@ -21,9 +21,7 @@
 from esmvalcore.config._logging import configure_logging
 
 from esmvaltool import ESMValToolDeprecationWarning
-from esmvaltool.cmorizers.data.utilities import (
-    read_cmor_config,
-)
+from esmvaltool.cmorizers.data.utilities import read_cmor_config
 
 logger = logging.getLogger(__name__)
 datasets_file = os.path.join(os.path.dirname(__file__), "datasets.yml")
diff --git a/esmvaltool/cmorizers/data/utilities.py b/esmvaltool/cmorizers/data/utilities.py
index de1868f4be..41cefebbd1 100644
--- a/esmvaltool/cmorizers/data/utilities.py
+++ b/esmvaltool/cmorizers/data/utilities.py
@@ -8,8 +8,10 @@
 import re
 import shutil
 import uuid
-from collections.abc import Callable
+from abc import abstractmethod
+from collections.abc import Mapping
 from contextlib import contextmanager
+from dataclasses import dataclass
 from functools import lru_cache
 from pathlib import Path
 
@@ -323,15 +325,33 @@ def read_cmor_config(dataset):
     attributes = cfg["attributes"]
     if attributes.get("activity_id", "") == "obs4MIPs":
         # Fill in various attributes automatically.
+        timestamp = datetime.datetime.now(datetime.timezone.utc)
+        timestamp_format = "%Y-%m-%dT%H:%M:%SZ"
+        now_time = timestamp.strftime(timestamp_format)
         attributes["project_id"] = "obs4MIPs"
         attributes["tier"] = "1"
         attributes["source_id"] = dataset
+        source_id_info = load_obs4mips_source_id_info()[dataset]
+        for key in ["institution_id", "source_label"]:
+            attributes[key] = re.sub(
+                "[^a-zA-Z0-9]+", "-", source_id_info[key]
+            ).strip("-")
         cv = load_controlled_vocabulary("obs4MIPs")
         for key, value in cv["source_id"][dataset].items():
             attributes[key] = value
         attributes["institution"] = cv["institution_id"][
             attributes["institution_id"]
         ]
+        if "references" not in attributes:
+            attributes["references"] = attributes["doi"]
+        if "creation_date" not in attributes:
+            attributes["creation_date"] = now_time
+        attributes["data_specs_version"] = "2.5"
+        attributes["processing_code_location"] = (
+            _get_processing_code_location()
+        )
+        if "version" not in attributes:
+            attributes["version"] = timestamp.strftime("v%Y%m%d")
     elif "comment" not in attributes:
         attributes["comment"] = ""
 
@@ -343,12 +363,8 @@ def read_cmor_config(dataset):
 # See https://zenodo.org/records/11500474 for the obs4MIPs specification
 # See https://github.com/PCMDI/obs4MIPs-cmor-tables for the obs4MIPs CMOR tables
 
-DRS_ATTRIBUTE = "^[a-zA-Z0-9-]+$"
-FREE_FORM_ATTRIBUTE = ".*"
-
 
-@lru_cache
-def load_controlled_vocabulary(project: str) -> dict:
+def find_cmor_tables_path(project) -> Path:
     project_config = yaml.safe_load(
         CFG["config_developer_file"].read_text(encoding="utf-8")
     )[project]
@@ -359,7 +375,13 @@ def load_controlled_vocabulary(project: str) -> dict:
     tables_path = os.path.expandvars(os.path.expanduser(tables_path))
     if not os.path.exists(tables_path):
         tables_path = os.path.join(install_dir, "tables", tables_path)
-    cv_paths = list(Path(tables_path, "Tables").glob("*_CV.json"))
+    return Path(tables_path)
+
+
+@lru_cache
+def load_controlled_vocabulary(project: str) -> dict:
+    tables_path = find_cmor_tables_path(project)
+    cv_paths = list((tables_path / "Tables").glob("*_CV.json"))
     if not cv_paths:
         return {}
     cv_path = cv_paths[0]
@@ -367,160 +389,306 @@ def load_controlled_vocabulary(project: str) -> dict:
     return cv["CV"]
 
 
-def check_with_controlled_vocabulary(
-    project: str,
-    attribute: str,
-    value: str,
-    attributes: dict[str, str],
-) -> bool:
-    cv = load_controlled_vocabulary(project)
-    values = cv[attribute]
-    if attribute in values:
-        values = values[attribute]
-    return value in values
+@lru_cache
+def load_obs4mips_source_id_info() -> dict[str, dict]:
+    table_path = find_cmor_tables_path("obs4MIPs") / "obs4MIPs_source_id.json"
+    table = json.loads(table_path.read_text(encoding="utf-8"))
+    return table["source_id"]
 
 
-def check_institutution(
-    project: str,
-    attribute: str,
-    value: str,
-    attributes: dict[str, str],
-) -> bool:
-    cv = load_controlled_vocabulary(project)
-    institution_id = attributes["institution_id"]
-    return value == cv["institution_id"][institution_id]
+class ValidationError(Exception):
+    pass
 
 
-def create_source_checker(
-    source_attr: str,
-) -> Callable:
-    def checker(
-        project: str, attribute: str, value: str, attributes: dict[str, str]
-    ) -> bool:
-        cv = load_controlled_vocabulary(project)
-        source_id = attributes["source_id"]
-        return value == cv["source_id"][source_id][source_attr]
+@dataclass
+class BaseAttributeValidator:
+    """Validator for global attributes."""
 
-    return checker
+    name: str
+    """The name of the attribute."""
+    required: bool
+    """Whether the attribute is required or not."""
 
+    def validate(self, attributes: Mapping[str, str]) -> None:
+        """Validate attributes."""
+        if self.name in attributes:
+            self.validate_values(attributes)
+        elif self.required:
+            msg = f"Required attribute '{self.name}' missing."
+            raise ValidationError(msg)
 
-def check_source_label(
-    project: str,
-    attribute: str,
-    value: str,
-    attributes: dict[str, str],
-) -> bool:
-    source_id = attributes["source_id"]
-    return source_id.startswith(f"{value}-")
+    @abstractmethod
+    def validate_values(self, attributes: Mapping[str, str]) -> None:
+        """Validate attribute values."""
 
 
-def check_variant_label(
-    project: str,
-    attribute: str,
-    value: str,
-    attributes: dict[str, str],
-) -> bool:
-    institution_id = attributes["institution_id"]
-    return (value == institution_id) or re.match(
-        f"^{institution_id}-[a-zA-Z0-9-]+$", value
-    )
+@dataclass
+class CVAttributeValidator(BaseAttributeValidator):
+    values: set[str]
+
+    def validate_values(self, attributes: Mapping[str, str]) -> None:
+        value = attributes[self.name]
+        if value not in self.values:
+            msg = (
+                f"Encountered an invalid value '{value}' for attribute "
+                f"'{self.name}'. Choose from: {','.join(sorted(self.values))}"
+            )
+            raise ValidationError(msg)
 
 
-def check_datetime(
-    project: str,
-    attribute: str,
-    value: str,
-    attributes: dict[str, str],
-) -> bool:
-    try:
-        datetime.datetime.fromisoformat(value)
-    except ValueError as exc:
-        logger.error("Invalid datetime format '%s'", exc)
-        return False
-    return True
-
-
-REQUIRED_GLOBAL_ATTRIBUTES = {
-    "obs4MIPs": {
-        "activity_id": "obs4MIPs",
-        "contact": FREE_FORM_ATTRIBUTE,
-        "creation_date": check_datetime,
-        "dataset_contributor": FREE_FORM_ATTRIBUTE,
-        "data_specs_version": FREE_FORM_ATTRIBUTE,  # TODO: automate, this should be the GH release of obs4MIPs CMOR tables
-        "frequency": check_with_controlled_vocabulary,
-        "grid": FREE_FORM_ATTRIBUTE,
-        "grid_label": check_with_controlled_vocabulary,
-        "institution": check_institutution,
-        "institution_id": check_with_controlled_vocabulary,
-        "license": FREE_FORM_ATTRIBUTE,
-        "nominal_resolution": check_with_controlled_vocabulary,
-        "processing_code_location": FREE_FORM_ATTRIBUTE,  # TODO: automate and add check
-        "product": check_with_controlled_vocabulary,
-        "realm": check_with_controlled_vocabulary,
-        "references": FREE_FORM_ATTRIBUTE,
-        "region": create_source_checker("region"),
-        "source": create_source_checker("source"),
-        "source_id": check_with_controlled_vocabulary,
-        "source_label": check_source_label,
-        "source_type": create_source_checker("source_type"),
-        "source_version_number": create_source_checker(
-            "source_version_number"
-        ),
-        "tracking_id": FREE_FORM_ATTRIBUTE,  # TODO: improve check
-        "variable_id": DRS_ATTRIBUTE,
-        "variant_label": check_variant_label,
+@dataclass
+class CVRelatedAttributeValidator(BaseAttributeValidator):
+    # source: CVAttributeValidator
+    source_name: str
+    values: dict[str, str]
+
+    def validate_values(self, attributes: Mapping[str, str]) -> None:
+        # self.source.validate(attributes)
+        source_value = attributes[self.source_name]
+        value = attributes[self.name]
+        if value != self.values[source_value]:
+            msg = (
+                f"Encountered an invalid value '{value}' for attribute "
+                f"{self.name}. It should be: {self.values[source_value]}"
+            )
+            raise ValidationError(msg)
+
+
+def load_cv_validators(project: str) -> list[BaseAttributeValidator]:
+    if project in ("OBS", "OBS6"):
+        # There is no controlled vocabulary for ESMValTool internal projects OBS6 and OBS.
+        return []
+
+    if project != "obs4MIPs":
+        msg = f"Reading the controlled vocabulary for project {project} is not (yet) supported."
+        raise NotImplementedError(msg)
+
+    cv = load_controlled_vocabulary(project)
+    validators: list[BaseAttributeValidator] = []
+    required_attributes = {
+        v.name for v in GLOBAL_ATTRIBUTE_VALIDATORS[project] if v.required
     }
-}
+    ignore = {"required_global_attributes", "license"}
+    for key, values in cv.items():
+        if key in ignore:
+            continue
+        if key in cv[key]:
+            # Some entries are nested.
+            values = cv[key][key]
+        if isinstance(values, list | dict):
+            validators.append(
+                CVAttributeValidator(
+                    key,
+                    values=set(values),
+                    required=key in required_attributes,
+                )
+            )
+
+    validators.append(
+        CVRelatedAttributeValidator(
+            "institution",
+            required=True,
+            source_name="institution_id",
+            values=cv["institution_id"],
+        )
+    )
+
+    # Create validators for attributes determined by the "source_id".
+    related_values: dict[str, dict[str, str]] = {}
+    for source_id, source_values in cv["source_id"].items():
+        for name, value in source_values.items():
+            if name not in related_values:
+                related_values[name] = {}
+            related_values[name][source_id] = value
+    for name, values in related_values.items():
+        validators.append(
+            CVRelatedAttributeValidator(
+                name,
+                required=True,
+                source_name="source_id",
+                values=values,
+            )
+        )
+
+    # from rich.pretty import pprint
+
+    # pprint(validators)
+    return validators
+
+
+@dataclass
+class DateTimeAttributeValidator(BaseAttributeValidator):
+    def validate_values(self, attributes: Mapping[str, str]) -> None:
+        value = attributes[self.name]
+        format = "%Y-%m-%dT%H:%M:%SZ"
+        try:
+            datetime.datetime.strptime(value, format)
+        except ValueError as exc:
+            msg = f"Invalid datetime encountered for attribute '{self.name}', message: {exc}"
+            raise ValidationError(msg) from None
+
+
+@dataclass
+class RegexAttributeValidator(BaseAttributeValidator):
+    pattern: str
+
+    def validate_values(self, attributes: Mapping[str, str]) -> None:
+        # if any(f"{{{a}}}" in self.pattern for a in attributes):
+        pattern = self.pattern.format(**attributes)
+        # else:
+        #     pattern = self.pattern
+        value = attributes[self.name]
+        if not re.match(pattern, value):
+            msg = (
+                f"Invalid attribute value '{value}' encountered for attribute "
+                f"'{self.name}'. It should match '{pattern}'"
+            )
+            raise ValidationError(msg)
+
+
+PATH_ATTRIBUTE = "^[a-zA-Z0-9-]+$"  # Used in file or directory names.
+PATH_ATTRIBUTE_WITH_SPACES = (
+    "^[a-zA-Z0-9- ]+$"  # Used in file or directory names after space removal.
+)
+DRS_ATTRIBUTE = "^[a-zA-Z0-9-_]+$"  # Data Reference Syntax (DRS) components.
+FREE_FORM_ATTRIBUTE = ".+"
 
-OPTIONAL_GLOBAL_ATTRIBUTES = {
-    "obs4MIPs": {
-        "comment": FREE_FORM_ATTRIBUTE,
-        "external_variables": check_with_controlled_vocabulary,
-        "history": FREE_FORM_ATTRIBUTE,
-        "source_data_notes": FREE_FORM_ATTRIBUTE,
+
+GLOBAL_ATTRIBUTE_VALIDATORS: dict[str, list[BaseAttributeValidator]] = {
+    "obs4MIPs": [
+        # Required attributes
+        RegexAttributeValidator(
+            "activity_id", required=True, pattern="^obs4MIPs$"
+        ),
+        RegexAttributeValidator(
+            "contact", required=True, pattern=FREE_FORM_ATTRIBUTE
+        ),
+        DateTimeAttributeValidator("creation_date", required=True),
+        RegexAttributeValidator(
+            "dataset_contributor", required=True, pattern=FREE_FORM_ATTRIBUTE
+        ),
+        RegexAttributeValidator(
+            "data_specs_version", required=True, pattern=r"^2\.5$"
+        ),
+        # "doi" is not a required attribute according to the obs4MIPs spec,
+        # but it is for CMIP7 data so we add it for consistency.
+        RegexAttributeValidator("doi", required=True, pattern=r"^10\.[0-9]+"),
+        RegexAttributeValidator(
+            "frequency", required=True, pattern=PATH_ATTRIBUTE
+        ),
+        RegexAttributeValidator(
+            "grid", required=True, pattern=FREE_FORM_ATTRIBUTE
+        ),
+        RegexAttributeValidator(
+            "grid_label", required=True, pattern=PATH_ATTRIBUTE
+        ),
+        RegexAttributeValidator(
+            "institution", required=True, pattern=FREE_FORM_ATTRIBUTE
+        ),
+        RegexAttributeValidator(
+            "institution_id", required=True, pattern=PATH_ATTRIBUTE
+        ),
+        RegexAttributeValidator(
+            "license", required=True, pattern=FREE_FORM_ATTRIBUTE
+        ),
+        RegexAttributeValidator(
+            "nominal_resolution",
+            required=True,
+            pattern=PATH_ATTRIBUTE_WITH_SPACES,
+        ),
+        RegexAttributeValidator(
+            "processing_code_location",
+            required=True,
+            pattern=FREE_FORM_ATTRIBUTE,
+        ),
+        RegexAttributeValidator(
+            "product", required=True, pattern=DRS_ATTRIBUTE
+        ),
+        RegexAttributeValidator("realm", required=True, pattern=DRS_ATTRIBUTE),
+        RegexAttributeValidator(
+            "references", required=True, pattern=FREE_FORM_ATTRIBUTE
+        ),
+        RegexAttributeValidator(
+            "region", required=True, pattern=DRS_ATTRIBUTE
+        ),
+        RegexAttributeValidator(
+            "source", required=True, pattern=FREE_FORM_ATTRIBUTE
+        ),
+        RegexAttributeValidator(
+            "source_id", required=True, pattern=PATH_ATTRIBUTE
+        ),
+        RegexAttributeValidator(
+            "source_id", required=True, pattern="^{source_label}-.+$"
+        ),
+        RegexAttributeValidator(
+            "source_label", required=True, pattern=DRS_ATTRIBUTE
+        ),
+        RegexAttributeValidator(
+            "source_type", required=True, pattern=FREE_FORM_ATTRIBUTE
+        ),
+        RegexAttributeValidator(
+            "source_version_number", required=True, pattern=FREE_FORM_ATTRIBUTE
+        ),
+        RegexAttributeValidator(
+            "tracking_id",
+            required=True,
+            pattern="^hdl:21.14102/[0-9a-f]{{8}}(-[0-9a-f]{{4}}){{3}}-[0-9a-f]{{12}}$",
+        ),
+        RegexAttributeValidator(
+            "variable_id", required=True, pattern=PATH_ATTRIBUTE
+        ),
+        RegexAttributeValidator(
+            "variant_label", required=True, pattern=PATH_ATTRIBUTE
+        ),
+        RegexAttributeValidator(
+            "variant_label", required=True, pattern="^{institution_id}(-.+)?$"
+        ),
+        # Optional attributes
+        RegexAttributeValidator(
+            "comment", required=False, pattern=FREE_FORM_ATTRIBUTE
+        ),
+        RegexAttributeValidator(
+            "external_variables", required=False, pattern=FREE_FORM_ATTRIBUTE
+        ),
+        RegexAttributeValidator(
+            "history", required=False, pattern=FREE_FORM_ATTRIBUTE
+        ),
+        RegexAttributeValidator(
+            "source_data_notes", required=False, pattern=FREE_FORM_ATTRIBUTE
+        ),
         # TODO: Maybe we can add the two attributes below based on info from
         # the automatic download.
-        "source_data_retrieval_date": check_datetime,
-        "source_data_url": FREE_FORM_ATTRIBUTE,
-        "title": FREE_FORM_ATTRIBUTE,
-        "variant_info": FREE_FORM_ATTRIBUTE,
-    }
+        DateTimeAttributeValidator(
+            "source_data_retrieval_date", required=False
+        ),
+        RegexAttributeValidator(
+            "source_data_url", required=False, pattern=FREE_FORM_ATTRIBUTE
+        ),
+        RegexAttributeValidator(
+            "title", required=False, pattern=FREE_FORM_ATTRIBUTE
+        ),
+        RegexAttributeValidator(
+            "variant_info", required=False, pattern=FREE_FORM_ATTRIBUTE
+        ),
+    ],
 }
 
 
-def check_global_attributes(project: str, attributes: dict[str, str]) -> bool:
-    """Check if the required attributes are available for the project."""
-    success = True
-    # Check that required attributes are present.
-    for attr in REQUIRED_GLOBAL_ATTRIBUTES.get(project, {}):
-        if attr not in attributes:
-            logger.error("Missing global attribute '%s'", attr)
-            success = False
-
-    # Check attribute values.
-    attr_definitions = REQUIRED_GLOBAL_ATTRIBUTES.get(
-        project, {}
-    ) | OPTIONAL_GLOBAL_ATTRIBUTES.get(project, {})
-
-    for attr, checker in attr_definitions.items():
-        if attr in attributes:
-            value = attributes[attr]
-            if not isinstance(checker, str):
-                result = checker(project, attr, value, attributes)
-            else:
-                result = re.match(checker, value)
-
-            if not result:
-                logger.error(
-                    "Invalid value '%s' for attribute '%s', expected a value "
-                    "matching '%s'",
-                    value,
-                    attr,
-                    checker,
-                )
-                success = False
-
-    return success
+def validate_global_attributes(
+    project: str, attributes: dict[str, str]
+) -> bool:
+    validators = GLOBAL_ATTRIBUTE_VALIDATORS.get(
+        project, []
+    ) + load_cv_validators(project)
+    messages = set()
+    for validator in validators:
+        try:
+            validator.validate(attributes)
+        except ValidationError as exc:
+            messages.add(str(exc))
+    if messages:
+        logger.error("%s", "\n".join(sorted(messages)))
+    return not (messages)
 
 
 def _get_attr_from_field_coord(ncfield, coord_name, attr):
@@ -543,13 +711,13 @@ def _load_callback(raw_cube, field, _):
                 coord.units = units
 
 
-def _check_formatting(filename: str, attributes: dict[str, str]) -> None:
+def _check_formatting(filename: Path, attributes: dict[str, str]) -> None:
     """Run final cmorization checks."""
     project = attributes["project_id"]
     logger.info("Checking compliance with '%s' project standards", project)
     cube = iris.load_cube(filename, callback=_load_callback)
 
-    attribute_success = check_global_attributes(
+    attribute_success = validate_global_attributes(
         project, cube.attributes.globals
     )
 
@@ -587,34 +755,57 @@ def _check_formatting(filename: str, attributes: dict[str, str]) -> None:
     "OBS": "{project_id}_{dataset_id}_{modeling_realm}_{version}_{mip}_{variable_id}",
 }
 
+DIRECTORY_TEMPLATE = {
+    "obs4MIPs": "{activity_id}/{institution_id}/{source_id}/{frequency}/{variable_id}/{nominal_resolution}/{version}",
+}
+
 
-def get_output_filename(attrs: dict[str, str], time_range: str | None) -> str:
+def get_output_filename(
+    outdir: str,
+    attrs: dict[str, str],
+    time_range: str | None,
+) -> Path:
     """Get the output filename."""
     project = attrs["project_id"]
+    if project in DIRECTORY_TEMPLATE:
+        dirname = DIRECTORY_TEMPLATE[project].format(
+            **{k: v.replace(" ", "") for k, v in attrs.items()}
+        )
+        # Ignore the TierX/dataset subdirectory set in the cmorizer.py script
+        # if the project defines its own directory structure.
+        out_path = Path(outdir).parent.parent / dirname
+    else:
+        out_path = Path(outdir)
     filename = FILENAME_TEMPLATE[project].format(**attrs)
     if time_range is not None:
         filename = f"{filename}_{time_range}"
     filename = f"{filename}.nc"
-    return filename
+    return out_path / filename
 
 
-def save_variable(cube, var, outdir, attrs, **kwargs):
+def save_variable(
+    cube: iris.cube.Cube,
+    var: str,
+    outdir: str,
+    attrs: dict[str, str],
+    **kwargs,
+) -> None:
     """Saver function.
 
     Saves iris cubes (data variables) in CMOR-standard named files.
 
     Parameters
     ----------
-    cube: iris.cube.Cube
+    cube:
         data cube to be saved.
 
-    var: str
+    var:
         Variable short_name e.g. ts or tas.
 
-    outdir: str
+    outdir:
         root directory where the file will be saved.
 
-    attrs: dict
+    attrs:
         dictionary holding cube metadata attributes like
         project_id, version etc.
 
@@ -656,14 +847,14 @@ def save_variable(cube, var, outdir, attrs, **kwargs):
             )
             time_suffix = "-".join([date1, date2])
 
-    file_name = get_output_filename(attrs, time_suffix)
-    file_path = os.path.join(outdir, file_name)
+    file_path = get_output_filename(outdir, attrs, time_suffix)
     logger.info("Saving: %s", file_path)
+    file_path.parent.mkdir(parents=True, exist_ok=True)
 
     # Save the cube.
     status = "lazy" if cube.has_lazy_data() else "realized"
     logger.info("Cube has %s data [lazy is preferred]", status)
-    iris.save(cube, file_path, fill_value=1e20, **kwargs)
+    iris.save(cube, file_path, fill_value=1e20, compute=False, **kwargs)
 
     # Check that the cube complies with the CMOR tables for the project.
     _check_formatting(file_path, attrs)
@@ -699,8 +890,9 @@ def extract_doi_value(tags):
 
 
 def _get_processing_code_location() -> str:
-    # TODO: make sure current working dir is not dirty and replace version
-    # by commit that is available online
+    # Ideas for improvement:
+    # - make sure current working dir is not dirty
+    # - replace version by commit that is available online
     version = ".".join(esmvaltool.__version__.split(".", 3)[:3])
     return f"https://github.com/ESMValGroup/ESMValTool/tree/{version}"
 
@@ -717,13 +909,20 @@ def set_global_atts(cube, attrs):
     # Necessary attributes
     if attrs["project_id"] == "obs4MIPs":
         glob_dict = {
-            "creation_date": now_time,
             "tracking_id": f"hdl:21.14102/{uuid.uuid4()}",
-            "processing_code_location": _get_processing_code_location(),
             "variable_id": cube.var_name,
         }
-        required_keys = set(REQUIRED_GLOBAL_ATTRIBUTES["obs4MIPs"])
-        for key in required_keys | set(OPTIONAL_GLOBAL_ATTRIBUTES["obs4MIPs"]):
+        required_keys = {
+            v.name
+            for v in GLOBAL_ATTRIBUTE_VALIDATORS["obs4MIPs"]
+            if v.required
+        }
+        optional_keys = {
+            v.name
+            for v in GLOBAL_ATTRIBUTE_VALIDATORS["obs4MIPs"]
+            if not v.required
+        }
+        for key in required_keys | optional_keys:
             if key in attrs:
                 glob_dict[key] = attrs[key]
         missing = required_keys - set(glob_dict)

From 1102247e074d425c325c8fddcb6a26d43e405baf Mon Sep 17 00:00:00 2001
From: Bouwe Andela <b.andela@esciencecenter.nl>
Date: Thu, 3 Apr 2025 15:59:10 +0200
Subject: [PATCH 3/5] Use CMOR check level from configuration for OBS and OBS6

---
 esmvaltool/cmorizers/data/utilities.py | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/esmvaltool/cmorizers/data/utilities.py b/esmvaltool/cmorizers/data/utilities.py
index 41cefebbd1..be7de94c8b 100644
--- a/esmvaltool/cmorizers/data/utilities.py
+++ b/esmvaltool/cmorizers/data/utilities.py
@@ -721,6 +721,13 @@ def _check_formatting(filename: Path, attributes: dict[str, str]) -> None:
         project, cube.attributes.globals
     )
 
+    if project in ("OBS", "OBS6"):
+        # Use the configured check_level for older CMORizers to avoid breaking
+        # them.
+        check_level = CFG["check_level"]
+    else:
+        # Use strict checks for obs4MIPs
+        check_level = CheckLevels.STRICT
     try:
         cmor_check(
             cube=cube,
@@ -728,7 +735,7 @@ def _check_formatting(filename: Path, attributes: dict[str, str]) -> None:
             mip=attributes["mip"],
             short_name=cube.var_name,
             frequency=cube.attributes.globals.get("frequency"),
-            check_level=CheckLevels.STRICT,
+            check_level=check_level,
         )
     except CMORCheckError as exc:
         logger.error("%s", exc)
@@ -784,7 +791,7 @@ def get_output_filename(
 
 
 def save_variable(
-    cube: iris.cube.Cube,
+    cube: Cube,
     var: str,
     outdir: str,
     attrs: dict[str, str],

From 1a24aa03484be0dd62a7c5e6fd650ab506047fce Mon Sep 17 00:00:00 2001
From: Bouwe Andela <b.andela@esciencecenter.nl>
Date: Thu, 3 Apr 2025 16:10:44 +0200
Subject: [PATCH 4/5] Ignore CMORization errors in MERRA2

---
 tests/unit/cmorizers/obs/test_merra2.py | 21 ++++++++++++++++++---
 1 file changed, 18 insertions(+), 3 deletions(-)

diff --git a/tests/unit/cmorizers/obs/test_merra2.py b/tests/unit/cmorizers/obs/test_merra2.py
index 3288df5b11..6de319c00b 100644
--- a/tests/unit/cmorizers/obs/test_merra2.py
+++ b/tests/unit/cmorizers/obs/test_merra2.py
@@ -5,6 +5,8 @@
 import numpy as np
 import pytest
 from cf_units import Unit
+from esmvalcore.cmor.check import CheckLevels
+from esmvalcore.config import CFG
 
 from esmvaltool.cmorizers.data.formatters.datasets.merra2 import (
     _extract_variable,
@@ -205,9 +207,14 @@ def test_load_cube_pairwise_vars_wrong_oper(tmp_path):
     print(exc)
 
 
-def test_extract_variable(tmp_path):
+def test_extract_variable(tmp_path, monkeypatch):
     """Test variable extraction."""
     # call is _extract_variable(in_files, var, cfg, out_dir)
+
+    # It looks like CMORization is not done to a good enough quality to pass
+    # the CMOR checks, so relax them until this is fixed.
+    monkeypatch.setitem(CFG, "check_level", CheckLevels.IGNORE)
+
     path_cubes = tmp_path / "cubes.nc"
     cube_1 = _create_sample_cube()
     cube_1.var_name = "SWTDN"
@@ -236,8 +243,12 @@ def test_extract_variable(tmp_path):
     assert cmorized_cube.attributes["raw"] == "SWTDN"
 
 
-def test_extract_variable_pairs(tmp_path):
+def test_extract_variable_pairs(tmp_path, monkeypatch):
     """Test variable extraction."""
+    # It looks like CMORization is not done to a good enough quality to pass
+    # the CMOR checks, so relax them until this is fixed.
+    monkeypatch.setitem(CFG, "check_level", CheckLevels.IGNORE)
+
     path_cubes = tmp_path / "cubes.nc"
     cube_1 = _create_sample_cube()
     cube_1.var_name = "SWTDN"
@@ -282,8 +293,12 @@ def test_extract_variable_pairs(tmp_path):
         assert attr in cmorized_cube.attributes
 
 
-def test_vertical_levels(tmp_path):
+def test_vertical_levels(tmp_path, monkeypatch):
     """Test cases for cmorization with vertical levels."""
+    # It looks like CMORization is not done to a good enough quality to pass
+    # the CMOR checks, so relax them until this is fixed.
+    monkeypatch.setitem(CFG, "check_level", CheckLevels.IGNORE)
+
     path_cubes = tmp_path / "cubes.nc"
     cube_1 = _create_sample_cube()
     cube_1.var_name = "V"

From a04c06c54afc884497f629109b464ed68b113895 Mon Sep 17 00:00:00 2001
From: Bouwe Andela <b.andela@esciencecenter.nl>
Date: Thu, 3 Apr 2025 17:27:30 +0200
Subject: [PATCH 5/5] Code quality improvements

---
 esmvaltool/cmorizers/data/utilities.py | 222 ++++++++++++++++---------
 1 file changed, 148 insertions(+), 74 deletions(-)

diff --git a/esmvaltool/cmorizers/data/utilities.py b/esmvaltool/cmorizers/data/utilities.py
index be7de94c8b..1a48650542 100644
--- a/esmvaltool/cmorizers/data/utilities.py
+++ b/esmvaltool/cmorizers/data/utilities.py
@@ -315,7 +315,7 @@ def flip_dim_coord(cube, coord_name):
     cube.data = da.flip(cube.core_data(), axis=coord_idx)
 
 
-def read_cmor_config(dataset):
+def read_cmor_config(dataset: str) -> dict:
     """Read the associated dataset-specific config file."""
     reg_path = os.path.join(
         os.path.dirname(__file__), "cmor_config", dataset + ".yml"
@@ -336,10 +336,10 @@ def read_cmor_config(dataset):
             attributes[key] = re.sub(
                 "[^a-zA-Z0-9]+", "-", source_id_info[key]
             ).strip("-")
-        cv = load_controlled_vocabulary("obs4MIPs")
-        for key, value in cv["source_id"][dataset].items():
+        vocabulary = load_controlled_vocabulary("obs4MIPs")
+        for key, value in vocabulary["source_id"][dataset].items():
             attributes[key] = value
-        attributes["institution"] = cv["institution_id"][
+        attributes["institution"] = vocabulary["institution_id"][
             attributes["institution_id"]
         ]
         if "references" not in attributes:
@@ -364,7 +364,10 @@ def read_cmor_config(dataset):
 # See https://github.com/PCMDI/obs4MIPs-cmor-tables for the obs4MIPs CMOR tables
 
 
-def find_cmor_tables_path(project) -> Path:
+def find_cmor_tables_path(project: str) -> Path:
+    """Find the path to the CMOR tables."""
+    # Code copied from
+    # https://github.com/ESMValGroup/ESMValCore/blob/main/esmvalcore/cmor/table.py
     project_config = yaml.safe_load(
         CFG["config_developer_file"].read_text(encoding="utf-8")
     )[project]
@@ -380,24 +383,26 @@ def find_cmor_tables_path(project) -> Path:
 
 @lru_cache
 def load_controlled_vocabulary(project: str) -> dict:
+    """Load the controlled vocabulary."""
     tables_path = find_cmor_tables_path(project)
     cv_paths = list((tables_path / "Tables").glob("*_CV.json"))
     if not cv_paths:
         return {}
     cv_path = cv_paths[0]
-    cv = json.loads(cv_path.read_text(encoding="utf-8"))
-    return cv["CV"]
+    vocabulary = json.loads(cv_path.read_text(encoding="utf-8"))
+    return vocabulary["CV"]
 
 
 @lru_cache
 def load_obs4mips_source_id_info() -> dict[str, dict]:
+    """Load additional information from the obs4MIPs source_id table."""
     table_path = find_cmor_tables_path("obs4MIPs") / "obs4MIPs_source_id.json"
     table = json.loads(table_path.read_text(encoding="utf-8"))
     return table["source_id"]
 
 
-class ValidationError(Exception):
-    pass
+class AttributeValidationError(Exception):
+    """There was an error in a global NetCDF attribute."""
 
 
 @dataclass
@@ -415,7 +420,7 @@ def validate(self, attributes: Mapping[str, str]) -> None:
             self.validate_values(attributes)
         elif self.required:
             msg = f"Required attribute '{self.name}' missing."
-            raise ValidationError(msg)
+            raise AttributeValidationError(msg)
 
     @abstractmethod
     def validate_values(self, attributes: Mapping[str, str]) -> None:
@@ -424,26 +429,30 @@ def validate_values(self, attributes: Mapping[str, str]) -> None:
 
 @dataclass
 class CVAttributeValidator(BaseAttributeValidator):
+    """Validator for attributes defined by the controlled vocabulary."""
+
     values: set[str]
 
     def validate_values(self, attributes: Mapping[str, str]) -> None:
+        """Validate attribute values."""
         value = attributes[self.name]
         if value not in self.values:
             msg = (
                 f"Encountered an invalid value '{value}' for attribute "
                 f"'{self.name}'. Choose from: {','.join(sorted(self.values))}"
             )
-            raise ValidationError(msg)
+            raise AttributeValidationError(msg)
 
 
 @dataclass
 class CVRelatedAttributeValidator(BaseAttributeValidator):
-    # source: CVAttributeValidator
+    """Validator for attributes defined by the controlled vocabulary."""
+
     source_name: str
     values: dict[str, str]
 
     def validate_values(self, attributes: Mapping[str, str]) -> None:
-        # self.source.validate(attributes)
+        """Validate attribute values."""
         source_value = attributes[self.source_name]
         value = attributes[self.name]
         if value != self.values[source_value]:
@@ -451,10 +460,11 @@ def validate_values(self, attributes: Mapping[str, str]) -> None:
                 f"Encountered an invalid value '{value}' for attribute "
                 f"{self.name}. It should be: {self.values[source_value]}"
             )
-            raise ValidationError(msg)
+            raise AttributeValidationError(msg)
 
 
 def load_cv_validators(project: str) -> list[BaseAttributeValidator]:
+    """Load validators representing the controlled vocabulary."""
     if project in ("OBS", "OBS6"):
         # There is no controlled vocabulary for ESMValTool internal projects OBS6 and OBS.
         return []
@@ -463,39 +473,38 @@ def load_cv_validators(project: str) -> list[BaseAttributeValidator]:
         msg = f"Reading the controlled vocabulary for project {project} is not (yet) supported."
         raise NotImplementedError(msg)
 
-    cv = load_controlled_vocabulary(project)
+    vocabulary = load_controlled_vocabulary(project)
     validators: list[BaseAttributeValidator] = []
     required_attributes = {
         v.name for v in GLOBAL_ATTRIBUTE_VALIDATORS[project] if v.required
     }
     ignore = {"required_global_attributes", "license"}
-    for key, values in cv.items():
+    for key, values in vocabulary.items():
         if key in ignore:
             continue
-        if key in cv[key]:
+        if key in vocabulary[key]:
             # Some entries are nested.
-            values = cv[key][key]
-        if isinstance(values, list | dict):
-            validators.append(
-                CVAttributeValidator(
-                    key,
-                    values=set(values),
-                    required=key in required_attributes,
-                )
+            values = vocabulary[key][key]
+        validators.append(
+            CVAttributeValidator(
+                key,
+                values={values} if isinstance(values, str) else set(values),
+                required=key in required_attributes,
             )
+        )
 
     validators.append(
         CVRelatedAttributeValidator(
             "institution",
             required=True,
             source_name="institution_id",
-            values=cv["institution_id"],
+            values=vocabulary["institution_id"],
         )
     )
 
     # Create validators for attributes determined by the "source_id".
     related_values: dict[str, dict[str, str]] = {}
-    for source_id, source_values in cv["source_id"].items():
+    for source_id, source_values in vocabulary["source_id"].items():
         for name, value in source_values.items():
             if name not in related_values:
                 related_values[name] = {}
@@ -510,40 +519,40 @@ def load_cv_validators(project: str) -> list[BaseAttributeValidator]:
             )
         )
 
-    # from rich.pretty import pprint
-
-    # pprint(validators)
     return validators
 
 
 @dataclass
 class DateTimeAttributeValidator(BaseAttributeValidator):
+    """Validator for datetime attributes."""
+
     def validate_values(self, attributes: Mapping[str, str]) -> None:
+        """Validate attribute values."""
         value = attributes[self.name]
-        format = "%Y-%m-%dT%H:%M:%SZ"
+        datetime_format = "%Y-%m-%dT%H:%M:%SZ"  # Enforce ISO 8601 with UTC.
         try:
-            datetime.datetime.strptime(value, format)
+            datetime.datetime.strptime(value, datetime_format)
         except ValueError as exc:
             msg = f"Invalid datetime encountered for attribute '{self.name}', message: {exc}"
-            raise ValidationError(msg) from None
+            raise AttributeValidationError(msg) from None
 
 
 @dataclass
 class RegexAttributeValidator(BaseAttributeValidator):
+    """Validator for attributes based on regular expressions."""
+
     pattern: str
 
     def validate_values(self, attributes: Mapping[str, str]) -> None:
-        # if any(f"{{{a}}}" in self.pattern for a in attributes):
+        """Validate attribute values."""
         pattern = self.pattern.format(**attributes)
-        # else:
-        #     pattern = self.pattern
         value = attributes[self.name]
         if not re.match(pattern, value):
             msg = (
                 f"Invalid attribute value '{value}' encountered for attribute "
                 f"'{self.name}'. It should match '{pattern}'"
             )
-            raise ValidationError(msg)
+            raise AttributeValidationError(msg)
 
 
 PATH_ATTRIBUTE = "^[a-zA-Z0-9-]+$"  # Used in file or directory names.
@@ -558,32 +567,51 @@ def validate_values(self, attributes: Mapping[str, str]) -> None:
     "obs4MIPs": [
         # Required attributes
         RegexAttributeValidator(
-            "activity_id", required=True, pattern="^obs4MIPs$"
+            "activity_id",
+            required=True,
+            pattern="^obs4MIPs$",
         ),
         RegexAttributeValidator(
-            "contact", required=True, pattern=FREE_FORM_ATTRIBUTE
+            "contact",
+            required=True,
+            pattern=FREE_FORM_ATTRIBUTE,
+        ),
+        DateTimeAttributeValidator(
+            "creation_date",
+            required=True,
         ),
-        DateTimeAttributeValidator("creation_date", required=True),
         RegexAttributeValidator(
-            "dataset_contributor", required=True, pattern=FREE_FORM_ATTRIBUTE
+            "dataset_contributor",
+            required=True,
+            pattern=FREE_FORM_ATTRIBUTE,
         ),
         RegexAttributeValidator(
-            "data_specs_version", required=True, pattern=r"^2\.5$"
+            "data_specs_version",
+            required=True,
+            pattern=r"^2\.5$",
         ),
         # "doi" is not a required attribute according to the obs4MIPs spec,
         # but it is for CMIP7 data so we add it for consistency.
         RegexAttributeValidator("doi", required=True, pattern=r"^10\.[0-9]+"),
         RegexAttributeValidator(
-            "frequency", required=True, pattern=PATH_ATTRIBUTE
+            "frequency",
+            required=True,
+            pattern=PATH_ATTRIBUTE,
         ),
         RegexAttributeValidator(
-            "grid", required=True, pattern=FREE_FORM_ATTRIBUTE
+            "grid",
+            required=True,
+            pattern=FREE_FORM_ATTRIBUTE,
         ),
         RegexAttributeValidator(
-            "grid_label", required=True, pattern=PATH_ATTRIBUTE
+            "grid_label",
+            required=True,
+            pattern=PATH_ATTRIBUTE,
         ),
         RegexAttributeValidator(
-            "institution", required=True, pattern=FREE_FORM_ATTRIBUTE
+            "institution",
+            required=True,
+            pattern=FREE_FORM_ATTRIBUTE,
         ),
         RegexAttributeValidator(
             "institution_id", required=True, pattern=PATH_ATTRIBUTE
@@ -606,28 +634,44 @@ def validate_values(self, attributes: Mapping[str, str]) -> None:
         ),
         RegexAttributeValidator("realm", required=True, pattern=DRS_ATTRIBUTE),
         RegexAttributeValidator(
-            "references", required=True, pattern=FREE_FORM_ATTRIBUTE
+            "references",
+            required=True,
+            pattern=FREE_FORM_ATTRIBUTE,
         ),
         RegexAttributeValidator(
-            "region", required=True, pattern=DRS_ATTRIBUTE
+            "region",
+            required=True,
+            pattern=DRS_ATTRIBUTE,
         ),
         RegexAttributeValidator(
-            "source", required=True, pattern=FREE_FORM_ATTRIBUTE
+            "source",
+            required=True,
+            pattern=FREE_FORM_ATTRIBUTE,
         ),
         RegexAttributeValidator(
-            "source_id", required=True, pattern=PATH_ATTRIBUTE
+            "source_id",
+            required=True,
+            pattern=PATH_ATTRIBUTE,
         ),
         RegexAttributeValidator(
-            "source_id", required=True, pattern="^{source_label}-.+$"
+            "source_id",
+            required=True,
+            pattern="^{source_label}-.+$",
         ),
         RegexAttributeValidator(
-            "source_label", required=True, pattern=DRS_ATTRIBUTE
+            "source_label",
+            required=True,
+            pattern=DRS_ATTRIBUTE,
         ),
         RegexAttributeValidator(
-            "source_type", required=True, pattern=FREE_FORM_ATTRIBUTE
+            "source_type",
+            required=True,
+            pattern=FREE_FORM_ATTRIBUTE,
         ),
         RegexAttributeValidator(
-            "source_version_number", required=True, pattern=FREE_FORM_ATTRIBUTE
+            "source_version_number",
+            required=True,
+            pattern=FREE_FORM_ATTRIBUTE,
         ),
         RegexAttributeValidator(
             "tracking_id",
@@ -635,48 +679,71 @@ def validate_values(self, attributes: Mapping[str, str]) -> None:
             pattern="^hdl:21.14102/[0-9a-f]{{8}}(-[0-9a-f]{{4}}){{3}}-[0-9a-f]{{12}}$",
         ),
         RegexAttributeValidator(
-            "variable_id", required=True, pattern=PATH_ATTRIBUTE
+            "variable_id",
+            required=True,
+            pattern=PATH_ATTRIBUTE,
         ),
         RegexAttributeValidator(
-            "variant_label", required=True, pattern=PATH_ATTRIBUTE
+            "variant_label",
+            required=True,
+            pattern=PATH_ATTRIBUTE,
         ),
         RegexAttributeValidator(
-            "variant_label", required=True, pattern="^{institution_id}(-.+)?$"
+            "variant_label",
+            required=True,
+            pattern="^{institution_id}(-.+)?$",
         ),
         # Optional attributes
         RegexAttributeValidator(
-            "comment", required=False, pattern=FREE_FORM_ATTRIBUTE
+            "comment",
+            required=False,
+            pattern=FREE_FORM_ATTRIBUTE,
         ),
         RegexAttributeValidator(
-            "external_variables", required=False, pattern=FREE_FORM_ATTRIBUTE
+            "external_variables",
+            required=False,
+            pattern=FREE_FORM_ATTRIBUTE,
         ),
         RegexAttributeValidator(
-            "history", required=False, pattern=FREE_FORM_ATTRIBUTE
+            "history",
+            required=False,
+            pattern=FREE_FORM_ATTRIBUTE,
         ),
         RegexAttributeValidator(
-            "source_data_notes", required=False, pattern=FREE_FORM_ATTRIBUTE
+            "source_data_notes",
+            required=False,
+            pattern=FREE_FORM_ATTRIBUTE,
         ),
         # TODO: Maybe we can add the two attributes below based on info from
         # the automatic download.
         DateTimeAttributeValidator(
-            "source_data_retrieval_date", required=False
+            "source_data_retrieval_date",
+            required=False,
         ),
         RegexAttributeValidator(
-            "source_data_url", required=False, pattern=FREE_FORM_ATTRIBUTE
+            "source_data_url",
+            required=False,
+            pattern=FREE_FORM_ATTRIBUTE,
         ),
         RegexAttributeValidator(
-            "title", required=False, pattern=FREE_FORM_ATTRIBUTE
+            "title",
+            required=False,
+            pattern=FREE_FORM_ATTRIBUTE,
         ),
         RegexAttributeValidator(
-            "variant_info", required=False, pattern=FREE_FORM_ATTRIBUTE
+            "variant_info",
+            required=False,
+            pattern=FREE_FORM_ATTRIBUTE,
         ),
     ],
 }
 
 
 def validate_global_attributes(
-    project: str, attributes: dict[str, str]
+    project: str,
+    attributes: dict[str, str],
 ) -> bool:
+    """Validate the global NetCDF attributes."""
     validators = GLOBAL_ATTRIBUTE_VALIDATORS.get(
         project, []
     ) + load_cv_validators(project)
@@ -684,11 +751,15 @@ def validate_global_attributes(
     for validator in validators:
         try:
             validator.validate(attributes)
-        except ValidationError as exc:
+        except AttributeValidationError as exc:
             messages.add(str(exc))
     if messages:
         logger.error("%s", "\n".join(sorted(messages)))
-    return not (messages)
+    return not messages
+
+
+# Code of the two functions below copied from
+# https://github.com/ESMValGroup/ESMValCore/blob/0a1292b0e3b181bb913242da7dc2798b50e7a892/esmvalcore/preprocessor/_io.py#L45-L66
 
 
 def _get_attr_from_field_coord(ncfield, coord_name, attr):
@@ -827,7 +898,6 @@ def save_variable(
         raise ValueError(msg)
 
     # Set global attributes.
-    attrs["variable_id"] = cube.var_name
     set_global_atts(cube, attrs)
 
     # Ensure correct dtypes.
@@ -854,6 +924,7 @@ def save_variable(
             )
             time_suffix = "-".join([date1, date2])
 
+    attrs["variable_id"] = cube.var_name
     file_path = get_output_filename(outdir, attrs, time_suffix)
     logger.info("Saving: %s", file_path)
     file_path.parent.mkdir(parents=True, exist_ok=True)
@@ -897,14 +968,17 @@ def extract_doi_value(tags):
 
 
 def _get_processing_code_location() -> str:
+    """Get a link to code used to CMORize the data."""
     # Ideas for improvement:
-    # - make sure current working dir is not dirty
-    # - replace version by commit that is available online
-    version = ".".join(esmvaltool.__version__.split(".", 3)[:3])
-    return f"https://github.com/ESMValGroup/ESMValTool/tree/{version}"
+    # - make sure current code dir is not dirty
+    # - replace version by commit that is available online (though this
+    #   guarantees nothing as it may still get garbage collected if it
+    #   becomes disconnected from existing branches/tags).
+    code_version = ".".join(esmvaltool.__version__.split(".", 3)[:3])
+    return f"https://github.com/ESMValGroup/ESMValTool/tree/{code_version}"
 
 
-def set_global_atts(cube, attrs):
+def set_global_atts(cube: Cube, attrs: dict[str, str]) -> None:
     """Complete the cmorized file with global metadata."""
     logger.debug("Setting global metadata...")
     attrs = dict(attrs)