diff --git a/changelog/219.feature.md b/changelog/219.feature.md new file mode 100644 index 000000000..13431a288 --- /dev/null +++ b/changelog/219.feature.md @@ -0,0 +1,3 @@ +Add `ref datasets fetch-obs4ref-data` CLI command to fetch datasets that are in the process of being published to obs4MIPs and are appropriately licensed for use within the REF. +The CLI command fetches the datasets and writes them to a local directory. +These datasets can then be ingested into the REF as obs4MIPs datasets. diff --git a/conftest.py b/conftest.py index 6dd6f7112..09e95a90d 100644 --- a/conftest.py +++ b/conftest.py @@ -93,16 +93,22 @@ def invoke_cli(): # stdout == output from commands runner = CliRunner(mix_stderr=False) - def _invoke_cli(args: list[str], expected_exit_code: int = 0) -> Result: + def _invoke_cli(args: list[str], expected_exit_code: int = 0, always_log: bool = False) -> Result: result = runner.invoke( app=cli.app, args=args, ) - if result.exit_code != expected_exit_code: + if always_log or result.exit_code != expected_exit_code: + print("## Command: ", " ".join(args)) + print("Exit code: ", result.exit_code) + print("Command stdout") print(result.stdout) + print("Command stderr") print(result.stderr) + print("## Command end") + if result.exit_code != expected_exit_code: if result.exception: raise result.exception raise ValueError(f"Expected exit code {expected_exit_code}, got {result.exit_code}") diff --git a/docs/getting_started.md b/docs/getting_started.md index 8e4a7e314..e68e96222 100644 --- a/docs/getting_started.md +++ b/docs/getting_started.md @@ -40,7 +40,7 @@ $ uv run ref This provides the ability to: -* **Ingest** new input datasetes +* **Ingest** new input datasets * **Solve** for the unique metrics executions that are required * **Execute** the metrics either locally or remotely diff --git a/packages/ref-core/src/cmip_ref_core/dataset_registry/__init__.py b/packages/ref-core/src/cmip_ref_core/dataset_registry/__init__.py new file mode 100644 index 000000000..562a0cac9 --- /dev/null +++ b/packages/ref-core/src/cmip_ref_core/dataset_registry/__init__.py @@ -0,0 +1,94 @@ +""" +Data registries for non-published reference data + +These data are placeholders until these data have been added to obs4MIPs. +The AR7 FT REF requires that reference datasets are openly licensed before it is included +in any published data catalogs. +""" + +import importlib.resources +import os +import pathlib +import shutil + +import pooch +from loguru import logger + +DATA_VERSION = "v3.9" +""" +Default version identifier for the datasets + +Changing this will bust any existing caches. +""" + + +def build_reference_data_registry( + version: str = DATA_VERSION, +) -> pooch.Pooch: + """ + Build a pooch registry of reference data associated with PMP that isn't currently in obs4MIPs. + + Currently we only have reference datasets published from PMP, + but this may change in the future. + + Parameters + ---------- + version : str + The version of the data. + + Changing the version will invalidate the cache and force a re-download of the data. + + Returns + ------- + pooch.Pooch + The pooch registry. + """ + registry = pooch.create( + path=pooch.os_cache("pmp"), + base_url="https://pcmdiweb.llnl.gov/pss/pmpdata/", + version=version, + env="REF_METRICS_PMP_DATA_DIR", + ) + registry.load_registry( + importlib.resources.open_binary("cmip_ref_core.dataset_registry", "pmp_reference.txt") + ) + return registry + + +def fetch_all_files(registry: pooch.Pooch, output_dir: pathlib.Path, symlink: bool = False) -> None: + """ + Fetch all files associated with a pooch registry and write them to an output directory. + + Pooch fetches, caches and validates the downloaded files. + Subsequent calls to this function will not refetch any previously downloaded files. + + Parameters + ---------- + registry + Pooch directory containing a set of files that should be fetched. + output_dir + The root directory to write the files to. + + The directory will be created if it doesn't exist, + and matching files will be overwritten. + symlink + If True, symlink all files to this directory. + Otherwise, perform a copy. + """ + output_dir.mkdir(parents=True, exist_ok=True) + + for key in registry.registry.keys(): + fetch_file = registry.fetch(key) + + linked_file = output_dir / key + linked_file.parent.mkdir(parents=True, exist_ok=True) + if not linked_file.exists(): # pragma: no cover + if symlink: + logger.info(f"Linking {key} to {linked_file}") + + os.symlink(fetch_file, linked_file) + else: + logger.info(f"Copying {key} to {linked_file}") + shutil.copy(fetch_file, linked_file) + else: + logger.info(f"File {linked_file} already exists. Skipping.") diff --git a/packages/ref-metrics-pmp/src/cmip_ref_metrics_pmp/registry/reference.txt b/packages/ref-core/src/cmip_ref_core/dataset_registry/pmp_reference.txt similarity index 50% rename from packages/ref-metrics-pmp/src/cmip_ref_metrics_pmp/registry/reference.txt rename to packages/ref-core/src/cmip_ref_core/dataset_registry/pmp_reference.txt index 44542a615..8609d8d5d 100644 --- a/packages/ref-metrics-pmp/src/cmip_ref_metrics_pmp/registry/reference.txt +++ b/packages/ref-core/src/cmip_ref_core/dataset_registry/pmp_reference.txt @@ -1 +1,2 @@ obs4MIPs_PCMDI_monthly/MOHC/HadISST-1-1/mon/ts/gn/v20210727/ts_mon_HadISST-1-1_PCMDI_gn_187001-201907.nc md5:99c8691e0f615dc4d79b4fb5e926cc76 +obs4MIPs_PCMDI_monthly/NOAA-ESRL-PSD/20CR/mon/psl/gn/v20210727/psl_mon_20CR_PCMDI_gn_187101-201212.nc md5:570ce90b3afd1d0b31690ae5dbe32d31 diff --git a/packages/ref-core/tests/unit/test_dataset_registry/test_dataset_registry.py b/packages/ref-core/tests/unit/test_dataset_registry/test_dataset_registry.py new file mode 100644 index 000000000..4033d26f7 --- /dev/null +++ b/packages/ref-core/tests/unit/test_dataset_registry/test_dataset_registry.py @@ -0,0 +1,24 @@ +import pytest + +from cmip_ref_core.dataset_registry import build_reference_data_registry, fetch_all_files + + +@pytest.mark.parametrize("symlink", [True, False]) +def test_fetch_all_files(mocker, tmp_path, symlink): + downloaded_file = tmp_path / "out.txt" + downloaded_file.write_text("foo") + + registry = build_reference_data_registry() + registry.fetch = mocker.MagicMock(return_value=downloaded_file) + + fetch_all_files(registry, tmp_path, symlink=symlink) + assert registry.fetch.call_count == 2 + + expected_file = ( + tmp_path + / "obs4MIPs_PCMDI_monthly/MOHC/HadISST-1-1/mon/ts/gn/v20210727/ts_mon_HadISST-1-1_PCMDI_gn_187001-201907.nc" # noqa: E501 + ) + + assert expected_file.exists() + assert expected_file.is_symlink() == symlink + assert expected_file.read_text() == "foo" diff --git a/packages/ref-metrics-pmp/src/cmip_ref_metrics_pmp/registry/__init__.py b/packages/ref-metrics-pmp/src/cmip_ref_metrics_pmp/registry/__init__.py deleted file mode 100644 index 4db56a358..000000000 --- a/packages/ref-metrics-pmp/src/cmip_ref_metrics_pmp/registry/__init__.py +++ /dev/null @@ -1,62 +0,0 @@ -""" -Data registries for PMP reference data and parameters. -""" - -import importlib.resources - -import pooch - -PMP_VERSION = "v3.9" -_DATASETS = { - "HadISST-1-1": "obs4MIPs_PCMDI_monthly/MOHC/HadISST-1-1/mon/ts/gn/v20210727/ts_mon_HadISST-1-1_PCMDI_gn_187001-201907.nc", # noqa -} -"""Map of dataset names to reference registry paths.""" - - -def build_reference_data_registry(version: str) -> pooch.Pooch: - """ - Build a pooch registry of reference data associated with PMP that isn't currently in obs4MIPs. - - Parameters - ---------- - version : str - The version of the data. - - Changing the version will invalidate the cache and force a re-download of the data. - - Returns - ------- - pooch.Pooch - The pooch registry. - """ - registry = pooch.create( - path=pooch.os_cache("pmp"), - base_url="https://pcmdiweb.llnl.gov/pss/pmpdata/", - version=version, - env="REF_METRICS_PMP_DATA_DIR", - ) - registry.load_registry(importlib.resources.open_binary("cmip_ref_metrics_pmp.registry", "reference.txt")) - return registry - - -_REFERENCE_REGISTRY = build_reference_data_registry(version=PMP_VERSION) - - -def fetch_reference_data(dataset_id: str, registry: pooch.Pooch = _REFERENCE_REGISTRY) -> str: - """ - Fetch the reference data associated with the dataset ID. - - Parameters - ---------- - dataset_id : str - The dataset ID. - registry - The registry to use to fetch the reference data. - - If none is provided, the default registry is used. - - Returns - ------- - The path to the reference data. - """ - return registry.fetch(_DATASETS[dataset_id]) diff --git a/packages/ref/src/cmip_ref/cli/_logging.py b/packages/ref/src/cmip_ref/cli/_logging.py index 91dbb8ce4..f8dd7603a 100644 --- a/packages/ref/src/cmip_ref/cli/_logging.py +++ b/packages/ref/src/cmip_ref/cli/_logging.py @@ -1,6 +1,7 @@ import inspect import logging +import pooch from loguru import logger @@ -28,7 +29,10 @@ def capture_logging() -> None: Note that this replaces the root logger, so any other handlers attached to it will be removed. """ - # logger.debug("Capturing logging from the standard library") + # Pooch adds a handler to its own logger which circumvents the REF logger + pooch.get_logger().handlers.clear() + pooch.get_logger().addHandler(_InterceptHandler()) + logging.basicConfig(handlers=[_InterceptHandler()], level=0, force=True) diff --git a/packages/ref/src/cmip_ref/cli/datasets.py b/packages/ref/src/cmip_ref/cli/datasets.py index 2c63881d0..cb4c021d7 100644 --- a/packages/ref/src/cmip_ref/cli/datasets.py +++ b/packages/ref/src/cmip_ref/cli/datasets.py @@ -4,6 +4,7 @@ import errno import os +import shutil from collections.abc import Iterable from pathlib import Path from typing import Annotated @@ -17,6 +18,7 @@ from cmip_ref.models import Dataset from cmip_ref.solver import solve_metrics from cmip_ref.testing import SAMPLE_DATA_VERSION, fetch_sample_data +from cmip_ref_core.dataset_registry import build_reference_data_registry, fetch_all_files from cmip_ref_core.datasets import SourceDatasetType app = typer.Typer(help=__doc__) @@ -149,21 +151,44 @@ def ingest( # noqa: PLR0913 @app.command(name="fetch-sample-data") def _fetch_sample_data( - version: str = SAMPLE_DATA_VERSION, force_cleanup: bool = False, symlink: bool = False + version: Annotated[ + str, + "The version tag of the sample data to fetch. " + "Defaults to the current version of data expected by the test suite", + ] = SAMPLE_DATA_VERSION, + force_cleanup: Annotated[bool, typer.Option(help="If True, remove any existing files")] = False, + symlink: Annotated[ + bool, typer.Option(help="If True, symlink files into the output directory, otherwise perform a copy") + ] = False, ) -> None: """ Fetch the sample data for the given version. - Parameters - ---------- - version - The version tag of the sample data to fetch. - - Defaults to the current version of data expected by the test suite - force_cleanup - If True, remove any existing files. - symlink : bool - If True, symlink in the data otherwise copy the files. + These data will be written into the test data directory. + This operation may fail if the test data directory does not exist, + as is the case for non-source-based installations. """ logger.info(f"Fetching data for version {version}") fetch_sample_data(version=version, force_cleanup=force_cleanup, symlink=symlink) + + +@app.command(name="fetch-obs4ref-data") +def fetch_obs4ref_data( + output_directory: Annotated[Path, typer.Option(help="Output directory where files will be saved")], + force_cleanup: Annotated[bool, typer.Option(help="If True, remove any existing files")] = False, + symlink: Annotated[ + bool, typer.Option(help="If True, symlink files into the output directory, otherwise perform a copy") + ] = False, +) -> None: + """ + Fetch non-published Obs4MIPs data that is used by the REF + + These datasets have been verified to have open licenses + and are in the process of being added to Obs4MIPs. + """ + if force_cleanup and output_directory.exists(): + logger.warning(f"Removing existing directory {output_directory}") + shutil.rmtree(output_directory) + + data_registry = build_reference_data_registry() + fetch_all_files(data_registry, output_directory, symlink=symlink) diff --git a/packages/ref/src/cmip_ref/testing.py b/packages/ref/src/cmip_ref/testing.py index d2f6750f8..d7df2a787 100644 --- a/packages/ref/src/cmip_ref/testing.py +++ b/packages/ref/src/cmip_ref/testing.py @@ -3,13 +3,14 @@ """ import importlib.resources -import os import shutil from pathlib import Path import pooch from loguru import logger +from cmip_ref_core.dataset_registry import fetch_all_files + def _determine_test_directory() -> Path | None: expected = Path(__file__).parents[4] / "tests" / "test-data" @@ -68,7 +69,7 @@ def fetch_sample_data( logger.warning("Test data directory not found, skipping sample data fetch") return - sample_registry = _build_sample_data_registry(version) + sample_data_registry = _build_sample_data_registry(version) output_dir = TEST_DATA_DIR / "sample-data" version_file = output_dir / "version.txt" @@ -83,21 +84,7 @@ def fetch_sample_data( logger.warning("Removing existing sample data") shutil.rmtree(output_dir) - output_dir.mkdir(parents=True, exist_ok=True) - - for key in sample_registry.registry.keys(): - fetch_file = sample_registry.fetch(key) - - linked_file = output_dir / key - linked_file.parent.mkdir(parents=True, exist_ok=True) - if not linked_file.exists(): # pragma: no cover - if symlink: - logger.info(f"Linking {key} to {linked_file}") - - os.symlink(fetch_file, linked_file) - else: - logger.info(f"Copying {key} to {linked_file}") - shutil.copy(fetch_file, linked_file) + fetch_all_files(sample_data_registry, output_dir, symlink) # Write out the current sample data version to the copying as complete with open(output_dir / "version.txt", "w") as fh: diff --git a/packages/ref/tests/unit/cli/test_datasets.py b/packages/ref/tests/unit/cli/test_datasets.py index 93acf0e5d..ad77834a4 100644 --- a/packages/ref/tests/unit/cli/test_datasets.py +++ b/packages/ref/tests/unit/cli/test_datasets.py @@ -1,5 +1,7 @@ from pathlib import Path +import pytest + from cmip_ref.datasets.cmip6 import CMIP6DatasetAdapter from cmip_ref.models import Dataset from cmip_ref.models.dataset import CMIP6Dataset, CMIP6File @@ -167,3 +169,44 @@ def test_fetch(self, mocker, invoke_cli): ) mock_fetch.assert_called_once_with(version="v0.1.0", force_cleanup=True, symlink=True) + + +@pytest.fixture(scope="function") +def mock_obs4ref(mocker): + mock_build_registry = mocker.patch("cmip_ref.cli.datasets.build_reference_data_registry") + mock_fetch = mocker.patch("cmip_ref.cli.datasets.fetch_all_files") + + return mock_build_registry, mock_fetch + + +class TestFetchObs4REFData: + def test_fetch_defaults(self, mock_obs4ref, invoke_cli, tmp_path): + mock_build_registry, mock_fetch = mock_obs4ref + + invoke_cli(["datasets", "fetch-obs4ref-data", "--output-directory", str(tmp_path)]) + + mock_fetch.assert_called_once_with(mock_build_registry(), tmp_path, symlink=False) + + def test_fetch_symlink(self, mock_obs4ref, invoke_cli, tmp_path): + mock_build_registry, mock_fetch = mock_obs4ref + invoke_cli(["datasets", "fetch-obs4ref-data", "--output-directory", str(tmp_path), "--symlink"]) + + mock_fetch.assert_called_once_with(mock_build_registry(), tmp_path, symlink=True) + + def test_fetch_force_cleanup(self, mock_obs4ref, invoke_cli, tmp_path): + assert tmp_path.exists() + + invoke_cli(["datasets", "fetch-obs4ref-data", "--output-directory", str(tmp_path), "--force-cleanup"]) + + assert not tmp_path.exists() + + def test_fetch_force_cleanup_missing(self, mock_obs4ref, invoke_cli, tmp_path): + invoke_cli( + [ + "datasets", + "fetch-obs4ref-data", + "--output-directory", + str(tmp_path / "missing"), + "--force-cleanup", + ] + ) diff --git a/stubs/pooch/__init__.pyi b/stubs/pooch/__init__.pyi index 00b78c1a1..463738937 100644 --- a/stubs/pooch/__init__.pyi +++ b/stubs/pooch/__init__.pyi @@ -1,6 +1,9 @@ +import logging from pathlib import Path from typing import IO, Any +def get_logger() -> logging.Logger: ... + class Pooch: registry: dict[str, dict[str, str]] diff --git a/tests/integration/test_ar7_ft.py b/tests/integration/test_ar7_ft.py index 695927652..b06a0301d 100644 --- a/tests/integration/test_ar7_ft.py +++ b/tests/integration/test_ar7_ft.py @@ -66,7 +66,8 @@ def test_solve_ar7_ft( # Solve # This will also create conda environments for the metric providers - invoke_cli(["--verbose", "solve", "--timeout", f"{60 * 60}"]) + # We always log the std out and stderr from the command as it is useful for debugging + invoke_cli(["--verbose", "solve", "--timeout", f"{60 * 60}"], always_log=True) execution_groups = db.session.query(MetricExecutionGroup).all() df = create_execution_dataframe(execution_groups) diff --git a/tests/test-data/.gitignore b/tests/test-data/.gitignore index 25caaabef..9fca7341a 100644 --- a/tests/test-data/.gitignore +++ b/tests/test-data/.gitignore @@ -1,2 +1,5 @@ # Regenerated using `make fetch-test-data` sample-data + +# Local location for obs4REF datasets +obs4ref