From 10cbc0afd49ee9d7172203b333729f95c5e8fddb Mon Sep 17 00:00:00 2001 From: Jared Lewis Date: Thu, 27 Mar 2025 14:35:58 -0600 Subject: [PATCH 01/11] feat: Enable fetching the data used by PMP that are not currently in Obs4MIPs. --- .../cmip_ref_metrics_pmp/registry/__init__.py | 62 -------------- packages/ref/src/cmip_ref/cli/datasets.py | 47 ++++++++--- .../ref/src/cmip_ref/registry/__init__.py | 83 +++++++++++++++++++ .../src/cmip_ref/registry/pmp_reference.txt} | 0 packages/ref/src/cmip_ref/testing.py | 21 +---- 5 files changed, 123 insertions(+), 90 deletions(-) delete mode 100644 packages/ref-metrics-pmp/src/cmip_ref_metrics_pmp/registry/__init__.py create mode 100644 packages/ref/src/cmip_ref/registry/__init__.py rename packages/{ref-metrics-pmp/src/cmip_ref_metrics_pmp/registry/reference.txt => ref/src/cmip_ref/registry/pmp_reference.txt} (100%) diff --git a/packages/ref-metrics-pmp/src/cmip_ref_metrics_pmp/registry/__init__.py b/packages/ref-metrics-pmp/src/cmip_ref_metrics_pmp/registry/__init__.py deleted file mode 100644 index 4db56a358..000000000 --- a/packages/ref-metrics-pmp/src/cmip_ref_metrics_pmp/registry/__init__.py +++ /dev/null @@ -1,62 +0,0 @@ -""" -Data registries for PMP reference data and parameters. -""" - -import importlib.resources - -import pooch - -PMP_VERSION = "v3.9" -_DATASETS = { - "HadISST-1-1": "obs4MIPs_PCMDI_monthly/MOHC/HadISST-1-1/mon/ts/gn/v20210727/ts_mon_HadISST-1-1_PCMDI_gn_187001-201907.nc", # noqa -} -"""Map of dataset names to reference registry paths.""" - - -def build_reference_data_registry(version: str) -> pooch.Pooch: - """ - Build a pooch registry of reference data associated with PMP that isn't currently in obs4MIPs. - - Parameters - ---------- - version : str - The version of the data. - - Changing the version will invalidate the cache and force a re-download of the data. - - Returns - ------- - pooch.Pooch - The pooch registry. - """ - registry = pooch.create( - path=pooch.os_cache("pmp"), - base_url="https://pcmdiweb.llnl.gov/pss/pmpdata/", - version=version, - env="REF_METRICS_PMP_DATA_DIR", - ) - registry.load_registry(importlib.resources.open_binary("cmip_ref_metrics_pmp.registry", "reference.txt")) - return registry - - -_REFERENCE_REGISTRY = build_reference_data_registry(version=PMP_VERSION) - - -def fetch_reference_data(dataset_id: str, registry: pooch.Pooch = _REFERENCE_REGISTRY) -> str: - """ - Fetch the reference data associated with the dataset ID. - - Parameters - ---------- - dataset_id : str - The dataset ID. - registry - The registry to use to fetch the reference data. - - If none is provided, the default registry is used. - - Returns - ------- - The path to the reference data. - """ - return registry.fetch(_DATASETS[dataset_id]) diff --git a/packages/ref/src/cmip_ref/cli/datasets.py b/packages/ref/src/cmip_ref/cli/datasets.py index 2c63881d0..2cbc315a9 100644 --- a/packages/ref/src/cmip_ref/cli/datasets.py +++ b/packages/ref/src/cmip_ref/cli/datasets.py @@ -4,6 +4,7 @@ import errno import os +import shutil from collections.abc import Iterable from pathlib import Path from typing import Annotated @@ -15,6 +16,7 @@ from cmip_ref.cli._utils import pretty_print_df from cmip_ref.datasets import get_dataset_adapter from cmip_ref.models import Dataset +from cmip_ref.registry import build_reference_data_registry, fetch_all_files from cmip_ref.solver import solve_metrics from cmip_ref.testing import SAMPLE_DATA_VERSION, fetch_sample_data from cmip_ref_core.datasets import SourceDatasetType @@ -149,21 +151,44 @@ def ingest( # noqa: PLR0913 @app.command(name="fetch-sample-data") def _fetch_sample_data( - version: str = SAMPLE_DATA_VERSION, force_cleanup: bool = False, symlink: bool = False + version: Annotated[ + str, + "The version tag of the sample data to fetch. " + "Defaults to the current version of data expected by the test suite", + ] = SAMPLE_DATA_VERSION, + force_cleanup: Annotated[bool, typer.Option(help="If True, remove any existing files")] = False, + symlink: Annotated[ + bool, typer.Option(help="If True, symlink files into the output directory, otherwise perform a copy") + ] = False, ) -> None: """ Fetch the sample data for the given version. - Parameters - ---------- - version - The version tag of the sample data to fetch. - - Defaults to the current version of data expected by the test suite - force_cleanup - If True, remove any existing files. - symlink : bool - If True, symlink in the data otherwise copy the files. + These data will be written into the test data directory. + This operation may fail if the test data directory does not exist, + as is the case for non-source-based installations. """ logger.info(f"Fetching data for version {version}") fetch_sample_data(version=version, force_cleanup=force_cleanup, symlink=symlink) + + +@app.command(name="fetch-obs4ref-data") +def fetch_obs4ref_data( + output_directory: Annotated[Path, typer.Option(help="Output directory where files will be saved")], + force_cleanup: Annotated[bool, typer.Option(help="If True, remove any existing files")] = False, + symlink: Annotated[ + bool, typer.Option(help="If True, symlink files into the output directory, otherwise perform a copy") + ] = False, +) -> None: + """ + Fetch non-published Obs4MIPs data that is used by the REF + + These datasets have been verified to have open licenses + and are in the process of being added to Obs4MIPs. + """ + if force_cleanup and output_directory.exists(): + logger.warning(f"Removing existing directory {output_directory}") + shutil.rmtree(output_directory) + + data_registry = build_reference_data_registry() + fetch_all_files(data_registry, output_directory, symlink=symlink) diff --git a/packages/ref/src/cmip_ref/registry/__init__.py b/packages/ref/src/cmip_ref/registry/__init__.py new file mode 100644 index 000000000..f0b0433ce --- /dev/null +++ b/packages/ref/src/cmip_ref/registry/__init__.py @@ -0,0 +1,83 @@ +""" +Data registries for PMP reference data + +These data are placeholders until these data have been added to obs4MIPs. +""" + +import importlib.resources +import os +import pathlib +import shutil + +import pooch +from loguru import logger + +DATA_VERSION = "v3.9" +""" +Default version identifier for the datasets + +Changing this will bust any existing caches. +""" + + +def build_reference_data_registry(version: str = DATA_VERSION) -> pooch.Pooch: + """ + Build a pooch registry of reference data associated with PMP that isn't currently in obs4MIPs. + + Parameters + ---------- + version : str + The version of the data. + + Changing the version will invalidate the cache and force a re-download of the data. + + Returns + ------- + pooch.Pooch + The pooch registry. + """ + registry = pooch.create( + path=pooch.os_cache("pmp"), + base_url="https://pcmdiweb.llnl.gov/pss/pmpdata/", + version=version, + env="REF_METRICS_PMP_DATA_DIR", + ) + registry.load_registry(importlib.resources.open_binary("cmip_ref.registry", "reference.txt")) + return registry + + +def fetch_all_files(registry: pooch.Pooch, output_dir: pathlib.Path, symlink: bool = False): + """ + Fetch all files associated with a pooch registry and write them to an output directory. + + Pooch fetches, caches and validates the downloaded files. + Subsequent calls to this function will not refetch any previously downloaded files. + + Parameters + ---------- + registry + Pooch directory containing a set of files that should be fetched. + output_dir + The root directory to write the files to. + + The directory will be created if it doesn't exist, + and matching files will be overwritten. + symlink + If True, symlink all files to this directory. + Otherwise, perform a copy. + """ + output_dir.mkdir(parents=True, exist_ok=True) + + for key in registry.registry.keys(): + fetch_file = registry.fetch(key) + + linked_file = output_dir / key + linked_file.parent.mkdir(parents=True, exist_ok=True) + if not linked_file.exists(): # pragma: no cover + if symlink: + logger.info(f"Linking {key} to {linked_file}") + + os.symlink(fetch_file, linked_file) + else: + logger.info(f"Copying {key} to {linked_file}") + shutil.copy(fetch_file, linked_file) diff --git a/packages/ref-metrics-pmp/src/cmip_ref_metrics_pmp/registry/reference.txt b/packages/ref/src/cmip_ref/registry/pmp_reference.txt similarity index 100% rename from packages/ref-metrics-pmp/src/cmip_ref_metrics_pmp/registry/reference.txt rename to packages/ref/src/cmip_ref/registry/pmp_reference.txt diff --git a/packages/ref/src/cmip_ref/testing.py b/packages/ref/src/cmip_ref/testing.py index d2f6750f8..477638b3b 100644 --- a/packages/ref/src/cmip_ref/testing.py +++ b/packages/ref/src/cmip_ref/testing.py @@ -3,13 +3,14 @@ """ import importlib.resources -import os import shutil from pathlib import Path import pooch from loguru import logger +from cmip_ref.registry import fetch_all_files + def _determine_test_directory() -> Path | None: expected = Path(__file__).parents[4] / "tests" / "test-data" @@ -68,7 +69,7 @@ def fetch_sample_data( logger.warning("Test data directory not found, skipping sample data fetch") return - sample_registry = _build_sample_data_registry(version) + sample_data_registry = _build_sample_data_registry(version) output_dir = TEST_DATA_DIR / "sample-data" version_file = output_dir / "version.txt" @@ -83,21 +84,7 @@ def fetch_sample_data( logger.warning("Removing existing sample data") shutil.rmtree(output_dir) - output_dir.mkdir(parents=True, exist_ok=True) - - for key in sample_registry.registry.keys(): - fetch_file = sample_registry.fetch(key) - - linked_file = output_dir / key - linked_file.parent.mkdir(parents=True, exist_ok=True) - if not linked_file.exists(): # pragma: no cover - if symlink: - logger.info(f"Linking {key} to {linked_file}") - - os.symlink(fetch_file, linked_file) - else: - logger.info(f"Copying {key} to {linked_file}") - shutil.copy(fetch_file, linked_file) + fetch_all_files(sample_data_registry, output_dir, symlink) # Write out the current sample data version to the copying as complete with open(output_dir / "version.txt", "w") as fh: From e198d81db1bc7dc7ecbb25209f77a2d9b3916a29 Mon Sep 17 00:00:00 2001 From: Jared Lewis Date: Fri, 28 Mar 2025 08:55:20 -0600 Subject: [PATCH 02/11] docs: Fix typo --- docs/getting_started.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/getting_started.md b/docs/getting_started.md index 6f318e447..c41019d28 100644 --- a/docs/getting_started.md +++ b/docs/getting_started.md @@ -37,7 +37,7 @@ $ uv run ref This provides the ability to: -* **Ingest** new input datasetes +* **Ingest** new input datasets * **Solve** for the unique metrics executions that are required * **Execute** the metrics either locally or remotely From 197dff12a1ada4f0f584be979c6aab8840c24cb2 Mon Sep 17 00:00:00 2001 From: Jared Lewis Date: Fri, 28 Mar 2025 17:59:48 -0600 Subject: [PATCH 03/11] chore: Correctly capture pooch logs --- packages/ref/src/cmip_ref/cli/_logging.py | 6 +++++- packages/ref/src/cmip_ref/cli/datasets.py | 2 +- .../{registry => dataset_registry}/__init__.py | 15 ++++++++++++--- .../pmp_reference.txt | 1 + packages/ref/src/cmip_ref/testing.py | 2 +- tests/test-data/.gitignore | 3 +++ 6 files changed, 23 insertions(+), 6 deletions(-) rename packages/ref/src/cmip_ref/{registry => dataset_registry}/__init__.py (82%) rename packages/ref/src/cmip_ref/{registry => dataset_registry}/pmp_reference.txt (50%) diff --git a/packages/ref/src/cmip_ref/cli/_logging.py b/packages/ref/src/cmip_ref/cli/_logging.py index 91dbb8ce4..f8dd7603a 100644 --- a/packages/ref/src/cmip_ref/cli/_logging.py +++ b/packages/ref/src/cmip_ref/cli/_logging.py @@ -1,6 +1,7 @@ import inspect import logging +import pooch from loguru import logger @@ -28,7 +29,10 @@ def capture_logging() -> None: Note that this replaces the root logger, so any other handlers attached to it will be removed. """ - # logger.debug("Capturing logging from the standard library") + # Pooch adds a handler to its own logger which circumvents the REF logger + pooch.get_logger().handlers.clear() + pooch.get_logger().addHandler(_InterceptHandler()) + logging.basicConfig(handlers=[_InterceptHandler()], level=0, force=True) diff --git a/packages/ref/src/cmip_ref/cli/datasets.py b/packages/ref/src/cmip_ref/cli/datasets.py index 2cbc315a9..516219247 100644 --- a/packages/ref/src/cmip_ref/cli/datasets.py +++ b/packages/ref/src/cmip_ref/cli/datasets.py @@ -14,9 +14,9 @@ from rich.console import Console from cmip_ref.cli._utils import pretty_print_df +from cmip_ref.dataset_registry import build_reference_data_registry, fetch_all_files from cmip_ref.datasets import get_dataset_adapter from cmip_ref.models import Dataset -from cmip_ref.registry import build_reference_data_registry, fetch_all_files from cmip_ref.solver import solve_metrics from cmip_ref.testing import SAMPLE_DATA_VERSION, fetch_sample_data from cmip_ref_core.datasets import SourceDatasetType diff --git a/packages/ref/src/cmip_ref/registry/__init__.py b/packages/ref/src/cmip_ref/dataset_registry/__init__.py similarity index 82% rename from packages/ref/src/cmip_ref/registry/__init__.py rename to packages/ref/src/cmip_ref/dataset_registry/__init__.py index f0b0433ce..ae40ae691 100644 --- a/packages/ref/src/cmip_ref/registry/__init__.py +++ b/packages/ref/src/cmip_ref/dataset_registry/__init__.py @@ -1,7 +1,9 @@ """ -Data registries for PMP reference data +Data registries for non-published reference data These data are placeholders until these data have been added to obs4MIPs. +The AR7 FT REF requires that reference datasets are openly licensed before it is included +in any published data catalogs. """ import importlib.resources @@ -20,10 +22,15 @@ """ -def build_reference_data_registry(version: str = DATA_VERSION) -> pooch.Pooch: +def build_reference_data_registry( + version: str = DATA_VERSION, +) -> pooch.Pooch: """ Build a pooch registry of reference data associated with PMP that isn't currently in obs4MIPs. + Currently we only have reference datasets published from PMP, + but this may change in the future. + Parameters ---------- version : str @@ -42,7 +49,7 @@ def build_reference_data_registry(version: str = DATA_VERSION) -> pooch.Pooch: version=version, env="REF_METRICS_PMP_DATA_DIR", ) - registry.load_registry(importlib.resources.open_binary("cmip_ref.registry", "reference.txt")) + registry.load_registry(importlib.resources.open_binary("cmip_ref.dataset_registry", "pmp_reference.txt")) return registry @@ -81,3 +88,5 @@ def fetch_all_files(registry: pooch.Pooch, output_dir: pathlib.Path, symlink: bo else: logger.info(f"Copying {key} to {linked_file}") shutil.copy(fetch_file, linked_file) + else: + logger.info(f"File {linked_file} already exists. Skipping.") diff --git a/packages/ref/src/cmip_ref/registry/pmp_reference.txt b/packages/ref/src/cmip_ref/dataset_registry/pmp_reference.txt similarity index 50% rename from packages/ref/src/cmip_ref/registry/pmp_reference.txt rename to packages/ref/src/cmip_ref/dataset_registry/pmp_reference.txt index 44542a615..8609d8d5d 100644 --- a/packages/ref/src/cmip_ref/registry/pmp_reference.txt +++ b/packages/ref/src/cmip_ref/dataset_registry/pmp_reference.txt @@ -1 +1,2 @@ obs4MIPs_PCMDI_monthly/MOHC/HadISST-1-1/mon/ts/gn/v20210727/ts_mon_HadISST-1-1_PCMDI_gn_187001-201907.nc md5:99c8691e0f615dc4d79b4fb5e926cc76 +obs4MIPs_PCMDI_monthly/NOAA-ESRL-PSD/20CR/mon/psl/gn/v20210727/psl_mon_20CR_PCMDI_gn_187101-201212.nc md5:570ce90b3afd1d0b31690ae5dbe32d31 diff --git a/packages/ref/src/cmip_ref/testing.py b/packages/ref/src/cmip_ref/testing.py index 477638b3b..482f30906 100644 --- a/packages/ref/src/cmip_ref/testing.py +++ b/packages/ref/src/cmip_ref/testing.py @@ -9,7 +9,7 @@ import pooch from loguru import logger -from cmip_ref.registry import fetch_all_files +from cmip_ref.dataset_registry import fetch_all_files def _determine_test_directory() -> Path | None: diff --git a/tests/test-data/.gitignore b/tests/test-data/.gitignore index 25caaabef..9fca7341a 100644 --- a/tests/test-data/.gitignore +++ b/tests/test-data/.gitignore @@ -1,2 +1,5 @@ # Regenerated using `make fetch-test-data` sample-data + +# Local location for obs4REF datasets +obs4ref From 0f51be29c6d0ae7aea0cf90d2c126d9154ebc0c2 Mon Sep 17 00:00:00 2001 From: Jared Lewis Date: Fri, 28 Mar 2025 18:24:57 -0600 Subject: [PATCH 04/11] chore: Add fetching the obs4ref data to the cli --- .github/workflows/ci-integration.yaml | 1 + packages/ref/src/cmip_ref/dataset_registry/__init__.py | 2 +- stubs/pooch/__init__.pyi | 3 +++ tests/integration/test_ar7_ft.py | 4 +++- 4 files changed, 8 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci-integration.yaml b/.github/workflows/ci-integration.yaml index 6a9aa2da9..f7731afec 100644 --- a/.github/workflows/ci-integration.yaml +++ b/.github/workflows/ci-integration.yaml @@ -56,6 +56,7 @@ jobs: run: | make fetch-test-data uv run python scripts/fetch-ilamb-data.py ilamb.txt + uv run ref --verbose datasets fetch-obs4ref-data --output-directory tests/test-data/obs4ref make test-integration-slow # Upload the scratch and results directories as artifacts - name: Upload scratch artifacts diff --git a/packages/ref/src/cmip_ref/dataset_registry/__init__.py b/packages/ref/src/cmip_ref/dataset_registry/__init__.py index ae40ae691..e75368f49 100644 --- a/packages/ref/src/cmip_ref/dataset_registry/__init__.py +++ b/packages/ref/src/cmip_ref/dataset_registry/__init__.py @@ -53,7 +53,7 @@ def build_reference_data_registry( return registry -def fetch_all_files(registry: pooch.Pooch, output_dir: pathlib.Path, symlink: bool = False): +def fetch_all_files(registry: pooch.Pooch, output_dir: pathlib.Path, symlink: bool = False) -> None: """ Fetch all files associated with a pooch registry and write them to an output directory. diff --git a/stubs/pooch/__init__.pyi b/stubs/pooch/__init__.pyi index 00b78c1a1..463738937 100644 --- a/stubs/pooch/__init__.pyi +++ b/stubs/pooch/__init__.pyi @@ -1,6 +1,9 @@ +import logging from pathlib import Path from typing import IO, Any +def get_logger() -> logging.Logger: ... + class Pooch: registry: dict[str, dict[str, str]] diff --git a/tests/integration/test_ar7_ft.py b/tests/integration/test_ar7_ft.py index 695927652..e39a8a93c 100644 --- a/tests/integration/test_ar7_ft.py +++ b/tests/integration/test_ar7_ft.py @@ -63,6 +63,8 @@ def test_solve_ar7_ft( # Ingest the sample data invoke_cli(["datasets", "ingest", "--source-type", "cmip6", str(sample_data_dir / "CMIP6")]) invoke_cli(["datasets", "ingest", "--source-type", "obs4mips", str(sample_data_dir / "obs4MIPs")]) + # TODO: Replace with sample data once the obs4ref data has been processed + invoke_cli(["datasets", "ingest", "--source-type", "obs4mips", str(sample_data_dir / ".." / "obs4ref")]) # Solve # This will also create conda environments for the metric providers @@ -74,7 +76,7 @@ def test_solve_ar7_ft( # Check that all 3 metric providers have been used # TODO: Update once the PMP metrics are solving - assert set(df["provider"].unique()) == {"esmvaltool", "ilamb"} + assert set(df["provider"].unique()) == {"esmvaltool", "ilamb", "pmp"} # Check that all metrics have been successful assert df["successful"].all(), df[["metric", "successful"]] From 7ea9c966fc44b91e4b0e3d8b31f6f2c3f1445677 Mon Sep 17 00:00:00 2001 From: Jared Lewis Date: Fri, 28 Mar 2025 19:02:03 -0600 Subject: [PATCH 05/11] fix: Always log integrationt test output --- conftest.py | 9 +++++++-- tests/integration/test_ar7_ft.py | 3 ++- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/conftest.py b/conftest.py index 6dd6f7112..cc13121b2 100644 --- a/conftest.py +++ b/conftest.py @@ -93,16 +93,21 @@ def invoke_cli(): # stdout == output from commands runner = CliRunner(mix_stderr=False) - def _invoke_cli(args: list[str], expected_exit_code: int = 0) -> Result: + def _invoke_cli(args: list[str], expected_exit_code: int = 0, always_log: bool = False) -> Result: result = runner.invoke( app=cli.app, args=args, ) - if result.exit_code != expected_exit_code: + if always_log or result.exit_code != expected_exit_code: + print("Command: ", " ".join(args)) + print("Exit code: ", result.exit_code) + print("Command stdout") print(result.stdout) + print("Command stderr") print(result.stderr) + if result.exit_code != expected_exit_code: if result.exception: raise result.exception raise ValueError(f"Expected exit code {expected_exit_code}, got {result.exit_code}") diff --git a/tests/integration/test_ar7_ft.py b/tests/integration/test_ar7_ft.py index e39a8a93c..3ca5603ab 100644 --- a/tests/integration/test_ar7_ft.py +++ b/tests/integration/test_ar7_ft.py @@ -68,7 +68,8 @@ def test_solve_ar7_ft( # Solve # This will also create conda environments for the metric providers - invoke_cli(["--verbose", "solve", "--timeout", f"{60 * 60}"]) + # We always log the std out and stderr from the command as it is useful for debugging + invoke_cli(["--verbose", "solve", "--timeout", f"{60 * 60}"], always_log=True) execution_groups = db.session.query(MetricExecutionGroup).all() df = create_execution_dataframe(execution_groups) From 378802bc84dc954f951547dbadf513569e54e921 Mon Sep 17 00:00:00 2001 From: Jared Lewis Date: Fri, 28 Mar 2025 19:07:50 -0600 Subject: [PATCH 06/11] chore: Book end the log messages to make them easier to find --- conftest.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/conftest.py b/conftest.py index cc13121b2..09e95a90d 100644 --- a/conftest.py +++ b/conftest.py @@ -100,12 +100,13 @@ def _invoke_cli(args: list[str], expected_exit_code: int = 0, always_log: bool = ) if always_log or result.exit_code != expected_exit_code: - print("Command: ", " ".join(args)) + print("## Command: ", " ".join(args)) print("Exit code: ", result.exit_code) print("Command stdout") print(result.stdout) print("Command stderr") print(result.stderr) + print("## Command end") if result.exit_code != expected_exit_code: if result.exception: From 51ece19925dfae6918d77cbb90ac8c6bc37e6ada Mon Sep 17 00:00:00 2001 From: Jared Lewis Date: Fri, 28 Mar 2025 21:14:30 -0600 Subject: [PATCH 07/11] chore: Undo changes to the integration test until the sample data are available --- tests/integration/test_ar7_ft.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/tests/integration/test_ar7_ft.py b/tests/integration/test_ar7_ft.py index 3ca5603ab..b06a0301d 100644 --- a/tests/integration/test_ar7_ft.py +++ b/tests/integration/test_ar7_ft.py @@ -63,8 +63,6 @@ def test_solve_ar7_ft( # Ingest the sample data invoke_cli(["datasets", "ingest", "--source-type", "cmip6", str(sample_data_dir / "CMIP6")]) invoke_cli(["datasets", "ingest", "--source-type", "obs4mips", str(sample_data_dir / "obs4MIPs")]) - # TODO: Replace with sample data once the obs4ref data has been processed - invoke_cli(["datasets", "ingest", "--source-type", "obs4mips", str(sample_data_dir / ".." / "obs4ref")]) # Solve # This will also create conda environments for the metric providers @@ -77,7 +75,7 @@ def test_solve_ar7_ft( # Check that all 3 metric providers have been used # TODO: Update once the PMP metrics are solving - assert set(df["provider"].unique()) == {"esmvaltool", "ilamb", "pmp"} + assert set(df["provider"].unique()) == {"esmvaltool", "ilamb"} # Check that all metrics have been successful assert df["successful"].all(), df[["metric", "successful"]] From 8c63ae4d742e5ad038b47514493acd3860d4d081 Mon Sep 17 00:00:00 2001 From: Jared Lewis Date: Fri, 28 Mar 2025 21:19:10 -0600 Subject: [PATCH 08/11] docs: Changelog --- changelog/219.feature.md | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 changelog/219.feature.md diff --git a/changelog/219.feature.md b/changelog/219.feature.md new file mode 100644 index 000000000..13431a288 --- /dev/null +++ b/changelog/219.feature.md @@ -0,0 +1,3 @@ +Add `ref datasets fetch-obs4ref-data` CLI command to fetch datasets that are in the process of being published to obs4MIPs and are appropriately licensed for use within the REF. +The CLI command fetches the datasets and writes them to a local directory. +These datasets can then be ingested into the REF as obs4MIPs datasets. From 44e49ad278375b642812244c0a183cb358e0d602 Mon Sep 17 00:00:00 2001 From: Jared Lewis Date: Fri, 28 Mar 2025 21:20:40 -0600 Subject: [PATCH 09/11] chore: Undo fetching obs4REF data --- .github/workflows/ci-integration.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/ci-integration.yaml b/.github/workflows/ci-integration.yaml index f7731afec..6a9aa2da9 100644 --- a/.github/workflows/ci-integration.yaml +++ b/.github/workflows/ci-integration.yaml @@ -56,7 +56,6 @@ jobs: run: | make fetch-test-data uv run python scripts/fetch-ilamb-data.py ilamb.txt - uv run ref --verbose datasets fetch-obs4ref-data --output-directory tests/test-data/obs4ref make test-integration-slow # Upload the scratch and results directories as artifacts - name: Upload scratch artifacts From 2fab1192b1298c4388263cbcc71be714011e02b3 Mon Sep 17 00:00:00 2001 From: Jared Lewis Date: Fri, 28 Mar 2025 21:41:55 -0600 Subject: [PATCH 10/11] test: Add tests --- packages/ref/tests/unit/cli/test_datasets.py | 43 ++++++++++++++++++++ 1 file changed, 43 insertions(+) diff --git a/packages/ref/tests/unit/cli/test_datasets.py b/packages/ref/tests/unit/cli/test_datasets.py index 93acf0e5d..ad77834a4 100644 --- a/packages/ref/tests/unit/cli/test_datasets.py +++ b/packages/ref/tests/unit/cli/test_datasets.py @@ -1,5 +1,7 @@ from pathlib import Path +import pytest + from cmip_ref.datasets.cmip6 import CMIP6DatasetAdapter from cmip_ref.models import Dataset from cmip_ref.models.dataset import CMIP6Dataset, CMIP6File @@ -167,3 +169,44 @@ def test_fetch(self, mocker, invoke_cli): ) mock_fetch.assert_called_once_with(version="v0.1.0", force_cleanup=True, symlink=True) + + +@pytest.fixture(scope="function") +def mock_obs4ref(mocker): + mock_build_registry = mocker.patch("cmip_ref.cli.datasets.build_reference_data_registry") + mock_fetch = mocker.patch("cmip_ref.cli.datasets.fetch_all_files") + + return mock_build_registry, mock_fetch + + +class TestFetchObs4REFData: + def test_fetch_defaults(self, mock_obs4ref, invoke_cli, tmp_path): + mock_build_registry, mock_fetch = mock_obs4ref + + invoke_cli(["datasets", "fetch-obs4ref-data", "--output-directory", str(tmp_path)]) + + mock_fetch.assert_called_once_with(mock_build_registry(), tmp_path, symlink=False) + + def test_fetch_symlink(self, mock_obs4ref, invoke_cli, tmp_path): + mock_build_registry, mock_fetch = mock_obs4ref + invoke_cli(["datasets", "fetch-obs4ref-data", "--output-directory", str(tmp_path), "--symlink"]) + + mock_fetch.assert_called_once_with(mock_build_registry(), tmp_path, symlink=True) + + def test_fetch_force_cleanup(self, mock_obs4ref, invoke_cli, tmp_path): + assert tmp_path.exists() + + invoke_cli(["datasets", "fetch-obs4ref-data", "--output-directory", str(tmp_path), "--force-cleanup"]) + + assert not tmp_path.exists() + + def test_fetch_force_cleanup_missing(self, mock_obs4ref, invoke_cli, tmp_path): + invoke_cli( + [ + "datasets", + "fetch-obs4ref-data", + "--output-directory", + str(tmp_path / "missing"), + "--force-cleanup", + ] + ) From 39560e4c2e3a8ded7e612afd61852df698a5f470 Mon Sep 17 00:00:00 2001 From: Jared Lewis Date: Wed, 2 Apr 2025 10:10:29 -0600 Subject: [PATCH 11/11] chore: Move to core library --- .../dataset_registry/__init__.py | 4 +++- .../dataset_registry/pmp_reference.txt | 0 .../test_dataset_registry.py | 24 +++++++++++++++++++ packages/ref/src/cmip_ref/cli/datasets.py | 2 +- packages/ref/src/cmip_ref/testing.py | 2 +- 5 files changed, 29 insertions(+), 3 deletions(-) rename packages/{ref/src/cmip_ref => ref-core/src/cmip_ref_core}/dataset_registry/__init__.py (95%) rename packages/{ref/src/cmip_ref => ref-core/src/cmip_ref_core}/dataset_registry/pmp_reference.txt (100%) create mode 100644 packages/ref-core/tests/unit/test_dataset_registry/test_dataset_registry.py diff --git a/packages/ref/src/cmip_ref/dataset_registry/__init__.py b/packages/ref-core/src/cmip_ref_core/dataset_registry/__init__.py similarity index 95% rename from packages/ref/src/cmip_ref/dataset_registry/__init__.py rename to packages/ref-core/src/cmip_ref_core/dataset_registry/__init__.py index e75368f49..562a0cac9 100644 --- a/packages/ref/src/cmip_ref/dataset_registry/__init__.py +++ b/packages/ref-core/src/cmip_ref_core/dataset_registry/__init__.py @@ -49,7 +49,9 @@ def build_reference_data_registry( version=version, env="REF_METRICS_PMP_DATA_DIR", ) - registry.load_registry(importlib.resources.open_binary("cmip_ref.dataset_registry", "pmp_reference.txt")) + registry.load_registry( + importlib.resources.open_binary("cmip_ref_core.dataset_registry", "pmp_reference.txt") + ) return registry diff --git a/packages/ref/src/cmip_ref/dataset_registry/pmp_reference.txt b/packages/ref-core/src/cmip_ref_core/dataset_registry/pmp_reference.txt similarity index 100% rename from packages/ref/src/cmip_ref/dataset_registry/pmp_reference.txt rename to packages/ref-core/src/cmip_ref_core/dataset_registry/pmp_reference.txt diff --git a/packages/ref-core/tests/unit/test_dataset_registry/test_dataset_registry.py b/packages/ref-core/tests/unit/test_dataset_registry/test_dataset_registry.py new file mode 100644 index 000000000..4033d26f7 --- /dev/null +++ b/packages/ref-core/tests/unit/test_dataset_registry/test_dataset_registry.py @@ -0,0 +1,24 @@ +import pytest + +from cmip_ref_core.dataset_registry import build_reference_data_registry, fetch_all_files + + +@pytest.mark.parametrize("symlink", [True, False]) +def test_fetch_all_files(mocker, tmp_path, symlink): + downloaded_file = tmp_path / "out.txt" + downloaded_file.write_text("foo") + + registry = build_reference_data_registry() + registry.fetch = mocker.MagicMock(return_value=downloaded_file) + + fetch_all_files(registry, tmp_path, symlink=symlink) + assert registry.fetch.call_count == 2 + + expected_file = ( + tmp_path + / "obs4MIPs_PCMDI_monthly/MOHC/HadISST-1-1/mon/ts/gn/v20210727/ts_mon_HadISST-1-1_PCMDI_gn_187001-201907.nc" # noqa: E501 + ) + + assert expected_file.exists() + assert expected_file.is_symlink() == symlink + assert expected_file.read_text() == "foo" diff --git a/packages/ref/src/cmip_ref/cli/datasets.py b/packages/ref/src/cmip_ref/cli/datasets.py index 516219247..cb4c021d7 100644 --- a/packages/ref/src/cmip_ref/cli/datasets.py +++ b/packages/ref/src/cmip_ref/cli/datasets.py @@ -14,11 +14,11 @@ from rich.console import Console from cmip_ref.cli._utils import pretty_print_df -from cmip_ref.dataset_registry import build_reference_data_registry, fetch_all_files from cmip_ref.datasets import get_dataset_adapter from cmip_ref.models import Dataset from cmip_ref.solver import solve_metrics from cmip_ref.testing import SAMPLE_DATA_VERSION, fetch_sample_data +from cmip_ref_core.dataset_registry import build_reference_data_registry, fetch_all_files from cmip_ref_core.datasets import SourceDatasetType app = typer.Typer(help=__doc__) diff --git a/packages/ref/src/cmip_ref/testing.py b/packages/ref/src/cmip_ref/testing.py index 482f30906..d7df2a787 100644 --- a/packages/ref/src/cmip_ref/testing.py +++ b/packages/ref/src/cmip_ref/testing.py @@ -9,7 +9,7 @@ import pooch from loguru import logger -from cmip_ref.dataset_registry import fetch_all_files +from cmip_ref_core.dataset_registry import fetch_all_files def _determine_test_directory() -> Path | None: