Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions changelog/219.feature.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
Add `ref datasets fetch-obs4ref-data` CLI command to fetch datasets that are in the process of being published to obs4MIPs and are appropriately licensed for use within the REF.
The CLI command fetches the datasets and writes them to a local directory.
These datasets can then be ingested into the REF as obs4MIPs datasets.
10 changes: 8 additions & 2 deletions conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,16 +93,22 @@ def invoke_cli():
# stdout == output from commands
runner = CliRunner(mix_stderr=False)

def _invoke_cli(args: list[str], expected_exit_code: int = 0) -> Result:
def _invoke_cli(args: list[str], expected_exit_code: int = 0, always_log: bool = False) -> Result:
result = runner.invoke(
app=cli.app,
args=args,
)

if result.exit_code != expected_exit_code:
if always_log or result.exit_code != expected_exit_code:
print("## Command: ", " ".join(args))
print("Exit code: ", result.exit_code)
print("Command stdout")
print(result.stdout)
print("Command stderr")
print(result.stderr)
print("## Command end")

if result.exit_code != expected_exit_code:
if result.exception:
raise result.exception
raise ValueError(f"Expected exit code {expected_exit_code}, got {result.exit_code}")
Expand Down
2 changes: 1 addition & 1 deletion docs/getting_started.md
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ $ uv run ref

This provides the ability to:

* **Ingest** new input datasetes
* **Ingest** new input datasets
* **Solve** for the unique metrics executions that are required
* **Execute** the metrics either locally or remotely

Expand Down
94 changes: 94 additions & 0 deletions packages/ref-core/src/cmip_ref_core/dataset_registry/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
"""
Data registries for non-published reference data

These data are placeholders until these data have been added to obs4MIPs.
The AR7 FT REF requires that reference datasets are openly licensed before it is included
in any published data catalogs.
"""

import importlib.resources
import os
import pathlib
import shutil

import pooch
from loguru import logger

DATA_VERSION = "v3.9"
"""
Default version identifier for the datasets

Changing this will bust any existing caches.
"""


def build_reference_data_registry(
version: str = DATA_VERSION,
) -> pooch.Pooch:
"""
Build a pooch registry of reference data associated with PMP that isn't currently in obs4MIPs.

Currently we only have reference datasets published from PMP,
but this may change in the future.

Parameters
----------
version : str
The version of the data.

Changing the version will invalidate the cache and force a re-download of the data.

Returns
-------
pooch.Pooch
The pooch registry.
"""
registry = pooch.create(
path=pooch.os_cache("pmp"),
base_url="https://pcmdiweb.llnl.gov/pss/pmpdata/",
version=version,
env="REF_METRICS_PMP_DATA_DIR",
)
registry.load_registry(
importlib.resources.open_binary("cmip_ref_core.dataset_registry", "pmp_reference.txt")
)
return registry


def fetch_all_files(registry: pooch.Pooch, output_dir: pathlib.Path, symlink: bool = False) -> None:
"""
Fetch all files associated with a pooch registry and write them to an output directory.

Pooch fetches, caches and validates the downloaded files.
Subsequent calls to this function will not refetch any previously downloaded files.

Parameters
----------
registry
Pooch directory containing a set of files that should be fetched.
output_dir
The root directory to write the files to.

The directory will be created if it doesn't exist,
and matching files will be overwritten.
symlink
If True, symlink all files to this directory.
Otherwise, perform a copy.
"""
output_dir.mkdir(parents=True, exist_ok=True)

for key in registry.registry.keys():
fetch_file = registry.fetch(key)

linked_file = output_dir / key
linked_file.parent.mkdir(parents=True, exist_ok=True)
if not linked_file.exists(): # pragma: no cover
if symlink:
logger.info(f"Linking {key} to {linked_file}")

os.symlink(fetch_file, linked_file)
else:
logger.info(f"Copying {key} to {linked_file}")
shutil.copy(fetch_file, linked_file)
else:
logger.info(f"File {linked_file} already exists. Skipping.")
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
obs4MIPs_PCMDI_monthly/MOHC/HadISST-1-1/mon/ts/gn/v20210727/ts_mon_HadISST-1-1_PCMDI_gn_187001-201907.nc md5:99c8691e0f615dc4d79b4fb5e926cc76
obs4MIPs_PCMDI_monthly/NOAA-ESRL-PSD/20CR/mon/psl/gn/v20210727/psl_mon_20CR_PCMDI_gn_187101-201212.nc md5:570ce90b3afd1d0b31690ae5dbe32d31
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
import pytest

from cmip_ref_core.dataset_registry import build_reference_data_registry, fetch_all_files


@pytest.mark.parametrize("symlink", [True, False])
def test_fetch_all_files(mocker, tmp_path, symlink):
downloaded_file = tmp_path / "out.txt"
downloaded_file.write_text("foo")

registry = build_reference_data_registry()
registry.fetch = mocker.MagicMock(return_value=downloaded_file)

fetch_all_files(registry, tmp_path, symlink=symlink)
assert registry.fetch.call_count == 2

expected_file = (
tmp_path
/ "obs4MIPs_PCMDI_monthly/MOHC/HadISST-1-1/mon/ts/gn/v20210727/ts_mon_HadISST-1-1_PCMDI_gn_187001-201907.nc" # noqa: E501
)

assert expected_file.exists()
assert expected_file.is_symlink() == symlink
assert expected_file.read_text() == "foo"

This file was deleted.

6 changes: 5 additions & 1 deletion packages/ref/src/cmip_ref/cli/_logging.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import inspect
import logging

import pooch
from loguru import logger


Expand Down Expand Up @@ -28,7 +29,10 @@ def capture_logging() -> None:

Note that this replaces the root logger, so any other handlers attached to it will be removed.
"""
# logger.debug("Capturing logging from the standard library")
# Pooch adds a handler to its own logger which circumvents the REF logger
pooch.get_logger().handlers.clear()
pooch.get_logger().addHandler(_InterceptHandler())

logging.basicConfig(handlers=[_InterceptHandler()], level=0, force=True)


Expand Down
47 changes: 36 additions & 11 deletions packages/ref/src/cmip_ref/cli/datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

import errno
import os
import shutil
from collections.abc import Iterable
from pathlib import Path
from typing import Annotated
Expand All @@ -17,6 +18,7 @@
from cmip_ref.models import Dataset
from cmip_ref.solver import solve_metrics
from cmip_ref.testing import SAMPLE_DATA_VERSION, fetch_sample_data
from cmip_ref_core.dataset_registry import build_reference_data_registry, fetch_all_files
from cmip_ref_core.datasets import SourceDatasetType

app = typer.Typer(help=__doc__)
Expand Down Expand Up @@ -149,21 +151,44 @@ def ingest( # noqa: PLR0913

@app.command(name="fetch-sample-data")
def _fetch_sample_data(
version: str = SAMPLE_DATA_VERSION, force_cleanup: bool = False, symlink: bool = False
version: Annotated[
str,
"The version tag of the sample data to fetch. "
"Defaults to the current version of data expected by the test suite",
] = SAMPLE_DATA_VERSION,
force_cleanup: Annotated[bool, typer.Option(help="If True, remove any existing files")] = False,
symlink: Annotated[
bool, typer.Option(help="If True, symlink files into the output directory, otherwise perform a copy")
] = False,
) -> None:
"""
Fetch the sample data for the given version.

Parameters
----------
version
The version tag of the sample data to fetch.

Defaults to the current version of data expected by the test suite
force_cleanup
If True, remove any existing files.
symlink : bool
If True, symlink in the data otherwise copy the files.
These data will be written into the test data directory.
This operation may fail if the test data directory does not exist,
as is the case for non-source-based installations.
"""
logger.info(f"Fetching data for version {version}")
fetch_sample_data(version=version, force_cleanup=force_cleanup, symlink=symlink)


@app.command(name="fetch-obs4ref-data")
def fetch_obs4ref_data(
output_directory: Annotated[Path, typer.Option(help="Output directory where files will be saved")],
force_cleanup: Annotated[bool, typer.Option(help="If True, remove any existing files")] = False,
symlink: Annotated[
bool, typer.Option(help="If True, symlink files into the output directory, otherwise perform a copy")
] = False,
) -> None:
"""
Fetch non-published Obs4MIPs data that is used by the REF

These datasets have been verified to have open licenses
and are in the process of being added to Obs4MIPs.
"""
if force_cleanup and output_directory.exists():
logger.warning(f"Removing existing directory {output_directory}")
shutil.rmtree(output_directory)

data_registry = build_reference_data_registry()
fetch_all_files(data_registry, output_directory, symlink=symlink)
21 changes: 4 additions & 17 deletions packages/ref/src/cmip_ref/testing.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,14 @@
"""

import importlib.resources
import os
import shutil
from pathlib import Path

import pooch
from loguru import logger

from cmip_ref_core.dataset_registry import fetch_all_files


def _determine_test_directory() -> Path | None:
expected = Path(__file__).parents[4] / "tests" / "test-data"
Expand Down Expand Up @@ -68,7 +69,7 @@ def fetch_sample_data(
logger.warning("Test data directory not found, skipping sample data fetch")
return

sample_registry = _build_sample_data_registry(version)
sample_data_registry = _build_sample_data_registry(version)

output_dir = TEST_DATA_DIR / "sample-data"
version_file = output_dir / "version.txt"
Expand All @@ -83,21 +84,7 @@ def fetch_sample_data(
logger.warning("Removing existing sample data")
shutil.rmtree(output_dir)

output_dir.mkdir(parents=True, exist_ok=True)

for key in sample_registry.registry.keys():
fetch_file = sample_registry.fetch(key)

linked_file = output_dir / key
linked_file.parent.mkdir(parents=True, exist_ok=True)
if not linked_file.exists(): # pragma: no cover
if symlink:
logger.info(f"Linking {key} to {linked_file}")

os.symlink(fetch_file, linked_file)
else:
logger.info(f"Copying {key} to {linked_file}")
shutil.copy(fetch_file, linked_file)
fetch_all_files(sample_data_registry, output_dir, symlink)

# Write out the current sample data version to the copying as complete
with open(output_dir / "version.txt", "w") as fh:
Expand Down
Loading