Skip to content

Commit a6821a3

Browse files
authored
Merge pull request #219 from Climate-REF/fetch-pmp
Enable fetching of "obs4ref" data from pmp
2 parents 940db4e + 39560e4 commit a6821a3

14 files changed

Lines changed: 227 additions & 95 deletions

File tree

changelog/219.feature.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
Add `ref datasets fetch-obs4ref-data` CLI command to fetch datasets that are in the process of being published to obs4MIPs and are appropriately licensed for use within the REF.
2+
The CLI command fetches the datasets and writes them to a local directory.
3+
These datasets can then be ingested into the REF as obs4MIPs datasets.

conftest.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -98,16 +98,22 @@ def invoke_cli():
9898
# stdout == output from commands
9999
runner = CliRunner(mix_stderr=False)
100100

101-
def _invoke_cli(args: list[str], expected_exit_code: int = 0) -> Result:
101+
def _invoke_cli(args: list[str], expected_exit_code: int = 0, always_log: bool = False) -> Result:
102102
result = runner.invoke(
103103
app=cli.app,
104104
args=args,
105105
)
106106

107-
if result.exit_code != expected_exit_code:
107+
if always_log or result.exit_code != expected_exit_code:
108+
print("## Command: ", " ".join(args))
109+
print("Exit code: ", result.exit_code)
110+
print("Command stdout")
108111
print(result.stdout)
112+
print("Command stderr")
109113
print(result.stderr)
114+
print("## Command end")
110115

116+
if result.exit_code != expected_exit_code:
111117
if result.exception:
112118
raise result.exception
113119
raise ValueError(f"Expected exit code {expected_exit_code}, got {result.exit_code}")

docs/getting_started.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ $ uv run ref
4040

4141
This provides the ability to:
4242

43-
* **Ingest** new input datasetes
43+
* **Ingest** new input datasets
4444
* **Solve** for the unique metrics executions that are required
4545
* **Execute** the metrics either locally or remotely
4646

Lines changed: 94 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,94 @@
1+
"""
2+
Data registries for non-published reference data
3+
4+
These data are placeholders until these data have been added to obs4MIPs.
5+
The AR7 FT REF requires that reference datasets are openly licensed before it is included
6+
in any published data catalogs.
7+
"""
8+
9+
import importlib.resources
10+
import os
11+
import pathlib
12+
import shutil
13+
14+
import pooch
15+
from loguru import logger
16+
17+
DATA_VERSION = "v3.9"
18+
"""
19+
Default version identifier for the datasets
20+
21+
Changing this will bust any existing caches.
22+
"""
23+
24+
25+
def build_reference_data_registry(
26+
version: str = DATA_VERSION,
27+
) -> pooch.Pooch:
28+
"""
29+
Build a pooch registry of reference data associated with PMP that isn't currently in obs4MIPs.
30+
31+
Currently we only have reference datasets published from PMP,
32+
but this may change in the future.
33+
34+
Parameters
35+
----------
36+
version : str
37+
The version of the data.
38+
39+
Changing the version will invalidate the cache and force a re-download of the data.
40+
41+
Returns
42+
-------
43+
pooch.Pooch
44+
The pooch registry.
45+
"""
46+
registry = pooch.create(
47+
path=pooch.os_cache("pmp"),
48+
base_url="https://pcmdiweb.llnl.gov/pss/pmpdata/",
49+
version=version,
50+
env="REF_METRICS_PMP_DATA_DIR",
51+
)
52+
registry.load_registry(
53+
importlib.resources.open_binary("cmip_ref_core.dataset_registry", "pmp_reference.txt")
54+
)
55+
return registry
56+
57+
58+
def fetch_all_files(registry: pooch.Pooch, output_dir: pathlib.Path, symlink: bool = False) -> None:
59+
"""
60+
Fetch all files associated with a pooch registry and write them to an output directory.
61+
62+
Pooch fetches, caches and validates the downloaded files.
63+
Subsequent calls to this function will not refetch any previously downloaded files.
64+
65+
Parameters
66+
----------
67+
registry
68+
Pooch directory containing a set of files that should be fetched.
69+
output_dir
70+
The root directory to write the files to.
71+
72+
The directory will be created if it doesn't exist,
73+
and matching files will be overwritten.
74+
symlink
75+
If True, symlink all files to this directory.
76+
Otherwise, perform a copy.
77+
"""
78+
output_dir.mkdir(parents=True, exist_ok=True)
79+
80+
for key in registry.registry.keys():
81+
fetch_file = registry.fetch(key)
82+
83+
linked_file = output_dir / key
84+
linked_file.parent.mkdir(parents=True, exist_ok=True)
85+
if not linked_file.exists(): # pragma: no cover
86+
if symlink:
87+
logger.info(f"Linking {key} to {linked_file}")
88+
89+
os.symlink(fetch_file, linked_file)
90+
else:
91+
logger.info(f"Copying {key} to {linked_file}")
92+
shutil.copy(fetch_file, linked_file)
93+
else:
94+
logger.info(f"File {linked_file} already exists. Skipping.")

packages/ref-metrics-pmp/src/cmip_ref_metrics_pmp/registry/reference.txt renamed to packages/ref-core/src/cmip_ref_core/dataset_registry/pmp_reference.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1 +1,2 @@
11
obs4MIPs_PCMDI_monthly/MOHC/HadISST-1-1/mon/ts/gn/v20210727/ts_mon_HadISST-1-1_PCMDI_gn_187001-201907.nc md5:99c8691e0f615dc4d79b4fb5e926cc76
2+
obs4MIPs_PCMDI_monthly/NOAA-ESRL-PSD/20CR/mon/psl/gn/v20210727/psl_mon_20CR_PCMDI_gn_187101-201212.nc md5:570ce90b3afd1d0b31690ae5dbe32d31
Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
import pytest
2+
3+
from cmip_ref_core.dataset_registry import build_reference_data_registry, fetch_all_files
4+
5+
6+
@pytest.mark.parametrize("symlink", [True, False])
7+
def test_fetch_all_files(mocker, tmp_path, symlink):
8+
downloaded_file = tmp_path / "out.txt"
9+
downloaded_file.write_text("foo")
10+
11+
registry = build_reference_data_registry()
12+
registry.fetch = mocker.MagicMock(return_value=downloaded_file)
13+
14+
fetch_all_files(registry, tmp_path, symlink=symlink)
15+
assert registry.fetch.call_count == 2
16+
17+
expected_file = (
18+
tmp_path
19+
/ "obs4MIPs_PCMDI_monthly/MOHC/HadISST-1-1/mon/ts/gn/v20210727/ts_mon_HadISST-1-1_PCMDI_gn_187001-201907.nc" # noqa: E501
20+
)
21+
22+
assert expected_file.exists()
23+
assert expected_file.is_symlink() == symlink
24+
assert expected_file.read_text() == "foo"

packages/ref-metrics-pmp/src/cmip_ref_metrics_pmp/registry/__init__.py

Lines changed: 0 additions & 62 deletions
This file was deleted.

packages/ref/src/cmip_ref/cli/_logging.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import inspect
22
import logging
33

4+
import pooch
45
from loguru import logger
56

67

@@ -28,7 +29,10 @@ def capture_logging() -> None:
2829
2930
Note that this replaces the root logger, so any other handlers attached to it will be removed.
3031
"""
31-
# logger.debug("Capturing logging from the standard library")
32+
# Pooch adds a handler to its own logger which circumvents the REF logger
33+
pooch.get_logger().handlers.clear()
34+
pooch.get_logger().addHandler(_InterceptHandler())
35+
3236
logging.basicConfig(handlers=[_InterceptHandler()], level=0, force=True)
3337

3438

packages/ref/src/cmip_ref/cli/datasets.py

Lines changed: 36 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44

55
import errno
66
import os
7+
import shutil
78
from collections.abc import Iterable
89
from pathlib import Path
910
from typing import Annotated
@@ -17,6 +18,7 @@
1718
from cmip_ref.models import Dataset
1819
from cmip_ref.solver import solve_metrics
1920
from cmip_ref.testing import SAMPLE_DATA_VERSION, fetch_sample_data
21+
from cmip_ref_core.dataset_registry import build_reference_data_registry, fetch_all_files
2022
from cmip_ref_core.datasets import SourceDatasetType
2123

2224
app = typer.Typer(help=__doc__)
@@ -149,21 +151,44 @@ def ingest( # noqa: PLR0913
149151

150152
@app.command(name="fetch-sample-data")
151153
def _fetch_sample_data(
152-
version: str = SAMPLE_DATA_VERSION, force_cleanup: bool = False, symlink: bool = False
154+
version: Annotated[
155+
str,
156+
"The version tag of the sample data to fetch. "
157+
"Defaults to the current version of data expected by the test suite",
158+
] = SAMPLE_DATA_VERSION,
159+
force_cleanup: Annotated[bool, typer.Option(help="If True, remove any existing files")] = False,
160+
symlink: Annotated[
161+
bool, typer.Option(help="If True, symlink files into the output directory, otherwise perform a copy")
162+
] = False,
153163
) -> None:
154164
"""
155165
Fetch the sample data for the given version.
156166
157-
Parameters
158-
----------
159-
version
160-
The version tag of the sample data to fetch.
161-
162-
Defaults to the current version of data expected by the test suite
163-
force_cleanup
164-
If True, remove any existing files.
165-
symlink : bool
166-
If True, symlink in the data otherwise copy the files.
167+
These data will be written into the test data directory.
168+
This operation may fail if the test data directory does not exist,
169+
as is the case for non-source-based installations.
167170
"""
168171
logger.info(f"Fetching data for version {version}")
169172
fetch_sample_data(version=version, force_cleanup=force_cleanup, symlink=symlink)
173+
174+
175+
@app.command(name="fetch-obs4ref-data")
176+
def fetch_obs4ref_data(
177+
output_directory: Annotated[Path, typer.Option(help="Output directory where files will be saved")],
178+
force_cleanup: Annotated[bool, typer.Option(help="If True, remove any existing files")] = False,
179+
symlink: Annotated[
180+
bool, typer.Option(help="If True, symlink files into the output directory, otherwise perform a copy")
181+
] = False,
182+
) -> None:
183+
"""
184+
Fetch non-published Obs4MIPs data that is used by the REF
185+
186+
These datasets have been verified to have open licenses
187+
and are in the process of being added to Obs4MIPs.
188+
"""
189+
if force_cleanup and output_directory.exists():
190+
logger.warning(f"Removing existing directory {output_directory}")
191+
shutil.rmtree(output_directory)
192+
193+
data_registry = build_reference_data_registry()
194+
fetch_all_files(data_registry, output_directory, symlink=symlink)

packages/ref/src/cmip_ref/testing.py

Lines changed: 4 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -3,13 +3,14 @@
33
"""
44

55
import importlib.resources
6-
import os
76
import shutil
87
from pathlib import Path
98

109
import pooch
1110
from loguru import logger
1211

12+
from cmip_ref_core.dataset_registry import fetch_all_files
13+
1314

1415
def _determine_test_directory() -> Path | None:
1516
expected = Path(__file__).parents[4] / "tests" / "test-data"
@@ -68,7 +69,7 @@ def fetch_sample_data(
6869
logger.warning("Test data directory not found, skipping sample data fetch")
6970
return
7071

71-
sample_registry = _build_sample_data_registry(version)
72+
sample_data_registry = _build_sample_data_registry(version)
7273

7374
output_dir = TEST_DATA_DIR / "sample-data"
7475
version_file = output_dir / "version.txt"
@@ -83,21 +84,7 @@ def fetch_sample_data(
8384
logger.warning("Removing existing sample data")
8485
shutil.rmtree(output_dir)
8586

86-
output_dir.mkdir(parents=True, exist_ok=True)
87-
88-
for key in sample_registry.registry.keys():
89-
fetch_file = sample_registry.fetch(key)
90-
91-
linked_file = output_dir / key
92-
linked_file.parent.mkdir(parents=True, exist_ok=True)
93-
if not linked_file.exists(): # pragma: no cover
94-
if symlink:
95-
logger.info(f"Linking {key} to {linked_file}")
96-
97-
os.symlink(fetch_file, linked_file)
98-
else:
99-
logger.info(f"Copying {key} to {linked_file}")
100-
shutil.copy(fetch_file, linked_file)
87+
fetch_all_files(sample_data_registry, output_dir, symlink)
10188

10289
# Write out the current sample data version to the copying as complete
10390
with open(output_dir / "version.txt", "w") as fh:

0 commit comments

Comments
 (0)