|
| 1 | +""" |
| 2 | +Data registries for non-published reference data |
| 3 | +
|
| 4 | +These data are placeholders until these data have been added to obs4MIPs. |
| 5 | +The AR7 FT REF requires that reference datasets are openly licensed before it is included |
| 6 | +in any published data catalogs. |
| 7 | +""" |
| 8 | + |
| 9 | +import importlib.resources |
| 10 | +import os |
| 11 | +import pathlib |
| 12 | +import shutil |
| 13 | + |
| 14 | +import pooch |
| 15 | +from loguru import logger |
| 16 | + |
| 17 | + |
| 18 | +def fetch_all_files(registry: pooch.Pooch, output_dir: pathlib.Path | None, symlink: bool = False) -> None: |
| 19 | + """ |
| 20 | + Fetch all files associated with a pooch registry and write them to an output directory. |
| 21 | +
|
| 22 | + Pooch fetches, caches and validates the downloaded files. |
| 23 | + Subsequent calls to this function will not refetch any previously downloaded files. |
| 24 | +
|
| 25 | + Parameters |
| 26 | + ---------- |
| 27 | + registry |
| 28 | + Pooch directory containing a set of files that should be fetched. |
| 29 | + output_dir |
| 30 | + The root directory to write the files to. |
| 31 | +
|
| 32 | + The directory will be created if it doesn't exist, |
| 33 | + and matching files will be overwritten. |
| 34 | +
|
| 35 | + If no directory is provided, the files will be fetched from the remote server, |
| 36 | + but not copied anywhere. |
| 37 | + symlink |
| 38 | + If True, symlink all files to this directory. |
| 39 | + Otherwise, perform a copy. |
| 40 | + """ |
| 41 | + if output_dir: |
| 42 | + output_dir.mkdir(parents=True, exist_ok=True) |
| 43 | + |
| 44 | + for key in registry.registry.keys(): |
| 45 | + fetch_file = registry.fetch(key) |
| 46 | + |
| 47 | + if output_dir is None: |
| 48 | + # Just warm the cache and move onto the next file |
| 49 | + continue |
| 50 | + |
| 51 | + linked_file = output_dir / key |
| 52 | + linked_file.parent.mkdir(parents=True, exist_ok=True) |
| 53 | + if not linked_file.exists(): # pragma: no cover |
| 54 | + if symlink: |
| 55 | + logger.info(f"Linking {key} to {linked_file}") |
| 56 | + |
| 57 | + os.symlink(fetch_file, linked_file) |
| 58 | + else: |
| 59 | + logger.info(f"Copying {key} to {linked_file}") |
| 60 | + shutil.copy(fetch_file, linked_file) |
| 61 | + else: |
| 62 | + logger.info(f"File {linked_file} already exists. Skipping.") |
| 63 | + |
| 64 | + |
| 65 | +class DatasetRegistryManager: |
| 66 | + """ |
| 67 | + A collection of reference datasets registries |
| 68 | +
|
| 69 | + The REF requires additional reference datasets |
| 70 | + in addition to obs4MIPs data which can be downloaded via ESGF. |
| 71 | + Each provider may have different sets of reference data that are needed. |
| 72 | + These are provider-specific datasets are datasets not yet available in obs4MIPs, |
| 73 | + or are post-processed from obs4MIPs. |
| 74 | +
|
| 75 | + A dataset registry consists of a file that contains a list of files and checksums, |
| 76 | + in combination with a base URL that is used to fetch the files. |
| 77 | + [Pooch](https://www.fatiando.org/pooch/latest/) is used within the DataRegistry |
| 78 | + to manage the caching, downloading and validation of the files. |
| 79 | +
|
| 80 | + All datasets that are registered here are expected to be openly licensed and freely available. |
| 81 | + """ |
| 82 | + |
| 83 | + def __init__(self) -> None: |
| 84 | + self._registries: dict[str, pooch.Pooch] = {} |
| 85 | + |
| 86 | + def __getitem__(self, item: str) -> pooch.Pooch: |
| 87 | + """ |
| 88 | + Get a registry by name |
| 89 | + """ |
| 90 | + return self._registries[item] |
| 91 | + |
| 92 | + def keys(self) -> list[str]: |
| 93 | + """ |
| 94 | + Get the list of registry names |
| 95 | + """ |
| 96 | + return list(self._registries.keys()) |
| 97 | + |
| 98 | + def register( # noqa: PLR0913 |
| 99 | + self, |
| 100 | + name: str, |
| 101 | + base_url: str, |
| 102 | + package: str, |
| 103 | + resource: str, |
| 104 | + cache_name: str | None = None, |
| 105 | + version: str | None = None, |
| 106 | + ) -> None: |
| 107 | + """ |
| 108 | + Register a new dataset registry |
| 109 | +
|
| 110 | + This will create a new Pooch registry and add it to the list of registries. |
| 111 | + This is typically used by a provider to register a new collections of datasets at runtime. |
| 112 | +
|
| 113 | + Parameters |
| 114 | + ---------- |
| 115 | + name |
| 116 | + Name of the registry |
| 117 | +
|
| 118 | + This is used to identify the registry |
| 119 | + base_url |
| 120 | + Commmon URL prefix for the files |
| 121 | + package |
| 122 | + Name of the package containing the registry resource. |
| 123 | + resource |
| 124 | + Name of the resource in the package that contains a list of files and checksums. |
| 125 | +
|
| 126 | + This must be formatted in a way that is expected by pooch. |
| 127 | + version |
| 128 | + The version of the data. |
| 129 | +
|
| 130 | + Changing the version will invalidate the cache and force a re-download of the data. |
| 131 | + cache_name |
| 132 | + Name to use to generate the cache directory. |
| 133 | +
|
| 134 | + This defaults to the value of `name` if not provided. |
| 135 | + """ |
| 136 | + if cache_name is None: |
| 137 | + cache_name = "ref" |
| 138 | + |
| 139 | + registry = pooch.create( |
| 140 | + path=pooch.os_cache(cache_name), |
| 141 | + base_url=base_url, |
| 142 | + version=version, |
| 143 | + env="REF_METRICS_DATA_DIR", |
| 144 | + ) |
| 145 | + registry.load_registry(str(importlib.resources.files(package) / resource)) |
| 146 | + self._registries[name] = registry |
| 147 | + |
| 148 | + |
| 149 | +dataset_registry_manager = DatasetRegistryManager() |
0 commit comments