|
| 1 | +import pathlib |
| 2 | +from pathlib import Path |
| 3 | + |
| 4 | +import pandas as pd |
| 5 | +import xarray as xr |
| 6 | +from cmip_ref.dataset_registry import build_reference_data_registry |
| 7 | + |
| 8 | +from ref_sample_data.data_request.base import DataRequest |
| 9 | +from ref_sample_data.resample import decimate_curvilinear, decimate_rectilinear |
| 10 | + |
| 11 | + |
| 12 | +class Obs4REFRequest(DataRequest): |
| 13 | + """ |
| 14 | + Fetch the unpublished Obs4MIPs datasets from the PMP registry |
| 15 | +
|
| 16 | + This includes all files that would be downloaded if you ran: |
| 17 | + ``` |
| 18 | + ref datasets fetch-obs4ref-data --output-data ... |
| 19 | + ``` |
| 20 | + """ |
| 21 | + |
| 22 | + def fetch_datasets(self) -> pd.DataFrame: |
| 23 | + """ |
| 24 | + Fetch the datasets from the source |
| 25 | +
|
| 26 | + Returns a dataframe of the metadata and paths to the fetched datasets. |
| 27 | + """ |
| 28 | + registry = build_reference_data_registry() |
| 29 | + |
| 30 | + datasets = [] |
| 31 | + for key in registry.registry.keys(): |
| 32 | + dataset_path = registry.fetch(key) |
| 33 | + datasets.append( |
| 34 | + { |
| 35 | + "key": key, |
| 36 | + "files": [dataset_path], |
| 37 | + } |
| 38 | + ) |
| 39 | + return pd.DataFrame(datasets) |
| 40 | + |
| 41 | + def decimate_dataset(self, dataset: xr.Dataset) -> xr.Dataset | None: |
| 42 | + """ |
| 43 | + Downscale the dataset to a smaller size. |
| 44 | +
|
| 45 | + Parameters |
| 46 | + ---------- |
| 47 | + dataset |
| 48 | + The dataset to downscale |
| 49 | +
|
| 50 | + Returns |
| 51 | + ------- |
| 52 | + xr.Dataset |
| 53 | + The downscaled dataset |
| 54 | + """ |
| 55 | + has_latlon = "lat" in dataset.dims and "lon" in dataset.dims |
| 56 | + has_ij = "i" in dataset.dims and "j" in dataset.dims |
| 57 | + |
| 58 | + if has_latlon: |
| 59 | + assert len(dataset.lat.dims) == 1 and len(dataset.lon.dims) == 1 |
| 60 | + |
| 61 | + result = decimate_rectilinear(dataset) |
| 62 | + elif has_ij: |
| 63 | + # 2d curvilinear grid (generally ocean variables) |
| 64 | + result = decimate_curvilinear(dataset) |
| 65 | + else: |
| 66 | + raise ValueError("Cannot decimate this grid: too many dimensions") |
| 67 | + |
| 68 | + return result |
| 69 | + |
| 70 | + def generate_filename(self, metadata: pd.Series, ds: xr.Dataset, ds_filename: pathlib.Path) -> Path: |
| 71 | + """ |
| 72 | + Create the output filename for the dataset. |
| 73 | +
|
| 74 | + Parameters |
| 75 | + ---------- |
| 76 | + metadata |
| 77 | + Metadata from the file |
| 78 | + ds |
| 79 | + Loaded dataset |
| 80 | +
|
| 81 | + ds_filename: |
| 82 | + Filename of the dataset (Unused) |
| 83 | +
|
| 84 | + Returns |
| 85 | + ------- |
| 86 | + The output filename |
| 87 | + """ |
| 88 | + return Path("obs4REF") / metadata.key |
0 commit comments