Merge pull request #24 from Climate-REF/add-pmp

lewisjared · web-flow · commit 503f22a5feaa · 2025-03-28T21:12:42.000-06:00
Add datasets that are not yet published on obs4MIPs
diff --git a/.github/actions/regenerate/action.yml b/.github/actions/regenerate/action.yml
@@ -5,7 +5,6 @@ runs:
   steps:
     - uses: ./.github/actions/setup
       with:
-        python-version: 3.12
         cache-esgf: true
 
     - name: Verify registry
diff --git a/.github/actions/setup/action.yml b/.github/actions/setup/action.yml
@@ -12,13 +12,11 @@ runs:
     - name: Install pixi
       uses: prefix-dev/setup-pixi@v0.8.3
       with:
-        pixi-version: "latest"
+        pixi-version: "v0.40.2"
         cache: true
-    - name: Install the project
-      shell: bash
-      run: |
-        # Only installs if the lock file is up-to-date with the manifest
-        pixi install --locked
+        # Frozen is needed as the ref git dependency was not playing nice with a fully locked environment
+        frozen: true
+        log-level: "v"
     - name: Cache downloaded ESGF data
       uses: actions/cache@v4
       if: ${{ inputs.cache-esgf == 'true' }}
diff --git a/changelog/24.feature.md b/changelog/24.feature.md
@@ -0,0 +1 @@
+Add datasets that from pmp which are not yet published on obs4MIPs
diff --git a/data/obs4REF/obs4MIPs_PCMDI_monthly/MOHC/HadISST-1-1/mon/ts/gn/v20210727/ts_mon_HadISST-1-1_PCMDI_gn_187001-201907.nc b/data/obs4REF/obs4MIPs_PCMDI_monthly/MOHC/HadISST-1-1/mon/ts/gn/v20210727/ts_mon_HadISST-1-1_PCMDI_gn_187001-201907.nc
diff --git a/data/obs4REF/obs4MIPs_PCMDI_monthly/NOAA-ESRL-PSD/20CR/mon/psl/gn/v20210727/psl_mon_20CR_PCMDI_gn_187101-201212.nc b/data/obs4REF/obs4MIPs_PCMDI_monthly/NOAA-ESRL-PSD/20CR/mon/psl/gn/v20210727/psl_mon_20CR_PCMDI_gn_187101-201212.nc
diff --git a/pixi.lock b/pixi.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -29,6 +29,9 @@ xesmf = ">=0.8.7,<0.9"
 [tool.pixi.pypi-dependencies]
 # Add any dependencies that aren't available on conda-forge here
 ref_sample_data = { path = ".", editable = true }
+# TODO: Pin a release
+# This rev includes the PMP reference data
+cmip-ref = { git = "https://github.com/Climate-REF/climate-ref", subdirectory = "packages/ref", rev = "7ea9c966fc44b91e4b0e3d8b31f6f2c3f1445677" }
 
 [tool.pixi.feature.dev.dependencies]
 ruff = "*"
diff --git a/registry.txt b/registry.txt
@@ -51,3 +51,5 @@ CMIP6/ScenarioMIP/CSIRO/ACCESS-ESM1-5/ssp126/r1i1p1f1/fx/areacella/gn/v20210318/
 obs4MIPs/NASA-JPL/AIRS-2-1/ta/gn/v20201110/taNobs_AIRS-2-1_gn_200209-201609.nc 3489895fc6cdd936ae64fa64fa221474e50f6b6bf347458c82d9a61f945f2d9d
 obs4MIPs/NASA-JPL/AIRS-2-1/ta/gn/v20201110/taStderr_AIRS-2-1_gn_200209-201609.nc 81e12ba5c6b058ace93737a3b69b317d2beb17e07fd6aa9f709b3e528ebfb4a2
 obs4MIPs/NASA-JPL/AIRS-2-1/ta/gn/v20201110/ta_AIRS-2-1_gn_200209-201609.nc a72d7172cd0c9df9eb0199082b196655490e5628fbb6a61ed1e7f8f83c610c0b
+obs4REF/obs4MIPs_PCMDI_monthly/MOHC/HadISST-1-1/mon/ts/gn/v20210727/ts_mon_HadISST-1-1_PCMDI_gn_187001-201907.nc 4f9a9270d001fc30488b49cdafe28e77db88e78e981ab580f0fae209f849a2da
+obs4REF/obs4MIPs_PCMDI_monthly/NOAA-ESRL-PSD/20CR/mon/psl/gn/v20210727/psl_mon_20CR_PCMDI_gn_187101-201212.nc 357e8915cc2ad30af1dd02cbecfb55f3083c13f54a11912e2f28396ccc84bd9c
diff --git a/scripts/fetch_test_data.py b/scripts/fetch_test_data.py
@@ -2,45 +2,16 @@
 from pathlib import Path
 from typing import Annotated
 
-import pandas as pd
 import pooch
 import typer
 import xarray as xr
 
-from ref_sample_data import CMIP6Request, DataRequest, Obs4MIPsRequest
+from ref_sample_data import CMIP6Request, DataRequest, Obs4MIPsRequest, Obs4REFRequest
 
 OUTPUT_PATH = Path("data")
 app = typer.Typer()
 
 
-def deduplicate_datasets(datasets: pd.DataFrame) -> pd.DataFrame:
-    """
-    Deduplicate a dataset collection.
-
-    Uses the metadata from the first dataset in each group,
-    but expands the time range to the min/max timespan of the group.
-
-    Parameters
-    ----------
-    datasets
-        The dataset collection
-
-    Returns
-    -------
-    pd.DataFrame
-        The deduplicated dataset collection spanning the times requested
-    """
-
-    def _deduplicate_group(group: pd.DataFrame) -> pd.DataFrame:
-        first = group.iloc[0].copy()
-        first.time_start = group.time_start.min()
-        first.time_end = group.time_end.max()
-
-        return first
-
-    return datasets.groupby("key").apply(_deduplicate_group, include_groups=False).reset_index()
-
-
 def process_sample_data_request(
     request: DataRequest, decimate: bool, output_directory: Path, quiet: bool
 ) -> None:
@@ -61,7 +32,6 @@ def process_sample_data_request(
         Whether to suppress progress messages
     """
     datasets = request.fetch_datasets()
-    datasets = deduplicate_datasets(datasets)
 
     for _, dataset in datasets.iterrows():
         for ds_filename in dataset["files"]:
@@ -183,6 +153,8 @@ def process_sample_data_request(
         remove_ensembles=False,
         time_span=("2002", "2016"),
     ),
+    # All unpublished obs4mips datasets
+    Obs4REFRequest(),
 ]
 
 
diff --git a/src/ref_sample_data/__init__.py b/src/ref_sample_data/__init__.py
@@ -10,5 +10,6 @@
 from .data_request.base import DataRequest
 from .data_request.cmip6 import CMIP6Request
 from .data_request.obs4mips import Obs4MIPsRequest
+from .data_request.obs4ref import Obs4REFRequest
 
-__all__ = ["CMIP6Request", "DataRequest", "Obs4MIPsRequest"]
+__all__ = ["CMIP6Request", "DataRequest", "Obs4MIPsRequest", "Obs4REFRequest"]
diff --git a/src/ref_sample_data/data_request/base.py b/src/ref_sample_data/data_request/base.py
@@ -34,6 +34,34 @@ def generate_filename(
         ...
 
 
+def _deduplicate_datasets(datasets: pd.DataFrame) -> pd.DataFrame:
+    """
+    Deduplicate a dataset collection.
+
+    Uses the metadata from the first dataset in each group,
+    but expands the time range to the min/max timespan of the group.
+
+    Parameters
+    ----------
+    datasets
+        The dataset collection
+
+    Returns
+    -------
+    pd.DataFrame
+        The deduplicated dataset collection spanning the times requested
+    """
+
+    def _deduplicate_group(group: pd.DataFrame) -> pd.DataFrame:
+        first = group.iloc[0].copy()
+        first.time_start = group.time_start.min()
+        first.time_end = group.time_end.max()
+
+        return first
+
+    return datasets.groupby("key").apply(_deduplicate_group, include_groups=False).reset_index()
+
+
 class IntakeESGFDataRequest(DataRequest):
     """
     A data request that fetches datasets from ESGF using intake-esgf.
@@ -56,4 +84,4 @@ def fetch_datasets(self) -> pd.DataFrame:
         if self.time_span:
             merged_df["time_start"] = self.time_span[0]
             merged_df["time_end"] = self.time_span[1]
-        return merged_df
+        return _deduplicate_datasets(merged_df)
diff --git a/src/ref_sample_data/data_request/cmip6.py b/src/ref_sample_data/data_request/cmip6.py
@@ -94,8 +94,6 @@ def decimate_dataset(self, dataset: xr.Dataset) -> xr.Dataset | None:
         ----------
         dataset
             The dataset to downscale
-        time_span
-            The time span to extract from a dataset
 
         Returns
         -------
diff --git a/src/ref_sample_data/data_request/obs4mips.py b/src/ref_sample_data/data_request/obs4mips.py
@@ -73,8 +73,6 @@ def decimate_dataset(self, dataset: xr.Dataset) -> xr.Dataset | None:
         ----------
         dataset
             The dataset to downscale
-        time_span
-            The time span to extract from a dataset
 
         Returns
         -------
diff --git a/src/ref_sample_data/data_request/obs4ref.py b/src/ref_sample_data/data_request/obs4ref.py
@@ -0,0 +1,88 @@
+import pathlib
+from pathlib import Path
+
+import pandas as pd
+import xarray as xr
+from cmip_ref.dataset_registry import build_reference_data_registry
+
+from ref_sample_data.data_request.base import DataRequest
+from ref_sample_data.resample import decimate_curvilinear, decimate_rectilinear
+
+
+class Obs4REFRequest(DataRequest):
+    """
+    Fetch the unpublished Obs4MIPs datasets from the PMP registry
+
+    This includes all files that would be downloaded if you ran:
+    ```
+    ref datasets fetch-obs4ref-data --output-data ...
+    ```
+    """
+
+    def fetch_datasets(self) -> pd.DataFrame:
+        """
+        Fetch the datasets from the source
+
+        Returns a dataframe of the metadata and paths to the fetched datasets.
+        """
+        registry = build_reference_data_registry()
+
+        datasets = []
+        for key in registry.registry.keys():
+            dataset_path = registry.fetch(key)
+            datasets.append(
+                {
+                    "key": key,
+                    "files": [dataset_path],
+                }
+            )
+        return pd.DataFrame(datasets)
+
+    def decimate_dataset(self, dataset: xr.Dataset) -> xr.Dataset | None:
+        """
+        Downscale the dataset to a smaller size.
+
+        Parameters
+        ----------
+        dataset
+            The dataset to downscale
+
+        Returns
+        -------
+        xr.Dataset
+            The downscaled dataset
+        """
+        has_latlon = "lat" in dataset.dims and "lon" in dataset.dims
+        has_ij = "i" in dataset.dims and "j" in dataset.dims
+
+        if has_latlon:
+            assert len(dataset.lat.dims) == 1 and len(dataset.lon.dims) == 1
+
+            result = decimate_rectilinear(dataset)
+        elif has_ij:
+            # 2d curvilinear grid (generally ocean variables)
+            result = decimate_curvilinear(dataset)
+        else:
+            raise ValueError("Cannot decimate this grid: too many dimensions")
+
+        return result
+
+    def generate_filename(self, metadata: pd.Series, ds: xr.Dataset, ds_filename: pathlib.Path) -> Path:
+        """
+        Create the output filename for the dataset.
+
+        Parameters
+        ----------
+        metadata
+            Metadata from the file
+        ds
+            Loaded dataset
+
+        ds_filename:
+            Filename of the dataset (Unused)
+
+        Returns
+        -------
+            The output filename
+        """
+        return Path("obs4REF") / metadata.key

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+Add datasets that from pmp which are not yet published on obs4MIPs`