From 9cabff8d5cf22fa0c9ff9643e1f18e19c04788a7 Mon Sep 17 00:00:00 2001
From: Sean Quinlan <1011062+sbquinlan@users.noreply.github.com>
Date: Wed, 25 Oct 2023 22:12:23 -0700
Subject: [PATCH] MUR SST Zarr Recipe

---
 recipes/mursst/meta.yaml        |  20 +++++++
 recipes/mursst/recipe.py        | 103 ++++++++++++++++++++++++++++++++
 recipes/mursst/requirements.txt |   3 +
 3 files changed, 126 insertions(+)
 create mode 100644 recipes/mursst/meta.yaml
 create mode 100644 recipes/mursst/recipe.py
 create mode 100644 recipes/mursst/requirements.txt

diff --git a/recipes/mursst/meta.yaml b/recipes/mursst/meta.yaml
new file mode 100644
index 0000000000..3ebd101603
--- /dev/null
+++ b/recipes/mursst/meta.yaml
@@ -0,0 +1,20 @@
+title: 'GHRSST Level 4 MUR Global Foundation Sea Surface Temperature Analysis (v4.1)'
+description: 'A Group for High Resolution Sea Surface Temperature (GHRSST) Level 4 sea surface temperature analysis produced as a retrospective dataset (four day latency) and near-real-time dataset (one day latency) at the JPL Physical Oceanography DAAC using wavelets as basis functions in an optimal interpolation approach on a global 0.01 degree grid'
+pangeo_forge_version: '0.9.2'
+recipes:
+  - id: MUR-JPL-L4-GLOB-v4.1
+    object: 'recipe:recipe'
+provenance:
+  providers:
+    - name: 'NASA JPL PO.DAAC'
+      description: 'Physical Oceanography Distributed Active Archive Center'
+      roles:
+        - producer
+        - licensor
+      url: https://podaac.jpl.nasa.gov/dataset/MUR-JPL-L4-GLOB-v4.1
+  license: 'Open Data'
+maintainers:
+  - name: 'Development Seed'
+    github: developmentseed
+bakery:
+  id: 'pangeo-ldeo-nsf-earthcube'
diff --git a/recipes/mursst/recipe.py b/recipes/mursst/recipe.py
new file mode 100644
index 0000000000..41c9872aba
--- /dev/null
+++ b/recipes/mursst/recipe.py
@@ -0,0 +1,103 @@
+import base64
+import json
+import os
+
+import apache_beam as beam
+import requests
+from cmr import GranuleQuery
+
+from pangeo_forge_recipes.patterns import pattern_from_file_sequence
+from pangeo_forge_recipes.transforms import (
+    Indexed,
+    OpenURLWithFSSpec,
+    OpenWithXarray,
+    StoreToZarr,
+    T,
+)
+
+HTTP_REL = 'http://esipfed.org/ns/fedsearch/1.1/data#'
+S3_REL = 'http://esipfed.org/ns/fedsearch/1.1/s3#'
+AUTH_HEADERS = {'headers': {'Authorization': f"Bearer {os.environ['EARTHDATA_TOKEN']}"}}
+CREDENTIALS_API = 'https://archive.podaac.earthdata.nasa.gov/s3credentials'
+
+
+def earthdata_auth(username, password):
+    login_resp = requests.get(CREDENTIALS_API, allow_redirects=False)
+    login_resp.raise_for_status()
+
+    encoded_auth = base64.b64encode(f'{username}:{password}'.encode('ascii'))
+    auth_redirect = requests.post(
+        login_resp.headers['location'],
+        data={'credentials': encoded_auth},
+        headers={'Origin': CREDENTIALS_API},
+        allow_redirects=False,
+    )
+    auth_redirect.raise_for_status()
+
+    final = requests.get(auth_redirect.headers['location'], allow_redirects=False)
+
+    results = requests.get(CREDENTIALS_API, cookies={'accessToken': final.cookies['accessToken']})
+    results.raise_for_status()
+
+    creds = json.loads(results.content)
+    return {
+        'aws_access_key_id': creds['accessKeyId'],
+        'aws_secret_access_key': creds['secretAccessKey'],
+        'aws_session_token': creds['sessionToken'],
+    }
+
+
+def filter_data_links(links, rel):
+    return filter(lambda link: link['rel'] == rel and link['href'].endswith('.nc'), links)
+
+
+def gen_data_links(rel):
+    granules = GranuleQuery().short_name('MUR-JPL-L4-GLOB-v4.1').downloadable(True).get_all()
+    for granule in granules:
+        s3_links = filter_data_links(granule['links'], rel)
+        first = next(s3_links, None)
+        # throw if CMR does not have exactly one S3 link for an item
+        if not first or next(s3_links, None) is not None:
+            raise ValueError(f"Expected 1 link of type {rel} on {granule['title']}")
+        yield first['href']
+
+
+class Preprocess(beam.PTransform):
+    """Filters variables to only be the non-optional L4 variables."""
+
+    @staticmethod
+    def _preproc(item: Indexed[T]) -> Indexed[T]:
+        SELECTED_VARS = {'analysed_sst', 'analysis_error', 'mask', 'sea_ice_fraction'}
+        index, ds = item
+        return index, ds.drop([k for k in ds.data_vars.keys() if k not in SELECTED_VARS])
+
+    def expand(self, pcoll: beam.PCollection) -> beam.PCollection:
+        return pcoll | beam.Map(self._preproc)
+
+
+# use HTTP_REL if S3 access is not possible. S3_REL is faster.
+selected_rel = S3_REL
+pattern = pattern_from_file_sequence(
+    list(gen_data_links(selected_rel)),
+    'time',
+)
+open_kwargs = (
+    AUTH_HEADERS
+    if selected_rel == HTTP_REL
+    else {
+        'client_kwargs': earthdata_auth(
+            os.environ['EARTHDATA_USERNAME'], os.environ['EARTHDATA_PASSWORD']
+        )
+    }
+)
+recipe = (
+    beam.Create(pattern.items())
+    | OpenURLWithFSSpec(open_kwargs=open_kwargs)
+    | OpenWithXarray(file_type=pattern.file_type)
+    | Preprocess()
+    | StoreToZarr(
+        store_name='mursst.zarr',
+        combine_dims=pattern.combine_dim_keys,
+        target_chunks={'time': 1, 'lat': 1800, 'lon': 3600},
+    )
+)
diff --git a/recipes/mursst/requirements.txt b/recipes/mursst/requirements.txt
new file mode 100644
index 0000000000..3805faeb55
--- /dev/null
+++ b/recipes/mursst/requirements.txt
@@ -0,0 +1,3 @@
+s3fs==2023.10.0
+boto3==1.28.71
+python-cmr==0.9.0