From 5b3a12f44767614842b90a2a03716f0216a2b1ba Mon Sep 17 00:00:00 2001
From: thodson <thodson@usgs.gov>
Date: Fri, 19 Jul 2024 12:08:50 -0500
Subject: [PATCH 1/4] Add rechunking example

---
 examples/rechunking/Dockerfile_virtualizarr | 59 +++++++++++++++
 examples/rechunking/README.md               | 15 ++++
 examples/rechunking/cubed-rechunk.py        | 81 +++++++++++++++++++++
 examples/rechunking/requirements.txt        |  9 +++
 4 files changed, 164 insertions(+)
 create mode 100644 examples/rechunking/Dockerfile_virtualizarr
 create mode 100644 examples/rechunking/README.md
 create mode 100644 examples/rechunking/cubed-rechunk.py
 create mode 100644 examples/rechunking/requirements.txt

diff --git a/examples/rechunking/Dockerfile_virtualizarr b/examples/rechunking/Dockerfile_virtualizarr
new file mode 100644
index 00000000..59f16193
--- /dev/null
+++ b/examples/rechunking/Dockerfile_virtualizarr
@@ -0,0 +1,59 @@
+# Python 3.11
+FROM python:3.11-slim-buster
+
+COPY requirements.txt requirements.txt
+
+RUN apt-get update \
+    # Install aws-lambda-cpp build dependencies
+    && apt-get install -y \
+      g++ \
+      make \
+      cmake \
+      unzip \
+    # cleanup package lists, they are not used anymore in this image
+    && rm -rf /var/lib/apt/lists/* \
+    && apt-cache search linux-headers-generic
+
+ARG FUNCTION_DIR="/function"
+
+# Copy function code
+RUN mkdir -p ${FUNCTION_DIR}
+
+# Update pip
+# NB botocore/boto3 are pinned due to https://github.com/boto/boto3/issues/3648
+#    using versions from https://github.com/aio-libs/aiobotocore/blob/72b8dd5d7d4ef2f1a49a0ae0c37b47e5280e2070/setup.py
+#    due to s3fs dependency
+RUN pip install --upgrade --ignore-installed pip wheel six setuptools \
+    && pip install --upgrade --no-cache-dir --ignore-installed \
+        awslambdaric \
+        botocore==1.29.76 \
+        boto3==1.26.76 \
+        redis \
+        httplib2 \
+        requests \
+        numpy \
+        scipy \
+        pandas \
+        pika \
+        kafka-python \
+        cloudpickle \
+        ps-mem \
+        tblib
+
+# Set working directory to function root directory
+WORKDIR ${FUNCTION_DIR}
+
+# Add Lithops
+COPY lithops_lambda.zip ${FUNCTION_DIR}
+RUN unzip lithops_lambda.zip \
+    && rm lithops_lambda.zip \
+    && mkdir handler \
+    && touch handler/__init__.py \
+    && mv entry_point.py handler/
+
+# Put your dependencies here, using RUN pip install... or RUN apt install...
+
+RUN pip install --no-cache-dir -r requirements.txt
+
+ENTRYPOINT [ "/usr/local/bin/python", "-m", "awslambdaric" ]
+CMD [ "handler.entry_point.lambda_handler" ]
diff --git a/examples/rechunking/README.md b/examples/rechunking/README.md
new file mode 100644
index 00000000..9789dc2e
--- /dev/null
+++ b/examples/rechunking/README.md
@@ -0,0 +1,15 @@
+# Example rechunking workflow
+
+1. Set up a Python environment
+```bash
+conda create --name virtualizarr-rechunk -y python=3.11
+conda activate virtualizarr-rechunk
+pip install -r requirements.txt
+```
+
+1. Set up cubed executor by following https://github.com/cubed-dev/cubed/blob/main/examples/lithops/aws/README.md
+
+1. Build a runtime image for Cubed
+```bash
+lithops runtime build -b aws_lambda -f Dockerfile_aws_lambda virtualizarr-runtime
+```
diff --git a/examples/rechunking/cubed-rechunk.py b/examples/rechunking/cubed-rechunk.py
new file mode 100644
index 00000000..eeb51f7b
--- /dev/null
+++ b/examples/rechunking/cubed-rechunk.py
@@ -0,0 +1,81 @@
+# Example rechunking flow using VirtualiZarr and cubed.
+
+import fsspec
+import lithops
+import xarray as xr
+
+from virtualizarr import open_virtual_dataset
+from cubed.primitive.rechunk import rechunk
+
+# example workflow based on Pythia's kerchunk cookbook:
+# https://projectpythia.org/kerchunk-cookbook/notebooks/foundations/02_kerchunk_multi_file.html
+fs_read = fsspec.filesystem("s3", anon=True, skip_instance_cache=True)
+files_paths = fs_read.glob("s3://wrf-se-ak-ar5/ccsm/rcp85/daily/2060/*")
+file_pattern = sorted(["s3://" + f for f in files_paths])
+
+# truncate file_pattern while debugging
+file_pattern = file_pattern[:4]
+
+print(f"{len(file_pattern)} file paths were retrieved.")
+
+
+def map_reference(fil):
+    """ Map 
+    """
+    vds = open_virtual_dataset(fil,
+                               indexes={},
+                               loadable_variables=['Time'],
+                               cftime_variables=['Time'],
+                               )
+    return vds
+
+
+def reduce_reference(results):
+    """
+    """
+    combined_vds = xr.combine_nested(
+        results,
+        concat_dim=['Time'],
+        coords='minimal',
+        compat='override',
+    )
+    # possibly write parquet to s3 here
+    return combined_vds
+
+
+fexec = lithops.FunctionExecutor()  # config=lambda_config
+
+futures = fexec.map_reduce(
+    map_reference,
+    file_pattern,
+    reduce_reference,
+    spawn_reducer=100
+    )
+
+ds = futures.get_result()
+
+ds.virtualize.to_kerchunk('combined.json', format='json')
+
+# in notebooks, open_dataset must be caching the json, such that changes 
+# to the json are not propogated until the kernel is restarted
+combined_ds = xr.open_dataset('combined.json', engine="kerchunk")
+combined_ds['Time'].attrs = {}  # to_zarr complains about attrs
+
+
+source_chunks = {'Time': 1, 'south_north': 250, 'west_east': 320}
+target_chunks = {'Time': 5, 'south_north': 25, 'west_east': 32}
+
+combined_chunked = combined_ds.chunk(
+    chunks=source_chunks,
+)
+
+# rechunk requires shape attr, so can't pass full Dataset
+rechunk(
+    combined_chunked['TMAX'],
+    target_chunks=target_chunks,
+    source_array_name='virtual',
+    int_array_name='temp',
+    allowed_mem=2000,
+    reserved_mem=1000,
+    target_store="test.zarr",
+)
diff --git a/examples/rechunking/requirements.txt b/examples/rechunking/requirements.txt
new file mode 100644
index 00000000..58a73097
--- /dev/null
+++ b/examples/rechunking/requirements.txt
@@ -0,0 +1,9 @@
+boto
+cftime
+lithops
+cubed
+h5py
+kerchunk
+s3fs
+virtualizarr
+xarray
\ No newline at end of file

From 4fbe5c77eb9c4ac460b0a2bcc49e4ec4cf1ece1f Mon Sep 17 00:00:00 2001
From: thodson <thodson@usgs.gov>
Date: Sat, 20 Jul 2024 13:16:39 -0500
Subject: [PATCH 2/4] Fix Dockerfile

---
 examples/rechunking/Dockerfile_virtualizarr | 2 +-
 examples/rechunking/README.md               | 7 +++++++
 examples/rechunking/requirements.txt        | 5 +++--
 3 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/examples/rechunking/Dockerfile_virtualizarr b/examples/rechunking/Dockerfile_virtualizarr
index 59f16193..d1793c6a 100644
--- a/examples/rechunking/Dockerfile_virtualizarr
+++ b/examples/rechunking/Dockerfile_virtualizarr
@@ -1,7 +1,6 @@
 # Python 3.11
 FROM python:3.11-slim-buster
 
-COPY requirements.txt requirements.txt
 
 RUN apt-get update \
     # Install aws-lambda-cpp build dependencies
@@ -53,6 +52,7 @@ RUN unzip lithops_lambda.zip \
 
 # Put your dependencies here, using RUN pip install... or RUN apt install...
 
+COPY requirements.txt requirements.txt
 RUN pip install --no-cache-dir -r requirements.txt
 
 ENTRYPOINT [ "/usr/local/bin/python", "-m", "awslambdaric" ]
diff --git a/examples/rechunking/README.md b/examples/rechunking/README.md
index 9789dc2e..0f115d8a 100644
--- a/examples/rechunking/README.md
+++ b/examples/rechunking/README.md
@@ -11,5 +11,12 @@ pip install -r requirements.txt
 
 1. Build a runtime image for Cubed
 ```bash
+export LITHOPS_CONFIG_FILE=$(pwd)/config.aws
 lithops runtime build -b aws_lambda -f Dockerfile_aws_lambda virtualizarr-runtime
 ```
+
+## Cleaning up
+To rebuild the Litops image, delete the existing one by running
+```bash
+lithops runtime delete -b aws_lambda -d virtualizarr-runtime
+```
diff --git a/examples/rechunking/requirements.txt b/examples/rechunking/requirements.txt
index 58a73097..dfd39b4b 100644
--- a/examples/rechunking/requirements.txt
+++ b/examples/rechunking/requirements.txt
@@ -1,9 +1,10 @@
 boto
 cftime
-lithops
 cubed
+cubed-xarray
 h5py
 kerchunk
+lithops
 s3fs
 virtualizarr
-xarray
\ No newline at end of file
+xarray

From 0beadcb20651fefd3935c101eca56f5d62906868 Mon Sep 17 00:00:00 2001
From: thodson <thodson@usgs.gov>
Date: Wed, 24 Jul 2024 15:09:32 -0500
Subject: [PATCH 3/4] Complete rechunk workflow

---
 examples/rechunking/README.md        |  9 ++++-
 examples/rechunking/cubed-rechunk.py | 56 ++++++++++++++--------------
 examples/rechunking/cubed.yaml       |  7 ++++
 3 files changed, 42 insertions(+), 30 deletions(-)
 create mode 100644 examples/rechunking/cubed.yaml

diff --git a/examples/rechunking/README.md b/examples/rechunking/README.md
index 0f115d8a..44b65ff6 100644
--- a/examples/rechunking/README.md
+++ b/examples/rechunking/README.md
@@ -8,11 +8,18 @@ pip install -r requirements.txt
 ```
 
 1. Set up cubed executor by following https://github.com/cubed-dev/cubed/blob/main/examples/lithops/aws/README.md
+```bash
+export CUBED_CONFIG=$(pwd)
 
 1. Build a runtime image for Cubed
 ```bash
 export LITHOPS_CONFIG_FILE=$(pwd)/config.aws
-lithops runtime build -b aws_lambda -f Dockerfile_aws_lambda virtualizarr-runtime
+lithops runtime build -b aws_lambda -f Dockerfile_virtualizarr virtualizarr-runtime
+```
+
+1. Run the script
+```bash
+python cubed-rechunk.py
 ```
 
 ## Cleaning up
diff --git a/examples/rechunking/cubed-rechunk.py b/examples/rechunking/cubed-rechunk.py
index eeb51f7b..24dea54c 100644
--- a/examples/rechunking/cubed-rechunk.py
+++ b/examples/rechunking/cubed-rechunk.py
@@ -1,14 +1,14 @@
-# Example rechunking flow using VirtualiZarr and cubed.
+# Example rechunking flow using VirtualiZarr and cubed based on Pythia's
+# kerchunk cookbook: https://projectpythia.org/kerchunk-cookbook
+# The example proves concept, but should be optimized before production use.
+# Please, suggest improvements.
 
 import fsspec
 import lithops
 import xarray as xr
 
 from virtualizarr import open_virtual_dataset
-from cubed.primitive.rechunk import rechunk
 
-# example workflow based on Pythia's kerchunk cookbook:
-# https://projectpythia.org/kerchunk-cookbook/notebooks/foundations/02_kerchunk_multi_file.html
 fs_read = fsspec.filesystem("s3", anon=True, skip_instance_cache=True)
 files_paths = fs_read.glob("s3://wrf-se-ak-ar5/ccsm/rcp85/daily/2060/*")
 file_pattern = sorted(["s3://" + f for f in files_paths])
@@ -19,8 +19,8 @@
 print(f"{len(file_pattern)} file paths were retrieved.")
 
 
-def map_reference(fil):
-    """ Map 
+def map_references(fil):
+    """ Open a virtual datasets from a list of file paths.
     """
     vds = open_virtual_dataset(fil,
                                indexes={},
@@ -30,8 +30,9 @@ def map_reference(fil):
     return vds
 
 
-def reduce_reference(results):
-    """
+def reduce_references(results):
+    """ Combine virtual datasets into a single dataset.
+
     """
     combined_vds = xr.combine_nested(
         results,
@@ -46,9 +47,9 @@ def reduce_reference(results):
 fexec = lithops.FunctionExecutor()  # config=lambda_config
 
 futures = fexec.map_reduce(
-    map_reference,
+    map_references,
     file_pattern,
-    reduce_reference,
+    reduce_references,
     spawn_reducer=100
     )
 
@@ -56,26 +57,23 @@ def reduce_reference(results):
 
 ds.virtualize.to_kerchunk('combined.json', format='json')
 
-# in notebooks, open_dataset must be caching the json, such that changes 
-# to the json are not propogated until the kernel is restarted
-combined_ds = xr.open_dataset('combined.json', engine="kerchunk")
-combined_ds['Time'].attrs = {}  # to_zarr complains about attrs
-
+# In jupyter, open_dataset seems to cache the json, such that
+# changes aren't propogated until the kernel is restarted.
+combined_ds = xr.open_dataset('combined.json',
+                              engine="kerchunk",
+                              chunks={},
+                              chunked_array_type='cubed',
+                              )
 
-source_chunks = {'Time': 1, 'south_north': 250, 'west_east': 320}
-target_chunks = {'Time': 5, 'south_north': 25, 'west_east': 32}
+combined_ds['Time'].attrs = {}  # to_zarr complains about attrs
 
-combined_chunked = combined_ds.chunk(
-    chunks=source_chunks,
+rechunked_ds = combined_ds.chunk(
+    chunks={'Time': 5, 'south_north': 25, 'west_east': 32}
 )
 
-# rechunk requires shape attr, so can't pass full Dataset
-rechunk(
-    combined_chunked['TMAX'],
-    target_chunks=target_chunks,
-    source_array_name='virtual',
-    int_array_name='temp',
-    allowed_mem=2000,
-    reserved_mem=1000,
-    target_store="test.zarr",
-)
+rechunked_ds.to_zarr('rechunked.zarr',
+                     mode='w',
+                     encoding={},  # TODO
+                     consolidated=True,
+                     safe_chunks=False,
+                     )
diff --git a/examples/rechunking/cubed.yaml b/examples/rechunking/cubed.yaml
new file mode 100644
index 00000000..b4d2173c
--- /dev/null
+++ b/examples/rechunking/cubed.yaml
@@ -0,0 +1,7 @@
+spec:
+  work_dir: "s3://cubed-$USER-temp"
+  allowed_mem: "2GB"
+  executor_name: "lithops"
+  executor_options:
+    runtime: "virtualizarr-runtime"
+    runtime_memory: 2000

From 851f6d86231985cf53190ac2b41d4531d9bd0d4a Mon Sep 17 00:00:00 2001
From: thodson <thodson@usgs.gov>
Date: Wed, 24 Jul 2024 15:37:29 -0500
Subject: [PATCH 4/4] Edit doc

---
 examples/rechunking/cubed-rechunk.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/examples/rechunking/cubed-rechunk.py b/examples/rechunking/cubed-rechunk.py
index 24dea54c..ab803c8d 100644
--- a/examples/rechunking/cubed-rechunk.py
+++ b/examples/rechunking/cubed-rechunk.py
@@ -1,7 +1,7 @@
-# Example rechunking flow using VirtualiZarr and cubed based on Pythia's
-# kerchunk cookbook: https://projectpythia.org/kerchunk-cookbook
-# The example proves concept, but should be optimized before production use.
-# Please, suggest improvements.
+# Example rechunking flow using VirtualiZarr, cubed, and xarray-cubed.
+# Inspired by Pythia's cookbook: https://projectpythia.org/kerchunk-cookbook
+# This script proves the concept but requires further optimization for
+# production. Please, suggest improvements.
 
 import fsspec
 import lithops