From 5b3a12f44767614842b90a2a03716f0216a2b1ba Mon Sep 17 00:00:00 2001 From: thodson Date: Fri, 19 Jul 2024 12:08:50 -0500 Subject: [PATCH 1/4] Add rechunking example --- examples/rechunking/Dockerfile_virtualizarr | 59 +++++++++++++++ examples/rechunking/README.md | 15 ++++ examples/rechunking/cubed-rechunk.py | 81 +++++++++++++++++++++ examples/rechunking/requirements.txt | 9 +++ 4 files changed, 164 insertions(+) create mode 100644 examples/rechunking/Dockerfile_virtualizarr create mode 100644 examples/rechunking/README.md create mode 100644 examples/rechunking/cubed-rechunk.py create mode 100644 examples/rechunking/requirements.txt diff --git a/examples/rechunking/Dockerfile_virtualizarr b/examples/rechunking/Dockerfile_virtualizarr new file mode 100644 index 00000000..59f16193 --- /dev/null +++ b/examples/rechunking/Dockerfile_virtualizarr @@ -0,0 +1,59 @@ +# Python 3.11 +FROM python:3.11-slim-buster + +COPY requirements.txt requirements.txt + +RUN apt-get update \ + # Install aws-lambda-cpp build dependencies + && apt-get install -y \ + g++ \ + make \ + cmake \ + unzip \ + # cleanup package lists, they are not used anymore in this image + && rm -rf /var/lib/apt/lists/* \ + && apt-cache search linux-headers-generic + +ARG FUNCTION_DIR="/function" + +# Copy function code +RUN mkdir -p ${FUNCTION_DIR} + +# Update pip +# NB botocore/boto3 are pinned due to https://github.com/boto/boto3/issues/3648 +# using versions from https://github.com/aio-libs/aiobotocore/blob/72b8dd5d7d4ef2f1a49a0ae0c37b47e5280e2070/setup.py +# due to s3fs dependency +RUN pip install --upgrade --ignore-installed pip wheel six setuptools \ + && pip install --upgrade --no-cache-dir --ignore-installed \ + awslambdaric \ + botocore==1.29.76 \ + boto3==1.26.76 \ + redis \ + httplib2 \ + requests \ + numpy \ + scipy \ + pandas \ + pika \ + kafka-python \ + cloudpickle \ + ps-mem \ + tblib + +# Set working directory to function root directory +WORKDIR ${FUNCTION_DIR} + +# Add Lithops +COPY lithops_lambda.zip ${FUNCTION_DIR} +RUN unzip lithops_lambda.zip \ + && rm lithops_lambda.zip \ + && mkdir handler \ + && touch handler/__init__.py \ + && mv entry_point.py handler/ + +# Put your dependencies here, using RUN pip install... or RUN apt install... + +RUN pip install --no-cache-dir -r requirements.txt + +ENTRYPOINT [ "/usr/local/bin/python", "-m", "awslambdaric" ] +CMD [ "handler.entry_point.lambda_handler" ] diff --git a/examples/rechunking/README.md b/examples/rechunking/README.md new file mode 100644 index 00000000..9789dc2e --- /dev/null +++ b/examples/rechunking/README.md @@ -0,0 +1,15 @@ +# Example rechunking workflow + +1. Set up a Python environment +```bash +conda create --name virtualizarr-rechunk -y python=3.11 +conda activate virtualizarr-rechunk +pip install -r requirements.txt +``` + +1. Set up cubed executor by following https://github.com/cubed-dev/cubed/blob/main/examples/lithops/aws/README.md + +1. Build a runtime image for Cubed +```bash +lithops runtime build -b aws_lambda -f Dockerfile_aws_lambda virtualizarr-runtime +``` diff --git a/examples/rechunking/cubed-rechunk.py b/examples/rechunking/cubed-rechunk.py new file mode 100644 index 00000000..eeb51f7b --- /dev/null +++ b/examples/rechunking/cubed-rechunk.py @@ -0,0 +1,81 @@ +# Example rechunking flow using VirtualiZarr and cubed. + +import fsspec +import lithops +import xarray as xr + +from virtualizarr import open_virtual_dataset +from cubed.primitive.rechunk import rechunk + +# example workflow based on Pythia's kerchunk cookbook: +# https://projectpythia.org/kerchunk-cookbook/notebooks/foundations/02_kerchunk_multi_file.html +fs_read = fsspec.filesystem("s3", anon=True, skip_instance_cache=True) +files_paths = fs_read.glob("s3://wrf-se-ak-ar5/ccsm/rcp85/daily/2060/*") +file_pattern = sorted(["s3://" + f for f in files_paths]) + +# truncate file_pattern while debugging +file_pattern = file_pattern[:4] + +print(f"{len(file_pattern)} file paths were retrieved.") + + +def map_reference(fil): + """ Map + """ + vds = open_virtual_dataset(fil, + indexes={}, + loadable_variables=['Time'], + cftime_variables=['Time'], + ) + return vds + + +def reduce_reference(results): + """ + """ + combined_vds = xr.combine_nested( + results, + concat_dim=['Time'], + coords='minimal', + compat='override', + ) + # possibly write parquet to s3 here + return combined_vds + + +fexec = lithops.FunctionExecutor() # config=lambda_config + +futures = fexec.map_reduce( + map_reference, + file_pattern, + reduce_reference, + spawn_reducer=100 + ) + +ds = futures.get_result() + +ds.virtualize.to_kerchunk('combined.json', format='json') + +# in notebooks, open_dataset must be caching the json, such that changes +# to the json are not propogated until the kernel is restarted +combined_ds = xr.open_dataset('combined.json', engine="kerchunk") +combined_ds['Time'].attrs = {} # to_zarr complains about attrs + + +source_chunks = {'Time': 1, 'south_north': 250, 'west_east': 320} +target_chunks = {'Time': 5, 'south_north': 25, 'west_east': 32} + +combined_chunked = combined_ds.chunk( + chunks=source_chunks, +) + +# rechunk requires shape attr, so can't pass full Dataset +rechunk( + combined_chunked['TMAX'], + target_chunks=target_chunks, + source_array_name='virtual', + int_array_name='temp', + allowed_mem=2000, + reserved_mem=1000, + target_store="test.zarr", +) diff --git a/examples/rechunking/requirements.txt b/examples/rechunking/requirements.txt new file mode 100644 index 00000000..58a73097 --- /dev/null +++ b/examples/rechunking/requirements.txt @@ -0,0 +1,9 @@ +boto +cftime +lithops +cubed +h5py +kerchunk +s3fs +virtualizarr +xarray \ No newline at end of file From 4fbe5c77eb9c4ac460b0a2bcc49e4ec4cf1ece1f Mon Sep 17 00:00:00 2001 From: thodson Date: Sat, 20 Jul 2024 13:16:39 -0500 Subject: [PATCH 2/4] Fix Dockerfile --- examples/rechunking/Dockerfile_virtualizarr | 2 +- examples/rechunking/README.md | 7 +++++++ examples/rechunking/requirements.txt | 5 +++-- 3 files changed, 11 insertions(+), 3 deletions(-) diff --git a/examples/rechunking/Dockerfile_virtualizarr b/examples/rechunking/Dockerfile_virtualizarr index 59f16193..d1793c6a 100644 --- a/examples/rechunking/Dockerfile_virtualizarr +++ b/examples/rechunking/Dockerfile_virtualizarr @@ -1,7 +1,6 @@ # Python 3.11 FROM python:3.11-slim-buster -COPY requirements.txt requirements.txt RUN apt-get update \ # Install aws-lambda-cpp build dependencies @@ -53,6 +52,7 @@ RUN unzip lithops_lambda.zip \ # Put your dependencies here, using RUN pip install... or RUN apt install... +COPY requirements.txt requirements.txt RUN pip install --no-cache-dir -r requirements.txt ENTRYPOINT [ "/usr/local/bin/python", "-m", "awslambdaric" ] diff --git a/examples/rechunking/README.md b/examples/rechunking/README.md index 9789dc2e..0f115d8a 100644 --- a/examples/rechunking/README.md +++ b/examples/rechunking/README.md @@ -11,5 +11,12 @@ pip install -r requirements.txt 1. Build a runtime image for Cubed ```bash +export LITHOPS_CONFIG_FILE=$(pwd)/config.aws lithops runtime build -b aws_lambda -f Dockerfile_aws_lambda virtualizarr-runtime ``` + +## Cleaning up +To rebuild the Litops image, delete the existing one by running +```bash +lithops runtime delete -b aws_lambda -d virtualizarr-runtime +``` diff --git a/examples/rechunking/requirements.txt b/examples/rechunking/requirements.txt index 58a73097..dfd39b4b 100644 --- a/examples/rechunking/requirements.txt +++ b/examples/rechunking/requirements.txt @@ -1,9 +1,10 @@ boto cftime -lithops cubed +cubed-xarray h5py kerchunk +lithops s3fs virtualizarr -xarray \ No newline at end of file +xarray From 0beadcb20651fefd3935c101eca56f5d62906868 Mon Sep 17 00:00:00 2001 From: thodson Date: Wed, 24 Jul 2024 15:09:32 -0500 Subject: [PATCH 3/4] Complete rechunk workflow --- examples/rechunking/README.md | 9 ++++- examples/rechunking/cubed-rechunk.py | 56 ++++++++++++++-------------- examples/rechunking/cubed.yaml | 7 ++++ 3 files changed, 42 insertions(+), 30 deletions(-) create mode 100644 examples/rechunking/cubed.yaml diff --git a/examples/rechunking/README.md b/examples/rechunking/README.md index 0f115d8a..44b65ff6 100644 --- a/examples/rechunking/README.md +++ b/examples/rechunking/README.md @@ -8,11 +8,18 @@ pip install -r requirements.txt ``` 1. Set up cubed executor by following https://github.com/cubed-dev/cubed/blob/main/examples/lithops/aws/README.md +```bash +export CUBED_CONFIG=$(pwd) 1. Build a runtime image for Cubed ```bash export LITHOPS_CONFIG_FILE=$(pwd)/config.aws -lithops runtime build -b aws_lambda -f Dockerfile_aws_lambda virtualizarr-runtime +lithops runtime build -b aws_lambda -f Dockerfile_virtualizarr virtualizarr-runtime +``` + +1. Run the script +```bash +python cubed-rechunk.py ``` ## Cleaning up diff --git a/examples/rechunking/cubed-rechunk.py b/examples/rechunking/cubed-rechunk.py index eeb51f7b..24dea54c 100644 --- a/examples/rechunking/cubed-rechunk.py +++ b/examples/rechunking/cubed-rechunk.py @@ -1,14 +1,14 @@ -# Example rechunking flow using VirtualiZarr and cubed. +# Example rechunking flow using VirtualiZarr and cubed based on Pythia's +# kerchunk cookbook: https://projectpythia.org/kerchunk-cookbook +# The example proves concept, but should be optimized before production use. +# Please, suggest improvements. import fsspec import lithops import xarray as xr from virtualizarr import open_virtual_dataset -from cubed.primitive.rechunk import rechunk -# example workflow based on Pythia's kerchunk cookbook: -# https://projectpythia.org/kerchunk-cookbook/notebooks/foundations/02_kerchunk_multi_file.html fs_read = fsspec.filesystem("s3", anon=True, skip_instance_cache=True) files_paths = fs_read.glob("s3://wrf-se-ak-ar5/ccsm/rcp85/daily/2060/*") file_pattern = sorted(["s3://" + f for f in files_paths]) @@ -19,8 +19,8 @@ print(f"{len(file_pattern)} file paths were retrieved.") -def map_reference(fil): - """ Map +def map_references(fil): + """ Open a virtual datasets from a list of file paths. """ vds = open_virtual_dataset(fil, indexes={}, @@ -30,8 +30,9 @@ def map_reference(fil): return vds -def reduce_reference(results): - """ +def reduce_references(results): + """ Combine virtual datasets into a single dataset. + """ combined_vds = xr.combine_nested( results, @@ -46,9 +47,9 @@ def reduce_reference(results): fexec = lithops.FunctionExecutor() # config=lambda_config futures = fexec.map_reduce( - map_reference, + map_references, file_pattern, - reduce_reference, + reduce_references, spawn_reducer=100 ) @@ -56,26 +57,23 @@ def reduce_reference(results): ds.virtualize.to_kerchunk('combined.json', format='json') -# in notebooks, open_dataset must be caching the json, such that changes -# to the json are not propogated until the kernel is restarted -combined_ds = xr.open_dataset('combined.json', engine="kerchunk") -combined_ds['Time'].attrs = {} # to_zarr complains about attrs - +# In jupyter, open_dataset seems to cache the json, such that +# changes aren't propogated until the kernel is restarted. +combined_ds = xr.open_dataset('combined.json', + engine="kerchunk", + chunks={}, + chunked_array_type='cubed', + ) -source_chunks = {'Time': 1, 'south_north': 250, 'west_east': 320} -target_chunks = {'Time': 5, 'south_north': 25, 'west_east': 32} +combined_ds['Time'].attrs = {} # to_zarr complains about attrs -combined_chunked = combined_ds.chunk( - chunks=source_chunks, +rechunked_ds = combined_ds.chunk( + chunks={'Time': 5, 'south_north': 25, 'west_east': 32} ) -# rechunk requires shape attr, so can't pass full Dataset -rechunk( - combined_chunked['TMAX'], - target_chunks=target_chunks, - source_array_name='virtual', - int_array_name='temp', - allowed_mem=2000, - reserved_mem=1000, - target_store="test.zarr", -) +rechunked_ds.to_zarr('rechunked.zarr', + mode='w', + encoding={}, # TODO + consolidated=True, + safe_chunks=False, + ) diff --git a/examples/rechunking/cubed.yaml b/examples/rechunking/cubed.yaml new file mode 100644 index 00000000..b4d2173c --- /dev/null +++ b/examples/rechunking/cubed.yaml @@ -0,0 +1,7 @@ +spec: + work_dir: "s3://cubed-$USER-temp" + allowed_mem: "2GB" + executor_name: "lithops" + executor_options: + runtime: "virtualizarr-runtime" + runtime_memory: 2000 From 851f6d86231985cf53190ac2b41d4531d9bd0d4a Mon Sep 17 00:00:00 2001 From: thodson Date: Wed, 24 Jul 2024 15:37:29 -0500 Subject: [PATCH 4/4] Edit doc --- examples/rechunking/cubed-rechunk.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/examples/rechunking/cubed-rechunk.py b/examples/rechunking/cubed-rechunk.py index 24dea54c..ab803c8d 100644 --- a/examples/rechunking/cubed-rechunk.py +++ b/examples/rechunking/cubed-rechunk.py @@ -1,7 +1,7 @@ -# Example rechunking flow using VirtualiZarr and cubed based on Pythia's -# kerchunk cookbook: https://projectpythia.org/kerchunk-cookbook -# The example proves concept, but should be optimized before production use. -# Please, suggest improvements. +# Example rechunking flow using VirtualiZarr, cubed, and xarray-cubed. +# Inspired by Pythia's cookbook: https://projectpythia.org/kerchunk-cookbook +# This script proves the concept but requires further optimization for +# production. Please, suggest improvements. import fsspec import lithops