From d3de58074a77aeb1896f842ea4ed69514543a05d Mon Sep 17 00:00:00 2001 From: norlandrhagen Date: Wed, 27 Aug 2025 15:00:45 -0600 Subject: [PATCH 1/4] update coiled and append examples to V2 syntax --- .gitignore | 2 +- docs/examples.md | 8 +- examples/V1/append/noaa-cdr-sst.ipynb | 2478 ----------------- examples/V2/append/noaa-cdr-sst.ipynb | 359 +++ examples/{V1 => V2}/coiled/terraclimate.ipynb | 101 +- 5 files changed, 428 insertions(+), 2520 deletions(-) delete mode 100644 examples/V1/append/noaa-cdr-sst.ipynb create mode 100644 examples/V2/append/noaa-cdr-sst.ipynb rename examples/{V1 => V2}/coiled/terraclimate.ipynb (71%) diff --git a/.gitignore b/.gitignore index a6e6ad270..7dd5d2e04 100644 --- a/.gitignore +++ b/.gitignore @@ -161,7 +161,7 @@ cython_debug/ virtualizarr/_version.py docs/generated/ docs/jupyter_execute/ -examples/ +*.DS_Store # Created by https://www.toptal.com/developers/gitignore/api/visualstudiocode # Edit at https://www.toptal.com/developers/gitignore?templates=visualstudiocode diff --git a/docs/examples.md b/docs/examples.md index 00859487b..aeffeba20 100644 --- a/docs/examples.md +++ b/docs/examples.md @@ -7,7 +7,7 @@ The following examples demonstrate the use of VirtualiZarr to create virtual datasets of various kinds: -1. [Appending new daily NOAA SST data to Icechunk](https://github.com/zarr-developers/VirtualiZarr/blob/main/examples/append/noaa-cdr-sst.ipynb) -2. [Parallel reference generation using Coiled Functions](https://github.com/zarr-developers/VirtualiZarr/blob/main/examples/coiled/terraclimate.ipynb) -3. [Serverless parallel reference generation using Lithops](https://github.com/zarr-developers/VirtualiZarr/tree/main/examples/virtualizarr-with-lithops) -4. [MUR SST Virtual and Zarr Icechunk Store Generation using Lithops](https://github.com/zarr-developers/VirtualiZarr/tree/main/examples/mursst-icechunk-with-lithops) +1. [Appending new daily NOAA SST data to Icechunk](https://github.com/zarr-developers/VirtualiZarr/blob/main/examples/V2/append/noaa-cdr-sst.ipynb) +2. [Parallel reference generation using Coiled Functions](https://github.com/zarr-developers/VirtualiZarr/blob/main/examples/V2/coiled/terraclimate.ipynb) +3. [Serverless parallel reference generation using Lithops](https://github.com/zarr-developers/VirtualiZarr/tree/main/examples/V1/virtualizarr-with-lithops) +4. [MUR SST Virtual and Zarr Icechunk Store Generation using Lithops](https://github.com/zarr-developers/VirtualiZarr/tree/main/V1/examples/mursst-icechunk-with-lithops) diff --git a/examples/V1/append/noaa-cdr-sst.ipynb b/examples/V1/append/noaa-cdr-sst.ipynb deleted file mode 100644 index 9ba0aa534..000000000 --- a/examples/V1/append/noaa-cdr-sst.ipynb +++ /dev/null @@ -1,2478 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "120903d6-ea52-4a1e-83d2-4d434ad2cb98", - "metadata": {}, - "source": [ - "# Appending to an Icechunk Store with Virtual References\n", - "## Note: This example uses a pre-2.0 release of VirtualiZarr\n", - "This notebook demonstrates how to append to an icechunk store.\n", - "\n", - "Please ensure the correct dependencies are installed before starting." - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "id": "d09bbff3-4e96-4490-b837-14b78b64df35", - "metadata": {}, - "outputs": [], - "source": [ - "# !pip install 'virtualizarr['icechunk','hdf']' ipykernel" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "3055eff4-9e22-4a95-a7fd-96933f381183", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Name: icechunk\n", - "Version: 0.1.2\n", - "Summary: Icechunk Python\n", - "Home-page: https://github.com/earth-mover/icechunk\n", - "Author: Earthmover PBC\n", - "Author-email: Earthmover \n", - "License: Apache-2.0\n", - "Location: /opt/homebrew/envs/virtualizarr-tests/lib/python3.12/site-packages\n", - "Requires: zarr\n", - "Required-by: \n" - ] - } - ], - "source": [ - "!pip show icechunk" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "2f69f0bb-316b-452c-b1ba-4d7ef4afcf67", - "metadata": {}, - "outputs": [], - "source": [ - "import warnings\n", - "\n", - "import fsspec\n", - "import icechunk\n", - "import xarray as xr\n", - "\n", - "from virtualizarr import open_virtual_dataset\n", - "\n", - "warnings.filterwarnings(\"ignore\", category=UserWarning)" - ] - }, - { - "cell_type": "markdown", - "id": "0df547e4-456d-44c1-b190-606f0b9e056e", - "metadata": {}, - "source": [ - "# Before you start\n", - "\n", - "Identify the dataset you will be using and create a list of files to generate a virtual icechunk datastore with." - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "1532c33b-804f-49fa-9fa9-0eb42ea87e26", - "metadata": {}, - "outputs": [], - "source": [ - "fs = fsspec.filesystem(\"s3\", anon=True)\n", - "\n", - "oisst_files = fs.glob(\n", - " \"s3://noaa-cdr-sea-surface-temp-optimum-interpolation-pds/data/v2.1/avhrr/202408/oisst-avhrr-v02r01.*.nc\"\n", - ")\n", - "\n", - "oisst_files = sorted([\"s3://\" + f for f in oisst_files])" - ] - }, - { - "cell_type": "markdown", - "id": "73ceb93b-b0ac-48b2-928a-84da0d2019ac", - "metadata": {}, - "source": [ - "## Create virtual datasets with VirtualiZarr's `open_virtual_dataset`" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "06bbec92-3974-4859-8bda-353afc7800b9", - "metadata": {}, - "outputs": [], - "source": [ - "so = dict(anon=True, default_fill_cache=False, default_cache_type=\"none\")\n", - "\n", - "virtual_datasets = [\n", - " open_virtual_dataset(url, indexes={}, reader_options={\"storage_options\": so})\n", - " for url in oisst_files[0:2]\n", - "]" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "77fb94c8-870f-4c9e-8421-ac9c17402122", - "metadata": {}, - "outputs": [], - "source": [ - "virtual_ds = xr.concat(\n", - " virtual_datasets,\n", - " dim=\"time\",\n", - " coords=\"minimal\",\n", - " compat=\"override\",\n", - " combine_attrs=\"override\",\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "c025f35d", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "
<xarray.Dataset> Size: 66MB\n",
-       "Dimensions:  (time: 2, zlev: 1, lat: 720, lon: 1440)\n",
-       "Coordinates:\n",
-       "    time     (time) float32 8B ManifestArray<shape=(2,), dtype=float32, chunk...\n",
-       "    zlev     (zlev) float32 4B ManifestArray<shape=(1,), dtype=float32, chunk...\n",
-       "    lat      (lat) float32 3kB ManifestArray<shape=(720,), dtype=float32, chu...\n",
-       "    lon      (lon) float32 6kB ManifestArray<shape=(1440,), dtype=float32, ch...\n",
-       "Data variables:\n",
-       "    sst      (time, zlev, lat, lon) float64 17MB ManifestArray<shape=(2, 1, 7...\n",
-       "    anom     (time, zlev, lat, lon) float64 17MB ManifestArray<shape=(2, 1, 7...\n",
-       "    err      (time, zlev, lat, lon) float64 17MB ManifestArray<shape=(2, 1, 7...\n",
-       "    ice      (time, zlev, lat, lon) float64 17MB ManifestArray<shape=(2, 1, 7...\n",
-       "Attributes: (12/37)\n",
-       "    Conventions:                CF-1.6, ACDD-1.3\n",
-       "    title:                      NOAA/NCEI 1/4 Degree Daily Optimum Interpolat...\n",
-       "    references:                 Reynolds, et al.(2007) Daily High-Resolution-...\n",
-       "    source:                     ICOADS, NCEP_GTS, GSFC_ICE, NCEP_ICE, Pathfin...\n",
-       "    id:                         oisst-avhrr-v02r01.20240801.nc\n",
-       "    naming_authority:           gov.noaa.ncei\n",
-       "    ...                         ...\n",
-       "    time_coverage_start:        2024-08-01T00:00:00Z\n",
-       "    time_coverage_end:          2024-08-01T23:59:59Z\n",
-       "    metadata_link:              https://doi.org/10.25921/RE9P-PT57\n",
-       "    ncei_template_version:      NCEI_NetCDF_Grid_Template_v2.0\n",
-       "    comment:                    Data was converted from NetCDF-3 to NetCDF-4 ...\n",
-       "    sensor:                     Thermometer, AVHRR
" - ], - "text/plain": [ - " Size: 66MB\n", - "Dimensions: (time: 2, zlev: 1, lat: 720, lon: 1440)\n", - "Coordinates:\n", - " time (time) float32 8B ManifestArray\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "
<xarray.Dataset> Size: 66MB\n",
-       "Dimensions:  (lon: 1440, time: 2, zlev: 1, lat: 720)\n",
-       "Coordinates:\n",
-       "  * lon      (lon) float32 6kB 0.125 0.375 0.625 0.875 ... 359.4 359.6 359.9\n",
-       "  * time     (time) datetime64[ns] 16B 2024-08-01T12:00:00 2024-08-02T12:00:00\n",
-       "  * lat      (lat) float32 3kB -89.88 -89.62 -89.38 -89.12 ... 89.38 89.62 89.88\n",
-       "  * zlev     (zlev) float32 4B 0.0\n",
-       "Data variables:\n",
-       "    sst      (time, zlev, lat, lon) float64 17MB dask.array<chunksize=(1, 1, 720, 1440), meta=np.ndarray>\n",
-       "    ice      (time, zlev, lat, lon) float64 17MB dask.array<chunksize=(1, 1, 720, 1440), meta=np.ndarray>\n",
-       "    anom     (time, zlev, lat, lon) float64 17MB dask.array<chunksize=(1, 1, 720, 1440), meta=np.ndarray>\n",
-       "    err      (time, zlev, lat, lon) float64 17MB dask.array<chunksize=(1, 1, 720, 1440), meta=np.ndarray>\n",
-       "Attributes: (12/37)\n",
-       "    Conventions:                CF-1.6, ACDD-1.3\n",
-       "    cdm_data_type:              Grid\n",
-       "    comment:                    Data was converted from NetCDF-3 to NetCDF-4 ...\n",
-       "    creator_email:              oisst-help@noaa.gov\n",
-       "    creator_url:                https://www.ncei.noaa.gov/\n",
-       "    date_created:               2024-08-16T09:12:00Z\n",
-       "    ...                         ...\n",
-       "    source:                     ICOADS, NCEP_GTS, GSFC_ICE, NCEP_ICE, Pathfin...\n",
-       "    standard_name_vocabulary:   CF Standard Name Table (v40, 25 January 2017)\n",
-       "    summary:                    NOAAs 1/4-degree Daily Optimum Interpolation ...\n",
-       "    time_coverage_end:          2024-08-01T23:59:59Z\n",
-       "    time_coverage_start:        2024-08-01T00:00:00Z\n",
-       "    title:                      NOAA/NCEI 1/4 Degree Daily Optimum Interpolat...
" - ], - "text/plain": [ - " Size: 66MB\n", - "Dimensions: (lon: 1440, time: 2, zlev: 1, lat: 720)\n", - "Coordinates:\n", - " * lon (lon) float32 6kB 0.125 0.375 0.625 0.875 ... 359.4 359.6 359.9\n", - " * time (time) datetime64[ns] 16B 2024-08-01T12:00:00 2024-08-02T12:00:00\n", - " * lat (lat) float32 3kB -89.88 -89.62 -89.38 -89.12 ... 89.38 89.62 89.88\n", - " * zlev (zlev) float32 4B 0.0\n", - "Data variables:\n", - " sst (time, zlev, lat, lon) float64 17MB dask.array\n", - " ice (time, zlev, lat, lon) float64 17MB dask.array\n", - " anom (time, zlev, lat, lon) float64 17MB dask.array\n", - " err (time, zlev, lat, lon) float64 17MB dask.array\n", - "Attributes: (12/37)\n", - " Conventions: CF-1.6, ACDD-1.3\n", - " cdm_data_type: Grid\n", - " comment: Data was converted from NetCDF-3 to NetCDF-4 ...\n", - " creator_email: oisst-help@noaa.gov\n", - " creator_url: https://www.ncei.noaa.gov/\n", - " date_created: 2024-08-16T09:12:00Z\n", - " ... ...\n", - " source: ICOADS, NCEP_GTS, GSFC_ICE, NCEP_ICE, Pathfin...\n", - " standard_name_vocabulary: CF Standard Name Table (v40, 25 January 2017)\n", - " summary: NOAAs 1/4-degree Daily Optimum Interpolation ...\n", - " time_coverage_end: 2024-08-01T23:59:59Z\n", - " time_coverage_start: 2024-08-01T00:00:00Z\n", - " title: NOAA/NCEI 1/4 Degree Daily Optimum Interpolat..." - ] - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "ds = xr.open_zarr(session.store, consolidated=False, zarr_format=3)\n", - "ds" - ] - }, - { - "cell_type": "markdown", - "id": "23dd5a13-9c0e-4132-9073-474c0af65920", - "metadata": {}, - "source": [ - "# Append\n", - "\n", - "That was all nothing new! Basically a repeat of what is in the [icechunk docs](https://icechunk.io/icechunk-python/virtual/). Here we follow the same steps to create a virtual dataset, but we add an `append_dim` argument to the `to_icechunk` function." - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "id": "190c25f9-e000-4b17-83eb-cf551141dfea", - "metadata": {}, - "outputs": [], - "source": [ - "virtual_datasets_a = [\n", - " open_virtual_dataset(\n", - " url, indexes={}, reader_options={\"storage_options\": {\"anon\": True}}\n", - " )\n", - " for url in oisst_files[2:4]\n", - "]" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "id": "af330082-207a-4f08-aefe-fc15aa8b2eb3", - "metadata": {}, - "outputs": [], - "source": [ - "virtual_ds_a = xr.concat(\n", - " virtual_datasets_a,\n", - " dim=\"time\",\n", - " coords=\"minimal\",\n", - " compat=\"override\",\n", - " combine_attrs=\"override\",\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "id": "b6137cc1-b996-4e60-8c12-01eb19930da6", - "metadata": {}, - "outputs": [], - "source": [ - "append_session = repo.writable_session(\"main\")" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "id": "a465be46-bb81-4e36-b1b6-67c3b8e4b9ec", - "metadata": {}, - "outputs": [], - "source": [ - "virtual_ds_a.virtualize.to_icechunk(append_session.store, append_dim=\"time\")" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "id": "e9908d2f-664b-4256-b9d4-842df2e512c3", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'3MEW3ECB74ZYANAZZHT0'" - ] - }, - "execution_count": 17, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "append_session.commit(\"wrote 2 more days of data\")" - ] - }, - { - "cell_type": "markdown", - "id": "e1384e99-c284-4942-a49b-7799802728b0", - "metadata": {}, - "source": [ - "# Check that it worked!" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "id": "477094aa-2094-46e2-ae78-516fc2a51690", - "metadata": {}, - "outputs": [], - "source": [ - "read_session = repo.readonly_session(branch=\"main\")" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "id": "47a53027-dbae-48aa-85d2-dcbc04e01e61", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "
<xarray.Dataset> Size: 133MB\n",
-       "Dimensions:  (zlev: 1, time: 4, lat: 720, lon: 1440)\n",
-       "Coordinates:\n",
-       "  * zlev     (zlev) float32 4B 0.0\n",
-       "  * lat      (lat) float32 3kB -89.88 -89.62 -89.38 -89.12 ... 89.38 89.62 89.88\n",
-       "  * time     (time) datetime64[ns] 32B 2024-08-01T12:00:00 ... 2024-08-04T12:...\n",
-       "  * lon      (lon) float32 6kB 0.125 0.375 0.625 0.875 ... 359.4 359.6 359.9\n",
-       "Data variables:\n",
-       "    ice      (time, zlev, lat, lon) float64 33MB dask.array<chunksize=(1, 1, 720, 1440), meta=np.ndarray>\n",
-       "    err      (time, zlev, lat, lon) float64 33MB dask.array<chunksize=(1, 1, 720, 1440), meta=np.ndarray>\n",
-       "    sst      (time, zlev, lat, lon) float64 33MB dask.array<chunksize=(1, 1, 720, 1440), meta=np.ndarray>\n",
-       "    anom     (time, zlev, lat, lon) float64 33MB dask.array<chunksize=(1, 1, 720, 1440), meta=np.ndarray>\n",
-       "Attributes: (12/37)\n",
-       "    Conventions:                CF-1.6, ACDD-1.3\n",
-       "    cdm_data_type:              Grid\n",
-       "    comment:                    Data was converted from NetCDF-3 to NetCDF-4 ...\n",
-       "    creator_email:              oisst-help@noaa.gov\n",
-       "    creator_url:                https://www.ncei.noaa.gov/\n",
-       "    date_created:               2024-08-18T09:12:00Z\n",
-       "    ...                         ...\n",
-       "    source:                     ICOADS, NCEP_GTS, GSFC_ICE, NCEP_ICE, Pathfin...\n",
-       "    standard_name_vocabulary:   CF Standard Name Table (v40, 25 January 2017)\n",
-       "    summary:                    NOAAs 1/4-degree Daily Optimum Interpolation ...\n",
-       "    time_coverage_end:          2024-08-03T23:59:59Z\n",
-       "    time_coverage_start:        2024-08-03T00:00:00Z\n",
-       "    title:                      NOAA/NCEI 1/4 Degree Daily Optimum Interpolat...
" - ], - "text/plain": [ - " Size: 133MB\n", - "Dimensions: (zlev: 1, time: 4, lat: 720, lon: 1440)\n", - "Coordinates:\n", - " * zlev (zlev) float32 4B 0.0\n", - " * lat (lat) float32 3kB -89.88 -89.62 -89.38 -89.12 ... 89.38 89.62 89.88\n", - " * time (time) datetime64[ns] 32B 2024-08-01T12:00:00 ... 2024-08-04T12:...\n", - " * lon (lon) float32 6kB 0.125 0.375 0.625 0.875 ... 359.4 359.6 359.9\n", - "Data variables:\n", - " ice (time, zlev, lat, lon) float64 33MB dask.array\n", - " err (time, zlev, lat, lon) float64 33MB dask.array\n", - " sst (time, zlev, lat, lon) float64 33MB dask.array\n", - " anom (time, zlev, lat, lon) float64 33MB dask.array\n", - "Attributes: (12/37)\n", - " Conventions: CF-1.6, ACDD-1.3\n", - " cdm_data_type: Grid\n", - " comment: Data was converted from NetCDF-3 to NetCDF-4 ...\n", - " creator_email: oisst-help@noaa.gov\n", - " creator_url: https://www.ncei.noaa.gov/\n", - " date_created: 2024-08-18T09:12:00Z\n", - " ... ...\n", - " source: ICOADS, NCEP_GTS, GSFC_ICE, NCEP_ICE, Pathfin...\n", - " standard_name_vocabulary: CF Standard Name Table (v40, 25 January 2017)\n", - " summary: NOAAs 1/4-degree Daily Optimum Interpolation ...\n", - " time_coverage_end: 2024-08-03T23:59:59Z\n", - " time_coverage_start: 2024-08-03T00:00:00Z\n", - " title: NOAA/NCEI 1/4 Degree Daily Optimum Interpolat..." - ] - }, - "execution_count": 19, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "ds = xr.open_zarr(read_session.store, consolidated=False, zarr_format=3)\n", - "ds" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.12.8" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/examples/V2/append/noaa-cdr-sst.ipynb b/examples/V2/append/noaa-cdr-sst.ipynb new file mode 100644 index 000000000..2fe5fad67 --- /dev/null +++ b/examples/V2/append/noaa-cdr-sst.ipynb @@ -0,0 +1,359 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "120903d6-ea52-4a1e-83d2-4d434ad2cb98", + "metadata": {}, + "source": [ + "# Appending to an Icechunk Store with Virtual References\n", + "This notebook demonstrates how to append to an icechunk store.\n", + "\n", + "Please ensure the correct dependencies are installed before starting." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d09bbff3-4e96-4490-b837-14b78b64df35", + "metadata": {}, + "outputs": [], + "source": [ + "# !pip install 'virtualizarr['icechunk','hdf']' ipykernel s3fs" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2f69f0bb-316b-452c-b1ba-4d7ef4afcf67", + "metadata": {}, + "outputs": [], + "source": [ + "import warnings\n", + "\n", + "import fsspec\n", + "import icechunk\n", + "import xarray as xr\n", + "from obstore.store import from_url\n", + "\n", + "from virtualizarr import open_virtual_dataset\n", + "from virtualizarr.parsers import HDFParser\n", + "from virtualizarr.registry import ObjectStoreRegistry\n", + "\n", + "warnings.filterwarnings(\"ignore\", category=UserWarning)" + ] + }, + { + "cell_type": "markdown", + "id": "0df547e4-456d-44c1-b190-606f0b9e056e", + "metadata": {}, + "source": [ + "# Before you start\n", + "\n", + "Identify the dataset you will be using and create a list of files to generate a virtual icechunk datastore with." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1532c33b-804f-49fa-9fa9-0eb42ea87e26", + "metadata": {}, + "outputs": [], + "source": [ + "fs = fsspec.filesystem(\"s3\", anon=True)\n", + "\n", + "oisst_files = fs.glob(\n", + " \"s3://noaa-cdr-sea-surface-temp-optimum-interpolation-pds/data/v2.1/avhrr/202408/oisst-avhrr-v02r01.*.nc\"\n", + ")\n", + "\n", + "oisst_files = sorted([\"s3://\" + f for f in oisst_files])" + ] + }, + { + "cell_type": "markdown", + "id": "5f235fc4", + "metadata": {}, + "source": [ + "### Define our Virtualizarr `Parser` and `ObjectStoreRegistry`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a343e5b1", + "metadata": {}, + "outputs": [], + "source": [ + "bucket = \"s3://noaa-cdr-sea-surface-temp-optimum-interpolation-pds\"\n", + "store = from_url(bucket, region=\"us-east-1\", skip_signature=True)\n", + "registry = ObjectStoreRegistry({bucket: store})\n", + "parser = HDFParser()" + ] + }, + { + "cell_type": "markdown", + "id": "73ceb93b-b0ac-48b2-928a-84da0d2019ac", + "metadata": {}, + "source": [ + "## Create virtual datasets with VirtualiZarr's `open_virtual_dataset`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "06bbec92-3974-4859-8bda-353afc7800b9", + "metadata": {}, + "outputs": [], + "source": [ + "virtual_datasets = [\n", + " open_virtual_dataset(\n", + " url=url,\n", + " parser=parser,\n", + " registry=registry,\n", + " loadable_variables=[\"time\", \"lat\", \"lon\", \"zlev\"],\n", + " )\n", + " for url in oisst_files[0:2]\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "77fb94c8-870f-4c9e-8421-ac9c17402122", + "metadata": {}, + "outputs": [], + "source": [ + "virtual_ds = xr.concat(\n", + " virtual_datasets,\n", + " dim=\"time\",\n", + " coords=\"minimal\",\n", + " compat=\"override\",\n", + " combine_attrs=\"override\",\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c025f35d", + "metadata": {}, + "outputs": [], + "source": [ + "virtual_ds" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "abefd6fa-386a-4e07-a7c8-219d3730eeeb", + "metadata": {}, + "outputs": [], + "source": [ + "# Clean up the store if running this notebook multiple times.\n", + "#!rm -rf ./noaa-cdr-icechunk/" + ] + }, + { + "cell_type": "markdown", + "id": "05f41a0b-6292-419d-a9d3-d8ddf8c0c15b", + "metadata": {}, + "source": [ + "## Initialize the Icechunk Store" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "79a4228a-0e17-4b07-9144-f24fe06db832", + "metadata": {}, + "outputs": [], + "source": [ + "storage = icechunk.local_filesystem_storage(\"./noaa-cdr-icechunk\")\n", + "\n", + "config = icechunk.RepositoryConfig.default()\n", + "\n", + "\n", + "config.set_virtual_chunk_container(\n", + " icechunk.VirtualChunkContainer(\n", + " url_prefix=\"s3://noaa-cdr-sea-surface-temp-optimum-interpolation-pds/\",\n", + " store=icechunk.s3_store(region=\"us-east-1\", anonymous=True),\n", + " ),\n", + ")\n", + "\n", + "repo = icechunk.Repository.create(storage, config)\n", + "\n", + "session = repo.writable_session(\"main\")" + ] + }, + { + "cell_type": "markdown", + "id": "749193c1-38b9-4400-a08f-f0a675d30f06", + "metadata": {}, + "source": [ + "## Write the virtual datasets to the icechunk store and commit" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9387e1ff-46c1-45fd-9796-0457538209a7", + "metadata": {}, + "outputs": [], + "source": [ + "virtual_ds.virtualize.to_icechunk(session.store)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "53a74fb9-006b-4d2b-9157-7090af6c9e09", + "metadata": {}, + "outputs": [], + "source": [ + "session.commit(\"first 2 days of 202408 data\")" + ] + }, + { + "cell_type": "markdown", + "id": "8becd176-1c7d-4c74-a3f1-1b9f55b445a2", + "metadata": {}, + "source": [ + "## Check your work!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b6271bd1-bc0b-4901-9901-91aabe508cf7", + "metadata": {}, + "outputs": [], + "source": [ + "ds = xr.open_zarr(session.store, consolidated=False, zarr_format=3)\n", + "ds" + ] + }, + { + "cell_type": "markdown", + "id": "23dd5a13-9c0e-4132-9073-474c0af65920", + "metadata": {}, + "source": [ + "# Append\n", + "\n", + "That was all nothing new! Basically a repeat of what is in the [icechunk docs](https://icechunk.io/icechunk-python/virtual/). Here we follow the same steps to create a virtual dataset, but we add an `append_dim` argument to the `to_icechunk` function." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "190c25f9-e000-4b17-83eb-cf551141dfea", + "metadata": {}, + "outputs": [], + "source": [ + "virtual_datasets_a = [\n", + " open_virtual_dataset(\n", + " url=url,\n", + " parser=parser,\n", + " registry=registry,\n", + " loadable_variables=[\"time\", \"lat\", \"lon\", \"zlev\"],\n", + " )\n", + " for url in oisst_files[2:4]\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "af330082-207a-4f08-aefe-fc15aa8b2eb3", + "metadata": {}, + "outputs": [], + "source": [ + "virtual_ds_a = xr.concat(\n", + " virtual_datasets_a,\n", + " dim=\"time\",\n", + " coords=\"minimal\",\n", + " compat=\"override\",\n", + " combine_attrs=\"override\",\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b6137cc1-b996-4e60-8c12-01eb19930da6", + "metadata": {}, + "outputs": [], + "source": [ + "append_session = repo.writable_session(\"main\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a465be46-bb81-4e36-b1b6-67c3b8e4b9ec", + "metadata": {}, + "outputs": [], + "source": [ + "virtual_ds_a.virtualize.to_icechunk(append_session.store, append_dim=\"time\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e9908d2f-664b-4256-b9d4-842df2e512c3", + "metadata": {}, + "outputs": [], + "source": [ + "append_session.commit(\"wrote 2 more days of data\")" + ] + }, + { + "cell_type": "markdown", + "id": "e1384e99-c284-4942-a49b-7799802728b0", + "metadata": {}, + "source": [ + "# Check that it worked!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "477094aa-2094-46e2-ae78-516fc2a51690", + "metadata": {}, + "outputs": [], + "source": [ + "read_session = repo.readonly_session(branch=\"main\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "47a53027-dbae-48aa-85d2-dcbc04e01e61", + "metadata": {}, + "outputs": [], + "source": [ + "ds = xr.open_zarr(read_session.store, consolidated=False, zarr_format=3)\n", + "ds" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.11" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/examples/V1/coiled/terraclimate.ipynb b/examples/V2/coiled/terraclimate.ipynb similarity index 71% rename from examples/V1/coiled/terraclimate.ipynb rename to examples/V2/coiled/terraclimate.ipynb index 13c1ce301..c0fa4e662 100644 --- a/examples/V1/coiled/terraclimate.ipynb +++ b/examples/V2/coiled/terraclimate.ipynb @@ -4,12 +4,12 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Virtualizarr and Coiled - Building a virtual dataset of Terraclimate\n", - "## Note: This example uses a pre-2.0 release of VirtualiZarr\n", + "# Virtualizarr and Coiled - Building a Virtual Zarr store of the Terraclimate dataset\n", "\n", "\n", "This notebook is an example of using Virtualizarr together with the Python distributed processing framework [Coiled](https://www.coiled.io/) to generate references using [serverless functions](https://docs.coiled.io/user_guide/functions.html). \n", - "- **Note:** running this notebook requires a coiled account.\n" + "\n", + "**Note:** running this notebook requires a coiled account.\n" ] }, { @@ -24,20 +24,18 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Parallelizing `virtualizarr` reference generation with coiled serverless functions\n", - "Coiled serverless functions allow us to easily spin up hundreds of small compute instances, which are great for individual file reference generation. We were able to process 924 netCDF files into a 1TB virtual xarray dataset in 9 minutes for ~$0.24." + "## Parallelizing `Virtualizarr` reference generation with coiled serverless functions\n", + "Coiled serverless functions allow us to easily spin up hundreds of small VMs, which are great for individual file reference generation. We were able to process 924 netCDF files a virtual xarray dataset in about 18 minutes for ~$0.43." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## Installation and environment\n", - "\n", - "You should install the Python requirements in a clean virtual environment of your choice. Each coiled serverless function will reuse this environment, so it's best to start with a clean slate.\n", + "## Requirements\n", "\n", "```bash\n", - "pip install 'virtualizarr['icechunk','hdf']' coiled ipykernel bokeh\n", + "pip install 'virtualizarr[icechunk,hdf]' coiled jupyter bokeh jupyter-server-proxy obstore\n", "```" ] }, @@ -58,8 +56,11 @@ "import icechunk\n", "import numpy as np\n", "import xarray as xr\n", + "from obstore.store import from_url\n", "\n", - "from virtualizarr import open_virtual_dataset" + "from virtualizarr import open_virtual_dataset\n", + "from virtualizarr.parsers import HDFParser\n", + "from virtualizarr.registry import ObjectStoreRegistry" ] }, { @@ -99,13 +100,32 @@ "max_year = 2023\n", "time_list = np.arange(min_year, max_year + 1, 1)\n", "\n", - "combinations = [\n", + "urls = [\n", " f\"https://climate.northwestknowledge.net/TERRACLIMATE-DATA/TerraClimate_{var}_{year}.nc\"\n", " for year in time_list\n", " for var in tvars\n", "]" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Define our Virtualizarr `Parser` and `ObjectStoreRegistry`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "bucket = \"https://climate.northwestknowledge.net\"\n", + "store = from_url(bucket)\n", + "registry = ObjectStoreRegistry({bucket: store})\n", + "parser = HDFParser()" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -114,11 +134,10 @@ "\n", "### Serverless function setup notes:\n", "- This coiled function is tailored to AWS\n", - "- `vm_type=[\"t4g.small\"]` - This is a small instance, you shouldn't need large machines for reference generation\n", + "- `vm_type=[\"m8g.medium\"]` - This is a small instance, you shouldn't need large machines for reference generation\n", "- `spot_policy=\"spot_with_fallback\"` is cheaper, but might have unintended consequences\n", - "- `arm=True` uses VMs with ARM architecture, which is cheaper\n", "- `idle_timeout=\"10 minutes\"` workers will shut down after 10 minutes of inactivity \n", - "- `n_workers=[100, 300]` adaptive scaling between 100 & 300 workers\n", + "- `n_workers=[10, 300]` adaptive scaling between 10 & 100 workers\n", "- `name` [optional] if you want to keep track of your cluster in the coiled dashboard\n", "\n", "More details can be found in the [serverless function API](https://docs.coiled.io/user_guide/functions.html#api)." @@ -132,17 +151,18 @@ "source": [ "@coiled.function(\n", " region=\"us-west-2\",\n", - " vm_type=[\"t4g.small\"],\n", + " vm_type=[\"m8g.medium\"],\n", " spot_policy=\"spot_with_fallback\",\n", " arm=True,\n", " idle_timeout=\"10 minutes\",\n", " n_workers=[10, 100],\n", " name=\"parallel_reference_generation\",\n", ")\n", - "def process(filename):\n", + "def process(url):\n", " vds = open_virtual_dataset(\n", - " filename,\n", - " decode_times=True,\n", + " url=url,\n", + " parser=parser,\n", + " registry=registry,\n", " loadable_variables=[\"time\", \"lat\", \"lon\", \"crs\"],\n", " )\n", " return vds\n", @@ -150,7 +170,7 @@ "\n", "# process.map distributes out the input file urls to coiled functions\n", "# retires=10 allows for individual task retires, which can be useful for inconsistent server behavior\n", - "results = process.map(combinations[0:2], retries=10)" + "results = process.map(urls, retries=10)" ] }, { @@ -171,22 +191,13 @@ "vds_list = [result for result in results]\n", "\n", "# combine individual refs into a virtual Xarray dataset\n", + "\n", "mds = xr.combine_by_coords(\n", - " vds_list, coords=\"minimal\", compat=\"override\", combine_attrs=\"drop\"\n", + " vds_list, combine_attrs=\"drop_conflicts\", coords=\"minimal\", compat=\"override\"\n", ")\n", - "\n", "mds" ] }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "print(str(\"{0:.2f}\".format(mds.nbytes / 1e12)), \" TB\")" - ] - }, { "cell_type": "markdown", "metadata": {}, @@ -215,16 +226,32 @@ "metadata": {}, "outputs": [], "source": [ - "mds.virtualize.to_icechunk(store=session.store)" + "config = icechunk.RepositoryConfig.default()\n", + "config.set_virtual_chunk_container(\n", + " icechunk.VirtualChunkContainer(\n", + " url_prefix=\"https://climate.northwestknowledge.net/\",\n", + " store=icechunk.http_store(),\n", + " ),\n", + ")\n", + "\n", + "\n", + "storage = icechunk.in_memory_storage()\n", + "repo = icechunk.Repository.create(storage, config)\n", + "session = repo.writable_session(\"main\")\n", + "\n", + "mds.vz.to_icechunk(session.store)\n", + "\n", + "snapshot_id = session.commit(\"terraclimate reference\")\n", + "print(snapshot_id)\n", + "\n", + "repo.save_config()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## Open the Icechunk store with Xarray\n", - "\n", - "**Warning:** Calling `to_zarr` on this dataset will try to write out 1TB of data.\n" + "## Open the Icechunk store with Xarray" ] }, { @@ -248,7 +275,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -262,9 +289,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.12.8" + "version": "3.11.11" } }, "nbformat": 4, - "nbformat_minor": 2 + "nbformat_minor": 4 } From f368f73a7a5d6130fc6604914265c6d5c8534515 Mon Sep 17 00:00:00 2001 From: norlandrhagen Date: Wed, 3 Sep 2025 11:52:52 -0600 Subject: [PATCH 2/4] adds container credentials to append example --- examples/V2/append/noaa-cdr-sst.ipynb | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/examples/V2/append/noaa-cdr-sst.ipynb b/examples/V2/append/noaa-cdr-sst.ipynb index 2fe5fad67..1ff8bd39d 100644 --- a/examples/V2/append/noaa-cdr-sst.ipynb +++ b/examples/V2/append/noaa-cdr-sst.ipynb @@ -157,7 +157,9 @@ "id": "05f41a0b-6292-419d-a9d3-d8ddf8c0c15b", "metadata": {}, "source": [ - "## Initialize the Icechunk Store" + "## Initialize the Icechunk Store\n", + "We need configure the `virtual_chunk_container` as make sure the icechunk container credentials allow for anonymous access. \n", + "Details on this can be found [here](https://icechunk.io/en/stable/virtual/)." ] }, { @@ -171,15 +173,21 @@ "\n", "config = icechunk.RepositoryConfig.default()\n", "\n", - "\n", "config.set_virtual_chunk_container(\n", " icechunk.VirtualChunkContainer(\n", " url_prefix=\"s3://noaa-cdr-sea-surface-temp-optimum-interpolation-pds/\",\n", " store=icechunk.s3_store(region=\"us-east-1\", anonymous=True),\n", " ),\n", ")\n", + "credentials = icechunk.containers_credentials(\n", + " {\n", + " \"s3://noaa-cdr-sea-surface-temp-optimum-interpolation-pds/\": icechunk.s3_credentials(\n", + " anonymous=True\n", + " )\n", + " }\n", + ")\n", "\n", - "repo = icechunk.Repository.create(storage, config)\n", + "repo = icechunk.Repository.create(storage, config, credentials)\n", "\n", "session = repo.writable_session(\"main\")" ] @@ -337,7 +345,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "virtualizarr (3.13.6)", "language": "python", "name": "python3" }, @@ -351,7 +359,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.11" + "version": "3.13.6" } }, "nbformat": 4, From a2c0d1d16e930bbf6f36c3394201ec0c7b70a436 Mon Sep 17 00:00:00 2001 From: norlandrhagen Date: Fri, 5 Sep 2025 09:42:47 -0600 Subject: [PATCH 3/4] moved icechunk containers_credentials to read_only session --- examples/V2/append/noaa-cdr-sst.ipynb | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/examples/V2/append/noaa-cdr-sst.ipynb b/examples/V2/append/noaa-cdr-sst.ipynb index 1ff8bd39d..fb1d5424e 100644 --- a/examples/V2/append/noaa-cdr-sst.ipynb +++ b/examples/V2/append/noaa-cdr-sst.ipynb @@ -179,15 +179,9 @@ " store=icechunk.s3_store(region=\"us-east-1\", anonymous=True),\n", " ),\n", ")\n", - "credentials = icechunk.containers_credentials(\n", - " {\n", - " \"s3://noaa-cdr-sea-surface-temp-optimum-interpolation-pds/\": icechunk.s3_credentials(\n", - " anonymous=True\n", - " )\n", - " }\n", - ")\n", "\n", - "repo = icechunk.Repository.create(storage, config, credentials)\n", + "\n", + "repo = icechunk.Repository.create(storage, config)\n", "\n", "session = repo.writable_session(\"main\")" ] @@ -318,7 +312,8 @@ "id": "e1384e99-c284-4942-a49b-7799802728b0", "metadata": {}, "source": [ - "# Check that it worked!" + "# Check that it worked!\n", + "Let's create a read-only icechunk session and pass in the authorization credentials for the[ Virtual Chunk Containers](https://icechunk.io/en/latest/configuration/#virtual-chunk-credentials) to Icechunk." ] }, { @@ -328,7 +323,18 @@ "metadata": {}, "outputs": [], "source": [ - "read_session = repo.readonly_session(branch=\"main\")" + "credentials = icechunk.containers_credentials(\n", + " {\n", + " \"s3://noaa-cdr-sea-surface-temp-optimum-interpolation-pds/\": icechunk.s3_credentials(\n", + " anonymous=True\n", + " )\n", + " }\n", + ")\n", + "read_repo = icechunk.Repository.open(\n", + " storage, config, authorize_virtual_chunk_access=credentials\n", + ")\n", + "\n", + "read_session = read_repo.readonly_session(\"main\")" ] }, { From 357a91416383737c781775497a5d28c82aa5cf1e Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 9 Sep 2025 18:05:59 +0000 Subject: [PATCH 4/4] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- docs/examples.md | 1 - 1 file changed, 1 deletion(-) diff --git a/docs/examples.md b/docs/examples.md index f1459f193..aeffeba20 100644 --- a/docs/examples.md +++ b/docs/examples.md @@ -11,4 +11,3 @@ The following examples demonstrate the use of VirtualiZarr to create virtual dat 2. [Parallel reference generation using Coiled Functions](https://github.com/zarr-developers/VirtualiZarr/blob/main/examples/V2/coiled/terraclimate.ipynb) 3. [Serverless parallel reference generation using Lithops](https://github.com/zarr-developers/VirtualiZarr/tree/main/examples/V1/virtualizarr-with-lithops) 4. [MUR SST Virtual and Zarr Icechunk Store Generation using Lithops](https://github.com/zarr-developers/VirtualiZarr/tree/main/V1/examples/mursst-icechunk-with-lithops) -