diff --git a/user-guide/notebooks/tutorials/virtualizarr-timeseries-RASI.ipynb b/user-guide/notebooks/tutorials/virtualizarr-timeseries-RASI.ipynb new file mode 100644 index 00000000..a22287c0 --- /dev/null +++ b/user-guide/notebooks/tutorials/virtualizarr-timeseries-RASI.ipynb @@ -0,0 +1,384 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "75852006-3105-47c3-9e7d-ced39b06f4e3", + "metadata": {}, + "source": [ + "---\n", + "title: Reading RASI Virtualizarr for Rio Grande\n", + "author: Maheshwari Neelam\n", + "date: 'November 19, 2025'\n", + "execute:\n", + " cache: true\n", + " freeze: true\n", + "---\n" + ] + }, + { + "cell_type": "markdown", + "id": "d710002e-6956-42d9-8b06-d464566c02ba", + "metadata": {}, + "source": [ + "You can launch this notbook using mybinder, by clicking the button below.\n", + "\n", + "\n", + "\"Binder\" \n", + "\n" + ] + }, + { + "cell_type": "markdown", + "id": "4a3d60f1-91fa-45f6-99c5-20d19b34608d", + "metadata": {}, + "source": [ + "## Approach" + ] + }, + { + "cell_type": "markdown", + "id": "a2efb7c4-a7d7-44d4-8b6d-ce20a6ced7ff", + "metadata": {}, + "source": [ + "[Virtual Zarr with Icechunk](https://icechunk.io/en/stable/virtual/) is a cloud-optimized format for storing and accessing large geospatial datasets without duplicating the underlying data. It provides fast, efficient access to chunked array data stored in cloud object storage.\n", + "\n", + "Reading RASI data from virtual Zarr stores enables rapid analysis and visualization of hydrological variables across specific watersheds and HUCs (Hydrologic Unit Codes).\n", + "\n", + "This tutorial shows how to read virtual Zarr data from S3 and create spatial and temporal visualizations for the Rio Grande HUC using [Xarray](https://github.com/pydata/xarray), [Icechunk](https://icechunk.io/), and [Cartopy](https://scitools.org.uk/cartopy/).\n", + "\n", + "1. Step-by-step guide to accessing RASI virtual Zarr stores from S3\n", + "2. Subsetting data for Rio Grande basin (HUC 13)\n", + "3. Creating spatial mean time series and temporal mean maps\n", + "4. Handling fill values and data quality issues" + ] + }, + { + "cell_type": "markdown", + "id": "caff5814-552c-4cbb-9664-bc26cd64c410", + "metadata": {}, + "source": [ + "## Step by step" + ] + }, + { + "cell_type": "markdown", + "id": "3f442b60-b857-474a-afd7-d01714077a8f", + "metadata": {}, + "source": [ + "### Step 0 - Installs" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "321f9841-3c04-4aff-9276-0a2ca1425b9b", + "metadata": {}, + "outputs": [], + "source": [ + "import xarray as xr\n", + "import matplotlib.pyplot as plt\n", + "import cartopy.crs as ccrs\n", + "import cartopy.feature as cfeature\n", + "import icechunk\n", + "import pandas as pd\n", + "import numpy as np" + ] + }, + { + "cell_type": "markdown", + "id": "6c8ae8bb-6f3c-4d5c-9d9b-4fad6cc3d0d5", + "metadata": {}, + "source": [ + "### Step 1 - # Configuration" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e8e5464f-6868-46e4-9351-85169aa7b32c", + "metadata": {}, + "outputs": [], + "source": [ + "s3_bucket = \"nasa-waterinsight\"\n", + "s3_prefix = \"virtual-zarr-store/icechunk/RASI/HISTORICAL\"\n", + "s3_region = \"us-west-2\"\n", + "virtual_data_path = \"s3://nasa-waterinsight/RASI/\" # Where actual data chunks are stored\n" + ] + }, + { + "cell_type": "markdown", + "id": "8cac5ba7", + "metadata": {}, + "source": [ + "### Step 2 - # Open Icechunk repository with virtual chunk authorization" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "78c9a1ef-6574-4f0b-9b61-5413cb98f525", + "metadata": {}, + "outputs": [], + "source": [ + "storage = icechunk.s3_storage(\n", + " bucket=s3_bucket,\n", + " region=s3_region,\n", + " prefix=s3_prefix,\n", + " anonymous=True\n", + ")\n", + "\n", + "config = icechunk.RepositoryConfig.default()\n", + "config.set_virtual_chunk_container(\n", + " icechunk.VirtualChunkContainer(\n", + " url_prefix=virtual_data_path,\n", + " store=icechunk.s3_store(region=s3_region, anonymous=True)\n", + " )\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "8e5cd0ab-9481-47ca-8844-ab0e606751c9", + "metadata": {}, + "source": [ + "### Step 3 - Set up credentials for anonymous access to virtual chunks" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5d197dfa-3e0c-459d-931d-d88bf13ceea9", + "metadata": {}, + "outputs": [], + "source": [ + "virtual_credentials = icechunk.containers_credentials(\n", + " {virtual_data_path: icechunk.s3_anonymous_credentials()}\n", + ")\n", + "\n", + "repo = icechunk.Repository.open(\n", + " storage=storage, \n", + " config=config,\n", + " authorize_virtual_chunk_access=virtual_credentials\n", + ")\n", + "session = repo.readonly_session(branch=\"main\")" + ] + }, + { + "cell_type": "markdown", + "id": "dad5ab09", + "metadata": {}, + "source": [ + "### Step 4 - # Load dataset" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ec644956-6ebd-47f3-b5a6-efd18b3fa8e2", + "metadata": {}, + "outputs": [], + "source": [ + "ds = xr.open_zarr(session.store, consolidated=False)\n", + "print(\"Dataset loaded:\")\n", + "print(ds)\n", + "\n", + "# Use TotalPrecip_Percentiles variable\n", + "data_var = \"TotalPrecip_Percentiles\"\n", + "print(f\"\\nVisualizing: {data_var}\")\n" + ] + }, + { + "cell_type": "markdown", + "id": "56f31fc8", + "metadata": {}, + "source": [ + "### Step 5 - # Check the dataset " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7283aa52-8ae1-4b0a-90fa-a3671c066fae", + "metadata": {}, + "outputs": [], + "source": [ + "# Check variable attributes\n", + "print(f\"\\nVariable attributes:\")\n", + "for attr, value in ds[data_var].attrs.items():\n", + " print(f\" {attr}: {value}\")\n", + "\n", + "# Check raw data range before any processing\n", + "print(f\"\\nRaw data info:\")\n", + "print(f\" Data type: {ds[data_var].dtype}\")\n", + "print(f\" Shape: {ds[data_var].shape}\")\n", + "print(f\" Chunk size: {ds[data_var].chunks}\")" + ] + }, + { + "cell_type": "markdown", + "id": "1726432a", + "metadata": {}, + "source": [ + "### Step 6 - # Subset for Rio Grande basin (approximate HUC 13 extent)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4df5df19-91dd-42b4-8b9f-b62a93f705f3", + "metadata": {}, + "outputs": [], + "source": [ + "# Rio Grande basin coordinates: ~26°N to 39°N, ~107°W to 97°W\n", + "rio_grande = ds.sel(\n", + " lat=slice(26, 39),\n", + " lon=slice(-107, -97)\n", + ")\n", + "print(f\"\\nSubset to Rio Grande basin:\")\n", + "print(f\" Lat range: {float(rio_grande.lat.min()):.2f}° to {float(rio_grande.lat.max()):.2f}°\")\n", + "print(f\" Lon range: {float(rio_grande.lon.min()):.2f}° to {float(rio_grande.lon.max()):.2f}°\")\n", + "\n", + "# Select median percentile (50th)\n", + "data_median = rio_grande[data_var].sel(percentile=50)\n", + "\n", + "# Load a sample to check actual values\n", + "sample = data_median.isel(time=0).compute()\n", + "print(f\"\\nSample data (first timestep) - BEFORE masking:\")\n", + "print(f\" Min: {float(sample.min()):.4f}\")\n", + "print(f\" Max: {float(sample.max()):.4f}\")\n", + "print(f\" Mean: {float(sample.mean()):.4f}\")" + ] + }, + { + "cell_type": "markdown", + "id": "b6042fc2-2205-4758-be61-d4d39174a5f8", + "metadata": {}, + "source": [ + "### Step 7. Mask fill values (-9999)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "eae56c2a-449a-4f44-90f6-96539b7dde04", + "metadata": {}, + "outputs": [], + "source": [ + "# Mask fill values (-9999)\n", + "data_median_masked = data_median.where(data_median > -9000)\n", + "\n", + "sample_masked = data_median_masked.isel(time=0).compute()\n", + "print(f\"\\nSample data (first timestep) - AFTER masking fill values:\")\n", + "print(f\" Min: {float(sample_masked.min()):.4f}\")\n", + "print(f\" Max: {float(sample_masked.max()):.4f}\")\n", + "print(f\" Mean: {float(sample_masked.mean()):.4f}\")\n", + "print(f\" Valid pixels: {(~np.isnan(sample_masked)).sum().values} / {sample_masked.size}\")" + ] + }, + { + "cell_type": "markdown", + "id": "ea49d017", + "metadata": {}, + "source": [ + "### Step 7 - # Calculate spatial and temporal metrics" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c18e78a2-b7c0-416e-9f0e-98b0c073b9d2", + "metadata": {}, + "outputs": [], + "source": [ + "# Calculate spatial mean time series (simple mean over Rio Grande basin for each time step)\n", + "# Using masked data to exclude fill values\n", + "spatial_mean_ts = data_median_masked.mean(dim=['lat', 'lon'], skipna=True)\n", + "\n", + "# Convert to pandas\n", + "ts_df = spatial_mean_ts.to_pandas()\n", + "\n", + "# Calculate temporal mean (mean over time for each grid cell)\n", + "temporal_mean = data_median_masked.mean(dim='time', skipna=True)\n" + ] + }, + { + "cell_type": "markdown", + "id": "f67ec1ff", + "metadata": {}, + "source": [ + "### Step 8 - # Create figure with two subplots" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "edf96b30-1c96-4128-8f4d-4f0e634d5964", + "metadata": {}, + "outputs": [], + "source": [ + "fig = plt.figure(figsize=(18, 10))\n", + "\n", + "# 1. Spatial Mean Time Series (top)\n", + "ax1 = plt.subplot(2, 1, 1)\n", + "ax1.plot(ts_df.index, ts_df.values, linewidth=2, marker='o', markersize=8, \n", + " alpha=0.8, color='steelblue', label='Spatial Mean')\n", + "ax1.set_xlabel('Date', fontsize=12, fontweight='bold')\n", + "ax1.set_ylabel(f\"{data_var.replace('_', ' ')} (m³/s)\", fontsize=12, fontweight='bold')\n", + "ax1.set_title(f'Rio Grande Basin - {data_var.replace(\"_\", \" \")} Spatial Mean Time Series\\n(Note: Units metadata shows m³/s but may represent precipitation)', \n", + " fontsize=13, fontweight='bold')\n", + "ax1.legend()\n", + "ax1.grid(True, alpha=0.3)\n", + "plt.setp(ax1.xaxis.get_majorticklabels(), rotation=45, ha='right')\n", + "\n", + "# 2. Temporal Mean Map (bottom)\n", + "ax2 = plt.subplot(2, 1, 2, projection=ccrs.PlateCarree())\n", + "im = temporal_mean.plot(\n", + " ax=ax2,\n", + " transform=ccrs.PlateCarree(),\n", + " cmap='Blues',\n", + " cbar_kwargs={'label': f'{data_var.replace(\"_\", \" \")} (m³/s)', 'shrink': 0.8}\n", + ")\n", + "ax2.coastlines(resolution='50m', linewidth=0.5)\n", + "ax2.add_feature(cfeature.BORDERS, linewidth=0.5, edgecolor='gray')\n", + "ax2.add_feature(cfeature.STATES, linewidth=0.3, edgecolor='gray', alpha=0.5)\n", + "# ax2.add_feature(cfeature.RIVERS, linewidth=1, edgecolor='blue', alpha=0.5)\n", + "ax2.gridlines(draw_labels=True, linewidth=0.5, alpha=0.5)\n", + "ax2.set_title(f'Rio Grande Basin - {data_var.replace(\"_\", \" \")} Temporal Mean', \n", + " fontsize=14, fontweight='bold')\n", + "\n", + "plt.tight_layout()\n", + "plt.savefig('rasi_rio_grande_analysis.png', dpi=300, bbox_inches='tight')\n", + "print(\"\\n✓ Plot saved as rasi_rio_grande_analysis.png\")\n", + "print(f\"\\nSpatial Mean Statistics (after masking fill values):\")\n", + "print(f\" Mean: {ts_df.mean():.4f} m³/s\")\n", + "print(f\" Std Dev: {ts_df.std():.4f} m³/s\")\n", + "print(f\" Min: {ts_df.min():.4f} (on {ts_df.idxmin().strftime('%Y-%m')})\")\n", + "print(f\" Max: {ts_df.max():.4f} (on {ts_df.idxmax().strftime('%Y-%m')})\")\n", + "print(f\"\\n⚠️ Note: Metadata says units are 'm³/s' which is unusual for precipitation.\")\n", + "print(f\" Values now look reasonable after masking -9999 fill values.\")\n", + "plt.show()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.11" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}