Dewberry · sray014 · Oct 29, 2025 · Oct 29, 2025 · Oct 29, 2025 · Oct 29, 2025
diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile
@@ -0,0 +1,9 @@
+FROM mcr.microsoft.com/devcontainers/base:jammy
+
+RUN apt-get update && apt-get upgrade -y && \
+    apt-get install -y awscli && \
+    apt-get install -y gdal-bin libgdal-dev && \
+    apt-get clean -y && \
+    rm -rf /var/lib/apt/lists/*   
+
+COPY env.yaml /tmp/env.yaml
diff --git a/.devcontainer/README.md b/.devcontainer/README.md
@@ -0,0 +1,16 @@
+# Devcontainer Template
+Devcontainers allow interactive development inside of a docker container using VSCode. 
+
+
+This devcontainer creates a reproducible environment for python projects using micromamba    
+environments (faster/more robust version of conda). To add this devcontainer template to your project, copy this .devcontainer folder  
+into the parent directory of your repository, and copy this [.gitattributes file](https://github.com/Michael-Baker-International-Lakewood/mbi_templates/blob/main/.gitattributes) into the same parent directory.
+
+When opening this repository in VSCode, you may be prompted to re-open the project in devcontainer.  
+Alternatively, you may access this option through the  
+View menu -> Command Palette -> DevContainers: Reopen in Container.
+
+Other requirements:
+1. An environment file (env.yaml) is required placed in the root folder of the  
+project for a reproducible python environment to be succussfully built.
+2. docker installed on the local machine (linux)
diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json
@@ -0,0 +1,44 @@
+{
+	"name": "stormhub-devcontainer",
+	"build": {
+		"dockerfile": "Dockerfile",
+		"context": ".."
+	},
+	"features": {
+		"ghcr.io/devcontainers/features/git:1": {},
+		"ghcr.io/mamba-org/devcontainer-features/micromamba:1": {
+			"envFile": "/tmp/env.yaml",
+			"envName": "stormhub-base",
+			"autoActivate": true,
+			"version": "1.5.6"
+		},
+		"ghcr.io/devcontainers/features/node:1": {
+			"version": "lts"
+		},
+		"ghcr.io/devcontainers/features/aws-cli:1": {
+			"version": "latest"
+		}
+	},
+	// "runArgs": [
+	// 	"--gpus",
+	// 	"all"
+	// ],
+	"customizations": {
+		"vscode": {
+			"extensions": [
+				"ms-python.python",
+				"charliermarsh.ruff",
+				"GitHub.copilot",
+				"ms-toolsai.jupyter"
+			],
+			"settings": {
+				"python.defaultInterpreterPath": "/opt/conda/envs/stormhub-base/bin/python"
+			}
+		}
+	},
+	// be sure to have the workspace folder owned by vscode user
+	"postCreateCommand": "sudo chown -R vscode:vscode ${containerWorkspaceFolder}",
+	// start the dev container with the stormhub-base environment activated
+	// avoid dubious ownership of the workspace folder https://www.kenmuse.com/blog/avoiding-dubious-ownership-in-dev-containers/
+	"postStartCommand": "micromamba shell init --shell=bash && echo 'micromamba activate stormhub-base' >> ~/.bashrc && git config --global --add safe.directory ${containerWorkspaceFolder}"
+}
diff --git a/.gitignore b/.gitignore
@@ -174,4 +174,6 @@ holding
 .DS_Store
 
 bighorn
-indian-creek
+indian-creek
+allegheny
+data
diff --git a/docs/make.sh b/docs/make.sh
@@ -1,3 +1,3 @@
 rm -rf build
-sphinx-apidoc -o source ../../stormhub
-sphinx-build -M html source build
+sphinx-apidoc -o docs/source stormhub
+sphinx-build -M html docs/source docs/build -c docs/source
diff --git a/docs/source/user_guide.rst b/docs/source/user_guide.rst
@@ -101,8 +101,18 @@ The following snippet provides an example of how to build and create a storm cat
          storm_duration_hours,
          min_precip_threshold,
          top_n_events,
+         use_threads=True, #True for Linux/WSL, False for Windows
+         num_workers=8, # Number of parallel workers
          check_every_n_hours=6,
       )
+      # Optionally, add DSS files to storm items
+      add_storm_dss_files(storm_catalog)
+      # Optionally, create normal precipitation grid
+      create_normal_precip(storm_catalog, duration_hours=storm_duration_hours)
+
+.. note::
+   If using Windows, set `use_threads` to `False` in order to avoid issues with multiprocessing. On Linux/WSL, set `use_threads` to `True`.
+   The use of ProcessPoolExecutor on Linux can lead to complications due to the way processes are spawned. More investigation is needed on this.
 
 Viewing Results
 ----------------

diff --git a/env.yaml b/env.yaml
@@ -0,0 +1,29 @@
+name: base
+channels:
+- conda-forge
+dependencies:
+- dask=2025.10.0
+- fiona=1.10.1
+- geopandas=1.1.1
+- ipykernel=7.0.1
+- matplotlib=3.10.7
+- numpy=2.3.3
+- pandas=2.3.3
+- pip=25.2
+- pyarrow=21.0.0
+- pytest=8.4.2
+- python=3.12.12
+- requests=2.32.5
+- rioxarray=0.19.0
+- ruff=0.14.1
+- scipy=1.16.2
+- xarray=2025.10.1
+- zarr=3.1.3
+-  pip:
+    - pystac==1.14.1
+    - boto3==1.40.54
+    - s3fs==0.4.2
+    - hecdss==0.1.28
+    - python-dotenv==1.1.1
+    - shapely==2.1.2
+    - contextily==1.6.2
diff --git a/stormhub/met/analysis.py b/stormhub/met/analysis.py
@@ -3,6 +3,7 @@
 import datetime
 import logging
 import os
+from typing import Union
 
 import numpy as np
 import pandas as pd
@@ -101,14 +102,14 @@ def _rank_storm_ids(self) -> list[str]:
         sorted_indices = np.argsort(self.metrics_df["mean"].values)[::-1]
         return self.metrics_df["storm_date"].dt.strftime("%Y-%m-%dT%H").iloc[sorted_indices].tolist()
 
-    def rank_and_save(self, collection_id: str, spm: StacPathManager) -> pd.DataFrame:
+    def rank_and_save(self, collection_id: str, spm: StacPathManager) -> tuple[pd.DataFrame, str]:
         """Rank storms and save to csv."""
         output_file = os.path.join(spm.collection_dir(collection_id), "ranked-storms.csv")
         ranked_df = self.rank_and_filter_storms()
         ranked_df.to_csv(output_file, index=False)
         ranked_df["storm_date"] = pd.to_datetime(ranked_df["storm_date"])
         logging.info("Saved ranked storm data to %s", output_file)
-        return ranked_df
+        return ranked_df, output_file
 
 
 class StormFilter:

diff --git a/stormhub/met/aorc/aorc.py b/stormhub/met/aorc/aorc.py
@@ -1,6 +1,7 @@
 """AORC Item class."""
 
 import datetime
+import gc
 import json
 import logging
 import os
@@ -115,6 +116,16 @@ def __init__(
         self._transposition_transform: Affine | None = None
         self._stats: dict | None = None
 
+    def clear_cached_data(self) -> None:
+        """Explicitly clear all cached data to free memory.
+
+        Call this after processing is complete to release memory.
+        """
+        self._aorc_source_data = None
+        self._transpose = None
+        self._sum_aorc = None
+        gc.collect()
+
     def _register_extensions(self) -> None:
         """Register item extensions."""
         ProjectionExtension.add_to(self)
@@ -145,8 +156,10 @@ def aorc_source_data(self) -> xr.Dataset:
         - adds ZARR files to assets if they don't exist already
         """
         if self._aorc_source_data is None:
-            s3_out = s3fs.S3FileSystem(anon=True)
+            # Increase connection pool and configure for better memory management
+            s3_out = s3fs.S3FileSystem(anon=True, config_kwargs={"max_pool_connections": 50})
             fileset = [s3fs.S3Map(root=aorc_path, s3=s3_out, check=False) for aorc_path in self.aorc_paths]
+            # Use auto chunks to align with Zarr storage, then rechunk if needed
             ds = xr.open_mfdataset(fileset, engine="zarr", chunks="auto", consolidated=True)
 
             transposition_geom_for_clip = self.transposition_domain_geometry
@@ -158,7 +171,20 @@ def aorc_source_data(self) -> xr.Dataset:
                 longitude=slice(bounds[0], bounds[2]),
                 latitude=slice(bounds[1], bounds[3]),
             )
-            self._aorc_source_data = subsection.rio.clip([transposition_geom_for_clip], drop=True, all_touched=True)
+
+            # Clip to geometry
+            clipped = subsection.rio.clip([transposition_geom_for_clip], drop=True, all_touched=True)
+
+            # Rechunk after loading to optimize memory usage for downstream operations
+            # This avoids the warning about misaligned chunks
+            self._aorc_source_data = clipped.chunk({"time": -1, "latitude": "auto", "longitude": "auto"})
+
+            # Clean up to free memory
+            del ds
+            del subsection
+            del clipped
+            gc.collect()
+
             for aorc_path in self.aorc_paths:
                 aorc_year = int(os.path.basename(aorc_path).replace(".zarr", ""))
                 aorc_start_datetime = datetime.datetime(
@@ -314,7 +340,8 @@ def aorc_thumbnail(
 
 def valid_spaces_item(watershed: Item, transposition_region: Item, storm_duration: int = 72) -> Polygon:
     """Search a sample zarr dataset to identify valid spaces for transposition. datetime.datetime(1980, 5, 1) is used as a start time for the search."""
-    s3 = s3fs.S3FileSystem(anon=True)
+    # Increase connection pool to avoid warnings
+    s3 = s3fs.S3FileSystem(anon=True, config_kwargs={"max_pool_connections": 50})
     start_time = datetime.datetime(1980, 5, 1)
     sample_data = s3fs.S3Map(root=f"{NOAA_AORC_S3_BASE_URL}/{start_time.year}.zarr", s3=s3)
     ds = xr.open_dataset(sample_data, engine="zarr", chunks="auto", consolidated=True)