Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
f19bafb
added data and alleghey folders
zherbz Oct 29, 2025
7223e02
memory and pool connection cleanup
zherbz Oct 29, 2025
2f6dc36
added devcontainer
zherbz Oct 29, 2025
c53f07b
configs folder
zherbz Oct 29, 2025
61323a1
notebooks folder and setup file
zherbz Oct 29, 2025
0b41029
runners folder and run_sst file
zherbz Oct 29, 2025
150ecf2
Merge branch 'main' into feature/allegheny
zherbz Oct 30, 2025
22f3e5c
resolve merge conflict
zherbz Oct 30, 2025
6e86a88
updated default moving window to 24 hours
zherbz Oct 30, 2025
14dee55
updated settings
zherbz Oct 30, 2025
5b941a7
resolve merge conflict
zherbz Oct 30, 2025
53c5976
updated thread implementation
zherbz Oct 31, 2025
86d0e01
updated runner
zherbz Oct 31, 2025
5b9242b
Add storm dss creation
sray014 Nov 6, 2025
02f138f
Force a non interactive backend when threading.
sray014 Nov 6, 2025
df49659
Allow optional dss output directory
sray014 Nov 11, 2025
7ad7a1a
Add example dss storms creation
sray014 Nov 11, 2025
72f7ac2
Add save dataarray to geotiff function
sray014 Nov 20, 2025
7f6fc5b
Return output filepath for ranked storms csv
sray014 Nov 20, 2025
92f1b5e
Create normal precip grid
sray014 Nov 20, 2025
d86e24c
Add defaults, docs
sray014 Nov 20, 2025
8980268
Fix make file
sray014 Nov 20, 2025
b353628
Add latest additions to docs
sray014 Nov 20, 2025
78b7bd4
Add path manager to function params
sray014 Nov 20, 2025
06f507b
Remove unneeded config
sray014 Nov 20, 2025
38ce7a7
Remove unneeded files
sray014 Nov 20, 2025
76b5aa0
Ruff fixes
sray014 Nov 20, 2025
2ce02ba
Merge pull request #27 from fema-ffrd/feature/allegheny
sray014 Nov 26, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions .devcontainer/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
FROM mcr.microsoft.com/devcontainers/base:jammy

RUN apt-get update && apt-get upgrade -y && \
apt-get install -y awscli && \
apt-get install -y gdal-bin libgdal-dev && \
apt-get clean -y && \
rm -rf /var/lib/apt/lists/*

COPY env.yaml /tmp/env.yaml
16 changes: 16 additions & 0 deletions .devcontainer/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
# Devcontainer Template
Devcontainers allow interactive development inside of a docker container using VSCode.


This devcontainer creates a reproducible environment for python projects using micromamba
environments (faster/more robust version of conda). To add this devcontainer template to your project, copy this .devcontainer folder
into the parent directory of your repository, and copy this [.gitattributes file](https://github.com/Michael-Baker-International-Lakewood/mbi_templates/blob/main/.gitattributes) into the same parent directory.

When opening this repository in VSCode, you may be prompted to re-open the project in devcontainer.
Alternatively, you may access this option through the
View menu -> Command Palette -> DevContainers: Reopen in Container.

Other requirements:
1. An environment file (env.yaml) is required placed in the root folder of the
project for a reproducible python environment to be succussfully built.
2. docker installed on the local machine (linux)
44 changes: 44 additions & 0 deletions .devcontainer/devcontainer.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
{
"name": "stormhub-devcontainer",
"build": {
"dockerfile": "Dockerfile",
"context": ".."
},
"features": {
"ghcr.io/devcontainers/features/git:1": {},
"ghcr.io/mamba-org/devcontainer-features/micromamba:1": {
"envFile": "/tmp/env.yaml",
"envName": "stormhub-base",
"autoActivate": true,
"version": "1.5.6"
},
"ghcr.io/devcontainers/features/node:1": {
"version": "lts"
},
"ghcr.io/devcontainers/features/aws-cli:1": {
"version": "latest"
}
},
// "runArgs": [
// "--gpus",
// "all"
// ],
"customizations": {
"vscode": {
"extensions": [
"ms-python.python",
"charliermarsh.ruff",
"GitHub.copilot",
"ms-toolsai.jupyter"
],
"settings": {
"python.defaultInterpreterPath": "/opt/conda/envs/stormhub-base/bin/python"
}
}
},
// be sure to have the workspace folder owned by vscode user
"postCreateCommand": "sudo chown -R vscode:vscode ${containerWorkspaceFolder}",
// start the dev container with the stormhub-base environment activated
// avoid dubious ownership of the workspace folder https://www.kenmuse.com/blog/avoiding-dubious-ownership-in-dev-containers/
"postStartCommand": "micromamba shell init --shell=bash && echo 'micromamba activate stormhub-base' >> ~/.bashrc && git config --global --add safe.directory ${containerWorkspaceFolder}"
}
4 changes: 3 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -174,4 +174,6 @@ holding
.DS_Store

bighorn
indian-creek
indian-creek
allegheny
data
4 changes: 2 additions & 2 deletions docs/make.sh
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
rm -rf build
sphinx-apidoc -o source ../../stormhub
sphinx-build -M html source build
sphinx-apidoc -o docs/source stormhub
sphinx-build -M html docs/source docs/build -c docs/source
10 changes: 10 additions & 0 deletions docs/source/user_guide.rst
Original file line number Diff line number Diff line change
Expand Up @@ -101,8 +101,18 @@ The following snippet provides an example of how to build and create a storm cat
storm_duration_hours,
min_precip_threshold,
top_n_events,
use_threads=True, #True for Linux/WSL, False for Windows
num_workers=8, # Number of parallel workers
check_every_n_hours=6,
)
# Optionally, add DSS files to storm items
add_storm_dss_files(storm_catalog)
# Optionally, create normal precipitation grid
create_normal_precip(storm_catalog, duration_hours=storm_duration_hours)

.. note::
If using Windows, set `use_threads` to `False` in order to avoid issues with multiprocessing. On Linux/WSL, set `use_threads` to `True`.
The use of ProcessPoolExecutor on Linux can lead to complications due to the way processes are spawned. More investigation is needed on this.

Viewing Results
----------------
Expand Down
29 changes: 29 additions & 0 deletions env.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
name: base
channels:
- conda-forge
dependencies:
- dask=2025.10.0
- fiona=1.10.1
- geopandas=1.1.1
- ipykernel=7.0.1
- matplotlib=3.10.7
- numpy=2.3.3
- pandas=2.3.3
- pip=25.2
- pyarrow=21.0.0
- pytest=8.4.2
- python=3.12.12
- requests=2.32.5
- rioxarray=0.19.0
- ruff=0.14.1
- scipy=1.16.2
- xarray=2025.10.1
- zarr=3.1.3
- pip:
- pystac==1.14.1
- boto3==1.40.54
- s3fs==0.4.2
- hecdss==0.1.28
- python-dotenv==1.1.1
- shapely==2.1.2
- contextily==1.6.2
5 changes: 3 additions & 2 deletions stormhub/met/analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import datetime
import logging
import os
from typing import Union

import numpy as np
import pandas as pd
Expand Down Expand Up @@ -101,14 +102,14 @@ def _rank_storm_ids(self) -> list[str]:
sorted_indices = np.argsort(self.metrics_df["mean"].values)[::-1]
return self.metrics_df["storm_date"].dt.strftime("%Y-%m-%dT%H").iloc[sorted_indices].tolist()

def rank_and_save(self, collection_id: str, spm: StacPathManager) -> pd.DataFrame:
def rank_and_save(self, collection_id: str, spm: StacPathManager) -> tuple[pd.DataFrame, str]:
"""Rank storms and save to csv."""
output_file = os.path.join(spm.collection_dir(collection_id), "ranked-storms.csv")
ranked_df = self.rank_and_filter_storms()
ranked_df.to_csv(output_file, index=False)
ranked_df["storm_date"] = pd.to_datetime(ranked_df["storm_date"])
logging.info("Saved ranked storm data to %s", output_file)
return ranked_df
return ranked_df, output_file


class StormFilter:
Expand Down
33 changes: 30 additions & 3 deletions stormhub/met/aorc/aorc.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""AORC Item class."""

import datetime
import gc
import json
import logging
import os
Expand Down Expand Up @@ -115,6 +116,16 @@ def __init__(
self._transposition_transform: Affine | None = None
self._stats: dict | None = None

def clear_cached_data(self) -> None:
"""Explicitly clear all cached data to free memory.

Call this after processing is complete to release memory.
"""
self._aorc_source_data = None
self._transpose = None
self._sum_aorc = None
gc.collect()

def _register_extensions(self) -> None:
"""Register item extensions."""
ProjectionExtension.add_to(self)
Expand Down Expand Up @@ -145,8 +156,10 @@ def aorc_source_data(self) -> xr.Dataset:
- adds ZARR files to assets if they don't exist already
"""
if self._aorc_source_data is None:
s3_out = s3fs.S3FileSystem(anon=True)
# Increase connection pool and configure for better memory management
s3_out = s3fs.S3FileSystem(anon=True, config_kwargs={"max_pool_connections": 50})
fileset = [s3fs.S3Map(root=aorc_path, s3=s3_out, check=False) for aorc_path in self.aorc_paths]
# Use auto chunks to align with Zarr storage, then rechunk if needed
ds = xr.open_mfdataset(fileset, engine="zarr", chunks="auto", consolidated=True)

transposition_geom_for_clip = self.transposition_domain_geometry
Expand All @@ -158,7 +171,20 @@ def aorc_source_data(self) -> xr.Dataset:
longitude=slice(bounds[0], bounds[2]),
latitude=slice(bounds[1], bounds[3]),
)
self._aorc_source_data = subsection.rio.clip([transposition_geom_for_clip], drop=True, all_touched=True)

# Clip to geometry
clipped = subsection.rio.clip([transposition_geom_for_clip], drop=True, all_touched=True)

# Rechunk after loading to optimize memory usage for downstream operations
# This avoids the warning about misaligned chunks
self._aorc_source_data = clipped.chunk({"time": -1, "latitude": "auto", "longitude": "auto"})

# Clean up to free memory
del ds
del subsection
del clipped
gc.collect()

for aorc_path in self.aorc_paths:
aorc_year = int(os.path.basename(aorc_path).replace(".zarr", ""))
aorc_start_datetime = datetime.datetime(
Expand Down Expand Up @@ -314,7 +340,8 @@ def aorc_thumbnail(

def valid_spaces_item(watershed: Item, transposition_region: Item, storm_duration: int = 72) -> Polygon:
"""Search a sample zarr dataset to identify valid spaces for transposition. datetime.datetime(1980, 5, 1) is used as a start time for the search."""
s3 = s3fs.S3FileSystem(anon=True)
# Increase connection pool to avoid warnings
s3 = s3fs.S3FileSystem(anon=True, config_kwargs={"max_pool_connections": 50})
start_time = datetime.datetime(1980, 5, 1)
sample_data = s3fs.S3Map(root=f"{NOAA_AORC_S3_BASE_URL}/{start_time.year}.zarr", s3=s3)
ds = xr.open_dataset(sample_data, engine="zarr", chunks="auto", consolidated=True)
Expand Down
Loading