Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
63 changes: 63 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
# Python
__pycache__/
*.py[cod]
*$py.class
*.so
.Python
*.egg-info/
dist/
build/
*.egg

# Testing
.pytest_cache/
.coverage
htmlcov/
.tox/

# Type checking
.mypy_cache/
.dmypy.json
dmypy.json

# Linting
.ruff_cache/

# Environment
.env
.env.local
.env.*.local
.venv
env/
venv/
ENV/
*.local.yaml

# IDE
.vscode/
.idea/
*.swp
*.swo

# Jupyter
.ipynb_checkpoints/

# Kubernetes
.work/
kubeconfig*
.kube/

# Temporary files
*.tmp
*.log
runs/
generated/
.archive/

# OS
.DS_Store
Thumbs.db

# Project-specific
*.zarr
out/
16 changes: 12 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,18 +23,24 @@ kubectl get wf -n devseed -w
[![Python](https://img.shields.io/badge/python-3.11+-blue.svg)](https://www.python.org/downloads/)
[![Tests](https://github.com/EOPF-Explorer/data-pipeline/workflows/Tests/badge.svg)](https://github.com/EOPF-Explorer/data-pipeline/actions)

- **Multi-sensor support**: Sentinel-1 GRD and Sentinel-2 L2A
- STAC item registration with retry logic
- GeoZarr format conversion
- Cloud-native workflows
- GeoZarr format conversion with cloud-optimized overviews
- Cloud-native workflows with Argo
- Interactive visualization with TiTiler

## What It Does

Transforms Sentinel-2 satellite data into web-ready visualizations:
Transforms Sentinel satellite data into web-ready visualizations:

**Input:** STAC item URL → **Output:** Interactive web map (~5-10 min)

**Pipeline:** Convert (5 min) → Register (30 sec) → Augment (10 sec)

**Supported sensors:**
- **Sentinel-1** L1 GRD: SAR backscatter (VH/VV polarizations)
- **Sentinel-2** L2A: Multispectral reflectance (10m/20m/60m)

## Quick Start

📖 **New to the project?** See [GETTING_STARTED.md](GETTING_STARTED.md) for complete setup (15 min).
Expand Down Expand Up @@ -262,10 +268,12 @@ pytest -v -k e2e # End-to-end tests only
1. **Edit workflow:** `workflows/template.yaml`
2. **Update scripts:** `scripts/*.py`
3. **Test locally:** `pytest tests/ -v`
4. **Build image:** `docker build -t ghcr.io/eopf-explorer/data-pipeline:dev -f docker/Dockerfile .`
4. **Build image:** `docker buildx build --platform linux/amd64 -t ghcr.io/eopf-explorer/data-pipeline:dev -f docker/Dockerfile . --push`
5. **Deploy:** `kubectl apply -f workflows/template.yaml -n devseed`
6. **Monitor:** `kubectl get wf -n devseed -w`

⚠️ **Important:** Always use `--platform linux/amd64` when building images for Kubernetes clusters.

See [CONTRIBUTING.md](CONTRIBUTING.md) for coding standards and development workflow.

## License
Expand Down
9 changes: 5 additions & 4 deletions docker/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# Build for linux/amd64: docker buildx build --platform linux/amd64 -t <tag> . --push
FROM python:3.11-slim

# System dependencies (including GDAL for rasterio)
Expand All @@ -18,13 +19,13 @@ RUN pip install -U pip uv
# Cachebust for data-model installation (change timestamp to force fresh install)
ARG CACHEBUST=2025-10-06T11:00:00Z

# Install eopf-geozarr from minimal fix branch
# Includes critical set_spatial_dims() fix before write_crs() calls
# Install eopf-geozarr from fix/s1-encoding-conflict branch (temporary until merged)
RUN uv pip install --system --no-cache \
git+https://github.com/EOPF-Explorer/data-model.git@fix/spatial-dims-minimal \
git+https://github.com/EOPF-Explorer/data-model.git@fix/s1-encoding-conflict \
pystac>=1.10.0 \
httpx>=0.27.0 \
boto3>=1.34.0
boto3>=1.34.0 \
tenacity>=8.0.0

# Force fresh copy of scripts (invalidate cache)
ARG SCRIPTS_VERSION=2025-10-06T02:05:00Z
Expand Down
93 changes: 87 additions & 6 deletions scripts/augment_stac_item.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,69 @@ def _encode_quicklook_query() -> str:

DEFAULT_QUICKLOOK_QUERY = _encode_quicklook_query()


def _get_s1_polarization(item: Item) -> str:
"""Extract first available polarization from S1 item assets.

Args:
item: PySTAC Item with S1 assets

Returns:
Uppercase polarization code (VH, VV, HH, or HV). Defaults to VH.
"""
for pol in _S1_POLARIZATIONS:
if pol in item.assets:
return pol.upper()
return "VH"


def _encode_s1_preview_query(item: Item) -> str:
"""Generate S1 GRD preview query for TiTiler.

S1 GRD structure in converted GeoZarr:
/S01SIWGRD_{timestamp}_{id}_VH/measurements with grd variable

TiTiler needs the full path to the measurements group with the grd variable.

Args:
item: PySTAC Item with S1 GRD data

Returns:
Query string for TiTiler (variables, bidx, rescale)
"""
pol = _get_s1_polarization(item)
asset = item.assets.get(pol.lower())

if not asset or not asset.href:
# Fallback to simple path
pairs = [
("variables", "/measurements:grd"),
("bidx", "1"),
("rescale", "0,219"),
]
return "&".join(f"{key}={urllib.parse.quote_plus(value)}" for key, value in pairs)

# Extract group path from asset href
# Example: s3://.../S01SIWGRD_..._VH/measurements -> /S01SIWGRD_..._VH/measurements:grd
href = asset.href
if ".zarr/" in href:
# Extract path after .zarr/
zarr_path = href.split(".zarr/")[1]
# zarr_path is like: S01SIWGRD_..._VH/measurements
# Build variable reference: /S01SIWGRD_..._VH/measurements:grd
variable_path = f"/{zarr_path}:grd"
else:
# Fallback
variable_path = "/measurements:grd"

pairs = [
("variables", variable_path),
("bidx", "1"),
("rescale", "0,219"), # Typical S1 GRD range
]
return "&".join(f"{key}={urllib.parse.quote_plus(value)}" for key, value in pairs)


_ALLOWED_SCHEMES = {"http", "https"}
_USER_AGENT = "augment-stac-item/1.0"
_DEFAULT_TIMEOUT = float(os.getenv("HTTP_TIMEOUT", "30"))
Expand All @@ -65,6 +128,14 @@ def _encode_quicklook_query() -> str:
_S2_DATASET_KEYS = ("SR_10m", "SR_20m", "SR_60m")
_S2_QUICKLOOK_KEYS = ("TCI_10m", "TCI", "TCI_20m")

_S1_COLLECTION_ID = "sentinel-1-l1-grd"
_S1_POLARIZATIONS = ("vh", "vv", "hh", "hv")


def _is_s1_collection(collection_id: str) -> bool:
"""Check if collection is Sentinel-1 GRD."""
return collection_id.startswith("sentinel-1-l1-grd")


def _coerce_epsg(value: Any) -> int | None:
if isinstance(value, bool):
Expand Down Expand Up @@ -462,17 +533,27 @@ def add_visualization_links(
item.links = [link for link in item.links if link.rel not in filtered_rels]
item_id = item.id
viewer_href = f"{base_raster_url}/collections/{coll}/items/{item_id}/viewer"
asset_key = _select_preview_asset(item)
preview_asset = item.assets.get(asset_key) if asset_key else None
is_quicklook = _is_quicklook_asset(preview_asset)
default_query = DEFAULT_QUICKLOOK_QUERY if is_quicklook else DEFAULT_TRUE_COLOR_QUERY

# Determine preview query based on collection type
asset_key: str | None
if _is_s1_collection(coll):
# Sentinel-1: Use GRD polarization preview
default_query = _encode_s1_preview_query(item)
xyz_title = os.getenv("PREVIEW_XYZ_TITLE", f"GRD {_get_s1_polarization(item)}")
asset_key = _get_s1_polarization(item).lower() # vh or vv
else:
# Sentinel-2: Use quicklook or true color
asset_key = _select_preview_asset(item)
preview_asset = item.assets.get(asset_key) if asset_key else None
is_quicklook = _is_quicklook_asset(preview_asset)
default_query = DEFAULT_QUICKLOOK_QUERY if is_quicklook else DEFAULT_TRUE_COLOR_QUERY
xyz_title = os.getenv("PREVIEW_XYZ_TITLE", "True Color Image (10m)")

xyz_query = _resolve_preview_query(
os.getenv("PREVIEW_XYZ_QUERY"),
default_query=default_query,
)

xyz_title = os.getenv("PREVIEW_XYZ_TITLE", "True Color Image (10m)")

def _add_link(rel: str, target: str, media_type: str, title: str | None = None) -> None:
item.add_link(
Link(
Expand Down
30 changes: 30 additions & 0 deletions scripts/get_zarr_url.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
#!/usr/bin/env python3
import json
import sys
from urllib.request import urlopen


def get_zarr_url(stac_item_url: str) -> str:
with urlopen(stac_item_url) as response:
item = json.loads(response.read())

assets = item.get("assets", {})

# Priority: product, zarr, then any .zarr asset
for key in ["product", "zarr"]:
if key in assets:
href = assets[key].get("href")
if href:
return str(href)

# Fallback
for asset in assets.values():
href = asset.get("href", "")
if ".zarr" in href:
return str(href)

raise RuntimeError("No Zarr asset found")


if __name__ == "__main__":
print(get_zarr_url(sys.argv[1]))
Loading