Preview Anndata (.h5ad) (#4636)

kevinemoore · Kevin Moore · fiskus · web-flow · commit a3625ee56f79 · 2025-12-17T12:00:02.000Z
Co-authored-by: Kevin Moore &lt;kevin@quiltdata.io&gt;
Co-authored-by: Maksim Chervonnyi &lt;mail@redmax.dev&gt;
Co-authored-by: Sergey Fedoseev &lt;fedoseev.sergey@quiltdata.io&gt;
Co-authored-by: Claude &lt;noreply@anthropic.com&gt;
Co-authored-by: Dr. Ernie Prabhakar &lt;ernest@quilt.bio&gt;
Co-authored-by: greptile-apps[bot] &lt;165735046+greptile-apps[bot]@users.noreply.github.com&gt;
Co-authored-by: Alexei Mochalov &lt;nl_0@quiltdata.io&gt;
diff --git a/.github/workflows/deploy-lambdas.yaml b/.github/workflows/deploy-lambdas.yaml
@@ -23,7 +23,6 @@ jobs:
           - s3hash
           - status_reports
           - manifest_indexer
-          - tabular_preview
           - transcode
     runs-on: ubuntu-latest
     # These permissions are needed to interact with GitHub's OIDC Token endpoint.
@@ -70,6 +69,7 @@ jobs:
         path:
           - indexer
           - thumbnail
+          - tabular_preview
     runs-on: ubuntu-latest
     # These permissions are needed to interact with GitHub's OIDC Token endpoint.
     permissions:
diff --git a/lambdas/tabular_preview/CHANGELOG.md b/lambdas/tabular_preview/CHANGELOG.md
@@ -17,6 +17,7 @@ where verb is one of
 
 ## Changes
 
+- [Added] Preview h5ad (anndata) files ([#4636](https://github.com/quiltdata/quilt/pull/4636))
 - [Changed] Switch to uv ([#4654](https://github.com/quiltdata/quilt/pull/4654))
 - [Changed] Upgrade to Python 3.13 ([#4654](https://github.com/quiltdata/quilt/pull/4654))
 - [Changed] Upgrade to Python 3.11 ([#4241](https://github.com/quiltdata/quilt/pull/4241))
diff --git a/lambdas/tabular_preview/Dockerfile b/lambdas/tabular_preview/Dockerfile
@@ -0,0 +1,50 @@
+ARG FUNCTION_DIR="/function"
+
+FROM public.ecr.aws/lambda/python:3.13.2025.12.10.20@sha256:27f9d657bbd39aa94a5a32ad0c98b797c7f92fa4fddf983f06ab58c4cc931722 AS base
+
+FROM base AS build
+
+# Include global arg in this stage of the build
+ARG FUNCTION_DIR
+
+COPY --from=ghcr.io/astral-sh/uv:latest /uv /usr/local/bin/uv
+
+ENV UV_LINK_MODE=copy \
+    UV_COMPILE_BYTECODE=1 \
+    UV_PYTHON_DOWNLOADS=never \
+    UV_PROJECT_ENVIRONMENT=${FUNCTION_DIR}
+
+RUN --mount=type=cache,target=/root/.cache \
+    --mount=type=bind,source=uv.lock,target=uv.lock \
+    --mount=type=bind,source=pyproject.toml,target=pyproject.toml \
+    uv sync \
+    --locked \
+    --no-dev \
+    --group=prod \
+    --no-install-project
+
+COPY . /src
+WORKDIR /src
+RUN --mount=type=cache,target=/root/.cache \
+    uv sync \
+    --locked \
+    --no-dev \
+    --group=prod \
+    --no-editable
+
+FROM base
+
+# Include global arg in this stage of the build
+ARG FUNCTION_DIR
+# Set working directory to function root directory
+WORKDIR ${FUNCTION_DIR}
+
+COPY --from=build ${FUNCTION_DIR} ${FUNCTION_DIR}
+
+# Configure environment for Lambda runtime
+ENV NUMBA_DISABLE_JIT=1 \
+    MPLCONFIGDIR=/tmp/matplotlib \
+    MPLBACKEND=Agg
+
+ENTRYPOINT ["./bin/python3", "-m", "awslambdaric"]
+CMD ["t4_lambda_tabular_preview.lambda_handler"]
diff --git a/lambdas/tabular_preview/pyproject.toml b/lambdas/tabular_preview/pyproject.toml
@@ -7,12 +7,10 @@ dependencies = [
     "pandas ~= 2.2",
     "xlrd >=2,< 3",
     "openpyxl >=3,<4 ",
-    "fsspec[http] >= 2022.1.0",
-    # Stripping numpy.libs in numpy 2 results in
-    # libscipy_openblas64_-ff651d7f.so: ELF load command address/offset not properly aligned
-    # which is probably caused by some bug in glibc (it happens on Ubuntu 22.04, Amazon Linux 2/2023,
-    # but not on Ubuntu 24.04).
-    "numpy < 2",
+    "fsspec[http]>=2022.1.0",
+    "anndata[lazy]>=0.8.0",
+    "scanpy >= 1.8.0",
+    "numpy ~= 2.3",
     "t4-lambda-shared",
 ]
 
@@ -27,6 +25,9 @@ test = [
     "pytest-env~=1.1",
     "pytest-mock~=3.15",
 ]
+prod = [
+    "awslambdaric~=3.1",
+]
 
 [tool.uv]
 default-groups = ["test"]
diff --git a/lambdas/tabular_preview/src/t4_lambda_tabular_preview/__init__.py b/lambdas/tabular_preview/src/t4_lambda_tabular_preview/__init__.py
@@ -1,16 +1,22 @@
+import errno
 import functools
 import gzip
 import io
 import json
+import os
+import tempfile
 import urllib.request
 from urllib.parse import urlparse
 
+import anndata
 import fsspec
+import h5py
 import pandas
 import pyarrow
 import pyarrow.csv
 import pyarrow.json
 import pyarrow.parquet
+import scanpy as sc
 
 from t4_lambda_shared.decorator import QUILT_INFO_HEADER, api, validate
 from t4_lambda_shared.utils import (
@@ -19,8 +25,12 @@
     make_json_response,
 )
 
+H5AD_META_ONLY_SIZE = int(os.getenv("H5AD_META_ONLY_SIZE", 1_000_000))
+
+
 logger = get_quilt_logger()
 
+
 # Lambda's response must fit into 6 MiB, binary data must be encoded
 # with base64 (4.5 MiB limit). It's rounded down to leave some space for headers
 # and non-flushed gzip buffers.
@@ -215,23 +225,93 @@ def preview_parquet(url, compression, max_out_size):
 
     output_data, output_truncated = write_pandas_as_csv(df, max_out_size)
 
-    return 200, output_data, {
-        "Content-Type": "text/csv",
-        "Content-Encoding": "gzip",
-        QUILT_INFO_HEADER: json.dumps({
-            "truncated": output_truncated,
-            "meta": {
-                "created_by": meta.created_by,
-                "format_version": meta.format_version,
-                "num_row_groups": meta.num_row_groups,
-                "schema": {
-                    "names": meta.schema.names
-                },
-                "serialized_size": meta.serialized_size,
-                "shape": (meta.num_rows, meta.num_columns),
-            },
-        }),
-    }
+    return (
+        200,
+        output_data,
+        {
+            "Content-Type": "text/csv",
+            "Content-Encoding": "gzip",
+            QUILT_INFO_HEADER: json.dumps(
+                {
+                    "truncated": output_truncated,
+                    "meta": {
+                        "created_by": meta.created_by,
+                        "format_version": meta.format_version,
+                        "num_row_groups": meta.num_row_groups,
+                        "schema": {"names": meta.schema.names},
+                        "serialized_size": meta.serialized_size,
+                        "shape": (meta.num_rows, meta.num_columns),
+                    },
+                }
+            ),
+        },
+    )
+
+
+def preview_h5ad(url, compression, max_out_size):
+    with urlopen(url, compression=compression, seekable=True) as src:
+        with h5py.File(src, "r") as h5py_file:
+            adata = anndata.experimental.read_lazy(h5py_file)
+
+            # Get matrix dimensions to decide processing strategy
+            n_obs, n_vars = adata.shape
+
+            if meta_only := (n_obs * n_vars >= H5AD_META_ONLY_SIZE):
+                # For large files, skip intensive QC calculation that requires loading full matrix
+                logger.warning("Getting only meta for large matrix (%d x %d) to avoid OOM/timeout", n_obs, n_vars)
+
+                # Create empty dataframe
+                var_df = pandas.DataFrame(columns=list(adata.var.keys()))
+            else:
+                adata = anndata.read_h5ad(src)
+                # For smaller matrices, calculate full QC metrics using scanpy
+                sc.pp.calculate_qc_metrics(adata, percent_top=None, log1p=False, inplace=True)
+                var_df = adata.var.copy()
+
+            # Add gene expression statistics from the QC metrics
+            # These columns are added by calculate_qc_metrics:
+            # - total_counts: total UMI counts for this gene across all cells
+            # - n_cells_by_counts: number of cells with non-zero counts for this gene
+            # - mean_counts: mean counts per cell for this gene
+            # - pct_dropout_by_counts: percentage of cells with zero counts
+
+            # Reset index to include gene IDs as a regular column
+            var_df_with_index = var_df.reset_index()
+            # XXX: doesn't that change the original column name?
+            var_df_with_index = var_df_with_index.rename(columns={"index": "gene_id"})
+
+            table = pyarrow.Table.from_pandas(var_df_with_index, preserve_index=False)
+            output_data, output_truncated = write_data_as_arrow(table, table.schema, max_out_size)
+
+    return (
+        200,
+        output_data,
+        {
+            "Content-Type": "application/vnd.apache.arrow.file",
+            "Content-Encoding": "gzip",
+            QUILT_INFO_HEADER: json.dumps(
+                {
+                    "truncated": output_truncated,
+                    "meta_only": meta_only,
+                    # H5AD-specific metadata format
+                    "meta": {
+                        "schema": {"names": list(var_df_with_index.columns)},
+                        "h5ad_obs_keys": list(adata.obs.columns),
+                        "h5ad_var_keys": list(adata.var.columns),
+                        "h5ad_uns_keys": list(adata.uns.keys()),
+                        "h5ad_obsm_keys": list(adata.obsm.keys()),
+                        "h5ad_varm_keys": list(adata.varm.keys()),
+                        "h5ad_layers_keys": list(adata.layers.keys()),
+                        "anndata_version": getattr(adata, "__version__", None),
+                        "n_cells": adata.n_obs,
+                        "n_genes": adata.n_vars,
+                        "matrix_type": "sparse" if hasattr(adata.X, "nnz") else "dense",
+                        "has_raw": adata.raw is not None,
+                    },
+                }
+            ),
+        },
+    )
 
 
 handlers = {
@@ -240,36 +320,33 @@ def preview_parquet(url, compression, max_out_size):
     "excel": preview_excel,
     "parquet": preview_parquet,
     "jsonl": preview_jsonl,
+    "h5ad": preview_h5ad,
 }
 
 SCHEMA = {
     "type": "object",
     "properties": {
-        "url": {
-            "type": "string"
-        },
+        "url": {"type": "string"},
         "input": {
             "enum": list(handlers),
         },
-        "compression": {
-            "enum": ["gz", "bz2"]
-        },
+        "compression": {"enum": ["gz", "bz2"]},
         "size": {
             "enum": list(OUTPUT_SIZES),
         },
     },
     "required": ["url", "input"],
-    "additionalProperties": False
+    "additionalProperties": False,
 }
 
 
 def is_s3_url(url: str) -> bool:
     parsed_url = urlparse(url, allow_fragments=False)
     return (
-        parsed_url.scheme == "https" and
-        parsed_url.netloc.endswith(S3_DOMAIN_SUFFIX) and
-        parsed_url.username is None and
-        parsed_url.password is None
+        parsed_url.scheme == "https"
+        and parsed_url.netloc.endswith(S3_DOMAIN_SUFFIX)
+        and parsed_url.username is None
+        and parsed_url.password is None
     )
 
 
@@ -282,9 +359,7 @@ def lambda_handler(request):
     compression = request.args.get("compression")
 
     if not is_s3_url(url):
-        return make_json_response(400, {
-            "title": "Invalid url=. Expected S3 virtual-host URL."
-        })
+        return make_json_response(400, {"title": "Invalid url=. Expected S3 virtual-host URL."})
 
     handler = handlers[input_type]
     return handler(url, compression, OUTPUT_SIZES[output_size])
diff --git a/lambdas/tabular_preview/tests/data/simple/test.h5ad b/lambdas/tabular_preview/tests/data/simple/test.h5ad
diff --git a/lambdas/tabular_preview/tests/test_index.py b/lambdas/tabular_preview/tests/test_index.py
@@ -98,6 +98,73 @@ def test_preview_simple(filename, handler_name):
         )
 
 
+@pytest.mark.parametrize(
+    "meta_only",
+    [False, True],
+)
+def test_preview_h5ad(mocker, meta_only):
+    if meta_only:
+        mocker.patch(
+            "t4_lambda_tabular_preview.H5AD_META_ONLY_SIZE",
+            0,  # Force providing only meta
+        )
+        calculate_qc_metrics_mock = mocker.patch("t4_lambda_tabular_preview.sc.pp.calculate_qc_metrics")
+
+    code, body, headers = t4_lambda_tabular_preview.handlers["h5ad"](
+        url=str(pathlib.Path(__file__).parent / "data" / "simple/test.h5ad"),
+        compression=None,
+        max_out_size=None,
+    )
+
+    assert code == 200
+    assert headers["Content-Type"] == "application/vnd.apache.arrow.file"
+    assert headers["Content-Encoding"] == "gzip"
+
+    # Parse the QUILT_INFO_HEADER to check metadata
+    info = json.loads(headers[QUILT_INFO_HEADER])
+    assert "truncated" in info
+    assert "meta" in info
+    assert info.get("meta_only") is meta_only
+    # Check H5AD-specific metadata format
+    assert "h5ad_obs_keys" in info["meta"]  # H5AD-specific fields
+    assert "h5ad_var_keys" in info["meta"]
+    # Check new H5AD-specific fields
+    assert info["meta"]["n_cells"] == 2  # 2 cells in test data
+    assert info["meta"]["n_genes"] == 2  # 2 genes in test data
+    assert "matrix_type" in info["meta"]  # sparse or dense
+    assert "has_raw" in info["meta"]  # boolean indicating raw data presence
+
+    # Check that the Arrow data can be read and contains expected content
+    with pyarrow.ipc.open_file(io.BytesIO(gzip.decompress(body))) as reader:
+        table = reader.read_all()
+
+    # Convert back to pandas to check content
+    df = table.to_pandas()
+
+    # Should have gene-level QC metrics instead of expression matrix
+    assert "gene_id" in df.columns
+    assert "highly_variable" in df.columns
+    if not meta_only:
+        assert "ENSG001" in df["gene_id"].values
+        assert "ENSG002" in df["gene_id"].values
+
+    # Should have QC metric columns added by scanpy
+    expected_qc_columns = ["total_counts", "n_cells_by_counts", "mean_counts", "pct_dropout_by_counts"]
+    if meta_only:
+        calculate_qc_metrics_mock.assert_not_called()
+        for col in expected_qc_columns:
+            assert col not in df.columns, f"Unexpected QC column {col} found in {df.columns.tolist()}"
+    else:
+        for col in expected_qc_columns:
+            assert col in df.columns, f"Expected QC column {col} not found in {df.columns.tolist()}"
+
+    # Check that we have the right number of genes (rows)
+    if meta_only:
+        assert len(df) == 0  # no tabular data
+    else:
+        assert len(df) == 2  # Should have 2 genes from our test data
+
+
 def test_preview_simple_parquet():
     data = (pathlib.Path(__file__).parent / "data" / "simple/test.parquet").read_bytes()
     with patch_urlopen(data) as urlopen_mock:
diff --git a/lambdas/tabular_preview/uv.lock b/lambdas/tabular_preview/uv.lock