Skip to content

Commit a3625ee

Browse files
kevinemooreKevin Moorefiskussir-sigurdclaude
authored
Preview Anndata (.h5ad) (#4636)
Co-authored-by: Kevin Moore <kevin@quiltdata.io> Co-authored-by: Maksim Chervonnyi <mail@redmax.dev> Co-authored-by: Sergey Fedoseev <fedoseev.sergey@quiltdata.io> Co-authored-by: Claude <noreply@anthropic.com> Co-authored-by: Dr. Ernie Prabhakar <ernest@quilt.bio> Co-authored-by: greptile-apps[bot] <165735046+greptile-apps[bot]@users.noreply.github.com> Co-authored-by: Alexei Mochalov <nl_0@quiltdata.io>
1 parent 45d4cdf commit a3625ee

File tree

8 files changed

+1032
-41
lines changed

8 files changed

+1032
-41
lines changed

.github/workflows/deploy-lambdas.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,6 @@ jobs:
2323
- s3hash
2424
- status_reports
2525
- manifest_indexer
26-
- tabular_preview
2726
- transcode
2827
runs-on: ubuntu-latest
2928
# These permissions are needed to interact with GitHub's OIDC Token endpoint.
@@ -70,6 +69,7 @@ jobs:
7069
path:
7170
- indexer
7271
- thumbnail
72+
- tabular_preview
7373
runs-on: ubuntu-latest
7474
# These permissions are needed to interact with GitHub's OIDC Token endpoint.
7575
permissions:

lambdas/tabular_preview/CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ where verb is one of
1717

1818
## Changes
1919

20+
- [Added] Preview h5ad (anndata) files ([#4636](https://github.com/quiltdata/quilt/pull/4636))
2021
- [Changed] Switch to uv ([#4654](https://github.com/quiltdata/quilt/pull/4654))
2122
- [Changed] Upgrade to Python 3.13 ([#4654](https://github.com/quiltdata/quilt/pull/4654))
2223
- [Changed] Upgrade to Python 3.11 ([#4241](https://github.com/quiltdata/quilt/pull/4241))

lambdas/tabular_preview/Dockerfile

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
ARG FUNCTION_DIR="/function"
2+
3+
FROM public.ecr.aws/lambda/python:3.13.2025.12.10.20@sha256:27f9d657bbd39aa94a5a32ad0c98b797c7f92fa4fddf983f06ab58c4cc931722 AS base
4+
5+
FROM base AS build
6+
7+
# Include global arg in this stage of the build
8+
ARG FUNCTION_DIR
9+
10+
COPY --from=ghcr.io/astral-sh/uv:latest /uv /usr/local/bin/uv
11+
12+
ENV UV_LINK_MODE=copy \
13+
UV_COMPILE_BYTECODE=1 \
14+
UV_PYTHON_DOWNLOADS=never \
15+
UV_PROJECT_ENVIRONMENT=${FUNCTION_DIR}
16+
17+
RUN --mount=type=cache,target=/root/.cache \
18+
--mount=type=bind,source=uv.lock,target=uv.lock \
19+
--mount=type=bind,source=pyproject.toml,target=pyproject.toml \
20+
uv sync \
21+
--locked \
22+
--no-dev \
23+
--group=prod \
24+
--no-install-project
25+
26+
COPY . /src
27+
WORKDIR /src
28+
RUN --mount=type=cache,target=/root/.cache \
29+
uv sync \
30+
--locked \
31+
--no-dev \
32+
--group=prod \
33+
--no-editable
34+
35+
FROM base
36+
37+
# Include global arg in this stage of the build
38+
ARG FUNCTION_DIR
39+
# Set working directory to function root directory
40+
WORKDIR ${FUNCTION_DIR}
41+
42+
COPY --from=build ${FUNCTION_DIR} ${FUNCTION_DIR}
43+
44+
# Configure environment for Lambda runtime
45+
ENV NUMBA_DISABLE_JIT=1 \
46+
MPLCONFIGDIR=/tmp/matplotlib \
47+
MPLBACKEND=Agg
48+
49+
ENTRYPOINT ["./bin/python3", "-m", "awslambdaric"]
50+
CMD ["t4_lambda_tabular_preview.lambda_handler"]

lambdas/tabular_preview/pyproject.toml

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -7,12 +7,10 @@ dependencies = [
77
"pandas ~= 2.2",
88
"xlrd >=2,< 3",
99
"openpyxl >=3,<4 ",
10-
"fsspec[http] >= 2022.1.0",
11-
# Stripping numpy.libs in numpy 2 results in
12-
# libscipy_openblas64_-ff651d7f.so: ELF load command address/offset not properly aligned
13-
# which is probably caused by some bug in glibc (it happens on Ubuntu 22.04, Amazon Linux 2/2023,
14-
# but not on Ubuntu 24.04).
15-
"numpy < 2",
10+
"fsspec[http]>=2022.1.0",
11+
"anndata[lazy]>=0.8.0",
12+
"scanpy >= 1.8.0",
13+
"numpy ~= 2.3",
1614
"t4-lambda-shared",
1715
]
1816

@@ -27,6 +25,9 @@ test = [
2725
"pytest-env~=1.1",
2826
"pytest-mock~=3.15",
2927
]
28+
prod = [
29+
"awslambdaric~=3.1",
30+
]
3031

3132
[tool.uv]
3233
default-groups = ["test"]

lambdas/tabular_preview/src/t4_lambda_tabular_preview/__init__.py

Lines changed: 106 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,22 @@
1+
import errno
12
import functools
23
import gzip
34
import io
45
import json
6+
import os
7+
import tempfile
58
import urllib.request
69
from urllib.parse import urlparse
710

11+
import anndata
812
import fsspec
13+
import h5py
914
import pandas
1015
import pyarrow
1116
import pyarrow.csv
1217
import pyarrow.json
1318
import pyarrow.parquet
19+
import scanpy as sc
1420

1521
from t4_lambda_shared.decorator import QUILT_INFO_HEADER, api, validate
1622
from t4_lambda_shared.utils import (
@@ -19,8 +25,12 @@
1925
make_json_response,
2026
)
2127

28+
H5AD_META_ONLY_SIZE = int(os.getenv("H5AD_META_ONLY_SIZE", 1_000_000))
29+
30+
2231
logger = get_quilt_logger()
2332

33+
2434
# Lambda's response must fit into 6 MiB, binary data must be encoded
2535
# with base64 (4.5 MiB limit). It's rounded down to leave some space for headers
2636
# and non-flushed gzip buffers.
@@ -215,23 +225,93 @@ def preview_parquet(url, compression, max_out_size):
215225

216226
output_data, output_truncated = write_pandas_as_csv(df, max_out_size)
217227

218-
return 200, output_data, {
219-
"Content-Type": "text/csv",
220-
"Content-Encoding": "gzip",
221-
QUILT_INFO_HEADER: json.dumps({
222-
"truncated": output_truncated,
223-
"meta": {
224-
"created_by": meta.created_by,
225-
"format_version": meta.format_version,
226-
"num_row_groups": meta.num_row_groups,
227-
"schema": {
228-
"names": meta.schema.names
229-
},
230-
"serialized_size": meta.serialized_size,
231-
"shape": (meta.num_rows, meta.num_columns),
232-
},
233-
}),
234-
}
228+
return (
229+
200,
230+
output_data,
231+
{
232+
"Content-Type": "text/csv",
233+
"Content-Encoding": "gzip",
234+
QUILT_INFO_HEADER: json.dumps(
235+
{
236+
"truncated": output_truncated,
237+
"meta": {
238+
"created_by": meta.created_by,
239+
"format_version": meta.format_version,
240+
"num_row_groups": meta.num_row_groups,
241+
"schema": {"names": meta.schema.names},
242+
"serialized_size": meta.serialized_size,
243+
"shape": (meta.num_rows, meta.num_columns),
244+
},
245+
}
246+
),
247+
},
248+
)
249+
250+
251+
def preview_h5ad(url, compression, max_out_size):
252+
with urlopen(url, compression=compression, seekable=True) as src:
253+
with h5py.File(src, "r") as h5py_file:
254+
adata = anndata.experimental.read_lazy(h5py_file)
255+
256+
# Get matrix dimensions to decide processing strategy
257+
n_obs, n_vars = adata.shape
258+
259+
if meta_only := (n_obs * n_vars >= H5AD_META_ONLY_SIZE):
260+
# For large files, skip intensive QC calculation that requires loading full matrix
261+
logger.warning("Getting only meta for large matrix (%d x %d) to avoid OOM/timeout", n_obs, n_vars)
262+
263+
# Create empty dataframe
264+
var_df = pandas.DataFrame(columns=list(adata.var.keys()))
265+
else:
266+
adata = anndata.read_h5ad(src)
267+
# For smaller matrices, calculate full QC metrics using scanpy
268+
sc.pp.calculate_qc_metrics(adata, percent_top=None, log1p=False, inplace=True)
269+
var_df = adata.var.copy()
270+
271+
# Add gene expression statistics from the QC metrics
272+
# These columns are added by calculate_qc_metrics:
273+
# - total_counts: total UMI counts for this gene across all cells
274+
# - n_cells_by_counts: number of cells with non-zero counts for this gene
275+
# - mean_counts: mean counts per cell for this gene
276+
# - pct_dropout_by_counts: percentage of cells with zero counts
277+
278+
# Reset index to include gene IDs as a regular column
279+
var_df_with_index = var_df.reset_index()
280+
# XXX: doesn't that change the original column name?
281+
var_df_with_index = var_df_with_index.rename(columns={"index": "gene_id"})
282+
283+
table = pyarrow.Table.from_pandas(var_df_with_index, preserve_index=False)
284+
output_data, output_truncated = write_data_as_arrow(table, table.schema, max_out_size)
285+
286+
return (
287+
200,
288+
output_data,
289+
{
290+
"Content-Type": "application/vnd.apache.arrow.file",
291+
"Content-Encoding": "gzip",
292+
QUILT_INFO_HEADER: json.dumps(
293+
{
294+
"truncated": output_truncated,
295+
"meta_only": meta_only,
296+
# H5AD-specific metadata format
297+
"meta": {
298+
"schema": {"names": list(var_df_with_index.columns)},
299+
"h5ad_obs_keys": list(adata.obs.columns),
300+
"h5ad_var_keys": list(adata.var.columns),
301+
"h5ad_uns_keys": list(adata.uns.keys()),
302+
"h5ad_obsm_keys": list(adata.obsm.keys()),
303+
"h5ad_varm_keys": list(adata.varm.keys()),
304+
"h5ad_layers_keys": list(adata.layers.keys()),
305+
"anndata_version": getattr(adata, "__version__", None),
306+
"n_cells": adata.n_obs,
307+
"n_genes": adata.n_vars,
308+
"matrix_type": "sparse" if hasattr(adata.X, "nnz") else "dense",
309+
"has_raw": adata.raw is not None,
310+
},
311+
}
312+
),
313+
},
314+
)
235315

236316

237317
handlers = {
@@ -240,36 +320,33 @@ def preview_parquet(url, compression, max_out_size):
240320
"excel": preview_excel,
241321
"parquet": preview_parquet,
242322
"jsonl": preview_jsonl,
323+
"h5ad": preview_h5ad,
243324
}
244325

245326
SCHEMA = {
246327
"type": "object",
247328
"properties": {
248-
"url": {
249-
"type": "string"
250-
},
329+
"url": {"type": "string"},
251330
"input": {
252331
"enum": list(handlers),
253332
},
254-
"compression": {
255-
"enum": ["gz", "bz2"]
256-
},
333+
"compression": {"enum": ["gz", "bz2"]},
257334
"size": {
258335
"enum": list(OUTPUT_SIZES),
259336
},
260337
},
261338
"required": ["url", "input"],
262-
"additionalProperties": False
339+
"additionalProperties": False,
263340
}
264341

265342

266343
def is_s3_url(url: str) -> bool:
267344
parsed_url = urlparse(url, allow_fragments=False)
268345
return (
269-
parsed_url.scheme == "https" and
270-
parsed_url.netloc.endswith(S3_DOMAIN_SUFFIX) and
271-
parsed_url.username is None and
272-
parsed_url.password is None
346+
parsed_url.scheme == "https"
347+
and parsed_url.netloc.endswith(S3_DOMAIN_SUFFIX)
348+
and parsed_url.username is None
349+
and parsed_url.password is None
273350
)
274351

275352

@@ -282,9 +359,7 @@ def lambda_handler(request):
282359
compression = request.args.get("compression")
283360

284361
if not is_s3_url(url):
285-
return make_json_response(400, {
286-
"title": "Invalid url=. Expected S3 virtual-host URL."
287-
})
362+
return make_json_response(400, {"title": "Invalid url=. Expected S3 virtual-host URL."})
288363

289364
handler = handlers[input_type]
290365
return handler(url, compression, OUTPUT_SIZES[output_size])
21.8 KB
Binary file not shown.

lambdas/tabular_preview/tests/test_index.py

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,73 @@ def test_preview_simple(filename, handler_name):
9898
)
9999

100100

101+
@pytest.mark.parametrize(
102+
"meta_only",
103+
[False, True],
104+
)
105+
def test_preview_h5ad(mocker, meta_only):
106+
if meta_only:
107+
mocker.patch(
108+
"t4_lambda_tabular_preview.H5AD_META_ONLY_SIZE",
109+
0, # Force providing only meta
110+
)
111+
calculate_qc_metrics_mock = mocker.patch("t4_lambda_tabular_preview.sc.pp.calculate_qc_metrics")
112+
113+
code, body, headers = t4_lambda_tabular_preview.handlers["h5ad"](
114+
url=str(pathlib.Path(__file__).parent / "data" / "simple/test.h5ad"),
115+
compression=None,
116+
max_out_size=None,
117+
)
118+
119+
assert code == 200
120+
assert headers["Content-Type"] == "application/vnd.apache.arrow.file"
121+
assert headers["Content-Encoding"] == "gzip"
122+
123+
# Parse the QUILT_INFO_HEADER to check metadata
124+
info = json.loads(headers[QUILT_INFO_HEADER])
125+
assert "truncated" in info
126+
assert "meta" in info
127+
assert info.get("meta_only") is meta_only
128+
# Check H5AD-specific metadata format
129+
assert "h5ad_obs_keys" in info["meta"] # H5AD-specific fields
130+
assert "h5ad_var_keys" in info["meta"]
131+
# Check new H5AD-specific fields
132+
assert info["meta"]["n_cells"] == 2 # 2 cells in test data
133+
assert info["meta"]["n_genes"] == 2 # 2 genes in test data
134+
assert "matrix_type" in info["meta"] # sparse or dense
135+
assert "has_raw" in info["meta"] # boolean indicating raw data presence
136+
137+
# Check that the Arrow data can be read and contains expected content
138+
with pyarrow.ipc.open_file(io.BytesIO(gzip.decompress(body))) as reader:
139+
table = reader.read_all()
140+
141+
# Convert back to pandas to check content
142+
df = table.to_pandas()
143+
144+
# Should have gene-level QC metrics instead of expression matrix
145+
assert "gene_id" in df.columns
146+
assert "highly_variable" in df.columns
147+
if not meta_only:
148+
assert "ENSG001" in df["gene_id"].values
149+
assert "ENSG002" in df["gene_id"].values
150+
151+
# Should have QC metric columns added by scanpy
152+
expected_qc_columns = ["total_counts", "n_cells_by_counts", "mean_counts", "pct_dropout_by_counts"]
153+
if meta_only:
154+
calculate_qc_metrics_mock.assert_not_called()
155+
for col in expected_qc_columns:
156+
assert col not in df.columns, f"Unexpected QC column {col} found in {df.columns.tolist()}"
157+
else:
158+
for col in expected_qc_columns:
159+
assert col in df.columns, f"Expected QC column {col} not found in {df.columns.tolist()}"
160+
161+
# Check that we have the right number of genes (rows)
162+
if meta_only:
163+
assert len(df) == 0 # no tabular data
164+
else:
165+
assert len(df) == 2 # Should have 2 genes from our test data
166+
167+
101168
def test_preview_simple_parquet():
102169
data = (pathlib.Path(__file__).parent / "data" / "simple/test.parquet").read_bytes()
103170
with patch_urlopen(data) as urlopen_mock:

0 commit comments

Comments
 (0)