Skip to content

Commit 40830e2

Browse files
Müller-Bötticherniklasmueboe
authored andcommitted
add VisiumHD IO support
1 parent a5b2fc6 commit 40830e2

File tree

3 files changed

+122
-1
lines changed

3 files changed

+122
-1
lines changed

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ authors = [
1616
]
1717
dependencies = [
1818
"anndata>=0.9",
19+
"h5py>=3",
1920
"matplotlib",
2021
"matplotlib-scalebar",
2122
"numba>=0.44",

sainsc/io/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
read_gem_header,
1111
read_StereoSeq,
1212
read_StereoSeq_bins,
13+
read_VisiumHD,
1314
read_Vizgen,
1415
read_Xenium,
1516
)
@@ -21,6 +22,7 @@
2122
"read_gem_header",
2223
"read_StereoSeq",
2324
"read_StereoSeq_bins",
25+
"read_VisiumHD",
2426
"read_Vizgen",
2527
"read_Xenium",
2628
]

sainsc/io/_io.py

Lines changed: 119 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,12 @@
22
from pathlib import Path
33
from typing import TYPE_CHECKING, Literal, get_args
44

5+
import h5py
56
import numpy as np
67
import pandas as pd
78
import polars as pl
89
from anndata import AnnData
9-
from scipy.sparse import csr_matrix
10+
from scipy.sparse import csc_array, csr_matrix
1011

1112
from .._typealias import _PathLike
1213
from .._utils import _get_coordinate_index, _raise_module_load_error, validate_threads
@@ -271,6 +272,123 @@ def read_Xenium(
271272
)
272273

273274

275+
# VisiumHD
276+
277+
278+
@validate_threads
279+
def read_VisiumHD(
280+
path: _PathLike,
281+
*,
282+
gene_name: bool = False,
283+
raw: bool = False,
284+
n_threads: int | None = None,
285+
) -> LazyKDE:
286+
"""
287+
Read a VisiumHD sample.
288+
289+
Parameters
290+
----------
291+
path : os.PathLike or str
292+
Path to the 2um bin directory of the sample. The filtered or raw h5 file and the
293+
spatial/tissue_positions.parquet file are required.
294+
gene_name : bool, optional
295+
If True will use the 'name' column otherwise the 'id' column is used as gene
296+
identifier.
297+
raw : bool, optional
298+
If `False` the filtered barcode-feature matrix will be used. If `True` the raw
299+
matrix is loaded.
300+
n_threads : int | None, optional
301+
Number of threads used for reading file and processing. If `None` or 0 this will
302+
default to the number of available CPUs.
303+
304+
Returns
305+
-------
306+
sainsc.LazyKDE
307+
308+
Raises
309+
------
310+
FileNotFoundError
311+
If either the h5-file or the barcode position file are not found at their
312+
default path.
313+
"""
314+
315+
path = Path(path)
316+
matrix_file = path / (
317+
"raw_feature_bc_matrix.h5" if raw else "filtered_feature_bc_matrix.h5"
318+
)
319+
position_file = path / "spatial" / "tissue_positions.parquet"
320+
321+
if not matrix_file.is_file():
322+
raise FileNotFoundError(f"{matrix_file} does not exist")
323+
if not position_file.is_file():
324+
raise FileNotFoundError(f"{position_file} does not exist")
325+
326+
gene_col = "name" if gene_name else "id"
327+
328+
barcode_position = pl.scan_parquet(position_file).rename(
329+
{"array_row": "y", "array_col": "x"}
330+
)
331+
332+
with h5py.File(matrix_file, "r") as h5file:
333+
matrix_group = h5file["matrix"]
334+
assert isinstance(matrix_group, h5py.Group)
335+
336+
barcodes = (
337+
pl.LazyFrame(
338+
{"barcode": matrix_group["barcodes"][:]}, schema={"barcode": pl.String}
339+
)
340+
.join(barcode_position, how="left", on="barcode", validate="1:1")
341+
.select(["x", "y"])
342+
.collect()
343+
)
344+
345+
feature_group = matrix_group["features"]
346+
features = pl.DataFrame(
347+
{
348+
"gene": feature_group[gene_col][:],
349+
"feature_type": feature_group["feature_type"][:],
350+
},
351+
schema={"gene": pl.String, "feature_type": pl.String},
352+
).with_columns(pl.col("feature_type").cast(pl.Categorical))
353+
354+
gene_expression = csc_array(
355+
(
356+
matrix_group["data"][:],
357+
matrix_group["indices"][:],
358+
matrix_group["indptr"][:],
359+
),
360+
shape=matrix_group["shape"][:],
361+
)
362+
363+
# only keep "Gene Expression" features not "Antibody Capture"
364+
keep_features = features["feature_type"] == "Gene Expression"
365+
features = features.filter(keep_features)
366+
gene_expression = gene_expression.tocsr()
367+
gene_expression = gene_expression[keep_features, :]
368+
369+
gene_expression = gene_expression.tocoo()
370+
371+
schema = {
372+
"gene": pl.Categorical,
373+
"x": barcodes["x"].dtype,
374+
"y": barcodes["y"].dtype,
375+
"count": pl.Int32,
376+
}
377+
378+
coordinates = barcodes[gene_expression.col]
379+
380+
df = pl.DataFrame(
381+
{
382+
"gene": features["gene"][gene_expression.row],
383+
"x": coordinates["x"],
384+
"y": coordinates["y"],
385+
"count": gene_expression.data,
386+
},
387+
schema=schema,
388+
)
389+
return LazyKDE.from_dataframe(df, resolution=2_000, n_threads=n_threads)
390+
391+
274392
# Vizgen
275393

276394
_VIZGEN_COLUMNS = {"gene": "gene", "global_x": "x", "global_y": "y"}

0 commit comments

Comments
 (0)