|
2 | 2 | from pathlib import Path
|
3 | 3 | from typing import TYPE_CHECKING, Literal, get_args
|
4 | 4 |
|
| 5 | +import h5py |
5 | 6 | import numpy as np
|
6 | 7 | import pandas as pd
|
7 | 8 | import polars as pl
|
8 | 9 | from anndata import AnnData
|
9 |
| -from scipy.sparse import csr_matrix |
| 10 | +from scipy.sparse import csc_array, csr_matrix |
10 | 11 |
|
11 | 12 | from .._typealias import _PathLike
|
12 | 13 | from .._utils import _get_coordinate_index, _raise_module_load_error, validate_threads
|
@@ -271,6 +272,123 @@ def read_Xenium(
|
271 | 272 | )
|
272 | 273 |
|
273 | 274 |
|
| 275 | +# VisiumHD |
| 276 | + |
| 277 | + |
| 278 | +@validate_threads |
| 279 | +def read_VisiumHD( |
| 280 | + path: _PathLike, |
| 281 | + *, |
| 282 | + gene_name: bool = False, |
| 283 | + raw: bool = False, |
| 284 | + n_threads: int | None = None, |
| 285 | +) -> LazyKDE: |
| 286 | + """ |
| 287 | + Read a VisiumHD sample. |
| 288 | +
|
| 289 | + Parameters |
| 290 | + ---------- |
| 291 | + path : os.PathLike or str |
| 292 | + Path to the 2um bin directory of the sample. The filtered or raw h5 file and the |
| 293 | + spatial/tissue_positions.parquet file are required. |
| 294 | + gene_name : bool, optional |
| 295 | + If True will use the 'name' column otherwise the 'id' column is used as gene |
| 296 | + identifier. |
| 297 | + raw : bool, optional |
| 298 | + If `False` the filtered barcode-feature matrix will be used. If `True` the raw |
| 299 | + matrix is loaded. |
| 300 | + n_threads : int | None, optional |
| 301 | + Number of threads used for reading file and processing. If `None` or 0 this will |
| 302 | + default to the number of available CPUs. |
| 303 | +
|
| 304 | + Returns |
| 305 | + ------- |
| 306 | + sainsc.LazyKDE |
| 307 | +
|
| 308 | + Raises |
| 309 | + ------ |
| 310 | + FileNotFoundError |
| 311 | + If either the h5-file or the barcode position file are not found at their |
| 312 | + default path. |
| 313 | + """ |
| 314 | + |
| 315 | + path = Path(path) |
| 316 | + matrix_file = path / ( |
| 317 | + "raw_feature_bc_matrix.h5" if raw else "filtered_feature_bc_matrix.h5" |
| 318 | + ) |
| 319 | + position_file = path / "spatial" / "tissue_positions.parquet" |
| 320 | + |
| 321 | + if not matrix_file.is_file(): |
| 322 | + raise FileNotFoundError(f"{matrix_file} does not exist") |
| 323 | + if not position_file.is_file(): |
| 324 | + raise FileNotFoundError(f"{position_file} does not exist") |
| 325 | + |
| 326 | + gene_col = "name" if gene_name else "id" |
| 327 | + |
| 328 | + barcode_position = pl.scan_parquet(position_file).rename( |
| 329 | + {"array_row": "y", "array_col": "x"} |
| 330 | + ) |
| 331 | + |
| 332 | + with h5py.File(matrix_file, "r") as h5file: |
| 333 | + matrix_group = h5file["matrix"] |
| 334 | + assert isinstance(matrix_group, h5py.Group) |
| 335 | + |
| 336 | + barcodes = ( |
| 337 | + pl.LazyFrame( |
| 338 | + {"barcode": matrix_group["barcodes"][:]}, schema={"barcode": pl.String} |
| 339 | + ) |
| 340 | + .join(barcode_position, how="left", on="barcode", validate="1:1") |
| 341 | + .select(["x", "y"]) |
| 342 | + .collect() |
| 343 | + ) |
| 344 | + |
| 345 | + feature_group = matrix_group["features"] |
| 346 | + features = pl.DataFrame( |
| 347 | + { |
| 348 | + "gene": feature_group[gene_col][:], |
| 349 | + "feature_type": feature_group["feature_type"][:], |
| 350 | + }, |
| 351 | + schema={"gene": pl.String, "feature_type": pl.String}, |
| 352 | + ).with_columns(pl.col("feature_type").cast(pl.Categorical)) |
| 353 | + |
| 354 | + gene_expression = csc_array( |
| 355 | + ( |
| 356 | + matrix_group["data"][:], |
| 357 | + matrix_group["indices"][:], |
| 358 | + matrix_group["indptr"][:], |
| 359 | + ), |
| 360 | + shape=matrix_group["shape"][:], |
| 361 | + ) |
| 362 | + |
| 363 | + # only keep "Gene Expression" features not "Antibody Capture" |
| 364 | + keep_features = features["feature_type"] == "Gene Expression" |
| 365 | + features = features.filter(keep_features) |
| 366 | + gene_expression = gene_expression.tocsr() |
| 367 | + gene_expression = gene_expression[keep_features, :] |
| 368 | + |
| 369 | + gene_expression = gene_expression.tocoo() |
| 370 | + |
| 371 | + schema = { |
| 372 | + "gene": pl.Categorical, |
| 373 | + "x": barcodes["x"].dtype, |
| 374 | + "y": barcodes["y"].dtype, |
| 375 | + "count": pl.Int32, |
| 376 | + } |
| 377 | + |
| 378 | + coordinates = barcodes[gene_expression.col] |
| 379 | + |
| 380 | + df = pl.DataFrame( |
| 381 | + { |
| 382 | + "gene": features["gene"][gene_expression.row], |
| 383 | + "x": coordinates["x"], |
| 384 | + "y": coordinates["y"], |
| 385 | + "count": gene_expression.data, |
| 386 | + }, |
| 387 | + schema=schema, |
| 388 | + ) |
| 389 | + return LazyKDE.from_dataframe(df, resolution=2_000, n_threads=n_threads) |
| 390 | + |
| 391 | + |
274 | 392 | # Vizgen
|
275 | 393 |
|
276 | 394 | _VIZGEN_COLUMNS = {"gene": "gene", "global_x": "x", "global_y": "y"}
|
|
0 commit comments