-
Notifications
You must be signed in to change notification settings - Fork 7
Added numpy_to_h5sc function #353
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
04046ef
1b4280b
0b1ba29
814281a
32d0bcc
ab5be07
8c1eb74
90d85d3
99cf95b
7a3e6ed
a1f5dce
a2573a5
54b0bce
6817966
6543925
23dfc1d
2197d8c
a5e274f
832ddad
d4e4230
aa9e3c3
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -2,10 +2,18 @@ | |
| from typing import Literal | ||
|
|
||
| import h5py | ||
| import numpy as np | ||
| import pandas as pd | ||
| from anndata import AnnData | ||
| from anndata._io.h5ad import _clean_uns, _read_raw, read_dataframe, read_elem | ||
|
|
||
| from scportrait.pipeline._utils.constants import DEFAULT_NAME_SINGLE_CELL_IMAGES, IMAGE_DATACONTAINER_NAME | ||
| from scportrait.pipeline._utils.constants import ( | ||
| DEFAULT_CELL_ID_NAME, | ||
| DEFAULT_NAME_SINGLE_CELL_IMAGES, | ||
| DEFAULT_SEGMENTATION_DTYPE, | ||
| DEFAULT_SINGLE_CELL_IMAGE_DTYPE, | ||
| IMAGE_DATACONTAINER_NAME, | ||
| ) | ||
|
|
||
|
|
||
| def read_h5sc(filename: str | Path) -> AnnData: | ||
|
|
@@ -45,3 +53,200 @@ def read_h5sc(filename: str | Path) -> AnnData: | |
|
|
||
| adata.obsm[DEFAULT_NAME_SINGLE_CELL_IMAGES] = f.get(IMAGE_DATACONTAINER_NAME) | ||
| return adata | ||
|
|
||
|
|
||
| def numpy_to_h5sc( | ||
| mask_names, | ||
| channel_names, | ||
| mask_imgs, | ||
| channel_imgs, | ||
| output_path, | ||
| cell_ids, | ||
| cell_metadata=None, | ||
| compression_type="gzip", | ||
| ): | ||
| """ | ||
| Create and write an scPortrait-style `.h5sc` file from NumPy arrays of single-cell | ||
| masks and image channels, with optional per-cell metadata. | ||
|
|
||
| This function builds a valid AnnData-backed HDF5 container following the scPortrait | ||
| “H5SC” convention. Internally, the file is a standard AnnData `.h5ad` structure whose | ||
| filename ends in `.h5sc`, and which contains a 4D image tensor stored at: | ||
|
|
||
| /obsm/single_cell_images | ||
|
|
||
| with shape: | ||
|
|
||
| (N, C, H, W) | ||
|
|
||
| where: | ||
| N = number of cells | ||
| C = n_masks + n_image_channels | ||
| H = image height | ||
| W = image width | ||
|
|
||
| The mask channels are stored first, followed by the image channels. All data are | ||
| written as a single float16 HDF5 dataset, with mask values encoded as 0.0 and 1.0. | ||
|
|
||
| Cell identifiers and optional per-cell metadata are written to `adata.obs`. | ||
|
|
||
| Metadata are written redundantly: | ||
| - At the AnnData level in `adata.uns[...]` | ||
| - At the HDF5 level as attributes on `/obsm/single_cell_images` | ||
|
|
||
| This allows the file to be read both via AnnData and as a standalone HDF5 image | ||
| container. | ||
|
|
||
| Parameters | ||
sophiamaedler marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| ---------- | ||
| mask_names : sequence of str | ||
| Names of the mask channels. Length must match `mask_imgs.shape[1]`. | ||
|
|
||
| channel_names : sequence of str | ||
sophiamaedler marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| Names of the image channels. Length must match `channel_imgs.shape[1]`. | ||
|
|
||
| mask_imgs : np.ndarray | ||
| Array of mask images with shape `(N, n_masks, H, W)`. | ||
| Masks are expected to be binary (0 or 1) and will be stored as float16. | ||
|
|
||
| channel_imgs : np.ndarray | ||
| Array of image channels with shape `(N, n_image_channels, H, W)`. | ||
| Images should already be normalized (e.g., to [0, 1]) before writing. | ||
|
|
||
| output_path : str | ||
| Path of the `.h5sc` file to create, e.g. `"/path/to/file.h5sc"`. | ||
| The file will be overwritten if it already exists. | ||
|
|
||
| cell_ids : np.ndarray | ||
| Array of segmentation cell identifiers with shape `(N,)`. | ||
| These values are written into `adata.obs[DEFAULT_CELL_ID_NAME]` and define the | ||
| mapping between row index and original segmentation label. | ||
|
|
||
| cell_metadata : pandas.DataFrame or None, optional (default: None) | ||
| Optional per-cell metadata to be written into `adata.obs`. | ||
| Must have exactly `N` rows. Columns will be merged into `obs` alongside the | ||
| cell ID column. The index is ignored and replaced by AnnData’s internal index. | ||
|
|
||
| compression_type : {"gzip", "lzf"}, optional (default: "gzip") | ||
| HDF5 compression algorithm used for the image tensor. | ||
| - "gzip": better compression, slower I/O | ||
| - "lzf" : faster I/O, lower compression ratio | ||
|
|
||
| File layout created | ||
| ------------------- | ||
| The resulting file contains: | ||
|
|
||
| /obs | ||
| Per-cell metadata including cell IDs and optional user-provided metadata. | ||
| /var | ||
| Channel metadata (channel names and channel mapping). | ||
| /uns | ||
| scPortrait metadata describing the image container. | ||
| /obsm/single_cell_images | ||
| HDF5 dataset with shape (N, C, H, W), dtype float16, chunked as | ||
| (1, 1, H, W), compressed. | ||
|
|
||
| Notes | ||
| ----- | ||
| - The file is technically an AnnData `.h5ad` file with a `.h5sc` extension. | ||
| - Masks and image channels share a single dataset and dtype (`float16`). | ||
| - The function performs a single-threaded write; no file locking is used. | ||
| - All input arrays are cast to the storage dtype before writing. | ||
|
|
||
| Raises | ||
| ------ | ||
| Exception | ||
| If: | ||
| - `mask_imgs` and `channel_imgs` have different numbers of cells, | ||
| - the number of provided channel names does not match the array shapes, | ||
| - `cell_metadata` does not have `N` rows, | ||
| - an unsupported compression type is requested. | ||
| """ | ||
| if mask_imgs.shape[0] != channel_imgs.shape[0]: | ||
| raise Exception( | ||
| "mask_imgs and channel_imgs do not contain the same number of cells. The expected shape is (N, C, H, W)." | ||
| ) | ||
| # check mask_names and channel_names fit to imgs shape-wise | ||
| if len(mask_names) != mask_imgs.shape[1]: | ||
| raise Exception( | ||
| "mask_names needs to match mask_imgs.shape[0]. You need to pass the same number of masks and labels." | ||
| ) | ||
| if len(channel_names) != channel_imgs.shape[1]: | ||
| raise Exception( | ||
| "channel_names needs to match channel_imgs.shape[0]. You need to pass the same number of image channels and labels." | ||
| ) | ||
| if compression_type not in ["gzip", "lzf"]: | ||
sophiamaedler marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| raise Exception("Compression needs to be lzf or gzip.") | ||
|
|
||
| channels = np.concatenate([mask_names, channel_names]) | ||
| num_cells = channel_imgs.shape[0] | ||
sophiamaedler marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| img_size = channel_imgs.shape[2:4] | ||
| cell_ids = cell_ids.astype(DEFAULT_SEGMENTATION_DTYPE, copy=False) | ||
| all_imgs = np.concatenate([mask_imgs, channel_imgs], axis=1) | ||
| all_imgs = all_imgs.astype(DEFAULT_SINGLE_CELL_IMAGE_DTYPE, copy=False) | ||
|
||
|
|
||
| channel_mapping = ["mask" for x in mask_names] + ["image_channel" for x in channel_names] | ||
|
|
||
| # create var object with channel names and their mapping to mask or image channels | ||
| vars = pd.DataFrame(index=np.arange(len(channels)).astype("str")) | ||
| vars["channels"] = channels | ||
| vars["channel_mapping"] = channel_mapping | ||
|
|
||
| obs = pd.DataFrame({DEFAULT_CELL_ID_NAME: cell_ids}) | ||
| obs.index = obs.index.values.astype("str") | ||
| if cell_metadata is not None: | ||
| for col in cell_metadata.columns: | ||
| obs[col] = cell_metadata[col].values | ||
|
|
||
| # create anndata object | ||
| adata = AnnData(obs=obs, var=vars) | ||
|
|
||
| # add additional metadata to `uns` | ||
| adata.uns[f"{DEFAULT_NAME_SINGLE_CELL_IMAGES}/n_cells"] = num_cells | ||
| adata.uns[f"{DEFAULT_NAME_SINGLE_CELL_IMAGES}/n_channels"] = len(channels) | ||
| adata.uns[f"{DEFAULT_NAME_SINGLE_CELL_IMAGES}/n_masks"] = mask_imgs.shape[1] | ||
| adata.uns[f"{DEFAULT_NAME_SINGLE_CELL_IMAGES}/n_image_channels"] = channel_imgs.shape[1] | ||
| adata.uns[f"{DEFAULT_NAME_SINGLE_CELL_IMAGES}/image_size_x"] = img_size[0] | ||
| adata.uns[f"{DEFAULT_NAME_SINGLE_CELL_IMAGES}/image_size_y"] = img_size[1] | ||
| # adata.uns[f"{self.DEFAULT_NAME_SINGLE_CELL_IMAGES}/normalization"] = self.normalization | ||
| # adata.uns[f"{self.DEFAULT_NAME_SINGLE_CELL_IMAGES}/normalization_range_lower"] = self.normalization_range[0] | ||
| # adata.uns[f"{self.DEFAULT_NAME_SINGLE_CELL_IMAGES}/normalization_range_upper"] = self.normalization_range[1] | ||
| adata.uns[f"{DEFAULT_NAME_SINGLE_CELL_IMAGES}/channel_names"] = channels | ||
| adata.uns[f"{DEFAULT_NAME_SINGLE_CELL_IMAGES}/channel_mapping"] = np.array(channel_mapping, dtype="<U15") | ||
| adata.uns[f"{DEFAULT_NAME_SINGLE_CELL_IMAGES}/compression"] = compression_type | ||
|
|
||
| # write to file | ||
| adata.write(output_path) | ||
|
|
||
| # add an empty HDF5 dataset to the obsm group of the anndata object | ||
| with h5py.File(output_path, "a") as hf: | ||
| hf.create_dataset( | ||
| IMAGE_DATACONTAINER_NAME, | ||
| shape=all_imgs.shape, | ||
| chunks=(1, 1, img_size[0], img_size[1]), | ||
| compression=compression_type, | ||
| dtype=DEFAULT_SINGLE_CELL_IMAGE_DTYPE, | ||
| ) | ||
|
|
||
| # add required metadata from anndata package | ||
| hf[IMAGE_DATACONTAINER_NAME].attrs["encoding-type"] = "array" | ||
| hf[IMAGE_DATACONTAINER_NAME].attrs["encoding-version"] = "0.2.0" | ||
|
|
||
| # add relevant metadata to the single-cell image container | ||
| hf[IMAGE_DATACONTAINER_NAME].attrs["n_cells"] = num_cells | ||
| hf[IMAGE_DATACONTAINER_NAME].attrs["n_channels"] = len(channels) | ||
| hf[IMAGE_DATACONTAINER_NAME].attrs["n_masks"] = mask_imgs.shape[1] | ||
| hf[IMAGE_DATACONTAINER_NAME].attrs["n_image_channels"] = channel_imgs.shape[1] | ||
| hf[IMAGE_DATACONTAINER_NAME].attrs["image_size_x"] = img_size[0] | ||
| hf[IMAGE_DATACONTAINER_NAME].attrs["image_size_y"] = img_size[1] | ||
| # hf[IMAGE_DATACONTAINER_NAME].attrs["normalization"] = self.normalization | ||
| # hf[IMAGE_DATACONTAINER_NAME].attrs["normalization_range"] = self.normalization_range | ||
| hf[IMAGE_DATACONTAINER_NAME].attrs["channel_names"] = np.array([x.encode("utf-8") for x in channels]) | ||
| mapping_values = ["mask" for x in mask_names] + ["image_channel" for x in channel_names] | ||
| hf[IMAGE_DATACONTAINER_NAME].attrs["channel_mapping"] = np.array([x.encode("utf-8") for x in mapping_values]) | ||
| hf[IMAGE_DATACONTAINER_NAME].attrs["compression"] = compression_type | ||
|
|
||
| # Write images to .h5sc file, single thread | ||
| with h5py.File(output_path, "a") as hf: | ||
| single_cell_data_container: h5py.Dataset = hf[IMAGE_DATACONTAINER_NAME] | ||
| single_cell_data_container[0 : all_imgs.shape[0], :, :, :] = all_imgs | ||
sophiamaedler marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
Uh oh!
There was an error while loading. Please reload this page.