|
| 1 | +import warnings |
| 2 | +from collections.abc import Sequence |
1 | 3 | from pathlib import Path |
2 | | -from typing import Literal |
| 4 | +from typing import Any, Literal |
3 | 5 |
|
4 | 6 | import h5py |
| 7 | +import numpy as np |
| 8 | +import numpy.typing as npt |
| 9 | +import pandas as pd |
5 | 10 | from anndata import AnnData |
6 | 11 | from anndata._io.h5ad import _clean_uns, _read_raw, read_dataframe, read_elem |
7 | 12 |
|
8 | | -from scportrait.pipeline._utils.constants import DEFAULT_NAME_SINGLE_CELL_IMAGES, IMAGE_DATACONTAINER_NAME |
| 13 | +from scportrait.pipeline._utils.constants import ( |
| 14 | + DEFAULT_CELL_ID_NAME, |
| 15 | + DEFAULT_NAME_SINGLE_CELL_IMAGES, |
| 16 | + DEFAULT_SEGMENTATION_DTYPE, |
| 17 | + DEFAULT_SINGLE_CELL_IMAGE_DTYPE, |
| 18 | + IMAGE_DATACONTAINER_NAME, |
| 19 | +) |
9 | 20 |
|
10 | 21 |
|
11 | 22 | def read_h5sc(filename: str | Path) -> AnnData: |
@@ -45,3 +56,220 @@ def read_h5sc(filename: str | Path) -> AnnData: |
45 | 56 |
|
46 | 57 | adata.obsm[DEFAULT_NAME_SINGLE_CELL_IMAGES] = f.get(IMAGE_DATACONTAINER_NAME) |
47 | 58 | return adata |
| 59 | + |
| 60 | + |
| 61 | +def numpy_to_h5sc( |
| 62 | + mask_names: Sequence[str], |
| 63 | + channel_names: Sequence[str], |
| 64 | + mask_imgs: npt.NDArray, |
| 65 | + channel_imgs: npt.NDArray, |
| 66 | + output_path: str | Path, |
| 67 | + cell_ids: npt.NDArray[np.integer[Any]], |
| 68 | + cell_metadata: pd.DataFrame | None = None, |
| 69 | + image_dtype=DEFAULT_SINGLE_CELL_IMAGE_DTYPE, |
| 70 | + compression_type: Literal["gzip", "lzf"] = "gzip", |
| 71 | +) -> None: |
| 72 | + """Create and write an scPortrait-style `.h5sc` file from NumPy arrays of single-cell |
| 73 | + masks and image channels, with optional per-cell metadata. |
| 74 | +
|
| 75 | + This function builds a valid AnnData-backed HDF5 container following the scPortrait |
| 76 | + “H5SC” convention. Internally, the file is a standard AnnData `.h5ad` structure whose |
| 77 | + filename ends in `.h5sc`, and which contains a 4D image tensor stored at: |
| 78 | +
|
| 79 | + /obsm/single_cell_images |
| 80 | +
|
| 81 | + with shape: |
| 82 | +
|
| 83 | + (N, C, H, W) |
| 84 | +
|
| 85 | + where: |
| 86 | + N = number of cells |
| 87 | + C = n_masks + n_image_channels |
| 88 | + H = image height |
| 89 | + W = image width |
| 90 | +
|
| 91 | + The mask channels are stored first, followed by the image channels. All data are |
| 92 | + written as a single float16 HDF5 dataset, with mask values encoded as 0.0 and 1.0. |
| 93 | +
|
| 94 | + Cell identifiers and optional per-cell metadata are written to `adata.obs`. |
| 95 | +
|
| 96 | + Metadata are written redundantly: |
| 97 | + - At the AnnData level in `adata.uns[...]` |
| 98 | + - At the HDF5 level as attributes on `/obsm/single_cell_images` |
| 99 | +
|
| 100 | + This allows the file to be read both via AnnData and as a standalone HDF5 image |
| 101 | + container. |
| 102 | +
|
| 103 | + Args: |
| 104 | + mask_names: Names of the mask channels. Length must match `mask_imgs.shape[1]`. |
| 105 | + channel_names: Names of the image channels. Length must match `channel_imgs.shape[1]`. |
| 106 | + mask_imgs: Array of mask images with shape `(N, n_masks, H, W)`. |
| 107 | + Masks are expected to be binary (0 or 1) and will be stored as float16. |
| 108 | + channel_imgs: Array of image channels with shape `(N, n_image_channels, H, W)`. |
| 109 | + Images should already be normalized (e.g., to [0, 1]) before writing. |
| 110 | + output_path: Path of the `.h5sc` file to create, e.g. `"/path/to/file.h5sc"`. |
| 111 | + The file will be overwritten if it already exists. |
| 112 | + cell_ids: Array of segmentation cell identifiers with shape `(N,)`. |
| 113 | + These values are written into `adata.obs[DEFAULT_CELL_ID_NAME]` and define the |
| 114 | + mapping between row index and original segmentation label. |
| 115 | + cell_metadata: Optional per-cell metadata to be written into `adata.obs`. |
| 116 | + Must have exactly `N` rows. Columns will be merged into `obs` alongside the |
| 117 | + cell ID column. The index is ignored and replaced by AnnData’s internal index. |
| 118 | + compression_type: HDF5 compression algorithm used for the image tensor. |
| 119 | + - "gzip": better compression, slower I/O |
| 120 | + - "lzf" : faster I/O, lower compression ratio |
| 121 | +
|
| 122 | + File layout created: |
| 123 | + The resulting file contains: |
| 124 | +
|
| 125 | + /obs |
| 126 | + Per-cell metadata including cell IDs and optional user-provided metadata. |
| 127 | + /var |
| 128 | + Channel metadata (channel names and channel mapping). |
| 129 | + /uns |
| 130 | + scPortrait metadata describing the image container. |
| 131 | + /obsm/single_cell_images |
| 132 | + HDF5 dataset with shape (N, C, H, W), dtype float16, chunked as |
| 133 | + (1, 1, H, W), compressed. |
| 134 | +
|
| 135 | + Notes: |
| 136 | + - The file is technically an AnnData `.h5ad` file with a `.h5sc` extension. |
| 137 | + - Masks and image channels share a single dataset and dtype (`float16`). |
| 138 | + - The function performs a single-threaded write; no file locking is used. |
| 139 | + - All input arrays are cast to the storage dtype before writing. |
| 140 | +
|
| 141 | + Warnings: |
| 142 | + UserWarning: If `mask_imgs` or `channel_imgs` contain values outside [0, 1]. |
| 143 | + Mask images or channel images are outside the expected [0, 1] range. This does not align with |
| 144 | + scPortrait's convention and unscaled data can produce unexpected results in downstream |
| 145 | + functions or require additional preprocessing before passing images to deep learning models. |
| 146 | +
|
| 147 | + Raises: |
| 148 | + Exception: If: |
| 149 | + - `mask_imgs` or `channel_imgs` do not have 4 dimensions `(N, C, H, W)`, |
| 150 | + - `mask_imgs` and `channel_imgs` have different numbers of cells, |
| 151 | + - `mask_imgs` and `channel_imgs` have different image sizes, |
| 152 | + - the number of provided channel names does not match the array shapes, |
| 153 | + - `cell_metadata` does not have `N` rows, |
| 154 | + - an unsupported compression type is requested. |
| 155 | + """ |
| 156 | + if mask_imgs.ndim != 4 or channel_imgs.ndim != 4: |
| 157 | + raise Exception("mask_imgs and channel_imgs must have shape (N, C, H, W) with exactly 4 dimensions.") |
| 158 | + if mask_imgs.shape[0] != channel_imgs.shape[0]: |
| 159 | + raise Exception( |
| 160 | + "mask_imgs and channel_imgs do not contain the same number of cells. The expected shape is (N, C, H, W)." |
| 161 | + ) |
| 162 | + if mask_imgs.shape[2:4] != channel_imgs.shape[2:4]: |
| 163 | + raise Exception( |
| 164 | + "mask_imgs and channel_imgs do not contain the same image size. The expected shape is (N, C, H, W)." |
| 165 | + ) |
| 166 | + # check mask_names and channel_names fit to imgs shape-wise |
| 167 | + if len(mask_names) != mask_imgs.shape[1]: |
| 168 | + raise Exception( |
| 169 | + "mask_names needs to match mask_imgs.shape[1]. You need to pass the same number of masks and labels." |
| 170 | + ) |
| 171 | + if len(channel_names) != channel_imgs.shape[1]: |
| 172 | + raise Exception( |
| 173 | + "channel_names needs to match channel_imgs.shape[1]. You need to pass the same number of image channels and labels." |
| 174 | + ) |
| 175 | + if compression_type not in ["gzip", "lzf"]: |
| 176 | + raise Exception("Compression needs to be lzf or gzip.") |
| 177 | + |
| 178 | + # prepare metadata |
| 179 | + channels = np.concatenate([mask_names, channel_names]) |
| 180 | + num_cells = channel_imgs.shape[0] |
| 181 | + img_size = channel_imgs.shape[2:4] |
| 182 | + cell_ids = cell_ids.astype(DEFAULT_SEGMENTATION_DTYPE, copy=False) |
| 183 | + channel_mapping = ["mask" for x in mask_names] + ["image_channel" for x in channel_names] |
| 184 | + |
| 185 | + # prepare images |
| 186 | + if mask_imgs.min() < 0 or mask_imgs.max() > 1: |
| 187 | + warnings.warn( |
| 188 | + "Mask images are outside the expected [0, 1] range. This does not align with " |
| 189 | + "scPortrait's convention and unscaled data can produce unexpected results in downstream " |
| 190 | + "functions or require additional preprocessing before passing images to " |
| 191 | + "deep learning models.", |
| 192 | + UserWarning, |
| 193 | + stacklevel=2, |
| 194 | + ) |
| 195 | + |
| 196 | + if channel_imgs.min() < 0 or channel_imgs.max() > 1: |
| 197 | + warnings.warn( |
| 198 | + " Image channels are outside the expected [0, 1] range. This does not align with" |
| 199 | + "scPortrait's convention and unscaled data can produce unexpected results in downstream " |
| 200 | + "functions or require additional preprocessing before passing images to " |
| 201 | + "deep learning models.", |
| 202 | + UserWarning, |
| 203 | + stacklevel=2, |
| 204 | + ) |
| 205 | + all_imgs = np.concatenate([mask_imgs, channel_imgs], axis=1) |
| 206 | + all_imgs = all_imgs.astype(image_dtype, copy=False) |
| 207 | + |
| 208 | + # create var object with channel names and their mapping to mask or image channels |
| 209 | + vars = pd.DataFrame(index=np.arange(len(channels)).astype("str")) |
| 210 | + vars["channels"] = channels |
| 211 | + vars["channel_mapping"] = channel_mapping |
| 212 | + |
| 213 | + obs = pd.DataFrame({DEFAULT_CELL_ID_NAME: cell_ids}) |
| 214 | + obs.index = obs.index.values.astype("str") |
| 215 | + if cell_metadata is not None: |
| 216 | + if len(cell_metadata) != num_cells: |
| 217 | + raise Exception( |
| 218 | + f"cell_metadata must have {num_cells} rows to match the number of cells, got {len(cell_metadata)}." |
| 219 | + ) |
| 220 | + for col in cell_metadata.columns: |
| 221 | + obs[col] = cell_metadata[col].values |
| 222 | + |
| 223 | + # create anndata object |
| 224 | + adata = AnnData(obs=obs, var=vars) |
| 225 | + |
| 226 | + # add additional metadata to `uns` |
| 227 | + adata.uns[f"{DEFAULT_NAME_SINGLE_CELL_IMAGES}/n_cells"] = num_cells |
| 228 | + adata.uns[f"{DEFAULT_NAME_SINGLE_CELL_IMAGES}/n_channels"] = len(channels) |
| 229 | + adata.uns[f"{DEFAULT_NAME_SINGLE_CELL_IMAGES}/n_masks"] = mask_imgs.shape[1] |
| 230 | + adata.uns[f"{DEFAULT_NAME_SINGLE_CELL_IMAGES}/n_image_channels"] = channel_imgs.shape[1] |
| 231 | + adata.uns[f"{DEFAULT_NAME_SINGLE_CELL_IMAGES}/image_size_x"] = img_size[0] |
| 232 | + adata.uns[f"{DEFAULT_NAME_SINGLE_CELL_IMAGES}/image_size_y"] = img_size[1] |
| 233 | + # adata.uns[f"{self.DEFAULT_NAME_SINGLE_CELL_IMAGES}/normalization"] = self.normalization |
| 234 | + # adata.uns[f"{self.DEFAULT_NAME_SINGLE_CELL_IMAGES}/normalization_range_lower"] = self.normalization_range[0] |
| 235 | + # adata.uns[f"{self.DEFAULT_NAME_SINGLE_CELL_IMAGES}/normalization_range_upper"] = self.normalization_range[1] |
| 236 | + adata.uns[f"{DEFAULT_NAME_SINGLE_CELL_IMAGES}/channel_names"] = channels |
| 237 | + adata.uns[f"{DEFAULT_NAME_SINGLE_CELL_IMAGES}/channel_mapping"] = np.array(channel_mapping, dtype="<U15") |
| 238 | + adata.uns[f"{DEFAULT_NAME_SINGLE_CELL_IMAGES}/compression"] = compression_type |
| 239 | + |
| 240 | + # write to file |
| 241 | + adata.write(output_path) |
| 242 | + |
| 243 | + # add an empty HDF5 dataset to the obsm group of the anndata object |
| 244 | + with h5py.File(output_path, "a") as hf: |
| 245 | + hf.create_dataset( |
| 246 | + IMAGE_DATACONTAINER_NAME, |
| 247 | + shape=all_imgs.shape, |
| 248 | + chunks=(1, 1, img_size[0], img_size[1]), |
| 249 | + compression=compression_type, |
| 250 | + dtype=image_dtype, |
| 251 | + ) |
| 252 | + |
| 253 | + # add required metadata from anndata package |
| 254 | + hf[IMAGE_DATACONTAINER_NAME].attrs["encoding-type"] = "array" |
| 255 | + hf[IMAGE_DATACONTAINER_NAME].attrs["encoding-version"] = "0.2.0" |
| 256 | + |
| 257 | + # add relevant metadata to the single-cell image container |
| 258 | + hf[IMAGE_DATACONTAINER_NAME].attrs["n_cells"] = num_cells |
| 259 | + hf[IMAGE_DATACONTAINER_NAME].attrs["n_channels"] = len(channels) |
| 260 | + hf[IMAGE_DATACONTAINER_NAME].attrs["n_masks"] = mask_imgs.shape[1] |
| 261 | + hf[IMAGE_DATACONTAINER_NAME].attrs["n_image_channels"] = channel_imgs.shape[1] |
| 262 | + hf[IMAGE_DATACONTAINER_NAME].attrs["image_size_x"] = img_size[0] |
| 263 | + hf[IMAGE_DATACONTAINER_NAME].attrs["image_size_y"] = img_size[1] |
| 264 | + # hf[IMAGE_DATACONTAINER_NAME].attrs["normalization"] = self.normalization |
| 265 | + # hf[IMAGE_DATACONTAINER_NAME].attrs["normalization_range"] = self.normalization_range |
| 266 | + hf[IMAGE_DATACONTAINER_NAME].attrs["channel_names"] = np.array([x.encode("utf-8") for x in channels]) |
| 267 | + mapping_values = ["mask" for x in mask_names] + ["image_channel" for x in channel_names] |
| 268 | + hf[IMAGE_DATACONTAINER_NAME].attrs["channel_mapping"] = np.array([x.encode("utf-8") for x in mapping_values]) |
| 269 | + hf[IMAGE_DATACONTAINER_NAME].attrs["compression"] = compression_type |
| 270 | + |
| 271 | + # Write images to .h5sc file, single thread |
| 272 | + single_cell_data_container: h5py.Dataset = hf[IMAGE_DATACONTAINER_NAME] |
| 273 | + |
| 274 | + for save_index, img in enumerate(all_imgs): |
| 275 | + single_cell_data_container[save_index] = img |
0 commit comments