Skip to content

Commit 374ec85

Browse files
Merge pull request #353 from MannLabs/numpy_to_h5sc
Added numpy_to_h5sc function
2 parents a0449dd + aa9e3c3 commit 374ec85

File tree

3 files changed

+312
-2
lines changed

3 files changed

+312
-2
lines changed

src/scportrait/io/h5sc.py

Lines changed: 230 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,22 @@
1+
import warnings
2+
from collections.abc import Sequence
13
from pathlib import Path
2-
from typing import Literal
4+
from typing import Any, Literal
35

46
import h5py
7+
import numpy as np
8+
import numpy.typing as npt
9+
import pandas as pd
510
from anndata import AnnData
611
from anndata._io.h5ad import _clean_uns, _read_raw, read_dataframe, read_elem
712

8-
from scportrait.pipeline._utils.constants import DEFAULT_NAME_SINGLE_CELL_IMAGES, IMAGE_DATACONTAINER_NAME
13+
from scportrait.pipeline._utils.constants import (
14+
DEFAULT_CELL_ID_NAME,
15+
DEFAULT_NAME_SINGLE_CELL_IMAGES,
16+
DEFAULT_SEGMENTATION_DTYPE,
17+
DEFAULT_SINGLE_CELL_IMAGE_DTYPE,
18+
IMAGE_DATACONTAINER_NAME,
19+
)
920

1021

1122
def read_h5sc(filename: str | Path) -> AnnData:
@@ -45,3 +56,220 @@ def read_h5sc(filename: str | Path) -> AnnData:
4556

4657
adata.obsm[DEFAULT_NAME_SINGLE_CELL_IMAGES] = f.get(IMAGE_DATACONTAINER_NAME)
4758
return adata
59+
60+
61+
def numpy_to_h5sc(
62+
mask_names: Sequence[str],
63+
channel_names: Sequence[str],
64+
mask_imgs: npt.NDArray,
65+
channel_imgs: npt.NDArray,
66+
output_path: str | Path,
67+
cell_ids: npt.NDArray[np.integer[Any]],
68+
cell_metadata: pd.DataFrame | None = None,
69+
image_dtype=DEFAULT_SINGLE_CELL_IMAGE_DTYPE,
70+
compression_type: Literal["gzip", "lzf"] = "gzip",
71+
) -> None:
72+
"""Create and write an scPortrait-style `.h5sc` file from NumPy arrays of single-cell
73+
masks and image channels, with optional per-cell metadata.
74+
75+
This function builds a valid AnnData-backed HDF5 container following the scPortrait
76+
“H5SC” convention. Internally, the file is a standard AnnData `.h5ad` structure whose
77+
filename ends in `.h5sc`, and which contains a 4D image tensor stored at:
78+
79+
/obsm/single_cell_images
80+
81+
with shape:
82+
83+
(N, C, H, W)
84+
85+
where:
86+
N = number of cells
87+
C = n_masks + n_image_channels
88+
H = image height
89+
W = image width
90+
91+
The mask channels are stored first, followed by the image channels. All data are
92+
written as a single float16 HDF5 dataset, with mask values encoded as 0.0 and 1.0.
93+
94+
Cell identifiers and optional per-cell metadata are written to `adata.obs`.
95+
96+
Metadata are written redundantly:
97+
- At the AnnData level in `adata.uns[...]`
98+
- At the HDF5 level as attributes on `/obsm/single_cell_images`
99+
100+
This allows the file to be read both via AnnData and as a standalone HDF5 image
101+
container.
102+
103+
Args:
104+
mask_names: Names of the mask channels. Length must match `mask_imgs.shape[1]`.
105+
channel_names: Names of the image channels. Length must match `channel_imgs.shape[1]`.
106+
mask_imgs: Array of mask images with shape `(N, n_masks, H, W)`.
107+
Masks are expected to be binary (0 or 1) and will be stored as float16.
108+
channel_imgs: Array of image channels with shape `(N, n_image_channels, H, W)`.
109+
Images should already be normalized (e.g., to [0, 1]) before writing.
110+
output_path: Path of the `.h5sc` file to create, e.g. `"/path/to/file.h5sc"`.
111+
The file will be overwritten if it already exists.
112+
cell_ids: Array of segmentation cell identifiers with shape `(N,)`.
113+
These values are written into `adata.obs[DEFAULT_CELL_ID_NAME]` and define the
114+
mapping between row index and original segmentation label.
115+
cell_metadata: Optional per-cell metadata to be written into `adata.obs`.
116+
Must have exactly `N` rows. Columns will be merged into `obs` alongside the
117+
cell ID column. The index is ignored and replaced by AnnData’s internal index.
118+
compression_type: HDF5 compression algorithm used for the image tensor.
119+
- "gzip": better compression, slower I/O
120+
- "lzf" : faster I/O, lower compression ratio
121+
122+
File layout created:
123+
The resulting file contains:
124+
125+
/obs
126+
Per-cell metadata including cell IDs and optional user-provided metadata.
127+
/var
128+
Channel metadata (channel names and channel mapping).
129+
/uns
130+
scPortrait metadata describing the image container.
131+
/obsm/single_cell_images
132+
HDF5 dataset with shape (N, C, H, W), dtype float16, chunked as
133+
(1, 1, H, W), compressed.
134+
135+
Notes:
136+
- The file is technically an AnnData `.h5ad` file with a `.h5sc` extension.
137+
- Masks and image channels share a single dataset and dtype (`float16`).
138+
- The function performs a single-threaded write; no file locking is used.
139+
- All input arrays are cast to the storage dtype before writing.
140+
141+
Warnings:
142+
UserWarning: If `mask_imgs` or `channel_imgs` contain values outside [0, 1].
143+
Mask images or channel images are outside the expected [0, 1] range. This does not align with
144+
scPortrait's convention and unscaled data can produce unexpected results in downstream
145+
functions or require additional preprocessing before passing images to deep learning models.
146+
147+
Raises:
148+
Exception: If:
149+
- `mask_imgs` or `channel_imgs` do not have 4 dimensions `(N, C, H, W)`,
150+
- `mask_imgs` and `channel_imgs` have different numbers of cells,
151+
- `mask_imgs` and `channel_imgs` have different image sizes,
152+
- the number of provided channel names does not match the array shapes,
153+
- `cell_metadata` does not have `N` rows,
154+
- an unsupported compression type is requested.
155+
"""
156+
if mask_imgs.ndim != 4 or channel_imgs.ndim != 4:
157+
raise Exception("mask_imgs and channel_imgs must have shape (N, C, H, W) with exactly 4 dimensions.")
158+
if mask_imgs.shape[0] != channel_imgs.shape[0]:
159+
raise Exception(
160+
"mask_imgs and channel_imgs do not contain the same number of cells. The expected shape is (N, C, H, W)."
161+
)
162+
if mask_imgs.shape[2:4] != channel_imgs.shape[2:4]:
163+
raise Exception(
164+
"mask_imgs and channel_imgs do not contain the same image size. The expected shape is (N, C, H, W)."
165+
)
166+
# check mask_names and channel_names fit to imgs shape-wise
167+
if len(mask_names) != mask_imgs.shape[1]:
168+
raise Exception(
169+
"mask_names needs to match mask_imgs.shape[1]. You need to pass the same number of masks and labels."
170+
)
171+
if len(channel_names) != channel_imgs.shape[1]:
172+
raise Exception(
173+
"channel_names needs to match channel_imgs.shape[1]. You need to pass the same number of image channels and labels."
174+
)
175+
if compression_type not in ["gzip", "lzf"]:
176+
raise Exception("Compression needs to be lzf or gzip.")
177+
178+
# prepare metadata
179+
channels = np.concatenate([mask_names, channel_names])
180+
num_cells = channel_imgs.shape[0]
181+
img_size = channel_imgs.shape[2:4]
182+
cell_ids = cell_ids.astype(DEFAULT_SEGMENTATION_DTYPE, copy=False)
183+
channel_mapping = ["mask" for x in mask_names] + ["image_channel" for x in channel_names]
184+
185+
# prepare images
186+
if mask_imgs.min() < 0 or mask_imgs.max() > 1:
187+
warnings.warn(
188+
"Mask images are outside the expected [0, 1] range. This does not align with "
189+
"scPortrait's convention and unscaled data can produce unexpected results in downstream "
190+
"functions or require additional preprocessing before passing images to "
191+
"deep learning models.",
192+
UserWarning,
193+
stacklevel=2,
194+
)
195+
196+
if channel_imgs.min() < 0 or channel_imgs.max() > 1:
197+
warnings.warn(
198+
" Image channels are outside the expected [0, 1] range. This does not align with"
199+
"scPortrait's convention and unscaled data can produce unexpected results in downstream "
200+
"functions or require additional preprocessing before passing images to "
201+
"deep learning models.",
202+
UserWarning,
203+
stacklevel=2,
204+
)
205+
all_imgs = np.concatenate([mask_imgs, channel_imgs], axis=1)
206+
all_imgs = all_imgs.astype(image_dtype, copy=False)
207+
208+
# create var object with channel names and their mapping to mask or image channels
209+
vars = pd.DataFrame(index=np.arange(len(channels)).astype("str"))
210+
vars["channels"] = channels
211+
vars["channel_mapping"] = channel_mapping
212+
213+
obs = pd.DataFrame({DEFAULT_CELL_ID_NAME: cell_ids})
214+
obs.index = obs.index.values.astype("str")
215+
if cell_metadata is not None:
216+
if len(cell_metadata) != num_cells:
217+
raise Exception(
218+
f"cell_metadata must have {num_cells} rows to match the number of cells, got {len(cell_metadata)}."
219+
)
220+
for col in cell_metadata.columns:
221+
obs[col] = cell_metadata[col].values
222+
223+
# create anndata object
224+
adata = AnnData(obs=obs, var=vars)
225+
226+
# add additional metadata to `uns`
227+
adata.uns[f"{DEFAULT_NAME_SINGLE_CELL_IMAGES}/n_cells"] = num_cells
228+
adata.uns[f"{DEFAULT_NAME_SINGLE_CELL_IMAGES}/n_channels"] = len(channels)
229+
adata.uns[f"{DEFAULT_NAME_SINGLE_CELL_IMAGES}/n_masks"] = mask_imgs.shape[1]
230+
adata.uns[f"{DEFAULT_NAME_SINGLE_CELL_IMAGES}/n_image_channels"] = channel_imgs.shape[1]
231+
adata.uns[f"{DEFAULT_NAME_SINGLE_CELL_IMAGES}/image_size_x"] = img_size[0]
232+
adata.uns[f"{DEFAULT_NAME_SINGLE_CELL_IMAGES}/image_size_y"] = img_size[1]
233+
# adata.uns[f"{self.DEFAULT_NAME_SINGLE_CELL_IMAGES}/normalization"] = self.normalization
234+
# adata.uns[f"{self.DEFAULT_NAME_SINGLE_CELL_IMAGES}/normalization_range_lower"] = self.normalization_range[0]
235+
# adata.uns[f"{self.DEFAULT_NAME_SINGLE_CELL_IMAGES}/normalization_range_upper"] = self.normalization_range[1]
236+
adata.uns[f"{DEFAULT_NAME_SINGLE_CELL_IMAGES}/channel_names"] = channels
237+
adata.uns[f"{DEFAULT_NAME_SINGLE_CELL_IMAGES}/channel_mapping"] = np.array(channel_mapping, dtype="<U15")
238+
adata.uns[f"{DEFAULT_NAME_SINGLE_CELL_IMAGES}/compression"] = compression_type
239+
240+
# write to file
241+
adata.write(output_path)
242+
243+
# add an empty HDF5 dataset to the obsm group of the anndata object
244+
with h5py.File(output_path, "a") as hf:
245+
hf.create_dataset(
246+
IMAGE_DATACONTAINER_NAME,
247+
shape=all_imgs.shape,
248+
chunks=(1, 1, img_size[0], img_size[1]),
249+
compression=compression_type,
250+
dtype=image_dtype,
251+
)
252+
253+
# add required metadata from anndata package
254+
hf[IMAGE_DATACONTAINER_NAME].attrs["encoding-type"] = "array"
255+
hf[IMAGE_DATACONTAINER_NAME].attrs["encoding-version"] = "0.2.0"
256+
257+
# add relevant metadata to the single-cell image container
258+
hf[IMAGE_DATACONTAINER_NAME].attrs["n_cells"] = num_cells
259+
hf[IMAGE_DATACONTAINER_NAME].attrs["n_channels"] = len(channels)
260+
hf[IMAGE_DATACONTAINER_NAME].attrs["n_masks"] = mask_imgs.shape[1]
261+
hf[IMAGE_DATACONTAINER_NAME].attrs["n_image_channels"] = channel_imgs.shape[1]
262+
hf[IMAGE_DATACONTAINER_NAME].attrs["image_size_x"] = img_size[0]
263+
hf[IMAGE_DATACONTAINER_NAME].attrs["image_size_y"] = img_size[1]
264+
# hf[IMAGE_DATACONTAINER_NAME].attrs["normalization"] = self.normalization
265+
# hf[IMAGE_DATACONTAINER_NAME].attrs["normalization_range"] = self.normalization_range
266+
hf[IMAGE_DATACONTAINER_NAME].attrs["channel_names"] = np.array([x.encode("utf-8") for x in channels])
267+
mapping_values = ["mask" for x in mask_names] + ["image_channel" for x in channel_names]
268+
hf[IMAGE_DATACONTAINER_NAME].attrs["channel_mapping"] = np.array([x.encode("utf-8") for x in mapping_values])
269+
hf[IMAGE_DATACONTAINER_NAME].attrs["compression"] = compression_type
270+
271+
# Write images to .h5sc file, single thread
272+
single_cell_data_container: h5py.Dataset = hf[IMAGE_DATACONTAINER_NAME]
273+
274+
for save_index, img in enumerate(all_imgs):
275+
single_cell_data_container[save_index] = img

src/scportrait/plotting/h5sc.py

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
from matplotlib.axes import Axes
1010
from matplotlib.figure import Figure
1111
from mpl_toolkits.axes_grid1.inset_locator import inset_axes
12+
from skimage import measure
1213

1314
from scportrait.pipeline._utils.constants import DEFAULT_CELL_ID_NAME
1415
from scportrait.tools.h5sc import get_image_with_cellid
@@ -119,9 +120,56 @@ def _plot_image_grid(
119120
ax_sub.set_title(f"{image_titles[i]}", fontsize=image_titles_fontsize, pad=2, rotation=image_title_rotation)
120121

121122

123+
def _plot_contour_grid(
124+
ax: Axes,
125+
masks: np.ndarray,
126+
level: float = 0.5,
127+
linewidth: float = 0.5,
128+
) -> None:
129+
"""Helper function to plot contour overlays on an existing image grid.
130+
131+
This function assumes that `_plot_image_grid` has already been called on the same
132+
`ax`, and that the inset axes created there are available in `ax.child_axes` in the
133+
same order as the images and masks.
134+
135+
Args:
136+
ax: The matplotlib axes object that already contains the image grid.
137+
masks: The masks to plot contours from. Shape should be (N, H, W) where N matches
138+
the number of images in the grid, or (H, W) for a single mask.
139+
level: The contour level passed to `skimage.measure.find_contours`. For binary masks,
140+
this is typically 0.5.
141+
linewidth: The line width used when plotting the contour outlines.
142+
143+
Returns:
144+
None
145+
"""
146+
147+
sub_axes = ax.child_axes
148+
if not sub_axes:
149+
raise RuntimeError("No inset axes found. Call _plot_image_grid(...) before calling this function.")
150+
151+
# Convert single mask (H, W) to (N, H, W)
152+
if masks.ndim == 2:
153+
masks = masks[None, ...]
154+
155+
if len(sub_axes) != masks.shape[0]:
156+
raise ValueError(f"Number of masks ({masks.shape[0]}) must match number of grid cells ({len(sub_axes)}).")
157+
158+
for ax_sub, mask in zip(sub_axes, masks, strict=False):
159+
contours = measure.find_contours(mask, level=level)
160+
for contour in contours:
161+
ax_sub.plot(
162+
contour[:, 1], # x = column
163+
contour[:, 0], # y = row
164+
linewidth=linewidth,
165+
color="white",
166+
)
167+
168+
122169
def cell_grid_single_channel(
123170
adata,
124171
select_channel: int | str,
172+
mask_id: int | None = None,
125173
n_cells: int = 16,
126174
cell_ids: int | list[int] | None = None,
127175
cell_labels: list[str] | None = None,
@@ -133,6 +181,8 @@ def cell_grid_single_channel(
133181
ncols: int | None = None,
134182
nrows: int | None = None,
135183
single_cell_size: int = 2,
184+
vmin: float = 0,
185+
vmax: float = 1,
136186
spacing: float = 0.025,
137187
ax: Axes = None,
138188
return_fig: bool = False,
@@ -143,6 +193,7 @@ def cell_grid_single_channel(
143193
Args:
144194
adata: An scPortrait single-cell image dataset.
145195
select_channel: The channel to visualize.
196+
mask_id: The index of a mask to visualize as outlines.
146197
n_cells: The number of cells to visualize. This number of cells will randomly be selected. If `None`, `cell_ids` must be provided.
147198
cell_ids: cell IDs for the specific cells that should be visualized. If `None`, `n_cells` must be provided.
148199
cell_labels: Label to plot as title for each single-cell image if provided.
@@ -231,7 +282,15 @@ def cell_grid_single_channel(
231282
image_titles=cell_labels,
232283
cmap=cmap,
233284
spacing=spacing,
285+
vmin=vmin,
286+
vmax=vmax,
234287
)
288+
if mask_id is not None:
289+
masks = get_image_with_cellid(adata, _cell_ids, mask_id)
290+
_plot_contour_grid(
291+
ax,
292+
masks,
293+
)
235294

236295
if return_fig:
237296
return fig

tests/unit_tests/plotting/test_h5sc.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
rng = np.random.default_rng()
1010

1111
from scportrait.plotting.h5sc import (
12+
_plot_contour_grid,
1213
_plot_image_grid,
1314
_reshape_image_array,
1415
cell_grid,
@@ -70,6 +71,28 @@ def test_plot_image_grid(input_shape, nrows, ncols, col_labels, col_labels_rotat
7071
break
7172

7273

74+
# ---------- _plot_contour_grid ----------
75+
76+
77+
def test_plot_contour_grid_requires_image_grid():
78+
fig, ax = plt.subplots(1, 1)
79+
masks = rng.random((2, 10, 10)) > 0.5
80+
with pytest.raises(RuntimeError):
81+
_plot_contour_grid(ax, masks)
82+
83+
84+
def test_plot_contour_grid_draws_on_existing_grid():
85+
images = rng.random((4, 10, 10))
86+
masks = np.zeros((4, 10, 10), dtype=float)
87+
masks[:, 2:8, 2:8] = 1.0
88+
fig, ax = plt.subplots(1, 1)
89+
_plot_image_grid(ax, images, nrows=2, ncols=2)
90+
_plot_contour_grid(ax, masks)
91+
# Verify contours were added to each inset axis
92+
for ax_sub in ax.child_axes:
93+
assert len(ax_sub.lines) > 0
94+
95+
7396
# ---------- cell_grid_single_channel ----------
7497

7598

0 commit comments

Comments
 (0)