Skip to content

Commit 5f829f5

Browse files
Merge pull request #268 from scverse/seqfish-v2-fixes
Fixes seqfish reader v2
2 parents 26fb99b + 6592674 commit 5f829f5

File tree

6 files changed

+117
-54
lines changed

6 files changed

+117
-54
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,3 +41,4 @@ data
4141
# data folder
4242
data/
4343
tests/data
44+
uv.lock

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@ dependencies = [
3636
"readfcs",
3737
"tifffile>=2023.8.12",
3838
"ome-types",
39+
"xmltodict",
3940
]
4041

4142
[project.optional-dependencies]

src/spatialdata_io/_constants/_constants.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,6 @@ class SeqfishKeys(ModeEnum):
6868
TIFF_FILE = ".tiff"
6969
GEOJSON_FILE = ".geojson"
7070
# file identifiers
71-
ROI = "Roi"
7271
TRANSCRIPT_COORDINATES = "TranscriptList"
7372
DAPI = "DAPI"
7473
COUNTS_FILE = "CellxGene"
@@ -78,6 +77,7 @@ class SeqfishKeys(ModeEnum):
7877
# transcripts
7978
TRANSCRIPTS_X = "x"
8079
TRANSCRIPTS_Y = "y"
80+
TRANSCRIPTS_Z = "z"
8181
FEATURE_KEY = "name"
8282
INSTANCE_KEY_POINTS = "cell"
8383
# cells
@@ -88,8 +88,6 @@ class SeqfishKeys(ModeEnum):
8888
SPATIAL_KEY = "spatial"
8989
REGION_KEY = "region"
9090
INSTANCE_KEY_TABLE = "instance_id"
91-
SCALEFEFACTOR_X = "PhysicalSizeX"
92-
SCALEFEFACTOR_Y = "PhysicalSizeY"
9391

9492

9593
@unique

src/spatialdata_io/readers/_utils/_utils.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
from pathlib import Path
55
from typing import TYPE_CHECKING, Any, Union
66

7+
from anndata import AnnData
78
from anndata.io import read_text
89
from h5py import File
910
from ome_types import from_tiff

src/spatialdata_io/readers/seqfish.py

Lines changed: 110 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,9 @@
22

33
import os
44
import re
5+
import warnings
56
import xml.etree.ElementTree as ET
7+
from collections.abc import Mapping
68
from pathlib import Path
79
from types import MappingProxyType
810
from typing import TYPE_CHECKING, Any
@@ -11,8 +13,10 @@
1113
import numpy as np
1214
import pandas as pd
1315
import tifffile
16+
import xmltodict
1417
from dask_image.imread import imread
1518
from spatialdata import SpatialData
19+
from spatialdata._logging import logger
1620
from spatialdata.models import (
1721
Image2DModel,
1822
Labels2DModel,
@@ -30,28 +34,30 @@
3034

3135
__all__ = ["seqfish"]
3236

37+
LARGE_IMAGE_THRESHOLD = 100_000_000
3338

34-
@inject_docs(vx=SK)
39+
40+
@inject_docs(vx=SK, megapixels_value=str(int(LARGE_IMAGE_THRESHOLD / 1e6)))
3541
def seqfish(
3642
path: str | Path,
3743
load_images: bool = True,
3844
load_labels: bool = True,
3945
load_points: bool = True,
4046
load_shapes: bool = True,
4147
cells_as_circles: bool = False,
42-
rois: list[int] | None = None,
48+
rois: list[str] | None = None,
4349
imread_kwargs: Mapping[str, Any] = MappingProxyType({}),
44-
raster_models_scale_factors: list[float] | None = None,
50+
raster_models_scale_factors: list[int] | None = None,
4551
) -> SpatialData:
4652
"""Read *seqfish* formatted dataset.
4753
4854
This function reads the following files:
4955
50-
- ```{vx.ROI!r}{vx.COUNTS_FILE!r}{vx.CSV_FILE!r}```: Counts and metadata file.
51-
- ```{vx.ROI!r}{vx.CELL_COORDINATES!r}{vx.CSV_FILE!r}```: Cell coordinates file.
52-
- ```{vx.ROI!r}{vx.DAPI!r}{vx.TIFF_FILE!r}```: High resolution tiff image.
53-
- ```{vx.ROI!r}{vx.SEGMENTATION!r}{vx.TIFF_FILE!r}```: Cell mask file.
54-
- ```{vx.ROI!r}{vx.TRANSCRIPT_COORDINATES!r}{vx.CSV_FILE!r}```: Transcript coordinates file.
56+
- ```<roi_prefix>_{vx.COUNTS_FILE!r}{vx.CSV_FILE!r}```: Counts and metadata file.
57+
- ```<roi_prefix>_{vx.CELL_COORDINATES!r}{vx.CSV_FILE!r}```: Cell coordinates file.
58+
- ```<roi_prefix>_{vx.DAPI!r}{vx.TIFF_FILE!r}```: High resolution tiff image.
59+
- ```<roi_prefix>_{vx.SEGMENTATION!r}{vx.TIFF_FILE!r}```: Cell mask file.
60+
- ```<roi_prefix>_{vx.TRANSCRIPT_COORDINATES!r}{vx.CSV_FILE!r}```: Transcript coordinates file.
5561
5662
.. seealso::
5763
@@ -72,9 +78,13 @@ def seqfish(
7278
cells_as_circles
7379
Whether to read cells also as circles instead of labels.
7480
rois
75-
Which ROIs (specified as integers) to load. Only necessary if multiple ROIs present.
81+
Which ROIs (specified as strings, without trailing "_") to load (the ROI strings are used as prefixes for the
82+
filenames). If `None`, all ROIs are loaded.
7683
imread_kwargs
7784
Keyword arguments to pass to :func:`dask_image.imread.imread`.
85+
raster_models_scale_factors
86+
Scale factors to downscale high-resolution images and labels. The scale factors will be automatically set to
87+
obtain a multi-scale image for all the images and labels that are larger than {megapixels_value} megapixels.
7888
7989
Returns
8090
-------
@@ -93,24 +103,29 @@ def seqfish(
93103
>>> sdata.write("path/to/data.zarr")
94104
"""
95105
path = Path(path)
96-
count_file_pattern = re.compile(rf"(.*?){re.escape(SK.CELL_COORDINATES)}{re.escape(SK.CSV_FILE)}$")
106+
count_file_pattern = re.compile(rf"(.*?)_{re.escape(SK.CELL_COORDINATES)}{re.escape(SK.CSV_FILE)}$")
97107
count_files = [f for f in os.listdir(path) if count_file_pattern.match(f)]
98108
if not count_files:
99109
raise ValueError(
100110
f"No files matching the pattern {count_file_pattern} were found. Cannot infer the naming scheme."
101111
)
102112

103-
roi_pattern = re.compile(f"^{SK.ROI}(\\d+)")
104-
found_rois = {m.group(1) for i in os.listdir(path) if (m := roi_pattern.match(i))}
105-
if rois is None:
106-
rois_str = [f"{SK.ROI}{roi}" for roi in found_rois]
107-
elif isinstance(rois, list):
113+
rois_str_set = set()
114+
for count_file in count_files:
115+
found = count_file_pattern.match(count_file)
116+
if found is None:
117+
raise ValueError(f"File {count_file} does not match the expected pattern.")
118+
rois_str_set.add(found.group(1))
119+
logger.info(f"Found ROIs: {rois_str_set}")
120+
rois_str = list(rois_str_set)
121+
122+
if isinstance(rois, list):
108123
for roi in rois:
109-
if str(roi) not in found_rois:
124+
if str(roi) not in rois_str_set:
110125
raise ValueError(f"ROI{roi} not found.")
111-
rois_str = [f"{SK.ROI}{roi}" for roi in rois]
112-
else:
113-
raise ValueError("Invalid type for 'roi'. Must be list[int] or None.")
126+
rois_str = rois
127+
elif rois is not None:
128+
raise ValueError("Invalid type for 'roi'. Must be list[str] or None.")
114129

115130
def get_cell_file(roi: str) -> str:
116131
return f"{roi}_{SK.CELL_COORDINATES}{SK.CSV_FILE}"
@@ -168,33 +183,44 @@ def get_transcript_file(roi: str) -> str:
168183
scaled = {}
169184
for roi_str in rois_str:
170185
scaled[roi_str] = Scale(
171-
np.array(_get_scale_factors(path / get_dapi_file(roi_str), SK.SCALEFEFACTOR_X, SK.SCALEFEFACTOR_Y)),
186+
np.array(_get_scale_factors_scale0(path / get_dapi_file(roi_str))),
172187
axes=("y", "x"),
173188
)
174189

190+
def _get_scale_factors(raster_path: Path, raster_models_scale_factors: list[int] | None) -> list[int] | None:
191+
n_pixels = _get_n_pixels(raster_path)
192+
if n_pixels > LARGE_IMAGE_THRESHOLD and raster_models_scale_factors is None:
193+
return [2, 2, 2]
194+
else:
195+
return raster_models_scale_factors
196+
175197
if load_images:
176-
images = {
177-
f"{os.path.splitext(get_dapi_file(x))[0]}": Image2DModel.parse(
178-
imread(path / get_dapi_file(x), **imread_kwargs),
198+
images = {}
199+
for x in rois_str:
200+
image_path = path / get_dapi_file(x)
201+
scale_factors = _get_scale_factors(image_path, raster_models_scale_factors)
202+
203+
images[f"{os.path.splitext(get_dapi_file(x))[0]}"] = Image2DModel.parse(
204+
imread(image_path, **imread_kwargs),
179205
dims=("c", "y", "x"),
180-
scale_factors=raster_models_scale_factors,
181-
transformations={"global": scaled[x]},
206+
scale_factors=scale_factors,
207+
transformations={x: scaled[x]},
182208
)
183-
for x in rois_str
184-
}
185209
else:
186210
images = {}
187211

188212
if load_labels:
189-
labels = {
190-
f"{os.path.splitext(get_cell_segmentation_labels_file(x))[0]}": Labels2DModel.parse(
191-
imread(path / get_cell_segmentation_labels_file(x), **imread_kwargs).squeeze(),
213+
labels = {}
214+
for x in rois_str:
215+
labels_path = path / get_cell_segmentation_labels_file(x)
216+
scale_factors = _get_scale_factors(labels_path, raster_models_scale_factors)
217+
218+
labels[f"{os.path.splitext(get_cell_segmentation_labels_file(x))[0]}"] = Labels2DModel.parse(
219+
imread(labels_path, **imread_kwargs).squeeze(),
192220
dims=("y", "x"),
193-
scale_factors=raster_models_scale_factors,
194-
transformations={"global": scaled[x]},
221+
scale_factors=scale_factors,
222+
transformations={x: scaled[x]},
195223
)
196-
for x in rois_str
197-
}
198224
else:
199225
labels = {}
200226

@@ -206,32 +232,39 @@ def get_transcript_file(roi: str) -> str:
206232
p = pd.read_csv(path / get_transcript_file(x), delimiter=",")
207233
instance_key_points = SK.INSTANCE_KEY_POINTS.value if SK.INSTANCE_KEY_POINTS.value in p.columns else None
208234

235+
coordinates = {"x": SK.TRANSCRIPTS_X, "y": SK.TRANSCRIPTS_Y, "z": SK.TRANSCRIPTS_Z}
236+
if SK.TRANSCRIPTS_Z not in p.columns:
237+
coordinates.pop("z")
238+
warnings.warn(
239+
f"Column {SK.TRANSCRIPTS_Z} not found in {get_transcript_file(x)}.", UserWarning, stacklevel=2
240+
)
241+
209242
# call parser
210243
points[name] = PointsModel.parse(
211244
p,
212-
coordinates={"x": SK.TRANSCRIPTS_X, "y": SK.TRANSCRIPTS_Y},
245+
coordinates=coordinates,
213246
feature_key=SK.FEATURE_KEY.value,
214247
instance_key=instance_key_points,
215-
transformations={"global": Identity()},
248+
transformations={x: Identity()},
216249
)
217250

218251
shapes = {}
219252
if cells_as_circles:
220-
for x, adata in zip(rois_str, tables.values(), strict=False):
253+
for x, adata in zip(rois_str, tables.values(), strict=True):
221254
shapes[f"{os.path.splitext(get_cell_file(x))[0]}"] = ShapesModel.parse(
222255
adata.obsm[SK.SPATIAL_KEY],
223256
geometry=0,
224257
radius=np.sqrt(adata.obs[SK.AREA].to_numpy() / np.pi),
225258
index=adata.obs[SK.INSTANCE_KEY_TABLE].copy(),
226-
transformations={"global": Identity()},
259+
transformations={x: Identity()},
227260
)
228261
if load_shapes:
229-
for x in rois_str:
262+
for x, adata in zip(rois_str, tables.values(), strict=True):
230263
# this assumes that the index matches the instance key of the table. A more robust approach could be
231264
# implemented, as described here https://github.com/scverse/spatialdata-io/issues/249
232265
shapes[f"{os.path.splitext(get_cell_segmentation_shapes_file(x))[0]}"] = ShapesModel.parse(
233266
path / get_cell_segmentation_shapes_file(x),
234-
transformations={"global": scaled[x]},
267+
transformations={x: scaled[x]},
235268
index=adata.obs[SK.INSTANCE_KEY_TABLE].copy(),
236269
)
237270

@@ -240,12 +273,40 @@ def get_transcript_file(roi: str) -> str:
240273
return sdata
241274

242275

243-
def _get_scale_factors(DAPI_path: Path, scalefactor_x_key: str, scalefactor_y_key: str) -> list[float]:
244-
with tifffile.TiffFile(DAPI_path) as tif:
245-
ome_metadata = tif.ome_metadata
246-
root = ET.fromstring(ome_metadata)
247-
for element in root.iter():
248-
if scalefactor_x_key in element.attrib.keys():
249-
scalefactor_x = element.attrib[scalefactor_x_key]
250-
scalefactor_y = element.attrib[scalefactor_y_key]
251-
return [float(scalefactor_x), float(scalefactor_y)]
276+
def _is_ome_tiff_multiscale(ome_tiff_file: Path) -> bool:
277+
"""Check if the OME-TIFF file is multi-scale.
278+
279+
Parameters
280+
----------
281+
ome_tiff_file
282+
Path to the OME-TIFF file.
283+
284+
Returns
285+
-------
286+
Whether the OME-TIFF file is multi-scale.
287+
"""
288+
# for some image files we couldn't find the multiscale information in the omexml metadata, and this method proves to
289+
# be more robust
290+
try:
291+
zarr_tiff_store = tifffile.imread(ome_tiff_file, is_ome=True, level=1, aszarr=True)
292+
zarr_tiff_store.close()
293+
except IndexError:
294+
return False
295+
return True
296+
297+
298+
def _get_n_pixels(ome_tiff_file: Path) -> int:
299+
with tifffile.TiffFile(ome_tiff_file, is_ome=True) as tif:
300+
page = tif.pages[0]
301+
shape = page.shape
302+
n_pixels = np.array(shape).prod().item()
303+
assert isinstance(n_pixels, int)
304+
return n_pixels
305+
306+
307+
def _get_scale_factors_scale0(DAPI_path: Path) -> list[float]:
308+
with tifffile.TiffFile(DAPI_path, is_ome=True) as tif:
309+
ome_metadata = xmltodict.parse(tif.ome_metadata)
310+
scalefactor_x = ome_metadata["OME"]["Image"]["Pixels"]["@PhysicalSizeX"]
311+
scalefactor_y = ome_metadata["OME"]["Image"]["Pixels"]["@PhysicalSizeY"]
312+
return [float(scalefactor_x), float(scalefactor_y)]

tests/test_seqfish.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,16 +17,17 @@
1717
@pytest.mark.parametrize(
1818
"dataset,expected", [("seqfish-2-test-dataset/instrument 2 official", "{'y': (0, 108), 'x': (0, 108)}")]
1919
)
20-
@pytest.mark.parametrize("rois", [[1], None])
20+
@pytest.mark.parametrize("rois", [["Roi1"], None])
2121
@pytest.mark.parametrize("cells_as_circles", [False, True])
2222
def test_example_data(dataset: str, expected: str, rois: list[int] | None, cells_as_circles: bool) -> None:
2323
f = Path("./data") / dataset
2424
assert f.is_dir()
2525
sdata = seqfish(f, cells_as_circles=cells_as_circles, rois=rois)
2626
from spatialdata import get_extent
2727

28-
extent = get_extent(sdata, exact=False)
28+
extent = get_extent(sdata, exact=False, coordinate_system="Roi1")
2929
extent = {ax: (math.floor(extent[ax][0]), math.ceil(extent[ax][1])) for ax in extent}
30+
del extent["z"]
3031
if cells_as_circles:
3132
# manual correction required to take into account for the circle radii
3233
expected = "{'y': (-2, 109), 'x': (-2, 109)}"

0 commit comments

Comments
 (0)