22
33import os
44import re
5+ import warnings
56import xml .etree .ElementTree as ET
7+ from collections .abc import Mapping
68from pathlib import Path
79from types import MappingProxyType
810from typing import TYPE_CHECKING , Any
1113import numpy as np
1214import pandas as pd
1315import tifffile
16+ import xmltodict
1417from dask_image .imread import imread
1518from spatialdata import SpatialData
19+ from spatialdata ._logging import logger
1620from spatialdata .models import (
1721 Image2DModel ,
1822 Labels2DModel ,
3034
3135__all__ = ["seqfish" ]
3236
37+ LARGE_IMAGE_THRESHOLD = 100_000_000
3338
34- @inject_docs (vx = SK )
39+
40+ @inject_docs (vx = SK , megapixels_value = str (int (LARGE_IMAGE_THRESHOLD / 1e6 )))
3541def seqfish (
3642 path : str | Path ,
3743 load_images : bool = True ,
3844 load_labels : bool = True ,
3945 load_points : bool = True ,
4046 load_shapes : bool = True ,
4147 cells_as_circles : bool = False ,
42- rois : list [int ] | None = None ,
48+ rois : list [str ] | None = None ,
4349 imread_kwargs : Mapping [str , Any ] = MappingProxyType ({}),
44- raster_models_scale_factors : list [float ] | None = None ,
50+ raster_models_scale_factors : list [int ] | None = None ,
4551) -> SpatialData :
4652 """Read *seqfish* formatted dataset.
4753
4854 This function reads the following files:
4955
50- - ```{vx.ROI!r} {vx.COUNTS_FILE!r}{vx.CSV_FILE!r}```: Counts and metadata file.
51- - ```{vx.ROI!r} {vx.CELL_COORDINATES!r}{vx.CSV_FILE!r}```: Cell coordinates file.
52- - ```{vx.ROI!r} {vx.DAPI!r}{vx.TIFF_FILE!r}```: High resolution tiff image.
53- - ```{vx.ROI!r} {vx.SEGMENTATION!r}{vx.TIFF_FILE!r}```: Cell mask file.
54- - ```{vx.ROI!r} {vx.TRANSCRIPT_COORDINATES!r}{vx.CSV_FILE!r}```: Transcript coordinates file.
56+ - ```<roi_prefix>_ {vx.COUNTS_FILE!r}{vx.CSV_FILE!r}```: Counts and metadata file.
57+ - ```<roi_prefix>_ {vx.CELL_COORDINATES!r}{vx.CSV_FILE!r}```: Cell coordinates file.
58+ - ```<roi_prefix>_ {vx.DAPI!r}{vx.TIFF_FILE!r}```: High resolution tiff image.
59+ - ```<roi_prefix>_ {vx.SEGMENTATION!r}{vx.TIFF_FILE!r}```: Cell mask file.
60+ - ```<roi_prefix>_ {vx.TRANSCRIPT_COORDINATES!r}{vx.CSV_FILE!r}```: Transcript coordinates file.
5561
5662 .. seealso::
5763
@@ -72,9 +78,13 @@ def seqfish(
7278 cells_as_circles
7379 Whether to read cells also as circles instead of labels.
7480 rois
75- Which ROIs (specified as integers) to load. Only necessary if multiple ROIs present.
81+ Which ROIs (specified as strings, without trailing "_") to load (the ROI strings are used as prefixes for the
82+ filenames). If `None`, all ROIs are loaded.
7683 imread_kwargs
7784 Keyword arguments to pass to :func:`dask_image.imread.imread`.
85+ raster_models_scale_factors
86+ Scale factors to downscale high-resolution images and labels. The scale factors will be automatically set to
87+ obtain a multi-scale image for all the images and labels that are larger than {megapixels_value} megapixels.
7888
7989 Returns
8090 -------
@@ -93,24 +103,29 @@ def seqfish(
93103 >>> sdata.write("path/to/data.zarr")
94104 """
95105 path = Path (path )
96- count_file_pattern = re .compile (rf"(.*?){ re .escape (SK .CELL_COORDINATES )} { re .escape (SK .CSV_FILE )} $" )
106+ count_file_pattern = re .compile (rf"(.*?)_ { re .escape (SK .CELL_COORDINATES )} { re .escape (SK .CSV_FILE )} $" )
97107 count_files = [f for f in os .listdir (path ) if count_file_pattern .match (f )]
98108 if not count_files :
99109 raise ValueError (
100110 f"No files matching the pattern { count_file_pattern } were found. Cannot infer the naming scheme."
101111 )
102112
103- roi_pattern = re .compile (f"^{ SK .ROI } (\\ d+)" )
104- found_rois = {m .group (1 ) for i in os .listdir (path ) if (m := roi_pattern .match (i ))}
105- if rois is None :
106- rois_str = [f"{ SK .ROI } { roi } " for roi in found_rois ]
107- elif isinstance (rois , list ):
113+ rois_str_set = set ()
114+ for count_file in count_files :
115+ found = count_file_pattern .match (count_file )
116+ if found is None :
117+ raise ValueError (f"File { count_file } does not match the expected pattern." )
118+ rois_str_set .add (found .group (1 ))
119+ logger .info (f"Found ROIs: { rois_str_set } " )
120+ rois_str = list (rois_str_set )
121+
122+ if isinstance (rois , list ):
108123 for roi in rois :
109- if str (roi ) not in found_rois :
124+ if str (roi ) not in rois_str_set :
110125 raise ValueError (f"ROI{ roi } not found." )
111- rois_str = [ f" { SK . ROI } { roi } " for roi in rois ]
112- else :
113- raise ValueError ("Invalid type for 'roi'. Must be list[int ] or None." )
126+ rois_str = rois
127+ elif rois is not None :
128+ raise ValueError ("Invalid type for 'roi'. Must be list[str ] or None." )
114129
115130 def get_cell_file (roi : str ) -> str :
116131 return f"{ roi } _{ SK .CELL_COORDINATES } { SK .CSV_FILE } "
@@ -168,33 +183,44 @@ def get_transcript_file(roi: str) -> str:
168183 scaled = {}
169184 for roi_str in rois_str :
170185 scaled [roi_str ] = Scale (
171- np .array (_get_scale_factors (path / get_dapi_file (roi_str ), SK . SCALEFEFACTOR_X , SK . SCALEFEFACTOR_Y )),
186+ np .array (_get_scale_factors_scale0 (path / get_dapi_file (roi_str ))),
172187 axes = ("y" , "x" ),
173188 )
174189
190+ def _get_scale_factors (raster_path : Path , raster_models_scale_factors : list [int ] | None ) -> list [int ] | None :
191+ n_pixels = _get_n_pixels (raster_path )
192+ if n_pixels > LARGE_IMAGE_THRESHOLD and raster_models_scale_factors is None :
193+ return [2 , 2 , 2 ]
194+ else :
195+ return raster_models_scale_factors
196+
175197 if load_images :
176- images = {
177- f"{ os .path .splitext (get_dapi_file (x ))[0 ]} " : Image2DModel .parse (
178- imread (path / get_dapi_file (x ), ** imread_kwargs ),
198+ images = {}
199+ for x in rois_str :
200+ image_path = path / get_dapi_file (x )
201+ scale_factors = _get_scale_factors (image_path , raster_models_scale_factors )
202+
203+ images [f"{ os .path .splitext (get_dapi_file (x ))[0 ]} " ] = Image2DModel .parse (
204+ imread (image_path , ** imread_kwargs ),
179205 dims = ("c" , "y" , "x" ),
180- scale_factors = raster_models_scale_factors ,
181- transformations = {"global" : scaled [x ]},
206+ scale_factors = scale_factors ,
207+ transformations = {x : scaled [x ]},
182208 )
183- for x in rois_str
184- }
185209 else :
186210 images = {}
187211
188212 if load_labels :
189- labels = {
190- f"{ os .path .splitext (get_cell_segmentation_labels_file (x ))[0 ]} " : Labels2DModel .parse (
191- imread (path / get_cell_segmentation_labels_file (x ), ** imread_kwargs ).squeeze (),
213+ labels = {}
214+ for x in rois_str :
215+ labels_path = path / get_cell_segmentation_labels_file (x )
216+ scale_factors = _get_scale_factors (labels_path , raster_models_scale_factors )
217+
218+ labels [f"{ os .path .splitext (get_cell_segmentation_labels_file (x ))[0 ]} " ] = Labels2DModel .parse (
219+ imread (labels_path , ** imread_kwargs ).squeeze (),
192220 dims = ("y" , "x" ),
193- scale_factors = raster_models_scale_factors ,
194- transformations = {"global" : scaled [x ]},
221+ scale_factors = scale_factors ,
222+ transformations = {x : scaled [x ]},
195223 )
196- for x in rois_str
197- }
198224 else :
199225 labels = {}
200226
@@ -206,32 +232,39 @@ def get_transcript_file(roi: str) -> str:
206232 p = pd .read_csv (path / get_transcript_file (x ), delimiter = "," )
207233 instance_key_points = SK .INSTANCE_KEY_POINTS .value if SK .INSTANCE_KEY_POINTS .value in p .columns else None
208234
235+ coordinates = {"x" : SK .TRANSCRIPTS_X , "y" : SK .TRANSCRIPTS_Y , "z" : SK .TRANSCRIPTS_Z }
236+ if SK .TRANSCRIPTS_Z not in p .columns :
237+ coordinates .pop ("z" )
238+ warnings .warn (
239+ f"Column { SK .TRANSCRIPTS_Z } not found in { get_transcript_file (x )} ." , UserWarning , stacklevel = 2
240+ )
241+
209242 # call parser
210243 points [name ] = PointsModel .parse (
211244 p ,
212- coordinates = { "x" : SK . TRANSCRIPTS_X , "y" : SK . TRANSCRIPTS_Y } ,
245+ coordinates = coordinates ,
213246 feature_key = SK .FEATURE_KEY .value ,
214247 instance_key = instance_key_points ,
215- transformations = {"global" : Identity ()},
248+ transformations = {x : Identity ()},
216249 )
217250
218251 shapes = {}
219252 if cells_as_circles :
220- for x , adata in zip (rois_str , tables .values (), strict = False ):
253+ for x , adata in zip (rois_str , tables .values (), strict = True ):
221254 shapes [f"{ os .path .splitext (get_cell_file (x ))[0 ]} " ] = ShapesModel .parse (
222255 adata .obsm [SK .SPATIAL_KEY ],
223256 geometry = 0 ,
224257 radius = np .sqrt (adata .obs [SK .AREA ].to_numpy () / np .pi ),
225258 index = adata .obs [SK .INSTANCE_KEY_TABLE ].copy (),
226- transformations = {"global" : Identity ()},
259+ transformations = {x : Identity ()},
227260 )
228261 if load_shapes :
229- for x in rois_str :
262+ for x , adata in zip ( rois_str , tables . values (), strict = True ) :
230263 # this assumes that the index matches the instance key of the table. A more robust approach could be
231264 # implemented, as described here https://github.com/scverse/spatialdata-io/issues/249
232265 shapes [f"{ os .path .splitext (get_cell_segmentation_shapes_file (x ))[0 ]} " ] = ShapesModel .parse (
233266 path / get_cell_segmentation_shapes_file (x ),
234- transformations = {"global" : scaled [x ]},
267+ transformations = {x : scaled [x ]},
235268 index = adata .obs [SK .INSTANCE_KEY_TABLE ].copy (),
236269 )
237270
@@ -240,12 +273,40 @@ def get_transcript_file(roi: str) -> str:
240273 return sdata
241274
242275
243- def _get_scale_factors (DAPI_path : Path , scalefactor_x_key : str , scalefactor_y_key : str ) -> list [float ]:
244- with tifffile .TiffFile (DAPI_path ) as tif :
245- ome_metadata = tif .ome_metadata
246- root = ET .fromstring (ome_metadata )
247- for element in root .iter ():
248- if scalefactor_x_key in element .attrib .keys ():
249- scalefactor_x = element .attrib [scalefactor_x_key ]
250- scalefactor_y = element .attrib [scalefactor_y_key ]
251- return [float (scalefactor_x ), float (scalefactor_y )]
276+ def _is_ome_tiff_multiscale (ome_tiff_file : Path ) -> bool :
277+ """Check if the OME-TIFF file is multi-scale.
278+
279+ Parameters
280+ ----------
281+ ome_tiff_file
282+ Path to the OME-TIFF file.
283+
284+ Returns
285+ -------
286+ Whether the OME-TIFF file is multi-scale.
287+ """
288+ # for some image files we couldn't find the multiscale information in the omexml metadata, and this method proves to
289+ # be more robust
290+ try :
291+ zarr_tiff_store = tifffile .imread (ome_tiff_file , is_ome = True , level = 1 , aszarr = True )
292+ zarr_tiff_store .close ()
293+ except IndexError :
294+ return False
295+ return True
296+
297+
298+ def _get_n_pixels (ome_tiff_file : Path ) -> int :
299+ with tifffile .TiffFile (ome_tiff_file , is_ome = True ) as tif :
300+ page = tif .pages [0 ]
301+ shape = page .shape
302+ n_pixels = np .array (shape ).prod ().item ()
303+ assert isinstance (n_pixels , int )
304+ return n_pixels
305+
306+
307+ def _get_scale_factors_scale0 (DAPI_path : Path ) -> list [float ]:
308+ with tifffile .TiffFile (DAPI_path , is_ome = True ) as tif :
309+ ome_metadata = xmltodict .parse (tif .ome_metadata )
310+ scalefactor_x = ome_metadata ["OME" ]["Image" ]["Pixels" ]["@PhysicalSizeX" ]
311+ scalefactor_y = ome_metadata ["OME" ]["Image" ]["Pixels" ]["@PhysicalSizeY" ]
312+ return [float (scalefactor_x ), float (scalefactor_y )]
0 commit comments