1+
2+
3+ """
4+ Notes about supported data structure:
5+
6+ The directory structure that we need, looks as follows (expected by the sopa.io.cosmx function):
7+ ├── CellStatsDir ( = `DATA_DIR`)
8+ │ ├── CellLabels
9+ │ │ ├── CellLabels_F001.tif
10+ │ │ ├── ...
11+ │ │ └── CellLabels_F<last_fov_id>.tif
12+ │ ├── Morphology2D
13+ │ │ ├── <some_id>_F001.tif
14+ │ │ ├── ...
15+ │ │ └── <some_id>_F<last_fov_id>.tif
16+ │ ├── <dataset_id>_exprMat_file.csv
17+ │ ├── <dataset_id>_fov_positions_file.csv
18+ │ ├── <dataset_id>_metadata_file.csv
19+ │ ├── <dataset_id>_tx_file.csv
20+ │ └── <dataset_id>-polygons.csv
21+ ├── (AnalysisResults)
22+ └── (RunSummary)
23+
24+ The "flat files" are the csv files that start with `<dataset_id>_`. They are generated by the AtoMx software.
25+
26+
27+ For different datasets we face slightly different data structures, the inputs to the component need to be chosen accordingly.
28+ If the raw files and the flat files are in different zip files, we need to set par["input_flat_files"]. The other structural
29+ differences are handled by the component.
30+
31+ ### Version 1 (Example: CosMx Liver and Liver cancer) ###
32+ ├── CellStatsDir
33+ │ ├── FOV001
34+ │ │ └── CellLabels_F001.tif
35+ │ ├── ...
36+ │ ├── FOV130
37+ │ │ └── CellLabels_F130.tif
38+ │ ├── Morphology2D
39+ │ │ └── ...
40+ │ ├── <dataset_id>_exprMat_file.csv
41+ │ ├── <dataset_id>_fov_positions_file.csv
42+ │ ├── <dataset_id>_metadata_file.csv
43+ │ ├── <dataset_id>_tx_file.csv
44+ │ └── <dataset_id>-polygons.csv
45+ ├── (AnalysisResults)
46+ └── (RunSummary)
47+ --> the CellLabels folder is not present, but the CellLabels_FXXX.tif files are in the FOV folders.
48+ They are moved to a newly created CellLabels folder.
49+
50+ ### Version 2 (Example: CosMx Mouse brain) ###
51+ ├── CellStatsDir
52+ │ ├── FOV001
53+ │ │ └── CellLabels_F001.tif
54+ │ ├── ...
55+ │ ├── FOV130
56+ │ │ └── CellLabels_F130.tif
57+ │ ├── Morphology2D
58+ │ │ └── ...
59+ ├── (AnalysisResults)
60+ └── (RunSummary)
61+ &
62+ ├── <dataset_id>_exprMat_file.csv
63+ ├── <dataset_id>_fov_positions_file.csv
64+ ├── <dataset_id>_metadata_file.csv
65+ ├── <dataset_id>_tx_file.csv
66+ └── <dataset_id>-polygons.csv
67+ --> the flat files are in a separate zip, they need to be moved to CellStatsDir ( = `DATA_DIR`)
68+ --> as in version 1, the CellLabels folder is not present, but the CellLabels_FXXX.tif files are in the FOV folders.
69+
70+
71+
72+ ### Version 3 (Example: CosMx lung cancer) ###
73+ this has subdirectories for each sample and an extra sub directories for morphology images. Also,
74+ the images are given for each z plane. This dataset is covered with the bruker_cosmx_nsclc dataloader.
75+
76+
77+
78+
79+
80+ """
81+
82+ import os
83+ import shutil
84+ import zipfile
85+ from pathlib import Path
86+ from datetime import datetime
87+ import sopa
88+
89+ ## VIASH START
90+ par = {
91+ "input_raw" : "https://smi-public.objects.liquidweb.services/HalfBrain.zip" ,
92+ "input_flat_files" : "https://smi-public.objects.liquidweb.services/Half%20%20Brain%20simple%20%20files%20.zip" ,
93+ "segmentation_id" : ["cell" ],
94+ "output" : "output.zarr" ,
95+ "dataset_id" : "bruker_cosmx/bruker_mouse_brain_cosmx/rep1" ,
96+ "dataset_name" : "value" ,
97+ "dataset_url" : "https://nanostring.com/products/cosmx-spatial-molecular-imager/ffpe-dataset/cosmx-smi-mouse-brain-ffpe-dataset/" ,
98+ "dataset_reference" : "value" ,
99+ "dataset_summary" : "value" ,
100+ "dataset_description" : "value" ,
101+ "dataset_organism" : "human" ,
102+ }
103+ meta = {
104+ #"temp_dir": "./temp/datasets/bruker_cosmx",
105+ "temp_dir" : "/Volumes/Sandisk2TB/G3_temp/bruker_cosmx/test_folder"
106+ }
107+
108+ ## VIASH END
109+
110+ assert ("cell" in par ["segmentation_id" ]) and (len (par ["segmentation_id" ]) == 1 ), "Currently cell labels are definitely assigned in this script. And cosmx does not provide other segmentations."
111+
112+ t0 = datetime .now ()
113+
114+ # Define temp dir and file names
115+ TMP_DIR = Path (meta ["temp_dir" ] or "/tmp" )
116+ TMP_DIR .mkdir (parents = True , exist_ok = True )
117+ FILE_NAME_RAW = TMP_DIR / par ["input_raw" ].split ("/" )[- 1 ]
118+ DATA_DIR = FILE_NAME_RAW .parent / FILE_NAME_RAW .stem / "CellStatsDir"
119+
120+ if par ["input_flat_files" ] is not None :
121+ FILE_NAME_FLAT = TMP_DIR / par ["input_flat_files" ].split ("/" )[- 1 ].replace ("%20" , " " )
122+
123+ # Download raw files
124+ print (datetime .now () - t0 , "Download raw files" , flush = True )
125+ os .system (f"wget { par ['input_raw' ]} -O '{ FILE_NAME_RAW } '" )
126+
127+ # Extract zip files
128+ print (datetime .now () - t0 , "Extract zip of raw files" , flush = True )
129+ with zipfile .ZipFile (FILE_NAME_RAW , 'r' ) as zip_ref :
130+ zip_ref .extractall (TMP_DIR )
131+
132+ # Download and extract flat files if they are not already present
133+ FLAT_FILES_ENDINGS = ["_exprMat_file.csv" , "_fov_positions_file.csv" , "_metadata_file.csv" , "_tx_file.csv" ] #, "polygons.csv"]
134+ flat_files_count = 0
135+ for ending in FLAT_FILES_ENDINGS :
136+ if any (f .endswith (ending ) for f in os .listdir (DATA_DIR )):
137+ print (f"Flat file with ending { ending } already present in extracted raw files" , flush = True )
138+ flat_files_count += 1
139+
140+ if flat_files_count == len (FLAT_FILES_ENDINGS ):
141+ print (datetime .now () - t0 , "All flat files already present in extracted raw files" , flush = True )
142+ else :
143+ print (datetime .now () - t0 , "Download and extract flat files" , flush = True )
144+ os .system (f"wget { par ['input_flat_files' ]} -O '{ FILE_NAME_FLAT } '" )
145+
146+ with zipfile .ZipFile (FILE_NAME_FLAT , 'r' ) as zip_ref :
147+ zip_ref .extractall (TMP_DIR )
148+
149+ print (datetime .now () - t0 , f"Move flat files to { DATA_DIR } " , flush = True )
150+ source_dir = FILE_NAME_FLAT .parent / FILE_NAME_FLAT .stem
151+
152+ file_names = os .listdir (source_dir )
153+ for file_name in file_names :
154+ if not (DATA_DIR / file_name ).exists ():
155+ shutil .move (source_dir / file_name , DATA_DIR )
156+ else :
157+ print (datetime .now () - t0 , f"File { file_name } already present in { DATA_DIR } " , flush = True )
158+
159+
160+ # Move CellLabels_FXXX.tif files to CellLabels folder if they are not already present
161+ labels_dir = DATA_DIR / "CellLabels"
162+
163+ if not labels_dir .exists ():
164+ print (datetime .now () - t0 , "Create CellLabels folder with CellLabels tif" , flush = True )
165+ # Create CellLabels folder with CellLabels tif (somehow this folder name is expected and this is not always present)
166+ # see e.g. late discussion in https://github.com/gustaveroussy/sopa/issues/285
167+
168+ labels_dir .mkdir (parents = True , exist_ok = True )
169+
170+ # Get all folders in data_dir that start with "FOV" and move the CellLabels_FXXX.tif file to the CellLabels folder
171+ print (datetime .now () - t0 , "Move CellLabels_FXXX.tif files to CellLabels folder" , flush = True )
172+ for fov_dir in DATA_DIR .glob ("FOV*" ):
173+ fov_id = str (fov_dir )[- 3 :]
174+ shutil .copy (fov_dir / f"CellLabels_F{ fov_id } .tif" , labels_dir / f"CellLabels_F{ fov_id } .tif" )
175+ else :
176+ print (datetime .now () - t0 , "CellLabels folder already present" , flush = True )
177+
178+
179+
180+ #########################################
181+ # Convert raw files to spatialdata zarr #
182+ #########################################
183+
184+
185+ #from pathlib import Path
186+ #import sopa
187+ #data_dir = Path("/Volumes/Sandisk2TB/G3_temp/bruker_cosmx/HalfBrain/CellStatsDir")
188+
189+ print (datetime .now () - t0 , "Convert raw files to spatialdata zarr" , flush = True )
190+
191+ sdata = sopa .io .cosmx (
192+ DATA_DIR ,
193+ dataset_id = None ,
194+ fov = None ,
195+ read_proteins = False ,
196+ cells_labels = True ,
197+ cells_table = True ,
198+ cells_polygons = True ,
199+ flip_image = False
200+ )
201+
202+
203+ ###############
204+ # Rename keys #
205+ ###############
206+ print (datetime .now () - t0 , "Rename keys" , flush = True )
207+
208+ elements_renaming_map = {
209+ "stitched_image" : "morphology_mip" ,
210+ "stitched_labels" : "cell_labels" ,
211+ "points" : "transcripts" ,
212+ "cells_polygons" : "cell_boundaries" ,
213+ "table" : "metadata" ,
214+ }
215+
216+ for old_key , new_key in elements_renaming_map .items ():
217+ sdata [new_key ] = sdata [old_key ]
218+ del sdata [old_key ]
219+
220+ # Rename transcript column (somehow overwriting the 'target' column leads to an error, so instead we add a duplicate with the right name)
221+ #sdata['transcripts'] = sdata['transcripts'].rename(columns={"global_cell_id":"cell_id", "target":"feature_name"})
222+ sdata ['transcripts' ] = sdata ['transcripts' ].rename (columns = {"global_cell_id" :"cell_id" })
223+ sdata ['transcripts' ]["feature_name" ] = sdata ['transcripts' ]["target" ]
224+
225+ #########################################
226+ # Throw out all channels except of DAPI #
227+ ######################################### NOTE: We assume the "DNA" stain is comparable to DAPI.
228+ print (datetime .now () - t0 , "Throw out all channels except of 'DNA' (DAPI?)" , flush = True )
229+
230+ # TODO: in the future we want to keep PolyT and Cellbound1/2/3 stains. Note however, that somehow saving or plotting the sdata fails when
231+ # these channels aren't excluded, not sure why...
232+ sdata ["morphology_mip" ] = sdata ["morphology_mip" ].sel (c = ["DNA" ])
233+
234+
235+ ##############################
236+ # Add info to metadata table #
237+ ##############################
238+ print (datetime .now () - t0 , "Add info to metadata table" , flush = True )
239+
240+ #TODO: values as input variables
241+ for key in ["dataset_id" , "dataset_name" , "dataset_url" , "dataset_reference" , "dataset_summary" , "dataset_description" , "dataset_organism" , "segmentation_id" ]:
242+ sdata ["metadata" ].uns [key ] = par [key ]
243+
244+ #########
245+ # Write #
246+ #########
247+ print (datetime .now () - t0 , f"Writing to { par ['output' ]} " , flush = True )
248+
249+ sdata .write (par ["output" ])
250+
251+ print (datetime .now () - t0 , "Done" , flush = True )
0 commit comments