|
| 1 | +import os |
| 2 | +import shutil |
| 3 | +from pathlib import Path |
| 4 | +from tifffile import imwrite |
| 5 | +import dask |
| 6 | +import numpy as np |
| 7 | +import pandas as pd |
| 8 | +import anndata as ad |
| 9 | +import spatialdata as sd |
| 10 | + |
| 11 | + |
| 12 | +## VIASH START |
| 13 | +# Note: this section is auto-generated by viash at runtime. To edit it, make changes |
| 14 | +# in config.vsh.yaml and then run `viash config inject config.vsh.yaml`. |
| 15 | +par = { |
| 16 | + 'input_ist': 'resources_test/task_ist_preprocessing/mouse_brain_combined/raw_ist.zarr', |
| 17 | + 'input_segmentation': 'resources_test/task_ist_preprocessing/mouse_brain_combined/segmentation.zarr', |
| 18 | + 'transcripts_key': 'transcripts', |
| 19 | + 'coordinate_system': 'global', |
| 20 | + 'output': './temp/methods/baysor/baysor_assigned_transcripts.zarr', |
| 21 | + |
| 22 | + 'force_2d': 'false', |
| 23 | + 'min_molecules_per_cell': 50, |
| 24 | + 'scale': -1.0, |
| 25 | + 'scale_std': "25%", |
| 26 | + 'n_clusters': 4, |
| 27 | + 'prior_segmentation_confidence': 0.8, |
| 28 | +} |
| 29 | +meta = { |
| 30 | + 'name': 'baysor_transcript_assignment', |
| 31 | + 'temp_dir': "./temp/methods/baysor" |
| 32 | +} |
| 33 | +## VIASH END |
| 34 | + |
| 35 | +TMP_DIR = Path(meta["temp_dir"] or "/tmp") |
| 36 | +TMP_DIR.mkdir(parents=True, exist_ok=True) |
| 37 | + |
| 38 | +TRANSCRIPTS_CSV = TMP_DIR / "transcripts.csv" |
| 39 | +SEGMENTATION_TIF = TMP_DIR / "segmentation.tif" |
| 40 | +CONFIG_TOML = TMP_DIR / "config.toml" |
| 41 | +BAYSOR_OUTPUT = TMP_DIR / "baysor_output.csv" |
| 42 | + |
| 43 | + |
| 44 | +# Read input |
| 45 | +print('Reading input files', flush=True) |
| 46 | +sdata = sd.read_zarr(par['input_ist']) |
| 47 | +sdata_segm = sd.read_zarr(par['input_segmentation']) |
| 48 | + |
| 49 | +# Check if coordinate system is available in input data |
| 50 | +transcripts_coord_systems = sd.transformations.get_transformation(sdata[par["transcripts_key"]], get_all=True).keys() |
| 51 | +assert par['coordinate_system'] in transcripts_coord_systems, f"Coordinate system '{par['coordinate_system']}' not found in input data." |
| 52 | +segmentation_coord_systems = sd.transformations.get_transformation(sdata_segm["segmentation"], get_all=True).keys() |
| 53 | +assert par['coordinate_system'] in segmentation_coord_systems, f"Coordinate system '{par['coordinate_system']}' not found in input data." |
| 54 | + |
| 55 | +# Transform transcript coordinates to the coordinate system |
| 56 | +print('Transforming transcripts coordinates', flush=True) |
| 57 | +transcripts = sd.transform(sdata[par['transcripts_key']], to_coordinate_system=par['coordinate_system']) |
| 58 | + |
| 59 | +# In case of a translation transformation of the segmentation (e.g. crop of the data), we need to adjust the transcript coordinates |
| 60 | +trans = sd.transformations.get_transformation(sdata_segm["segmentation"], get_all=True)[par['coordinate_system']].inverse() |
| 61 | +transcripts = sd.transform(transcripts, trans, par['coordinate_system']) |
| 62 | + |
| 63 | + |
| 64 | +# Write transcripts to csv |
| 65 | +print('Writing transcripts to csv', flush=True) |
| 66 | +transcripts[['x', 'y', 'z', 'feature_name']].compute().to_csv(TRANSCRIPTS_CSV) |
| 67 | + |
| 68 | +# Write segmentation to tif |
| 69 | +print('Writing segmentation to tif', flush=True) |
| 70 | +imwrite(SEGMENTATION_TIF, sdata_segm["segmentation"]["scale0"].image.to_numpy()) |
| 71 | + |
| 72 | +# Write config to toml |
| 73 | +print('Writing config to toml', flush=True) |
| 74 | +toml_str = f"""[data] |
| 75 | +x = "x" |
| 76 | +y = "y" |
| 77 | +z = "z" |
| 78 | +gene = "feature_name" |
| 79 | +force_2d = {par['force_2d']} |
| 80 | +min_molecules_per_cell = {int(par['min_molecules_per_cell'])} |
| 81 | +exclude_genes = "" |
| 82 | +
|
| 83 | +[segmentation] |
| 84 | +scale = {float(par['scale'])} |
| 85 | +scale_std = "{par['scale_std']}" |
| 86 | +n_clusters = {int(par['n_clusters'])} |
| 87 | +prior_segmentation_confidence = {float(par['prior_segmentation_confidence'])} |
| 88 | +""" |
| 89 | +with open(CONFIG_TOML, "w") as toml_file: |
| 90 | + toml_file.write(toml_str) |
| 91 | + |
| 92 | + |
| 93 | +# Run Baysor |
| 94 | +print('Running Baysor', flush=True) |
| 95 | +baysor_cmd = f"baysor run -c {CONFIG_TOML} -o {BAYSOR_OUTPUT} {TRANSCRIPTS_CSV} {SEGMENTATION_TIF}" |
| 96 | +print("\t" + baysor_cmd, flush=True) |
| 97 | +os.system(baysor_cmd) |
| 98 | + |
| 99 | + |
| 100 | +# Read Baysor output |
| 101 | +print('Reading Baysor output', flush=True) |
| 102 | +df_baysor = pd.read_csv(BAYSOR_OUTPUT) |
| 103 | + |
| 104 | +# Formatting of Baysor output |
| 105 | +print('Formatting Baysor output', flush=True) |
| 106 | + |
| 107 | +def convert_str_ids_to_ints(df, file_path_for_error_messages=None): |
| 108 | + """Convert cell ids like "CR4b68f93d8-27" to 27 |
| 109 | + |
| 110 | + The file argument is just for creating more informative Error messages. |
| 111 | + """ |
| 112 | + |
| 113 | + df = df.copy() |
| 114 | + file = file_path_for_error_messages |
| 115 | + |
| 116 | + unique_cell_values = df.loc[~df["cell"].isnull(), "cell"].unique() |
| 117 | + unique_types = {type(value) for value in unique_cell_values} |
| 118 | + n_cells_pre = len(df.loc[~df["cell"].isnull(),"cell"].unique()) |
| 119 | + if (len(unique_types) == 1) and (str in unique_types): |
| 120 | + df.loc[~df["cell"].isnull(),"cell"] = df.loc[~df["cell"].isnull(),"cell"].apply(lambda i: i.split("-")[-1]).astype(int) |
| 121 | + elif (len(unique_types) != 1): |
| 122 | + raise ValueError(f"Non NaN values of column 'cell' in file {file} have multiple types: {unique_types}") |
| 123 | + n_cells_post = len(df.loc[~df["cell"].isnull(),"cell"].unique()) |
| 124 | + |
| 125 | + if n_cells_pre != n_cells_post: |
| 126 | + raise ValueError(f"Number of cells changed after conversion to integers, probably baysor used different substrings with same integers for some cells. Check file: {file}") |
| 127 | + |
| 128 | + # Convert nan values to 0 (background) |
| 129 | + df.loc[df["cell"].isnull(),"cell"] = 0 |
| 130 | + |
| 131 | + |
| 132 | + return df |
| 133 | + |
| 134 | +df_baysor = convert_str_ids_to_ints(df_baysor, file_path_for_error_messages=BAYSOR_OUTPUT) |
| 135 | + |
| 136 | + |
| 137 | +# Add cell ids to transcripts |
| 138 | +print('Adding cell ids to transcripts', flush=True) |
| 139 | +cell_id_dask_series = dask.dataframe.from_dask_array( |
| 140 | + dask.array.from_array( |
| 141 | + df_baysor['cell'].values, chunks=tuple(sdata[par['transcripts_key']].map_partitions(len).compute()) |
| 142 | + ), |
| 143 | + index=sdata[par['transcripts_key']].index |
| 144 | +) |
| 145 | +sdata[par['transcripts_key']]["cell_id"] = cell_id_dask_series |
| 146 | + |
| 147 | + |
| 148 | +# Create objects for cells table |
| 149 | +print('Creating objects for cells table', flush=True) |
| 150 | +#create new .obs for cells based on the segmentation output (corresponding with the transcripts 'cell_id') |
| 151 | +unique_cells = np.unique(cell_id_dask_series) |
| 152 | + |
| 153 | +# check if a '0' (noise/background) cell is in cell_id and remove |
| 154 | +zero_idx = np.where(unique_cells == 0) |
| 155 | +if len(zero_idx[0]): unique_cells=np.delete(unique_cells, zero_idx[0][0]) |
| 156 | + |
| 157 | +#transform into pandas series and check |
| 158 | +cell_id_col = pd.Series(unique_cells, name='cell_id', index=unique_cells) |
| 159 | +assert 0 not in cell_id_col, "Found '0' in cell_id column of assingment output cell matrix" |
| 160 | + |
| 161 | + |
| 162 | +# Create transcripts only sdata |
| 163 | +print('Subsetting to transcripts cell id data', flush=True) |
| 164 | +sdata_transcripts_only = sd.SpatialData( |
| 165 | + points={ |
| 166 | + "transcripts": sdata[par['transcripts_key']] |
| 167 | + }, |
| 168 | + tables={ |
| 169 | + "table": ad.AnnData( |
| 170 | + obs=pd.DataFrame(cell_id_col), |
| 171 | + var=sdata.tables["table"].var[[]] |
| 172 | + ) |
| 173 | + } |
| 174 | +) |
| 175 | + |
| 176 | +# Write output |
| 177 | +print('Write transcripts with cell ids', flush=True) |
| 178 | +if os.path.exists(par["output"]): |
| 179 | + shutil.rmtree(par["output"]) |
| 180 | + |
| 181 | +sdata_transcripts_only.write(par['output']) |
0 commit comments