Add baysor using sopa

LouisK92 · LouisK92 · commit cb66f69121dd · 2025-09-17T17:53:50.000+02:00
diff --git a/src/methods_transcript_assignment/baysor/config.vsh.yaml b/src/methods_transcript_assignment/baysor/config.vsh.yaml
@@ -84,6 +84,7 @@ engines:
       - type: python
         packages:
            - spatialdata
+           - sopa[baysor]
       - type: docker
         run:
           - wget https://github.com/kharchenkolab/Baysor/releases/download/v0.7.1/baysor-x86_x64-linux-v0.7.1_build.zip
diff --git a/src/methods_transcript_assignment/baysor/script.py b/src/methods_transcript_assignment/baysor/script.py
@@ -0,0 +1,166 @@
+import os
+import shutil
+from pathlib import Path
+import xarray as xr
+import dask
+import numpy as np
+import pandas as pd
+import anndata as ad
+import spatialdata as sd
+import sopa
+
+
+## VIASH START
+# Note: this section is auto-generated by viash at runtime. To edit it, make changes
+# in config.vsh.yaml and then run `viash config inject config.vsh.yaml`.
+par = {
+  'input_ist': 'resources_test/task_ist_preprocessing/mouse_brain_combined/raw_ist.zarr',
+  'input_segmentation': 'resources_test/task_ist_preprocessing/mouse_brain_combined/segmentation.zarr',
+  'transcripts_key': 'transcripts',
+  'coordinate_system': 'global',
+  'output': './temp/methods/baysor/baysor_assigned_transcripts.zarr',
+
+  'force_2d': 'false',
+  'min_molecules_per_cell': 50,
+  'scale': -1.0, #NOTE: For parameter selection see https://github.com/gustaveroussy/sopa/tree/main/workflow/config
+  'scale_std': "25%",
+  'n_clusters': 4,
+  'prior_segmentation_confidence': 0.8,
+}
+meta = {
+  'name': 'baysor_transcript_assignment',
+  'temp_dir': "./temp/methods/baysor",
+  'cpus': 4,
+}
+## VIASH END
+
+TMP_DIR = Path(meta["temp_dir"] or "/tmp")
+TMP_DIR.mkdir(parents=True, exist_ok=True)
+
+CONFIG_TOML = TMP_DIR / "config.toml"
+
+
+##############################
+# Basic assignment for prior #
+##############################
+
+# Sopa takes the prior segmentation as cell_id column in the transcripts table. 
+# Generate this column with basic assignment:
+print('Reading input files', flush=True)
+sdata = sd.read_zarr(par['input_ist'])
+sdata_segm = sd.read_zarr(par['input_segmentation'])
+
+# Check if coordinate system is available in input data
+transcripts_coord_systems = sd.transformations.get_transformation(sdata[par["transcripts_key"]], get_all=True).keys()
+assert par['coordinate_system'] in transcripts_coord_systems, f"Coordinate system '{par['coordinate_system']}' not found in input data."
+segmentation_coord_systems = sd.transformations.get_transformation(sdata_segm["segmentation"], get_all=True).keys()
+assert par['coordinate_system'] in segmentation_coord_systems, f"Coordinate system '{par['coordinate_system']}' not found in input data."
+
+print('Transforming transcripts coordinates', flush=True)
+transcripts = sd.transform(sdata[par['transcripts_key']], to_coordinate_system=par['coordinate_system'])
+
+# In case of a translation transformation of the segmentation (e.g. crop of the data), we need to adjust the transcript coordinates
+trans = sd.transformations.get_transformation(sdata_segm["segmentation"], get_all=True)[par['coordinate_system']].inverse()
+transcripts = sd.transform(transcripts, trans, par['coordinate_system'])
+
+print('Assigning transcripts to cell ids', flush=True)
+y_coords = transcripts.y.compute().to_numpy(dtype=np.int64)
+x_coords = transcripts.x.compute().to_numpy(dtype=np.int64)
+if isinstance(sdata_segm["segmentation"], xr.DataTree):
+    label_image = sdata_segm["segmentation"]["scale0"].image.to_numpy() 
+else:
+    label_image = sdata_segm["segmentation"].to_numpy()
+cell_id_dask_series = dask.dataframe.from_dask_array(
+    dask.array.from_array(
+        label_image[y_coords, x_coords], chunks=tuple(sdata[par['transcripts_key']].map_partitions(len).compute())
+    ), 
+    index=sdata[par['transcripts_key']].index
+)
+sdata[par['transcripts_key']]["cell_id"] = cell_id_dask_series
+
+
+########################
+# Run baysor with sopa #
+########################
+
+# Create reduced sdata
+sdata_sopa = sd.SpatialData(
+    points={
+        "transcripts": sdata[par['transcripts_key']]
+    },
+)
+
+# Write config to toml
+print('Writing config to toml', flush=True)
+toml_str = f"""[data]
+x = "x"
+y = "y"
+z = "z"
+gene = "feature_name" 
+force_2d = {par['force_2d']} 
+min_molecules_per_cell = {int(par['min_molecules_per_cell'])}
+exclude_genes = "" 
+
+[segmentation]
+scale = {float(par['scale'])} 
+scale_std = "{par['scale_std']}"
+n_clusters = {int(par['n_clusters'])}
+prior_segmentation_confidence = {float(par['prior_segmentation_confidence'])}
+"""
+with open(CONFIG_TOML, "w") as toml_file:
+    toml_file.write(toml_str)
+
+
+
+# Make transcript patches
+sopa.make_transcript_patches(sdata_sopa, patch_width=2000, patch_overlap=50, prior_shapes_key="cell_id")
+sopa.settings.parallelization_backend = "dask"
+
+# Run baysor
+sopa.segmentation.baysor(sdata_sopa, config=str(CONFIG_TOML))
+
+# Assign transcripts to cell ids
+sopa.spatial.assign_transcript_to_cell(
+    sdata_sopa,
+    points_key="transcripts",
+    shapes_key="baysor_boundaries",
+    key_added="cell_id",
+    unassigned_value=0
+)
+
+
+
+# Create objects for cells table
+print('Creating objects for cells table', flush=True)
+#create new .obs for cells based on the segmentation output (corresponding with the transcripts 'cell_id')
+unique_cells = np.unique(sdata_sopa["transcripts"]["cell_id"])
+
+# check if a '0' (noise/background) cell is in cell_id and remove
+zero_idx = np.where(unique_cells == 0)
+if len(zero_idx[0]): unique_cells=np.delete(unique_cells, zero_idx[0][0])
+
+#transform into pandas series and check
+cell_id_col = pd.Series(unique_cells, name='cell_id', index=unique_cells)
+assert 0 not in cell_id_col, "Found '0' in cell_id column of assingment output cell matrix"
+
+
+# Create transcripts only sdata
+print('Subsetting to transcripts cell id data', flush=True)
+sdata_transcripts_only = sd.SpatialData(
+    points={
+        "transcripts": sdata_sopa['transcripts']
+    },
+    tables={
+        "table": ad.AnnData(
+          obs=pd.DataFrame(cell_id_col),
+          var=sdata.tables["table"].var[[]]
+        )
+    }
+)
+
+# Write output
+print('Write transcripts with cell ids', flush=True)
+if os.path.exists(par["output"]):
+    shutil.rmtree(par["output"])
+    
+sdata_transcripts_only.write(par['output'])