Add baysor

LouisK92 · LouisK92 · commit e99876ef67d8 · 2025-07-30T17:32:40.000+02:00
diff --git a/src/methods_transcript_assignment/baysor/config.vsh.yaml b/src/methods_transcript_assignment/baysor/config.vsh.yaml
@@ -0,0 +1,89 @@
+__merge__: /src/api/comp_method_transcript_assignment.yaml
+
+name: baysor
+label: "Baysor Transcript Assignment"
+summary: "Assign transcripts to cells using the Baysor method"
+description: "Baysor is a tool for performing cell segmentation on imaging-based spatial transcriptomics data. It optimizes segmentation considering the likelihood of transcriptional composition, size and shape of the cell."
+links:
+  documentation: "https://kharchenkolab.github.io/Baysor/dev/"
+  repository: "https://github.com/kharchenkolab/Baysor"
+references:
+  doi: "10.1038/s41587-021-01044-w"
+
+arguments:
+  - name: --transcripts_key
+    type: string
+    description: The key of the transcripts within the points of the spatial data
+    default: transcripts
+  - name: --coordinate_system
+    type: string
+    description: The key of the pixel space coordinate system within the spatial data
+    default: global
+
+  - name: --force_2d
+    type: string
+    required: false
+    description: "Ignores z-column in the data if it is provided"
+    direction: input 
+    default: "false"
+
+  - name: --min_molecules_per_cell
+    type: integer
+    required: false
+    description: "Minimal number of molecules per cell"
+    direction: input 
+    default: 50
+  
+  - name: --scale
+    type: double
+    required: false
+    description: |
+      "Scale parameter, which suggest approximate cell radius for the algorithm. Must be in the same units as 
+      x and y molecule coordinates. Negative values mean it must be estimated from `min_molecules_per_cell`."
+    direction: input 
+    default: -1.0
+
+  - name: --scale_std
+    type: string
+    required: false
+    description: "Standard deviation of scale across cells relative to `scale`"
+    direction: input 
+    default: "25%"
+
+  - name: --n_clusters
+    type: integer
+    required: false
+    description: "Number of molecule clusters, i.e. major cell types."
+    direction: input 
+    default: 4
+
+  - name: --prior_segmentation_confidence
+    type: double
+    required: false
+    description: "Confidence of the prior segmentation"
+    direction: input 
+    default: 0.8
+
+resources:
+  - type: python_script
+    path: script.py
+
+engines:
+  - type: docker
+    image: openproblems/base_python:1
+    __merge__: 
+      - /src/base/setup_spatialdata_partial.yaml
+    setup:
+      - type: docker
+        run:
+          - wget https://github.com/kharchenkolab/Baysor/releases/download/v0.7.1/baysor-x86_x64-linux-v0.7.1_build.zip
+          - unzip baysor-x86_x64-linux-v0.7.1_build.zip
+          - chmod +x /bin/baysor
+          - ln -sf /bin/baysor/bin/baysor /usr/local/bin/baysor
+  - type: native
+
+runners:
+  - type: executable
+  - type: nextflow
+    directives:
+      label: [ midtime, midcpu, midmem ]
diff --git a/src/methods_transcript_assignment/baysor/script.py b/src/methods_transcript_assignment/baysor/script.py
@@ -0,0 +1,181 @@
+import os
+import shutil
+from pathlib import Path
+from tifffile import imwrite
+import dask
+import numpy as np
+import pandas as pd
+import anndata as ad
+import spatialdata as sd
+
+
+## VIASH START
+# Note: this section is auto-generated by viash at runtime. To edit it, make changes
+# in config.vsh.yaml and then run `viash config inject config.vsh.yaml`.
+par = {
+  'input_ist': 'resources_test/task_ist_preprocessing/mouse_brain_combined/raw_ist.zarr',
+  'input_segmentation': 'resources_test/task_ist_preprocessing/mouse_brain_combined/segmentation.zarr',
+  'transcripts_key': 'transcripts',
+  'coordinate_system': 'global',
+  'output': './temp/methods/baysor/baysor_assigned_transcripts.zarr',
+
+  'force_2d': 'false',
+  'min_molecules_per_cell': 50,
+  'scale': -1.0,
+  'scale_std': "25%",
+  'n_clusters': 4,
+  'prior_segmentation_confidence': 0.8,
+}
+meta = {
+  'name': 'baysor_transcript_assignment',
+  'temp_dir': "./temp/methods/baysor"
+}
+## VIASH END
+
+TMP_DIR = Path(meta["temp_dir"] or "/tmp")
+TMP_DIR.mkdir(parents=True, exist_ok=True)
+
+TRANSCRIPTS_CSV = TMP_DIR / "transcripts.csv"
+SEGMENTATION_TIF = TMP_DIR / "segmentation.tif"
+CONFIG_TOML = TMP_DIR / "config.toml"
+BAYSOR_OUTPUT = TMP_DIR / "baysor_output.csv"
+
+
+# Read input
+print('Reading input files', flush=True)
+sdata = sd.read_zarr(par['input_ist'])
+sdata_segm = sd.read_zarr(par['input_segmentation'])
+
+# Check if coordinate system is available in input data
+transcripts_coord_systems = sd.transformations.get_transformation(sdata[par["transcripts_key"]], get_all=True).keys()
+assert par['coordinate_system'] in transcripts_coord_systems, f"Coordinate system '{par['coordinate_system']}' not found in input data."
+segmentation_coord_systems = sd.transformations.get_transformation(sdata_segm["segmentation"], get_all=True).keys()
+assert par['coordinate_system'] in segmentation_coord_systems, f"Coordinate system '{par['coordinate_system']}' not found in input data."
+
+# Transform transcript coordinates to the coordinate system
+print('Transforming transcripts coordinates', flush=True)
+transcripts = sd.transform(sdata[par['transcripts_key']], to_coordinate_system=par['coordinate_system'])
+
+# In case of a translation transformation of the segmentation (e.g. crop of the data), we need to adjust the transcript coordinates
+trans = sd.transformations.get_transformation(sdata_segm["segmentation"], get_all=True)[par['coordinate_system']].inverse()
+transcripts = sd.transform(transcripts, trans, par['coordinate_system'])
+
+
+# Write transcripts to csv 
+print('Writing transcripts to csv', flush=True)
+transcripts[['x', 'y', 'z', 'feature_name']].compute().to_csv(TRANSCRIPTS_CSV)
+
+# Write segmentation to tif
+print('Writing segmentation to tif', flush=True)
+imwrite(SEGMENTATION_TIF, sdata_segm["segmentation"]["scale0"].image.to_numpy())
+
+# Write config to toml
+print('Writing config to toml', flush=True)
+toml_str = f"""[data]
+x = "x"
+y = "y"
+z = "z"
+gene = "feature_name" 
+force_2d = {par['force_2d']} 
+min_molecules_per_cell = {int(par['min_molecules_per_cell'])}
+exclude_genes = "" 
+
+[segmentation]
+scale = {float(par['scale'])} 
+scale_std = "{par['scale_std']}"
+n_clusters = {int(par['n_clusters'])}
+prior_segmentation_confidence = {float(par['prior_segmentation_confidence'])}
+"""
+with open(CONFIG_TOML, "w") as toml_file:
+    toml_file.write(toml_str)
+
+
+# Run Baysor
+print('Running Baysor', flush=True)
+baysor_cmd = f"baysor run -c {CONFIG_TOML} -o {BAYSOR_OUTPUT} {TRANSCRIPTS_CSV} {SEGMENTATION_TIF}"
+print("\t" + baysor_cmd, flush=True)
+os.system(baysor_cmd)
+
+
+# Read Baysor output
+print('Reading Baysor output', flush=True)
+df_baysor = pd.read_csv(BAYSOR_OUTPUT)
+
+# Formatting of Baysor output
+print('Formatting Baysor output', flush=True)
+
+def convert_str_ids_to_ints(df, file_path_for_error_messages=None):
+    """Convert cell ids like "CR4b68f93d8-27" to 27
+    
+    The file argument is just for creating more informative Error messages.
+    """
+    
+    df = df.copy()
+    file = file_path_for_error_messages
+    
+    unique_cell_values = df.loc[~df["cell"].isnull(), "cell"].unique()
+    unique_types = {type(value) for value in unique_cell_values}
+    n_cells_pre = len(df.loc[~df["cell"].isnull(),"cell"].unique())
+    if (len(unique_types) == 1) and (str in unique_types):
+        df.loc[~df["cell"].isnull(),"cell"] = df.loc[~df["cell"].isnull(),"cell"].apply(lambda i: i.split("-")[-1]).astype(int)
+    elif (len(unique_types) != 1):
+        raise ValueError(f"Non NaN values of column 'cell' in file {file} have multiple types: {unique_types}")
+    n_cells_post = len(df.loc[~df["cell"].isnull(),"cell"].unique())
+    
+    if n_cells_pre != n_cells_post:
+        raise ValueError(f"Number of cells changed after conversion to integers, probably baysor used different substrings with same integers for some cells. Check file: {file}")
+        
+    # Convert nan values to 0 (background)
+    df.loc[df["cell"].isnull(),"cell"] = 0
+    
+        
+    return df
+
+df_baysor = convert_str_ids_to_ints(df_baysor, file_path_for_error_messages=BAYSOR_OUTPUT)
+
+
+# Add cell ids to transcripts
+print('Adding cell ids to transcripts', flush=True)
+cell_id_dask_series = dask.dataframe.from_dask_array(
+    dask.array.from_array(
+        df_baysor['cell'].values, chunks=tuple(sdata[par['transcripts_key']].map_partitions(len).compute())
+    ), 
+    index=sdata[par['transcripts_key']].index
+)
+sdata[par['transcripts_key']]["cell_id"] = cell_id_dask_series
+
+
+# Create objects for cells table
+print('Creating objects for cells table', flush=True)
+#create new .obs for cells based on the segmentation output (corresponding with the transcripts 'cell_id')
+unique_cells = np.unique(cell_id_dask_series)
+
+# check if a '0' (noise/background) cell is in cell_id and remove
+zero_idx = np.where(unique_cells == 0)
+if len(zero_idx[0]): unique_cells=np.delete(unique_cells, zero_idx[0][0])
+
+#transform into pandas series and check
+cell_id_col = pd.Series(unique_cells, name='cell_id', index=unique_cells)
+assert 0 not in cell_id_col, "Found '0' in cell_id column of assingment output cell matrix"
+
+
+# Create transcripts only sdata
+print('Subsetting to transcripts cell id data', flush=True)
+sdata_transcripts_only = sd.SpatialData(
+    points={
+        "transcripts": sdata[par['transcripts_key']]
+    },
+    tables={
+        "table": ad.AnnData(
+          obs=pd.DataFrame(cell_id_col),
+          var=sdata.tables["table"].var[[]]
+        )
+    }
+)
+
+# Write output
+print('Write transcripts with cell ids', flush=True)
+if os.path.exists(par["output"]):
+    shutil.rmtree(par["output"])
+    
+sdata_transcripts_only.write(par['output'])