Comseg integration (#45)

Kraftfahrzeughaftpflichtversicherung · web-flow · commit ca85a33038ed · 2025-09-19T00:00:02.000+02:00
diff --git a/src/methods_transcript_assignment/comseg/config.vsh.yaml b/src/methods_transcript_assignment/comseg/config.vsh.yaml
@@ -0,0 +1,98 @@
+name: comseg
+label: "ComSeg Segmentation"
+summary: "Spatial segmentation using ComSeg method"
+description: |
+  ComSeg is a spatial transcriptomics segmentation method that uses transcript locations 
+  and morphological information to define cell boundaries. It is particularly effective 
+  for high-resolution spatial transcriptomics data.
+links:
+  documentation: "https://github.com/openproblems-bio/task_ist_preprocessing"
+  repository: "https://github.com/openproblems-bio/task_ist_preprocessing"
+references:
+  doi: "10.1038/s41592-020-01018-x"
+
+__merge__: /src/api/comp_method_segmentation.yaml
+
+arguments:
+  - name: --transcripts_key
+    type: string
+    default: "transcripts"
+    description: "Key for transcripts in the points layer"
+  - name: --shapes_key
+    type: string
+    default: "cell_boundaries"
+    description: "Key for cell boundaries in the shapes layer"
+  - name: --images_key
+    type: string
+    default: "morphology_mip"
+    description: "Key for morphology image in the images layer"
+  - name: --patch_width
+    type: integer
+    default: 1200
+    description: "Width of image patches for processing"
+  - name: --patch_overlap
+    type: integer
+    default: 50
+    description: "Overlap between patches"
+  - name: --transcript_patch_width
+    type: integer
+    default: 200
+    description: "Width of transcript patches"
+  - name: --mean_cell_diameter
+    type: double
+    default: 15.0
+    description: "Expected mean cell diameter in micrometers"
+  - name: --max_cell_radius
+    type: double
+    default: 25.0
+    description: "Maximum cell radius in micrometers"
+  - name: --alpha
+    type: double
+    default: 0.5
+    description: "Alpha parameter for ComSeg algorithm"
+  - name: --min_rna_per_cell
+    type: integer
+    default: 5
+    description: "Minimum number of transcripts per cell"
+  - name: --gene_column
+    type: string
+    default: "feature_name"
+    description: "Column name for gene identifiers in transcripts data"
+  - name: --norm_vector
+    type: boolean
+    default: false
+    description: "Whether to normalize vectors in ComSeg"
+  - name: --allow_disconnected_polygon
+    type: boolean
+    default: true
+    description: "Allow disconnected polygons in segmentation"
+
+
+resources:
+  - type: python_script
+    path: script.py
+
+
+engines:
+  - type: docker
+    image: openproblems/base_python:1
+    setup:
+      - type: python
+        pypi: 
+          - spatialdata
+          - sopa
+          - anndata
+          - pandas
+          - numpy
+          - xarray
+          - scikit-image
+          - comseg
+          - scipy
+
+  - type: native
+
+runners:
+  - type: executable
+  - type: nextflow
+    directives:
+      label: [ hightime, midcpu, highmem ]
diff --git a/src/methods_transcript_assignment/comseg/script.py b/src/methods_transcript_assignment/comseg/script.py
@@ -0,0 +1,116 @@
+import spatialdata as sd
+import sopa
+import anndata as ad
+import pandas as pd
+import numpy as np
+from scipy import sparse
+
+## VIASH START
+par = {
+    "input": "resources_test/task_ist_preprocessing/mouse_brain_combined/raw_ist.zarr",
+    "output": "transcripts.zarr",
+
+    "transcripts_key": "transcripts",
+    "shapes_key": "cell_boundaries",
+    "images_key": "morphology_mip",
+    "patch_width": 1200,
+    "patch_overlap": 50,
+    "transcript_patch_width": 200,
+    "mean_cell_diameter": 15.0,
+    "max_cell_radius": 25.0,
+    "alpha": 0.5,
+    "min_rna_per_cell": 5,
+    "gene_column": "feature_name",
+    "norm_vector": False,
+    "allow_disconnected_polygon": True,
+}
+## VIASH END
+
+def fixed_count_transcripts_aligned(geo_df, points, value_key):
+    """
+    The same function as sopa.aggregation.transcripts._count_transcripts_aligned.
+    Minor change just the matrix X is converted to csr_matrix, to avoid bug error in comseg call
+
+    """
+    from scipy.sparse import csr_matrix
+    from anndata import AnnData
+    from dask.diagnostics import ProgressBar
+    from functools import partial
+    from sopa._settings import settings
+    import geopandas as gpd
+    def _add_csr(X_partitions, geo_df, partition, gene_column, gene_names ):
+        if settings.gene_exclude_pattern is not None:
+            partition = partition[~partition[gene_column].str.match(settings.gene_exclude_pattern, case=False, na=False)]
+
+        points_gdf = gpd.GeoDataFrame(partition, geometry=gpd.points_from_xy(partition["x"], partition["y"]))
+        joined = geo_df.sjoin(points_gdf)
+        cells_indices, column_indices = joined.index, joined[gene_column].cat.codes
+        cells_indices = cells_indices[column_indices >= 0]
+        column_indices = column_indices[column_indices >= 0]
+        X_partition = csr_matrix((np.full(len(cells_indices), 1), (cells_indices, column_indices)),
+            shape=(len(geo_df), len(gene_names)),
+        )
+        X_partitions.append(X_partition)
+    
+
+    points[value_key] = points[value_key].astype("category").cat.as_known()
+    gene_names = points[value_key].cat.categories.astype(str)
+    X = csr_matrix((len(geo_df), len(gene_names)), dtype=int)
+    adata = AnnData(X=X, var=pd.DataFrame(index=gene_names))
+    adata.obs_names = geo_df.index.astype(str)
+    geo_df = geo_df.reset_index()
+    X_partitions = []
+    with ProgressBar():
+        points.map_partitions(
+            partial(_add_csr, X_partitions, geo_df, gene_column=value_key, gene_names=gene_names),
+            meta=(),
+        ).compute()
+    for X_partition in X_partitions:
+        adata.X += X_partition
+    if settings.gene_exclude_pattern is not None:
+        adata = adata[:, ~adata.var_names.str.match(settings.gene_exclude_pattern, case=False, na=False)].copy()
+    return adata
+
+
+# Read input SpatialData
+sdata = sd.read_zarr(par["input"])
+sopa.make_image_patches(sdata, patch_width=par["patch_width"], patch_overlap=par["patch_overlap"])
+
+transcript_patch_args = {
+    "sdata": sdata,
+    "write_cells_centroids": True,
+    "patch_width": par["transcript_patch_width"],
+}
+transcript_patch_args["prior_shapes_key"] = par["shapes_key"]
+
+sopa.make_transcript_patches(**transcript_patch_args)
+
+config = {
+    "dict_scale": {"x": 1, "y": 1, "z": 1},
+    "mean_cell_diameter": par["mean_cell_diameter"],
+    "max_cell_radius": par["max_cell_radius"],
+    "norm_vector": par["norm_vector"],
+    "alpha": par["alpha"], 
+    "allow_disconnected_polygon": par["allow_disconnected_polygon"],
+    "min_rna_per_cell": par["min_rna_per_cell"],
+    "gene_column": par["gene_column"],
+}
+
+
+sopa.aggregation.transcripts._count_transcripts_aligned = fixed_count_transcripts_aligned
+sopa.segmentation.comseg(sdata, config)
+
+# Create output SpatialData 
+sd_output = sd.SpatialData()
+
+cell_id_col = sdata["transcripts"][f"cell_id"]
+sdata.tables["table"]=ad.AnnData(obs=pd.DataFrame({"cell_id":cell_id_col}), var=sdata.tables["table"].var[[]])
+sdata_new = sd.SpatialData(
+    points=sdata.points,  
+    tables=sdata.tables   
+) 
+
+output_path = par['output']
+sdata_new.write(output_path, overwrite=True)
+
+