Add split (#125)

habibrehman2002 · Habib Rehman · LouisK92 · web-flow · commit f64ec2759547 · 2026-02-23T18:11:56.000+01:00
* Added SPLIT, tentatively works

* Fixed filtering and container for SPLIT

* Save uncorrected counts in split script

* Rename split_correction to split and add method to workflow and scripts

---------

Co-authored-by: Habib Rehman &lt;harehman@iu.edu&gt;
Co-authored-by: LouisK92 &lt;louiskuemmerle@googlemail.com&gt;
diff --git a/scripts/run_benchmark/config.yaml b/scripts/run_benchmark/config.yaml
diff --git a/scripts/run_benchmark/run_full_local.sh b/scripts/run_benchmark/run_full_local.sh
@@ -65,6 +65,7 @@ expression_correction_methods:
   - no_correction
   # - gene_efficiency_correction
   # - resolvi_correction
+  # - split
 method_parameters_yaml: /tmp/method_params.yaml
 HERE
 
diff --git a/scripts/run_benchmark/run_full_seqeracloud.sh b/scripts/run_benchmark/run_full_seqeracloud.sh
@@ -57,6 +57,7 @@ expression_correction_methods:
   - no_correction
   - gene_efficiency_correction
   - resolvi_correction
+  - split
 method_parameters_yaml: /tmp/method_params.yaml
 HERE
 
diff --git a/scripts/run_benchmark/run_test_local.sh b/scripts/run_benchmark/run_test_local.sh
@@ -60,6 +60,7 @@ expression_correction_methods:
   - no_correction
   # - gene_efficiency_correction
   # - resolvi_correction
+  # - split
 method_parameters_yaml: /tmp/method_params.yaml
 HERE
 
diff --git a/scripts/run_benchmark/run_test_seqeracloud.sh b/scripts/run_benchmark/run_test_seqeracloud.sh
@@ -56,6 +56,7 @@ expression_correction_methods:
   - no_correction
   - gene_efficiency_correction
   - resolvi_correction
+  - split
 #method_parameters_yaml: /tmp/method_params.yaml
 HERE
 
diff --git a/src/methods_expression_correction/split/config.vsh.yaml b/src/methods_expression_correction/split/config.vsh.yaml
@@ -0,0 +1,51 @@
+__merge__: /src/api/comp_method_expression_correction.yaml
+
+name: split
+label: "SPLIT"
+summary: "Correct doublet/misegmented cells using SPLIT"
+description: "SPLIT (Spatial Purification of Layered Intracellular Transcripts) is a novel method that integrates snRNA-seq with RCTD deconvolution to enhance signal purity. SPLIT effectively resolves mixed transcriptomic signals, improving background correction and cell-type resolution."
+links:
+  documentation: "https://github.com/bdsc-tds/SPLIT"
+  repository: "https://github.com/bdsc-tds/SPLIT"
+references:
+  doi: "10.1101/2025.04.23.649965"
+
+arguments:
+  - name: --keep_all_cells
+    required: false
+    direction: input
+    type: boolean
+    default: false
+    description: Whether to keep cells with 0 counts (may cause errors if set to TRUE)
+
+resources:
+  - type: r_script
+    path: script.R
+
+engines:
+  - type: docker
+    image: openproblems/base_r:1
+    setup:
+      - type: docker
+        run: |
+          apt-get update
+      - type: r
+        bioc: [anndataR, rhdf5, devtools, scater]
+      - type: docker
+        run: |
+          Rscript -e "BiocManager::install('SingleCellExperiment', type = 'source', force = TRUE, ask = FALSE); options(timeout = 600000000); devtools::install_github('dmcable/spacexr', build_vignettes = FALSE); devtools::install_github('bdsc-tds/SPLIT')"
+
+      # SingleCellExperiment part can probably be left out again in the future. It currently fixes a bug described in these issues:
+      # https://github.com/drighelli/SpatialExperiment/issues/171
+      # https://github.com/satijalab/seurat/issues/9889
+      # The reinstall of SingleCellExperiment triggers the correct re-install of SpatialExperiment.
+
+      # Using a large timeout here to reduce failures during GitHub package installation.
+
+  - type: native
+
+runners:
+  - type: executable
+  - type: nextflow
+    directives:
+      label: [ hightime, highcpu, highmem ]
diff --git a/src/methods_expression_correction/split/script.R b/src/methods_expression_correction/split/script.R
@@ -0,0 +1,105 @@
+library(spacexr)
+library(Matrix)
+library(SingleCellExperiment)
+library(anndataR)
+library(SPLIT)
+library(Seurat)
+library(scuttle)
+
+## VIASH START
+par <- list(
+  "input_spatial_with_cell_types" = "task_ist_preprocessing/resources_test/task_ist_preprocessing/mouse_brain_combined/spatial_with_celltypes.h5ad",
+  "input_scrnaseq_reference"= "task_ist_preprocessing/resources_test/task_ist_preprocessing/mouse_brain_combined/scrnaseq_reference.h5ad",
+  "output" = "task_ist_preprocessing/tmp/split_corrected.h5ad",
+  "keep_all_cells" = FALSE,
+)
+
+meta <- list(
+  'cpus': 4,
+)
+
+## VIASH END
+
+# Read the input h5ad file and convert to SingleCellExperiment and Seurat
+sce <- read_h5ad(par$input_spatial_with_cell_types, as = "SingleCellExperiment")
+xe <- read_h5ad(par$input_spatial_with_cell_types, as = "Seurat")
+
+# filter out 0 cells
+if (!par$keep_all_cells) {
+  cat("Filtering cells with 0 counts\n")
+  sce <- sce[, colSums(counts(sce)) > 0]
+  xe <- subset(xe, subset = nCount_RNA > 0)
+}
+
+# Extract spatial coordinates and counts matrix
+centroid_x <- colData(sce)$centroid_x
+centroid_y <- colData(sce)$centroid_y
+coords <- data.frame(centroid_x, centroid_y)
+counts <- assay(sce, "counts")
+rownames(coords) <- colData(sce)$cell_id
+puck <- SpatialRNA(coords, counts)
+
+# Read reference scrnaseq
+ref <- read_h5ad(par$input_scrnaseq_reference, as = "SingleCellExperiment")
+
+#filter reference cell types to those with >25 cells (minimum for RCTD)
+valid_celltypes <- names(table(colData(ref)$cell_type))[table(colData(ref)$cell_type) >= 25] 
+filtered_ref <- ref[,colData(ref)$cell_type %in% valid_celltypes]
+
+ref_counts <- assay(filtered_ref, "counts")
+# factor to drop filtered cell types
+colData(filtered_ref)$cell_type <- factor(colData(filtered_ref)$cell_type)
+cell_types <- colData(filtered_ref)$cell_type
+names(cell_types) <- colnames(ref_counts)
+reference <- Reference(ref_counts, cell_types, min_UMI = 0)
+
+# check cores
+cores <- 1
+if ("cpus" %in% names(meta) && !is.null(meta$cpus)) cores <- meta$cpus
+cat(sprintf("Number of cores: %s\n", cores))
+
+# Run the algorithm
+cat("Running RCTD\n")
+myRCTD <- create.RCTD(puck, reference, max_cores = cores)
+myRCTD <- run.RCTD(myRCTD, doublet_mode = "doublet")
+
+# Get the "spot_class" annotation from RCTD
+# cat("Saving RCTD spot_class\n")
+# results <- myRCTD@results
+# rctd_spot_class <- results$results_df$spot_class
+# names(rctd_spot_class) <- rownames(results$results_df)
+# colData(sce)$RCTD_class <- "not_included"
+# colData(sce)[names(rctd_spot_class),"RCTD_class"] <- as.character(rctd_spot_class)
+
+# Post-process RCTD output
+RCTD <- SPLIT::run_post_process_RCTD(myRCTD)
+
+# Run SPLIT purification
+cat("Running SPLIT\n")
+res_split <- SPLIT::purify(
+  counts = GetAssayData(xe, assay = 'RNA', layer = 'counts'), # or any gene x cells counts matrix
+  rctd = RCTD,
+  DO_purify_singlets = TRUE # optional
+)
+
+
+# create corrected counts layer in original SingleCell object
+cat("Normalizing counts\n")
+
+# Preserve original normalized values before overwriting with corrected normalization
+assay(sce, "normalized_uncorrected") <- assay(sce, "normalized")
+
+# First copy in counts
+assay(sce, "corrected_counts") <- assay(sce, "counts")
+
+# Then, replace only the updated cells
+assay(sce, "corrected_counts")[rownames(res_split$purified_counts), colnames(res_split$purified_counts)] <- res_split$purified_counts
+
+# Library size normalization - see note in resolVI
+size_factors <- librarySizeFactors(assay(sce, "corrected_counts"))
+assay(sce, "normalized") <- assay(logNormCounts(sce, size_factors=size_factors, assay.type = "corrected_counts"),"logcounts")
+
+# Write the final object to h5ad format
+cat("Writing to h5ad\n")
+dir.create(dirname(par$output), showWarnings = FALSE, recursive = TRUE)
+write_h5ad(sce, par$output, mode = "w")
diff --git a/src/workflows/run_benchmark/config.vsh.yaml b/src/workflows/run_benchmark/config.vsh.yaml
@@ -104,7 +104,7 @@ argument_groups:
           A list of expression correction methods to run.
         type: string
         multiple: true
-        default: "no_correction:gene_efficiency_correction:resolvi_correction"
+        default: "no_correction:gene_efficiency_correction:resolvi_correction:split"
   - name: Method parameters
     description: |
       Use these arguments to control the parameter sets that are run for each
@@ -175,6 +175,7 @@ dependencies:
   - name: methods_expression_correction/no_correction
   - name: methods_expression_correction/gene_efficiency_correction
   - name: methods_expression_correction/resolvi_correction
+  - name: methods_expression_correction/split
   - name: methods_data_aggregation/aggregate_spatial_data
   - name: metrics/similarity
   - name: metrics/quality
diff --git a/src/workflows/run_benchmark/main.nf b/src/workflows/run_benchmark/main.nf
@@ -414,7 +414,8 @@ workflow run_wf {
   expr_corr_methods = [
     no_correction,
     gene_efficiency_correction,
-    resolvi_correction
+    resolvi_correction,
+    split
   ]
   
   expr_corr_ch = cta_ch

Original file line number	Diff line number	Diff line change
`@@ -414,7 +414,8 @@ workflow run_wf {`
`414`	`414`	`expr_corr_methods = [`
`415`	`415`	`no_correction,`
`416`	`416`	`gene_efficiency_correction,`
`417`		`- resolvi_correction`
	`417`	`+ resolvi_correction,`
	`418`	`+ split`
`418`	`419`	`]`
`419`	`420`
`420`	`421`	`expr_corr_ch = cta_ch`