changed transcripts handling to parquet file from csv

khersameesh24 · khersameesh24 · commit 248b7b2d7050 · 2025-05-19T13:47:29.000Z
diff --git a/conf/modules.config b/conf/modules.config
@@ -27,6 +27,10 @@ process {
         ]
     }
 
+    withName: GUNZIP {
+        ext.prefix = "transcripts.csv"
+    }
+
     withName: XENIUMRANGER_RESEGMENT {
         publishDir = [
             path: "${params.outdir}/xeniumranger/resegment",
diff --git a/modules.json b/modules.json
@@ -14,7 +14,8 @@
                     "gunzip": {
                         "branch": "master",
                         "git_sha": "81880787133db07d9b4c1febd152c090eb8325dc",
-                        "installed_by": ["modules"]
+                        "installed_by": ["modules"],
+                        "patch": "modules/nf-core/gunzip/gunzip.diff"
                     },
                     "multiqc": {
                         "branch": "master",
diff --git a/modules/local/spatialconverter/parquet_to_csv/main.nf b/modules/local/spatialconverter/parquet_to_csv/main.nf
@@ -4,21 +4,21 @@ process PARQUET_TO_CSV {
 
     container "ghcr.io/scverse/spatialdata:spatialdata0.3.0_spatialdata-io0.1.7_spatialdata-plot0.2.9"
 
-    if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) {
-        error "PARQUET_TO_CSV module does not support Conda. Please use Docker / Singularity / Podman instead."
-    }
-
     input:
     tuple val(meta), path(transcripts)
 
     output:
-    tuple val(meta), path("*.csv")   , emit: transcripts_csv
-    path("versions.yml")             , emit: versions
+    tuple val(meta), path("*.csv"), emit: transcripts_csv
+    path("versions.yml")          , emit: versions
 
     when:
     task.ext.when == null || task.ext.when
 
     script:
+    if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) {
+        error "PARQUET_TO_CSV module does not support Conda. Please use Docker / Singularity / Podman instead."
+    }
+
     template 'parquet_to_csv.py'
 
     stub:
diff --git a/modules/nf-core/gunzip/gunzip.diff b/modules/nf-core/gunzip/gunzip.diff
diff --git a/modules/nf-core/gunzip/main.nf b/modules/nf-core/gunzip/main.nf
diff --git a/nextflow.config b/nextflow.config
@@ -51,9 +51,6 @@ params {
     features                   = null
 
     // Baysor specific
-    baysor_run_image           = true       // run baysor with image/seg-mask
-    baysor_run_transcripts     = false      // run baysor with transcripts.csv.gz
-    baysor_preview             = false      // generate preview with baysor preview cmd
 
     // MultiQC options
     multiqc_config             = null
diff --git a/nextflow_schema.json b/nextflow_schema.json
@@ -85,7 +85,7 @@
             "properties": {
                 "generate_preview": {
                     "type": "boolean",
-                    "description": "Whether to generate a preview of the dataset with the transcripts.csv.gz."
+                    "description": "Whether to generate a preview of the dataset with the transcripts.parquet."
                 },
                 "segmentation_refinement": {
                     "type": "boolean",
@@ -175,19 +175,6 @@
                 "features": {
                     "type": "string",
                     "description": "List of features to be passed to the ficture method. (eg: TP53,OCIAD1,BCAS3,SOX)"
-                },
-                "baysor_run_image": {
-                    "type": "boolean",
-                    "default": true,
-                    "description": "Whether to run bayor with image/segmentation-mask."
-                },
-                "baysor_run_transcripts": {
-                    "type": "boolean",
-                    "description": "Whether to run baysor with transcripts.csv.gz."
-                },
-                "baysor_preview": {
-                    "type": "boolean",
-                    "description": "Whether to create a preview of the dataset with transcripts.csv.gz."
                 }
             }
         },
diff --git a/subworkflows/local/baysor_generate_preview/main.nf b/subworkflows/local/baysor_generate_preview/main.nf
@@ -2,31 +2,29 @@
 // Run baysor create_dataset & preview
 //
 
-include { GUNZIP                } from '../../../modules/nf-core/gunzip/main'
 include { BAYSOR_PREVIEW        } from '../../../modules/local/baysor/preview/main'
 include { BAYSOR_CREATE_DATASET } from '../../../modules/local/baysor/create_dataset/main'
+include { PARQUET_TO_CSV        } from '../../../modules/local/spatialconverter/parquet_to_csv/main'
 
 workflow BAYSOR_GENERATE_PREVIEW {
 
     take:
 
-    ch_transcripts_csv // channel: [ val(meta), ["path-to-transcripts.csv.gz"] ]
-    ch_config          // channel: ["path-to-xenium.toml"]
+    ch_transcripts_parquet // channel: [ val(meta), ["path-to-transcripts.parquet"] ]
+    ch_config              // channel: ["path-to-xenium.toml"]
 
     main:
 
     ch_versions             = Channel.empty()
     ch_preview_html         = Channel.empty()
 
 
-    // unzip transcripts.csv.gz
-    GUNZIP ( ch_transcripts_csv )
-    ch_versions = ch_versions.mix ( GUNZIP.out.versions )
-
-    ch_unzipped_transcripts = GUNZIP.out.gunzip
+    // run parquet to csv
+    PARQUET_TO_CSV ( ch_transcripts_parquet )
+    ch_versions = ch_versions.mix ( PARQUET_TO_CSV.out.versions )
 
     // generate randomised sample data
-    BAYSOR_CREATE_DATASET ( ch_unzipped_transcripts, "0.3" )
+    BAYSOR_CREATE_DATASET ( PARQUET_TO_CSV.out.transcripts_csv, "0.3" )
     ch_versions = ch_versions.mix ( BAYSOR_CREATE_DATASET.out.versions )
 
     // run baysor preview if param - generate_preview is true
diff --git a/subworkflows/local/baysor_generate_segfree/main.nf b/subworkflows/local/baysor_generate_segfree/main.nf
@@ -10,27 +10,27 @@ workflow BAYSOR_GENERATE_SEGFREE {
 
     take:
 
-    ch_transcripts // channel: [ val(meta), ["transcripts.csv.gz"] ]
+    ch_transcripts_parquet // channel: [ val(meta), ["transcripts.parquet"] ]
+    ch_config
 
     main:
 
     ch_versions = Channel.empty()
 
     ch_ncvs     = Channel.empty()
 
-    // unzip transcripts.csv.gz
-    GUNZIP ( ch_transcripts )
-    ch_versions = ch_versions.mix ( GUNZIP.out.versions )
-
     // run baysor segfree
     BAYSOR_SEGFREE (
-        GUNZIP.out.gunzip
+        ch_transcripts_parquet,
+        ch_config
     )
     ch_versions = ch_versions.mix( BAYSOR_SEGFREE.out.versions )
 
+    ch_ncvs = BAYSOR_SEGFREE.out.ncvs
+
     emit:
 
-    ncvs     = ch_ncvs
+    ncvs     = ch_ncvs      // channel: [ val(meta), ["ncvs.loom"] ]
 
-    versions = ch_versions                    // channel: [ versions.yml ]
+    versions = ch_versions  // channel: [ versions.yml ]
 }
diff --git a/subworkflows/local/baysor_run_prior_segmentation_mask/main.nf b/subworkflows/local/baysor_run_prior_segmentation_mask/main.nf
@@ -2,20 +2,18 @@
 // Run baysor run & import-segmentation
 //
 
-include { GUNZIP                               } from '../../../modules/nf-core/gunzip/main'
-include { RESOLIFT                             } from '../../../modules/local/resolift/main'
-include { BAYSOR_RUN as BAYSOR_RUN_IMAGE       } from '../../../modules/local/baysor/run/main'
-include { XENIUMRANGER_IMPORT_SEGMENTATION     } from '../../../modules/nf-core/xeniumranger/import-segmentation/main'
+include { BAYSOR_RUN as BAYSOR_RUN_IMAGE   } from '../../../modules/local/baysor/run/main'
+include { XENIUMRANGER_IMPORT_SEGMENTATION } from '../../../modules/nf-core/xeniumranger/import-segmentation/main'
 
 
 workflow BAYSOR_RUN_PRIOR_SEGMENTATION_MASK {
 
     take:
 
-    ch_bundle_path       // channel: [ val(meta), ["path-to-xenium-bundle"] ]
-    ch_transcripts_csv   // channel: [ val(meta), ["path-to-transcripts.csv.gz"] ]
-    ch_segmentation_mask // channel: [ ["path-to-prior-segmentation-mask"] ]
-    ch_config            // channel: ["path-to-xenium.toml"]
+    ch_bundle_path         // channel: [ val(meta), ["path-to-xenium-bundle"] ]
+    ch_transcripts_parquet // channel: [ val(meta), ["path-to-transcripts.parquet"] ]
+    ch_segmentation_mask   // channel: [ ["path-to-prior-segmentation-mask"] ]
+    ch_config              // channel: [ "path-to-xenium.toml" ]
 
     main:
 
@@ -26,17 +24,11 @@ workflow BAYSOR_RUN_PRIOR_SEGMENTATION_MASK {
     ch_htmls                = Channel.empty()
 
     ch_redefined_bundle     = Channel.empty()
-    ch_unzipped_transcripts = Channel.empty()
 
-    // unzip transcripts.csv.gz
-    GUNZIP ( ch_transcripts_csv )
-    ch_versions = ch_versions.mix ( GUNZIP.out.versions )
-
-    ch_unzipped_transcripts = GUNZIP.out.gunzip
 
     // run baysor with morphology.tiff
     BAYSOR_RUN_IMAGE (
-        ch_unzipped_transcripts,
+        ch_transcripts_parquet,
         ch_segmentation_mask,
         ch_config,
         30
diff --git a/subworkflows/local/baysor_run_transcripts_csv/main.nf b/subworkflows/local/baysor_run_transcripts_csv/main.nf
@@ -11,9 +11,9 @@ workflow BAYSOR_RUN_TRANSCRIPTS_CSV {
 
     take:
 
-    ch_bundle_path     // channel: [ val(meta), ["xenium-bundle"] ]
-    ch_transcripts_csv // channel: [ val(meta), ["transcripts.csv.gz"] ]
-    ch_config          // channel: ["path-to-xenium.toml"]
+    ch_bundle_path          // channel: [ val(meta), ["xenium-bundle"] ]
+    ch_transcripts_parquet  // channel: [ val(meta), ["transcripts.csv.parquet"] ]
+    ch_config               // channel: ["path-to-xenium.toml"]
 
     main:
 
@@ -24,18 +24,10 @@ workflow BAYSOR_RUN_TRANSCRIPTS_CSV {
     ch_htmls                = Channel.empty()
 
     ch_redefined_bundle     = Channel.empty()
-    ch_unzipped_transcripts = Channel.empty()
-
-
-    // unzip transcripts.csv.gz
-    GUNZIP ( ch_transcripts_csv )
-    ch_versions = ch_versions.mix ( GUNZIP.out.versions )
-
-    ch_unzipped_transcripts = GUNZIP.out.gunzip
 
     // run baysor with transcripts.csv
     BAYSOR_RUN_TRANSCRIPTS (
-        ch_unzipped_transcripts,
+        ch_transcripts_parquet,
         [],
         ch_config,
         30
diff --git a/subworkflows/local/proseg_preset_proseg2baysor/main.nf b/subworkflows/local/proseg_preset_proseg2baysor/main.nf
@@ -2,26 +2,35 @@
 // Runs proseg for the xenium format and proseg2baysor to generate cell ploygons
 //
 
+include { GUNZIP                           } from '../../../modules/nf-core/gunzip/main'
 include { PROSEG                           } from '../../../modules/local/proseg/preset/main'
 include { PROSEG2BAYSOR                    } from '../../../modules/local/proseg/proseg2baysor/main'
+include { PARQUET_TO_CSV                   } from '../../../modules/local/spatialconverter/parquet_to_csv/main'
 include { XENIUMRANGER_IMPORT_SEGMENTATION } from '../../../modules/nf-core/xeniumranger/import-segmentation/main'
 
 workflow PROSEG_PRESET_PROSEG2BAYSOR {
 
     take:
 
-    ch_bundle_path     // channel: [ val(meta), ["path-to-xenium-bundle"] ]
-    ch_transcripts_csv // channel: [ val(meta), [ "transcripts.csv.gz" ] ]
+    ch_bundle_path         // channel: [ val(meta), ["path-to-xenium-bundle"] ]
+    ch_transcripts_parquet // channel: [ val(meta), [ "transcripts.parquet" ] ]
 
     main:
 
     ch_versions = Channel.empty()
 
+    // run parquet-to-csv
+    PARQUET_TO_CSV ( ch_transcripts_parquet )
+    ch_versions = ch_versions.mix( PARQUET_TO_CSV.out.versions )
+
+    // run gzip to create `transcripts.csv.gz`
+    GUNZIP ( PARQUET_TO_CSV.out.transcripts_csv )
+    ch_versions = ch_versions.mix( GUNZIP.out.versions )
+
     // run proseg with the xenium format
-    PROSEG ( ch_transcripts_csv )
+    PROSEG ( GUNZIP.out.gunzip )
     ch_versions = ch_versions.mix( PROSEG.out.versions )
 
-
     // run proseg-to-baysor on the data generated with the proseg run
     PROSEG2BAYSOR ( PROSEG.out.cell_polygons_2d, PROSEG.out.transcript_metadata )
     ch_versions = ch_versions.mix( PROSEG2BAYSOR.out.versions )
diff --git a/workflows/spatialxe.nf b/workflows/spatialxe.nf

Original file line number	Diff line number	Diff line change
`@@ -27,6 +27,10 @@ process {`
`27`	`27`	`]`
`28`	`28`	`}`
`29`	`29`
	`30`	`+ withName: GUNZIP {`
	`31`	`+ ext.prefix = "transcripts.csv"`
	`32`	`+ }`
	`33`	`+`
`30`	`34`	`withName: XENIUMRANGER_RESEGMENT {`
`31`	`35`	`publishDir = [`
`32`	`36`	`path: "${params.outdir}/xeniumranger/resegment",`