From fcf1b9b4fac7ecb8b1a7c1dbd76c51b467946733 Mon Sep 17 00:00:00 2001 From: Ben Sherman Date: Fri, 13 Jun 2025 10:41:36 -0500 Subject: [PATCH 1/2] Workflow outputs (third preview) Signed-off-by: Ben Sherman --- bin/fastqc.sh | 6 ++-- data/allreads.csv | 4 +++ data/gut.csv | 1 + main.nf | 63 +++++++++++++++++++++++++++++++---------- modules/fastqc/main.nf | 10 +++---- modules/multiqc/main.nf | 2 -- modules/quant/main.nf | 10 +++---- modules/rnaseq.nf | 20 ++++++------- nextflow.config | 34 +++++++--------------- 9 files changed, 86 insertions(+), 64 deletions(-) create mode 100644 data/allreads.csv create mode 100644 data/gut.csv diff --git a/bin/fastqc.sh b/bin/fastqc.sh index 93f38b6..55bc33b 100755 --- a/bin/fastqc.sh +++ b/bin/fastqc.sh @@ -1,6 +1,6 @@ #!/usr/bin/env bash -sample_id="$1" +id="$1" reads="$2" -mkdir fastqc_${sample_id}_logs -fastqc -o fastqc_${sample_id}_logs -f fastq -q ${reads} +mkdir fastqc_${id}_logs +fastqc -o fastqc_${id}_logs -f fastq -q ${reads} diff --git a/data/allreads.csv b/data/allreads.csv new file mode 100644 index 0000000..db40aac --- /dev/null +++ b/data/allreads.csv @@ -0,0 +1,4 @@ +gut,https://raw.githubusercontent.com/nextflow-io/rnaseq-nf/refs/heads/master/data/ggal/ggal_gut_1.fq,https://raw.githubusercontent.com/nextflow-io/rnaseq-nf/refs/heads/master/data/ggal/ggal_gut_2.fq +liver,https://raw.githubusercontent.com/nextflow-io/rnaseq-nf/refs/heads/master/data/ggal/ggal_liver_1.fq,https://raw.githubusercontent.com/nextflow-io/rnaseq-nf/refs/heads/master/data/ggal/ggal_liver_2.fq +lung,https://raw.githubusercontent.com/nextflow-io/rnaseq-nf/refs/heads/master/data/ggal/ggal_lung_1.fq,https://raw.githubusercontent.com/nextflow-io/rnaseq-nf/refs/heads/master/data/ggal/ggal_lung_2.fq +spleen,https://raw.githubusercontent.com/nextflow-io/rnaseq-nf/refs/heads/master/data/ggal/ggal_spleen_1.fq,https://raw.githubusercontent.com/nextflow-io/rnaseq-nf/refs/heads/master/data/ggal/ggal_spleen_2.fq \ No newline at end of file diff --git a/data/gut.csv b/data/gut.csv new file mode 100644 index 0000000..e9c7353 --- /dev/null +++ b/data/gut.csv @@ -0,0 +1 @@ +gut,https://raw.githubusercontent.com/nextflow-io/rnaseq-nf/refs/heads/master/data/ggal/ggal_gut_1.fq,https://raw.githubusercontent.com/nextflow-io/rnaseq-nf/refs/heads/master/data/ggal/ggal_gut_2.fq \ No newline at end of file diff --git a/main.nf b/main.nf index f585d53..7c9a46e 100755 --- a/main.nf +++ b/main.nf @@ -4,16 +4,17 @@ * Proof of concept of a RNAseq pipeline implemented with Nextflow */ +nextflow.preview.output = true /* * Default pipeline parameters. They can be overriden on the command line eg. - * given `params.foo` specify on the run command line `--foo some_value`. + * given `params.reads` specify on the run command line `--reads some_value`. */ -params.reads = "$baseDir/data/ggal/ggal_gut_{1,2}.fq" -params.transcriptome = "$baseDir/data/ggal/ggal_1_48850000_49020000.Ggal71.500bpflank.fa" +params.reads = null +params.transcriptome = null params.outdir = "results" -params.multiqc = "$baseDir/multiqc" +params.multiqc = "$projectDir/multiqc" // import modules @@ -24,16 +25,48 @@ include { MULTIQC } from './modules/multiqc' * main script flow */ workflow { + main: + log.info """\ + R N A S E Q - N F P I P E L I N E + =================================== + transcriptome: ${params.transcriptome} + reads : ${params.reads} + outdir : ${params.outdir} + """.stripIndent() -log.info """\ - R N A S E Q - N F P I P E L I N E - =================================== - transcriptome: ${params.transcriptome} - reads : ${params.reads} - outdir : ${params.outdir} - """ - - read_pairs_ch = channel.fromFilePairs( params.reads, checkIfExists: true ) - RNASEQ( params.transcriptome, read_pairs_ch ) - MULTIQC( RNASEQ.out, params.multiqc ) + inputs_ch = channel.fromPath(params.reads) + .splitCsv() + .map { id, fastq_1, fastq_2 -> + tuple(id, file(fastq_1, checkIfExists: true), file(fastq_2, checkIfExists: true)) + } + + samples_ch = RNASEQ( params.transcriptome, inputs_ch ) + .map { id, fastqc, quant -> + [id: id, fastqc: fastqc, quant: quant] + } + + multiqc_files_ch = samples_ch + .flatMap { sample -> [sample.fastqc, sample.quant] } + .collect() + multiqc_report = MULTIQC( multiqc_files_ch, params.multiqc ) + + publish: + samples = samples_ch + multiqc_report = multiqc_report +} + +output { + samples { + path { sample -> + sample.fastqc >> "fastqc/${sample.id}" + sample.quant >> "quant/${sample.id}" + } + index { + path 'samples.csv' + header true + } + } + + multiqc_report { + } } diff --git a/modules/fastqc/main.nf b/modules/fastqc/main.nf index 57c0477..5d013c9 100644 --- a/modules/fastqc/main.nf +++ b/modules/fastqc/main.nf @@ -1,18 +1,16 @@ -params.outdir = 'results' process FASTQC { - tag "FASTQC on $sample_id" + tag "$id" conda 'bioconda::fastqc=0.12.1' - publishDir params.outdir, mode:'copy' input: - tuple val(sample_id), path(reads) + tuple val(id), path(fastq_1), path(fastq_2) output: - path "fastqc_${sample_id}_logs", emit: logs + tuple val(id), path("fastqc_${id}_logs") script: """ - fastqc.sh "$sample_id" "$reads" + fastqc.sh "$id" "$fastq_1 $fastq_2" """ } diff --git a/modules/multiqc/main.nf b/modules/multiqc/main.nf index 43d7450..ac80a9e 100644 --- a/modules/multiqc/main.nf +++ b/modules/multiqc/main.nf @@ -1,8 +1,6 @@ -params.outdir = 'results' process MULTIQC { conda 'bioconda::multiqc=1.27.1' - publishDir params.outdir, mode:'copy' input: path '*' diff --git a/modules/quant/main.nf b/modules/quant/main.nf index 7e7286f..2ab0a3d 100644 --- a/modules/quant/main.nf +++ b/modules/quant/main.nf @@ -1,17 +1,17 @@ process QUANT { - tag "$pair_id" + tag "$id" conda 'bioconda::salmon=1.10.3' input: - path index - tuple val(pair_id), path(reads) + path index + tuple val(id), path(fastq_1), path(fastq_2) output: - path pair_id + tuple val(id), path("quant_${id}") script: """ - salmon quant --threads $task.cpus --libType=U -i $index -1 ${reads[0]} -2 ${reads[1]} -o $pair_id + salmon quant --threads $task.cpus --libType=U -i $index -1 ${fastq_1} -2 ${fastq_2} -o quant_$id """ } diff --git a/modules/rnaseq.nf b/modules/rnaseq.nf index 2f607c1..6849a9e 100644 --- a/modules/rnaseq.nf +++ b/modules/rnaseq.nf @@ -1,19 +1,19 @@ -params.outdir = 'results' include { INDEX } from './index' include { QUANT } from './quant' include { FASTQC } from './fastqc' workflow RNASEQ { - take: + take: transcriptome - read_pairs_ch - - main: - INDEX(transcriptome) - FASTQC(read_pairs_ch) - QUANT(INDEX.out, read_pairs_ch) + samples_ch - emit: - QUANT.out | concat(FASTQC.out) | collect + main: + index = INDEX(transcriptome) + fastqc_ch = FASTQC(samples_ch) + quant_ch = QUANT(index, samples_ch) + samples_ch = fastqc_ch.join(quant_ch) + + emit: + samples_ch } \ No newline at end of file diff --git a/nextflow.config b/nextflow.config index c07d123..922745e 100755 --- a/nextflow.config +++ b/nextflow.config @@ -17,16 +17,20 @@ manifest { } /* - * default params + * params for default test data */ -params.outdir = "results" -params.reads = "${projectDir}/data/ggal/ggal_gut_{1,2}.fq" -params.transcriptome = "${projectDir}/data/ggal/ggal_1_48850000_49020000.Ggal71.500bpflank.fa" -params.multiqc = "${projectDir}/multiqc" +params.reads = "${projectDir}/data/gut.csv" +params.transcriptome = "https://raw.githubusercontent.com/nextflow-io/rnaseq-nf/refs/heads/master/data/ggal/ggal_1_48850000_49020000.Ggal71.500bpflank.fa" /* - * defines execution profiles for different environments + * publish settings + */ + +workflow.output.mode = 'copy' + +/* + * execution profiles for different environments */ profiles { @@ -35,7 +39,7 @@ profiles { } 'all-reads' { - params.reads = "${projectDir}/data/ggal/ggal_*_{1,2}.fq" + params.reads = "${projectDir}/data/allreads.csv" } 'arm64' { @@ -84,8 +88,6 @@ profiles { } 'batch' { - params.reads = 's3://rnaseq-nf/data/ggal/lung_{1,2}.fq' - params.transcriptome = 's3://rnaseq-nf/data/ggal/transcript.fa' process.container = 'docker.io/nextflow/rnaseq-nf:v1.3.1' process.executor = 'awsbatch' process.queue = 'nextflow-ci' @@ -94,15 +96,7 @@ profiles { aws.batch.cliPath = '/home/ec2-user/miniconda/bin/aws' } - 's3-data' { - process.container = 'docker.io/nextflow/rnaseq-nf:v1.3.1' - params.reads = 's3://rnaseq-nf/data/ggal/lung_{1,2}.fq' - params.transcriptome = 's3://rnaseq-nf/data/ggal/transcript.fa' - } - 'google-batch' { - params.transcriptome = 'gs://rnaseq-nf/data/ggal/transcript.fa' - params.reads = 'gs://rnaseq-nf/data/ggal/gut_{1,2}.fq' params.multiqc = 'gs://rnaseq-nf/multiqc' process.executor = 'google-batch' process.container = 'docker.io/nextflow/rnaseq-nf:v1.3.1' @@ -113,12 +107,6 @@ profiles { google.region = 'europe-west2' } - 'gs-data' { - process.container = 'docker.io/nextflow/rnaseq-nf:v1.3.1' - params.transcriptome = 'gs://rnaseq-nf/data/ggal/transcript.fa' - params.reads = 'gs://rnaseq-nf/data/ggal/gut_{1,2}.fq' - } - 'azure-batch' { process.container = 'docker.io/nextflow/rnaseq-nf:v1.3.1' workDir = 'az://nf-scratch/work' From 484bfed5224bf9b248b767518c3b6d6eff7d46eb Mon Sep 17 00:00:00 2001 From: Ben Sherman Date: Fri, 13 Jun 2025 12:43:07 -0500 Subject: [PATCH 2/2] Static types --- main.nf | 69 +++++++++++++++++++++++++---------------- modules/fastqc/main.nf | 7 +++-- modules/index/main.nf | 4 +-- modules/multiqc/main.nf | 6 ++-- modules/quant/main.nf | 9 ++++-- modules/rnaseq.nf | 27 +++++++++++----- nextflow_schema.json | 29 ++++++++++++----- 7 files changed, 100 insertions(+), 51 deletions(-) diff --git a/main.nf b/main.nf index 7c9a46e..c9fb0c6 100755 --- a/main.nf +++ b/main.nf @@ -4,59 +4,75 @@ * Proof of concept of a RNAseq pipeline implemented with Nextflow */ -nextflow.preview.output = true - -/* - * Default pipeline parameters. They can be overriden on the command line eg. - * given `params.reads` specify on the run command line `--reads some_value`. - */ - -params.reads = null -params.transcriptome = null -params.outdir = "results" -params.multiqc = "$projectDir/multiqc" +// enable v2 operators (required for static type checking) +nextflow.preview.operators = true +// enable static type checking +nextflow.preview.typeChecking = true // import modules include { RNASEQ } from './modules/rnaseq' +include { FastqPair ; Sample } from './modules/rnaseq' include { MULTIQC } from './modules/multiqc' +/* + * Pipeline parameters. They can be overridden on the command line, e.g. + * `params.reads` can be specified as `--reads '...'`. + */ +params { + // The input read-pair files + reads: List + + // The input transcriptome file + transcriptome: Path + + // Directory containing multiqc configuration + multiqc: Path = "${projectDir}/multiqc" +} + /* - * main script flow + * Entry workflow */ workflow { main: log.info """\ R N A S E Q - N F P I P E L I N E =================================== + reads : ${params.reads*.id.join(',')} transcriptome: ${params.transcriptome} - reads : ${params.reads} - outdir : ${params.outdir} + outdir : ${workflow.outputDir} """.stripIndent() - inputs_ch = channel.fromPath(params.reads) - .splitCsv() - .map { id, fastq_1, fastq_2 -> - tuple(id, file(fastq_1, checkIfExists: true), file(fastq_2, checkIfExists: true)) - } - - samples_ch = RNASEQ( params.transcriptome, inputs_ch ) - .map { id, fastqc, quant -> - [id: id, fastqc: fastqc, quant: quant] - } + (samples_ch, index) = RNASEQ( channel.fromList(params.reads), params.transcriptome ) multiqc_files_ch = samples_ch .flatMap { sample -> [sample.fastqc, sample.quant] } .collect() + multiqc_report = MULTIQC( multiqc_files_ch, params.multiqc ) publish: + index = index samples = samples_ch multiqc_report = multiqc_report + + onComplete: + log.info( + workflow.success + ? "\nDone! Open the following report in your browser --> ${workflow.outputDir}/multiqc_report.html\n" + : "Oops .. something went wrong" + ) } +/* + * Pipeline outputs. By default they will be saved to the 'results' directory. + */ output { - samples { + index: Path { + path '.' + } + + samples: Channel { path { sample -> sample.fastqc >> "fastqc/${sample.id}" sample.quant >> "quant/${sample.id}" @@ -67,6 +83,7 @@ output { } } - multiqc_report { + multiqc_report: Path { + path '.' } } diff --git a/modules/fastqc/main.nf b/modules/fastqc/main.nf index 5d013c9..8634f9f 100644 --- a/modules/fastqc/main.nf +++ b/modules/fastqc/main.nf @@ -4,10 +4,13 @@ process FASTQC { conda 'bioconda::fastqc=0.12.1' input: - tuple val(id), path(fastq_1), path(fastq_2) + id : String + fastq_1 : Path + fastq_2 : Path output: - tuple val(id), path("fastqc_${id}_logs") + id : String = id + fastqc : Path = file("fastqc_${id}_logs") script: """ diff --git a/modules/index/main.nf b/modules/index/main.nf index 1d99b1b..5b1e305 100644 --- a/modules/index/main.nf +++ b/modules/index/main.nf @@ -4,10 +4,10 @@ process INDEX { conda 'bioconda::salmon=1.10.3' input: - path transcriptome + transcriptome : Path output: - path 'index' + file('index') script: """ diff --git a/modules/multiqc/main.nf b/modules/multiqc/main.nf index ac80a9e..7efb9a1 100644 --- a/modules/multiqc/main.nf +++ b/modules/multiqc/main.nf @@ -3,11 +3,11 @@ process MULTIQC { conda 'bioconda::multiqc=1.27.1' input: - path '*' - path config + inputs : Bag + config : Path output: - path 'multiqc_report.html', emit: report + file('multiqc_report.html') script: """ diff --git a/modules/quant/main.nf b/modules/quant/main.nf index 2ab0a3d..0e59326 100644 --- a/modules/quant/main.nf +++ b/modules/quant/main.nf @@ -4,11 +4,14 @@ process QUANT { conda 'bioconda::salmon=1.10.3' input: - path index - tuple val(id), path(fastq_1), path(fastq_2) + id : String + fastq_1 : Path + fastq_2 : Path + index : Path output: - tuple val(id), path("quant_${id}") + id : String = id + quant : Path = file("quant_${id}") script: """ diff --git a/modules/rnaseq.nf b/modules/rnaseq.nf index 6849a9e..9a4d0cc 100644 --- a/modules/rnaseq.nf +++ b/modules/rnaseq.nf @@ -5,15 +5,28 @@ include { FASTQC } from './fastqc' workflow RNASEQ { take: - transcriptome - samples_ch + reads : Channel + transcriptome : Path main: index = INDEX(transcriptome) - fastqc_ch = FASTQC(samples_ch) - quant_ch = QUANT(index, samples_ch) - samples_ch = fastqc_ch.join(quant_ch) + fastqc_ch = reads.map(FASTQC) + quant_ch = reads.map(QUANT, index: index) + samples_ch = fastqc_ch.join(quant_ch, 'id') emit: - samples_ch -} \ No newline at end of file + samples : Channel = samples_ch + index : Path = index +} + +record FastqPair { + id : String + fastq_1 : Path + fastq_2 : Path +} + +record Sample { + id : String + fastqc : Path + quant : Path +} diff --git a/nextflow_schema.json b/nextflow_schema.json index 823417a..c0cab4f 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -11,15 +11,27 @@ "fa_icon": "fas fa-terminal", "description": "Define where the pipeline should find input data and save output data.", "properties": { - "outdir": { - "type": "string", - "format": "directory-path", - "description": "The output directory where the results will be saved. You have to use absolute paths to storage on Cloud infrastructure.", - "fa_icon": "fas fa-folder-open", - "default": "results" - }, "reads": { - "type": "string", + "type": "array", + "items": { + "type": "object", + "properties": { + "id": { + "type": "string" + }, + "fastq_1": { + "type": "string", + "format": "file-path", + "exists": true + }, + "fastq_2": { + "type": "string", + "format": "file-path", + "exists": true + } + }, + "required": ["id", "fastq_1", "fastq_2"] + }, "description": "The input read-pair files", "fa_icon": "fas fa-folder-open", "default": "${projectDir}/data/ggal/ggal_gut_{1,2}.fq" @@ -32,6 +44,7 @@ }, "multiqc": { "type": "string", + "description": "Directory containing multiqc configuration", "fa_icon": "fas fa-folder-open", "default": "${projectDir}/multiqc" }