diff --git a/CHANGELOG.md b/CHANGELOG.md index c682d31..f7a3306 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,6 +12,7 @@ Initial release of nf-core/seqinspector, created with the [nf-core](https://nf-c - [#20](https://github.com/nf-core/seqinspector/pull/20) Use tags to generate group reports - [#13](https://github.com/nf-core/seqinspector/pull/13) Generate reports per run, per project and per lane. - [#49](https://github.com/nf-core/seqinspector/pull/49) Merge with template 3.0.2. +- [#50](https://github.com/nf-core/seqinspector/pull/50) Add an optional subsampling step. - [#51](https://github.com/nf-core/seqinspector/pull/51) Add nf-test to CI. ### `Fixed` diff --git a/CITATIONS.md b/CITATIONS.md index ecbfb16..8a4e350 100644 --- a/CITATIONS.md +++ b/CITATIONS.md @@ -18,6 +18,8 @@ > Ewels P, Magnusson M, Lundin S, Käller M. MultiQC: summarize analysis results for multiple tools and samples in a single report. Bioinformatics. 2016 Oct 1;32(19):3047-8. doi: 10.1093/bioinformatics/btw354. Epub 2016 Jun 16. PubMed PMID: 27312411; PubMed Central PMCID: PMC5039924. +- [Seqtk](https://github.com/lh3/seqtk) + ## Software packaging/containerisation tools - [Anaconda](https://anaconda.com) diff --git a/README.md b/README.md index f647af0..6cf36dc 100644 --- a/README.md +++ b/README.md @@ -31,8 +31,9 @@ workflows use the "tube map" design for that. See https://nf-co.re/docs/contributing/design_guidelines#examples for examples. --> -1. Read QC ([`FastQC`](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/)) -2. Present QC for raw reads ([`MultiQC`](http://multiqc.info/)) +1. Subsample reads ([`Seqtk`](https://github.com/lh3/seqtk)) +2. Read QC ([`FastQC`](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/)) +3. Present QC for raw reads ([`MultiQC`](http://multiqc.info/)) ## Usage diff --git a/conf/modules.config b/conf/modules.config index c883822..d3c597b 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -18,6 +18,10 @@ process { saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] + withName: SEQTK_SAMPLE { + ext.args = '-s100' + } + withName: FASTQC { ext.args = '--quiet' } diff --git a/docs/output.md b/docs/output.md index e14c3ad..2d4efb0 100644 --- a/docs/output.md +++ b/docs/output.md @@ -10,10 +10,23 @@ The directories listed below will be created in the results directory after the The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes data using the following steps: +- [Seqtk](#seqtk) - Subsample a specific number of reads per sample - [FastQC](#fastqc) - Raw read QC - [MultiQC](#multiqc) - Aggregate report describing results and QC from the whole pipeline - [Pipeline information](#pipeline-information) - Report metrics generated during the workflow execution +### Seqtk + +
+Output files + +- `seqtk/` + - `*_fastq`: FastQ file after being subsampled to the sample_size value. + +
+ +[Seqtk](https://github.com/lh3/seqtk) samples sequences by number. + ### FastQC
diff --git a/docs/usage.md b/docs/usage.md index d75e0fb..31ab91e 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -93,6 +93,12 @@ genome: 'GRCh37' You can also generate such `YAML`/`JSON` files via [nf-core/launch](https://nf-co.re/launch). +Optionally, the `sample_size` parameter allows you to subset a random number of reads to be analysed. Note that it refers to an absolute number. + +```bash +nextflow run nf-core/seqinspector --input ./samplesheet.csv --outdir ./results --sample_size 1000000 -profile docker +``` + ### Updating the pipeline When you run the above command, Nextflow automatically pulls the pipeline code from GitHub and stores it as a cached version. When running the pipeline after this, it will always use the cached version if available - even if the pipeline has been updated since. To make sure that you're running the latest version of the pipeline, make sure that you regularly update the cached version of the pipeline: diff --git a/modules.json b/modules.json index a27c750..7e57ea1 100644 --- a/modules.json +++ b/modules.json @@ -14,6 +14,11 @@ "branch": "master", "git_sha": "cf17ca47590cc578dfb47db1c2a44ef86f89976d", "installed_by": ["modules"] + }, + "seqtk/sample": { + "branch": "master", + "git_sha": "666652151335353eef2fcd58880bcef5bc2928e1", + "installed_by": ["modules"] } } }, diff --git a/modules/nf-core/seqtk/sample/environment.yml b/modules/nf-core/seqtk/sample/environment.yml new file mode 100644 index 0000000..693aa5c --- /dev/null +++ b/modules/nf-core/seqtk/sample/environment.yml @@ -0,0 +1,5 @@ +channels: + - conda-forge + - bioconda +dependencies: + - bioconda::seqtk=1.4 diff --git a/modules/nf-core/seqtk/sample/main.nf b/modules/nf-core/seqtk/sample/main.nf new file mode 100644 index 0000000..ea9b839 --- /dev/null +++ b/modules/nf-core/seqtk/sample/main.nf @@ -0,0 +1,58 @@ +process SEQTK_SAMPLE { + tag "$meta.id" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/seqtk:1.4--he4a0461_1' : + 'biocontainers/seqtk:1.4--he4a0461_1' }" + + input: + tuple val(meta), path(reads), val(sample_size) + + output: + tuple val(meta), path("*.fastq.gz"), emit: reads + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + if (!(args ==~ /.*-s[0-9]+.*/)) { + args += " -s100" + } + if ( !sample_size ) { + error "SEQTK/SAMPLE must have a sample_size value included" + } + """ + printf "%s\\n" $reads | while read f; + do + seqtk \\ + sample \\ + $args \\ + \$f \\ + $sample_size \\ + | gzip --no-name > ${prefix}_\$(basename \$f) + done + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + seqtk: \$(echo \$(seqtk 2>&1) | sed 's/^.*Version: //; s/ .*\$//') + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + + """ + echo "" | gzip > ${prefix}.fastq.gz + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + seqtk: \$(echo \$(seqtk 2>&1) | sed 's/^.*Version: //; s/ .*\$//') + END_VERSIONS + """ + +} diff --git a/modules/nf-core/seqtk/sample/meta.yml b/modules/nf-core/seqtk/sample/meta.yml new file mode 100644 index 0000000..42f67d8 --- /dev/null +++ b/modules/nf-core/seqtk/sample/meta.yml @@ -0,0 +1,52 @@ +name: seqtk_sample +description: Subsample reads from FASTQ files +keywords: + - sample + - fastx + - reads +tools: + - seqtk: + description: Seqtk is a fast and lightweight tool for processing sequences in + the FASTA or FASTQ format. Seqtk sample command subsamples sequences. + homepage: https://github.com/lh3/seqtk + documentation: https://docs.csc.fi/apps/seqtk/ + tool_dev_url: https://github.com/lh3/seqtk + licence: ["MIT"] + identifier: biotools:seqtk +input: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: List of input FastQ files + pattern: "*.{fastq.gz}" + - sample_size: + type: integer + description: Number of reads to sample. +output: + - reads: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - "*.fastq.gz": + type: file + description: Subsampled FastQ files + pattern: "*.{fastq.gz}" + - versions: + - versions.yml: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@kaurravneet4123" + - "@sidorov-si" + - "@adamrtalbot" +maintainers: + - "@kaurravneet4123" + - "@sidorov-si" + - "@adamrtalbot" diff --git a/modules/nf-core/seqtk/sample/tests/main.nf.test b/modules/nf-core/seqtk/sample/tests/main.nf.test new file mode 100644 index 0000000..c121c9d --- /dev/null +++ b/modules/nf-core/seqtk/sample/tests/main.nf.test @@ -0,0 +1,80 @@ +nextflow_process { + + name "Test Process SEQTK_SAMPLE" + script "modules/nf-core/seqtk/sample/main.nf" + process "SEQTK_SAMPLE" + config "./standard.config" + + tag "modules" + tag "modules_nfcore" + tag "seqtk" + tag "seqtk/sample" + + test("sarscov2_sample_singleend_fqgz") { + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:true ], + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true), + 50 + ] + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } + + test("sarscov2_sample_pairedend_fqgz") { + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:true ], + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true), + 50 + ] + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } + + test("sarscov2_sample_singlend_fqgz_stub") { + options "-stub" + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:true ], + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true), + 50 + ] + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } + +} diff --git a/modules/nf-core/seqtk/sample/tests/main.nf.test.snap b/modules/nf-core/seqtk/sample/tests/main.nf.test.snap new file mode 100644 index 0000000..a9fec3c --- /dev/null +++ b/modules/nf-core/seqtk/sample/tests/main.nf.test.snap @@ -0,0 +1,95 @@ +{ + "sarscov2_sample_singlend_fqgz_stub": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": true + }, + "test.sampled.fastq.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ] + ], + "1": [ + "versions.yml:md5,0529f2d163df9e2cd2ae8254dfb63806" + ], + "reads": [ + [ + { + "id": "test", + "single_end": true + }, + "test.sampled.fastq.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ] + ], + "versions": [ + "versions.yml:md5,0529f2d163df9e2cd2ae8254dfb63806" + ] + } + ], + "timestamp": "2024-02-22T15:58:45.902956" + }, + "sarscov2_sample_pairedend_fqgz": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": true + }, + "test.sampled_test_1.fastq.gz:md5,e5f44fafd7351c5abb9925a075132941" + ] + ], + "1": [ + "versions.yml:md5,0529f2d163df9e2cd2ae8254dfb63806" + ], + "reads": [ + [ + { + "id": "test", + "single_end": true + }, + "test.sampled_test_1.fastq.gz:md5,e5f44fafd7351c5abb9925a075132941" + ] + ], + "versions": [ + "versions.yml:md5,0529f2d163df9e2cd2ae8254dfb63806" + ] + } + ], + "timestamp": "2024-02-22T15:58:37.679954" + }, + "sarscov2_sample_singleend_fqgz": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": true + }, + "test.sampled_test_1.fastq.gz:md5,e5f44fafd7351c5abb9925a075132941" + ] + ], + "1": [ + "versions.yml:md5,0529f2d163df9e2cd2ae8254dfb63806" + ], + "reads": [ + [ + { + "id": "test", + "single_end": true + }, + "test.sampled_test_1.fastq.gz:md5,e5f44fafd7351c5abb9925a075132941" + ] + ], + "versions": [ + "versions.yml:md5,0529f2d163df9e2cd2ae8254dfb63806" + ] + } + ], + "timestamp": "2024-02-22T15:58:29.474491" + } +} \ No newline at end of file diff --git a/modules/nf-core/seqtk/sample/tests/standard.config b/modules/nf-core/seqtk/sample/tests/standard.config new file mode 100644 index 0000000..b2dd4b9 --- /dev/null +++ b/modules/nf-core/seqtk/sample/tests/standard.config @@ -0,0 +1,6 @@ +process { + withName: SEQTK_SAMPLE { + ext.args = '-s100' + ext.prefix = { "${meta.id}.sampled" } + } +} \ No newline at end of file diff --git a/modules/nf-core/seqtk/sample/tests/tags.yml b/modules/nf-core/seqtk/sample/tests/tags.yml new file mode 100644 index 0000000..e5d113b --- /dev/null +++ b/modules/nf-core/seqtk/sample/tests/tags.yml @@ -0,0 +1,2 @@ +seqtk/sample: + - "modules/nf-core/seqtk/sample/**" diff --git a/nextflow.config b/nextflow.config index 50c1ecb..38eb312 100644 --- a/nextflow.config +++ b/nextflow.config @@ -12,7 +12,7 @@ params { // TODO nf-core: Specify your pipeline's command line flags // Input options input = null - + sample_size = 0 // References genome = null fasta = null diff --git a/nextflow_schema.json b/nextflow_schema.json index 88fd607..49742b2 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -23,8 +23,15 @@ "help_text": "You will need to create a design file with information about the samples in your experiment before running the pipeline. Use this parameter to specify its location. It has to be a comma-separated file with 3 columns, and a header row. See [usage docs](https://nf-co.re/seqinspector/usage#samplesheet-input).", "fa_icon": "fas fa-file-csv" }, + "sample_size": { + "type": "integer", + "default": 0, + "description": "Take this number of reads as a subset.", + "help_text": "Choose the size of the subset or 0, if no subsampling shall be performed. Note that it refers to an absolute number." + }, "outdir": { "type": "string", + "default": null, "format": "directory-path", "description": "The output directory where the results will be saved. You have to use absolute paths to storage on Cloud infrastructure.", "fa_icon": "fas fa-folder-open" diff --git a/tests/MiSeq.main.nf.test.snap b/tests/MiSeq.main.nf.test.snap index 0742b5a..de0afa2 100644 --- a/tests/MiSeq.main.nf.test.snap +++ b/tests/MiSeq.main.nf.test.snap @@ -6,9 +6,9 @@ "multiqc_general_stats.txt:md5,5b28a83b14cb2fe88d084d08900ebdbf" ], "meta": { - "nf-test": "0.9.0", - "nextflow": "24.10.0" + "nf-test": "0.9.1", + "nextflow": "24.04.4" }, - "timestamp": "2024-10-28T13:18:10.3675973" + "timestamp": "2024-10-30T09:08:29.692511055" } -} +} \ No newline at end of file diff --git a/tests/NovaSeq6000.main.nf.test.snap b/tests/NovaSeq6000.main.nf.test.snap index fc38ac5..62ccd4a 100644 --- a/tests/NovaSeq6000.main.nf.test.snap +++ b/tests/NovaSeq6000.main.nf.test.snap @@ -18,9 +18,9 @@ "multiqc_general_stats.txt:md5,6a1c16f068d7ba3a9225a17eb570ed9a" ], "meta": { - "nf-test": "0.9.0", - "nextflow": "24.10.0" + "nf-test": "0.9.1", + "nextflow": "24.04.4" }, - "timestamp": "2024-10-28T13:19:13.226135825" + "timestamp": "2024-10-30T09:09:57.158871165" } -} +} \ No newline at end of file diff --git a/tests/NovaSeq6000.main_subsample.nf.test b/tests/NovaSeq6000.main_subsample.nf.test new file mode 100644 index 0000000..fe2b068 --- /dev/null +++ b/tests/NovaSeq6000.main_subsample.nf.test @@ -0,0 +1,45 @@ +nextflow_pipeline { + + name "Test Workflow main.nf on NovaSeq6000 data sample size 90" + script "../main.nf" + tag "seqinspector" + tag "PIPELINE" + + test("NovaSeq6000 data test sample size") { + + when { + config "./NovaSeq6000.main_subsample.nf.test.config" + params { + outdir = "$outputDir" + } + } + + then { + assertAll( + { assert workflow.success }, + { assert snapshot( + path("$outputDir/multiqc/global_report/multiqc_data/multiqc_citations.txt"), + path("$outputDir/multiqc/global_report/multiqc_data/multiqc_fastqc.txt"), + path("$outputDir/multiqc/global_report/multiqc_data/multiqc_general_stats.txt"), + + path("$outputDir/multiqc/group_reports/lane1/multiqc_data/multiqc_citations.txt"), + path("$outputDir/multiqc/group_reports/lane1/multiqc_data/multiqc_fastqc.txt"), + path("$outputDir/multiqc/group_reports/lane1/multiqc_data/multiqc_general_stats.txt"), + + path("$outputDir/multiqc/group_reports/group1/multiqc_data/multiqc_citations.txt"), + path("$outputDir/multiqc/group_reports/group1/multiqc_data/multiqc_fastqc.txt"), + path("$outputDir/multiqc/group_reports/group1/multiqc_data/multiqc_general_stats.txt"), + + path("$outputDir/multiqc/group_reports/group2/multiqc_data/multiqc_citations.txt"), + path("$outputDir/multiqc/group_reports/group2/multiqc_data/multiqc_fastqc.txt"), + path("$outputDir/multiqc/group_reports/group2/multiqc_data/multiqc_general_stats.txt"), + + path("$outputDir/multiqc/group_reports/test/multiqc_data/multiqc_citations.txt"), + path("$outputDir/multiqc/group_reports/test/multiqc_data/multiqc_fastqc.txt"), + path("$outputDir/multiqc/group_reports/test/multiqc_data/multiqc_general_stats.txt"), + ).match() + }, + ) + } + } +} diff --git a/tests/NovaSeq6000.main_subsample.nf.test.config b/tests/NovaSeq6000.main_subsample.nf.test.config new file mode 100644 index 0000000..acda74d --- /dev/null +++ b/tests/NovaSeq6000.main_subsample.nf.test.config @@ -0,0 +1,8 @@ +// Load the basic test config +includeConfig 'nextflow.config' + +// Load the correct samplesheet for that test +params { + input = params.pipelines_testdata_base_path + 'seqinspector/testdata/NovaSeq6000/samplesheet.csv' + sample_size = 90 +} diff --git a/tests/NovaSeq6000.main_subsample.nf.test.snap b/tests/NovaSeq6000.main_subsample.nf.test.snap new file mode 100644 index 0000000..651973b --- /dev/null +++ b/tests/NovaSeq6000.main_subsample.nf.test.snap @@ -0,0 +1,26 @@ +{ + "NovaSeq6000 data test sample size": { + "content": [ + "multiqc_citations.txt:md5,4c806e63a283ec1b7e78cdae3a923d4f", + "multiqc_fastqc.txt:md5,aba942d1e6996b579f19798e5673f514", + "multiqc_general_stats.txt:md5,ad1ec9c64cbdb1131a26aeb6de51e31c", + "multiqc_citations.txt:md5,4c806e63a283ec1b7e78cdae3a923d4f", + "multiqc_fastqc.txt:md5,aa1b8d6adae86005ea7a8b2e901099b8", + "multiqc_general_stats.txt:md5,c73c8d10568a56f6534d280fff701e60", + "multiqc_citations.txt:md5,4c806e63a283ec1b7e78cdae3a923d4f", + "multiqc_fastqc.txt:md5,ff996e1d3dc4a46e0c9535e54d51ccab", + "multiqc_general_stats.txt:md5,834e1868b887171cfda72029bbbe2d3f", + "multiqc_citations.txt:md5,4c806e63a283ec1b7e78cdae3a923d4f", + "multiqc_fastqc.txt:md5,3df36ecfe76b25b0c22dcda84bce2b3b", + "multiqc_general_stats.txt:md5,274a001b007521970f14d68bd176e5be", + "multiqc_citations.txt:md5,4c806e63a283ec1b7e78cdae3a923d4f", + "multiqc_fastqc.txt:md5,ce61b4ce4b1d76ec3f20de3bf0c9ec7f", + "multiqc_general_stats.txt:md5,d476ad59458a035a329605d5284b6012" + ], + "meta": { + "nf-test": "0.9.1", + "nextflow": "24.04.4" + }, + "timestamp": "2024-10-30T09:37:46.182191597" + } +} \ No newline at end of file diff --git a/tests/PromethION.main.nf.test.snap b/tests/PromethION.main.nf.test.snap index 875673c..dfa4eb6 100644 --- a/tests/PromethION.main.nf.test.snap +++ b/tests/PromethION.main.nf.test.snap @@ -6,9 +6,9 @@ "multiqc_general_stats.txt:md5,409cefc7f17f95d176ced6032bf8fb32" ], "meta": { - "nf-test": "0.9.0", - "nextflow": "24.10.0" + "nf-test": "0.9.1", + "nextflow": "24.04.4" }, - "timestamp": "2024-10-28T13:19:57.261730412" + "timestamp": "2024-10-30T09:12:03.048502046" } -} +} \ No newline at end of file diff --git a/workflows/seqinspector.nf b/workflows/seqinspector.nf index ab6cda4..7a2dfae 100644 --- a/workflows/seqinspector.nf +++ b/workflows/seqinspector.nf @@ -3,6 +3,8 @@ IMPORT MODULES / SUBWORKFLOWS / FUNCTIONS ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ + +include { SEQTK_SAMPLE } from '../modules/nf-core/seqtk/sample/main' include { FASTQC } from '../modules/nf-core/fastqc/main' include { MULTIQC as MULTIQC_GLOBAL } from '../modules/nf-core/multiqc/main' @@ -30,11 +32,28 @@ workflow SEQINSPECTOR { ch_multiqc_extra_files = Channel.empty() ch_multiqc_reports = Channel.empty() + // + // MODULE: Run Seqtk sample to perform subsampling + // + if (params.sample_size > 0 ) { + ch_sample_sized = SEQTK_SAMPLE( + ch_samplesheet.map { + meta, reads -> [meta, reads, params.sample_size] + } + ).reads + ch_versions = ch_versions.mix(SEQTK_SAMPLE.out.versions.first()) + } else { + // No do subsample + ch_sample_sized = ch_samplesheet + } + // // MODULE: Run FastQC // FASTQC ( - ch_samplesheet + ch_sample_sized.map { + meta, subsampled -> [meta, subsampled] + } ) ch_multiqc_files = ch_multiqc_files.mix(FASTQC.out.zip) ch_versions = ch_versions.mix(FASTQC.out.versions.first())