From d51006be60f12716ab0c9e5a008c25be75699aaf Mon Sep 17 00:00:00 2001 From: Tanaes Date: Thu, 14 Aug 2025 15:04:38 -0700 Subject: [PATCH 1/4] Adding direct AWS download for SRA files --- CITATIONS.md | 6 ++ docs/usage.md | 5 +- .../local/sra_aws_download/environment.yml | 7 ++ modules/local/sra_aws_download/main.nf | 55 +++++++++++ .../local/sra_aws_download/nextflow.config | 8 ++ .../local/sra_aws_download/tests/main.nf.test | 56 +++++++++++ .../sra_aws_download/tests/main.nf.test.snap | 72 ++++++++++++++ nextflow_schema.json | 6 +- .../local/fastq_download_aws_sratools/main.nf | 39 ++++++++ .../tests/main.nf.test | 41 ++++++++ .../tests/main.nf.test.snap | 97 +++++++++++++++++++ workflows/sra/main.nf | 16 +++ workflows/sra/nextflow.config | 1 + 13 files changed, 405 insertions(+), 4 deletions(-) create mode 100644 modules/local/sra_aws_download/environment.yml create mode 100644 modules/local/sra_aws_download/main.nf create mode 100644 modules/local/sra_aws_download/nextflow.config create mode 100644 modules/local/sra_aws_download/tests/main.nf.test create mode 100644 modules/local/sra_aws_download/tests/main.nf.test.snap create mode 100644 subworkflows/local/fastq_download_aws_sratools/main.nf create mode 100644 subworkflows/local/fastq_download_aws_sratools/tests/main.nf.test create mode 100644 subworkflows/local/fastq_download_aws_sratools/tests/main.nf.test.snap diff --git a/CITATIONS.md b/CITATIONS.md index 62235e72..2575be3e 100644 --- a/CITATIONS.md +++ b/CITATIONS.md @@ -12,6 +12,8 @@ - [Aspera CLI](https://github.com/IBM/aspera-cli) +- [AWS CLI](https://aws.amazon.com/cli/) + - [Python](http://www.python.org) - [Requests](https://docs.python-requests.org/) @@ -20,6 +22,10 @@ ## Pipeline resources +- [AWS Open Data Program - SRA](https://registry.opendata.aws/ncbi-sra/) + + > The Sequence Read Archive (SRA) is mirrored on AWS S3 as part of the AWS Open Data Program, providing free access to SRA data. + - [ENA](https://pubmed.ncbi.nlm.nih.gov/33175160/) > Harrison PW, Ahamed A, Aslam R, Alako BTF, Burgin J, Buso N, Courtot M, Fan J, Gupta D, Haseeb M, Holt S, Ibrahim T, Ivanov E, Jayathilaka S, Kadhirvelu VB, Kumar M, Lopez R, Kay S, Leinonen R, Liu X, O'Cathail C, Pakseresht A, Park Y, Pesant S, Rahman N, Rajan J, Sokolov A, Vijayaraja S, Waheed Z, Zyoud A, Burdett T, Cochrane G. The European Nucleotide Archive in 2020. Nucleic Acids Res. 2021 Jan 8;49(D1):D82-D85. doi: 10.1093/nar/gkaa1028. PubMed PMID: 33175160; PubMed Central PMCID: PMC7778925. diff --git a/docs/usage.md b/docs/usage.md index 8f27c3d5..689318dd 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -72,7 +72,10 @@ See [issue #260](https://github.com/nf-core/fetchngs/issues/260) for more detail ### Primary options for downloading data -If the appropriate download links are available, the pipeline uses FTP by default to download FastQ files by setting the `--download_method ftp` parameter. If you are having issues and prefer to use sra-tools or Aspera instead, you can set the [`--download_method`](https://nf-co.re/fetchngs/parameters#download_method) parameter to `--download_method sratools` or `--download_method aspera`, respectively. +If the appropriate download links are available, the pipeline uses FTP by default to download FastQ files by setting the `--download_method ftp` parameter. If you are having issues and prefer to use alternative methods, you can set the [`--download_method`](https://nf-co.re/fetchngs/parameters#download_method) parameter to: +- `--download_method sratools`: Uses NCBI's sra-tools to download SRA files and convert to FastQ +- `--download_method aspera`: Uses Aspera CLI for faster downloads from ENA +- `--download_method aws`: Downloads SRA files from the AWS S3 Open Data Program mirror and converts to FastQ ### Downloading dbGAP data with JWT diff --git a/modules/local/sra_aws_download/environment.yml b/modules/local/sra_aws_download/environment.yml new file mode 100644 index 00000000..7d00cd7b --- /dev/null +++ b/modules/local/sra_aws_download/environment.yml @@ -0,0 +1,7 @@ +name: sra_aws_download +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - conda-forge::awscli=2.15.0 \ No newline at end of file diff --git a/modules/local/sra_aws_download/main.nf b/modules/local/sra_aws_download/main.nf new file mode 100644 index 00000000..a849e43c --- /dev/null +++ b/modules/local/sra_aws_download/main.nf @@ -0,0 +1,55 @@ +process SRA_AWS_DOWNLOAD { + tag "$meta.id" + label 'process_low' + label 'error_retry' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/awscli:1.8.3--py35_0' : + 'quay.io/biocontainers/awscli:1.8.3--py35_0' }" + + input: + tuple val(meta), val(run_accession) + + output: + tuple val(meta), path("*.sra"), emit: sra + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${run_accession}" + """ + # Download SRA file from AWS S3 Open Data Program + aws s3 cp \\ + --region us-east-1 \\ + --no-sign-request \\ + ${args} \\ + s3://sra-pub-run-odp/sra/${run_accession}/${run_accession} \\ + ${prefix}.sra + + # Verify download + if [ ! -f "${prefix}.sra" ]; then + echo "ERROR: Failed to download ${run_accession} from AWS S3" + exit 1 + fi + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + aws-cli: \$(aws --version 2>&1 | sed 's/aws-cli\\///; s/ Python.*//') + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${run_accession}" + """ + touch ${prefix}.sra + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + aws-cli: \$(aws --version 2>&1 | sed 's/aws-cli\\///; s/ Python.*//') + END_VERSIONS + """ +} \ No newline at end of file diff --git a/modules/local/sra_aws_download/nextflow.config b/modules/local/sra_aws_download/nextflow.config new file mode 100644 index 00000000..dfe95467 --- /dev/null +++ b/modules/local/sra_aws_download/nextflow.config @@ -0,0 +1,8 @@ +process { + withName: 'SRA_AWS_DOWNLOAD' { + publishDir = [ + path: { "${params.outdir}/sra" }, + enabled: false + ] + } +} \ No newline at end of file diff --git a/modules/local/sra_aws_download/tests/main.nf.test b/modules/local/sra_aws_download/tests/main.nf.test new file mode 100644 index 00000000..aaaa6ff3 --- /dev/null +++ b/modules/local/sra_aws_download/tests/main.nf.test @@ -0,0 +1,56 @@ +nextflow_process { + + name "Test Process SRA_AWS_DOWNLOAD" + script "../main.nf" + process "SRA_AWS_DOWNLOAD" + tag "modules" + tag "modules_local" + tag "sra_aws_download" + + test("Should download SRA file from AWS") { + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:false ], + 'DRR028935' + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("Should download SRA file from AWS - stub") { + + options "-stub" + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:false ], + 'DRR028935' + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + +} \ No newline at end of file diff --git a/modules/local/sra_aws_download/tests/main.nf.test.snap b/modules/local/sra_aws_download/tests/main.nf.test.snap new file mode 100644 index 00000000..742d6f64 --- /dev/null +++ b/modules/local/sra_aws_download/tests/main.nf.test.snap @@ -0,0 +1,72 @@ +{ + "Should download SRA file from AWS": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": false + }, + "DRR028935.sra:md5,bc88b59c510081d85448416f05094ed5" + ] + ], + "1": [ + "versions.yml:md5,ce0676c62bd6864661cf98777e7c2896" + ], + "sra": [ + [ + { + "id": "test", + "single_end": false + }, + "DRR028935.sra:md5,bc88b59c510081d85448416f05094ed5" + ] + ], + "versions": [ + "versions.yml:md5,ce0676c62bd6864661cf98777e7c2896" + ] + } + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "25.04.6" + }, + "timestamp": "2025-08-14T14:59:02.578113" + }, + "Should download SRA file from AWS - stub": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": false + }, + "DRR028935.sra:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + "versions.yml:md5,ce0676c62bd6864661cf98777e7c2896" + ], + "sra": [ + [ + { + "id": "test", + "single_end": false + }, + "DRR028935.sra:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions": [ + "versions.yml:md5,ce0676c62bd6864661cf98777e7c2896" + ] + } + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "25.04.6" + }, + "timestamp": "2025-08-14T14:59:07.021124" + } +} \ No newline at end of file diff --git a/nextflow_schema.json b/nextflow_schema.json index 29f7b710..072d85ee 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -51,9 +51,9 @@ "type": "string", "default": "ftp", "fa_icon": "fas fa-download", - "enum": ["aspera", "ftp", "sratools"], - "description": "Method to download FastQ files. Available options are 'aspera', 'ftp' or 'sratools'. Default is 'ftp'.", - "help_text": "FTP and Aspera CLI download FastQ files directly from the ENA FTP whereas sratools uses sra-tools to download *.sra files and convert to FastQ." + "enum": ["aspera", "ftp", "sratools", "aws"], + "description": "Method to download FastQ files. Available options are 'aspera', 'ftp', 'sratools', or 'aws'. Default is 'ftp'.", + "help_text": "FTP and Aspera CLI download FastQ files directly from the ENA FTP. sratools uses sra-tools to download *.sra files and convert to FastQ. aws uses AWS CLI to download *.sra files from the SRA mirror on AWS S3 Open Data Program and convert to FastQ." }, "skip_fastq_download": { "type": "boolean", diff --git a/subworkflows/local/fastq_download_aws_sratools/main.nf b/subworkflows/local/fastq_download_aws_sratools/main.nf new file mode 100644 index 00000000..85ad5d25 --- /dev/null +++ b/subworkflows/local/fastq_download_aws_sratools/main.nf @@ -0,0 +1,39 @@ +include { CUSTOM_SRATOOLSNCBISETTINGS } from '../../../modules/nf-core/custom/sratoolsncbisettings/main' +include { SRA_AWS_DOWNLOAD } from '../../../modules/local/sra_aws_download/main' +include { SRATOOLS_FASTERQDUMP } from '../../../modules/nf-core/sratools/fasterqdump/main' + +// +// Download FASTQ sequencing reads from AWS S3 SRA mirror +// +workflow FASTQ_DOWNLOAD_AWS_SRATOOLS { + take: + ch_sra_ids // channel: [ val(meta), val(id) ] + ch_dbgap_key // channel: [ path(dbgap_key) ] + + main: + + ch_versions = Channel.empty() + + // + // Detect existing NCBI user settings or create new ones. + // + CUSTOM_SRATOOLSNCBISETTINGS ( ch_sra_ids.collect() ) + ch_ncbi_settings = CUSTOM_SRATOOLSNCBISETTINGS.out.ncbi_settings + ch_versions = ch_versions.mix(CUSTOM_SRATOOLSNCBISETTINGS.out.versions) + + // + // Download SRA files from AWS S3 + // + SRA_AWS_DOWNLOAD ( ch_sra_ids ) + ch_versions = ch_versions.mix(SRA_AWS_DOWNLOAD.out.versions.first()) + + // + // Convert the SRA format into one or more compressed FASTQ files. + // + SRATOOLS_FASTERQDUMP ( SRA_AWS_DOWNLOAD.out.sra, ch_ncbi_settings, ch_dbgap_key ) + ch_versions = ch_versions.mix(SRATOOLS_FASTERQDUMP.out.versions.first()) + + emit: + reads = SRATOOLS_FASTERQDUMP.out.reads // channel: [ val(meta), [ reads ] ] + versions = ch_versions // channel: [ versions.yml ] +} \ No newline at end of file diff --git a/subworkflows/local/fastq_download_aws_sratools/tests/main.nf.test b/subworkflows/local/fastq_download_aws_sratools/tests/main.nf.test new file mode 100644 index 00000000..d4477df0 --- /dev/null +++ b/subworkflows/local/fastq_download_aws_sratools/tests/main.nf.test @@ -0,0 +1,41 @@ +nextflow_workflow { + + name "Test workflow: fastq_download_aws_sratools/main.nf" + script "../main.nf" + workflow "FASTQ_DOWNLOAD_AWS_SRATOOLS" + + tag "CUSTOM_SRATOOLSNCBISETTINGS" + tag "SRA_AWS_DOWNLOAD" + tag "SRATOOLS_FASTERQDUMP" + + test("Parameters: default") { + + when { + workflow { + """ + input[0] = Channel.of( + [[ id:'test_single_end', single_end:true ], 'DRR000774'], + [[ id:'test_paired_end', single_end:false ], 'SRR11140744'] + ) + input[1] = [] + """ + } + } + + then { + def pelines1 = path(workflow.out.reads[0][1][0]).linesGzip + def pelines2 = path(workflow.out.reads[0][1][1]).linesGzip + def selines = path(workflow.out.reads[1][1]).linesGzip + assertAll( + { assert workflow.success }, + { assert snapshot(pelines1[0..5]).match("test_pe_reads_1_lines") }, + { assert snapshot(pelines1.size()).match("test_pe_reads_1_size") }, + { assert snapshot(pelines2[0..5]).match("test_pe_reads_2_lines") }, + { assert snapshot(pelines2.size()).match("test_pe_reads_2_size") }, + { assert snapshot(selines[0..5]).match("test_se_reads_lines") }, + { assert snapshot(selines.size()).match("test_se_reads_size") }, + { assert snapshot(workflow.out.versions).match("versions") } + ) + } + } +} \ No newline at end of file diff --git a/subworkflows/local/fastq_download_aws_sratools/tests/main.nf.test.snap b/subworkflows/local/fastq_download_aws_sratools/tests/main.nf.test.snap new file mode 100644 index 00000000..f16b5987 --- /dev/null +++ b/subworkflows/local/fastq_download_aws_sratools/tests/main.nf.test.snap @@ -0,0 +1,97 @@ +{ + "test_se_reads_size": { + "content": [ + 19996 + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "25.04.6" + }, + "timestamp": "2025-08-14T15:00:55.000747" + }, + "test_pe_reads_2_lines": { + "content": [ + [ + "@SRR11140744.1 M01472:285:000000000-CYHNP:1:1101:12117:3295 length=251", + "ACAGGACACGAGTAACTCGTCTATCTTCTGCTGGCTGCTTACGGTTTCGTCCGTGTTGCAGCCGATCATCAGCACATCTAGGTTTCGTCCGGGTGTGACCGAAAGGTAAGATGGAGAGCCTTGTCCCTGGTTTCAACGAGAAAACACACGTCCAACTCAGTTTGCCTGTTTTACAGGTTCGCGACGTGCTCGTACGTGGCTTTGGAGACTCCGTGGAGGAGGTCTTATCAGAGGCACGTCAACATCTTAAA", + "+SRR11140744.1 M01472:285:000000000-CYHNP:1:1101:12117:3295 length=251", + "ABAAAFBFFBDBGGGGGGGGGGHHHHHHHHHHCHGHGGGHHHGGHGGHGHGGGHFHHHHHHHHGGGGGHHHHHHHHHFHHHHGHHHGHGGGGGEFGDGHHGFGGGHHHHHGHHGGHHFHHHHGHHHHHHHHHHHHHHGFFGGHHHHHHGGHHGGHHHHHEGHHHHHHHGHHGHHFHHHHHGGGGGGGGGGGGAGGG9BEFFFFFFFFFFFFFFEEFFFFFFFA.FFFFFFFEFEFFFFFFF.BFFFFFFFB", + "@SRR11140744.2 M01472:285:000000000-CYHNP:1:1101:20752:3564 length=238", + "GTGCACTCACGCAGTATAATTAATAACTAATTACTGTCGTTGACAGGACACGAGTAACTCGTCTATCTTCTGCAGGCTGCTTACGGTTTCGTCCGTGTTGCAGCCGATCATCAGCACATCTAGGTTTCGTCCGGGTGTGACCGAAAGGTAAGATGGAGAGCCTTGTCCCTGGTTTCAACGAGAAAACACACGTCCAACTCAGTTTGCCTGTTTTACAGGTTCGCGACGTGCTCGTACG" + ] + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "25.04.6" + }, + "timestamp": "2025-08-14T15:00:54.994204" + }, + "test_pe_reads_2_size": { + "content": [ + 2011460 + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "25.04.6" + }, + "timestamp": "2025-08-14T15:00:54.996252" + }, + "versions": { + "content": [ + [ + "versions.yml:md5,4146bec4feafc4feada81bcd86180836", + "versions.yml:md5,44c44e0430f2f8aff8aef894c79ae2c8", + "versions.yml:md5,9c64ac49745ab1738b7edeecee34f559" + ] + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "25.04.6" + }, + "timestamp": "2025-08-14T15:00:55.003931" + }, + "test_pe_reads_1_size": { + "content": [ + 2013376 + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "25.04.6" + }, + "timestamp": "2025-08-14T15:00:54.990405" + }, + "test_se_reads_lines": { + "content": [ + [ + "@DRR000774.1 1 length=421", + "ACGCAGGTGCCAGCAGCCGCGGTAATACGTAGGATCCGAGCGTTGTCCGGATTTATTGGGCGTAAAGGGTGCGTAGGCGGCTTGTCAAGTCTCATGTGAAATCTCCCGGCTCAACTGGGAGGGTCATGGGAAACTGATGAGCTCGAGGGCAGTAGAGGGAAGCGGAATTCCGAGAGTAGTGGTGAAATGCGTAGATACTCGGAGGAACACCAGTGGCGAAAGCGGCTTCCTGGACTGTACCTGACGCTGAGGCACGAAAGCGTGGGGAGCAAACCGGATTAGATACCCGGGTAGTCCACGCCCTAAACGATGGATACTAGATATAGGGGGTATCGACCCTCTGTGTCGAAGCTAACGCATTAAGTATCCCGCCTGAGGAGTACGGCCGCAAGGCTAAAACTTAAGGAATTGACGGCTGCGT", + "+DRR000774.1 1 length=421", + "FFFFFFFFFFFIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIHHFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF:88FFF888???DBBBBB666F222ADDDFFF::;FFFFFFFFFFFFFFFFFFFFFFFFFFFF9:::FFFFCCCFFFFDDDFFFFF<<<<<8888886623//38><83238@B@@<;855557,,,,,,,0/0;;8:==DDDDDDDDD9:", + "@DRR000774.2 2 length=126", + "ACGCAGGTGCCAGCAGCCGCGGTAATACGGAGGGAGCTAGCGTTGTTCGGAATTACTGGGCGTAAAGCGCACGTAGGCGGCTTTTCAAGTCAGGGGTGGAAATACCCGGGGCCGTCAACCCGACCG" + ] + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "25.04.6" + }, + "timestamp": "2025-08-14T15:00:54.998236" + }, + "test_pe_reads_1_lines": { + "content": [ + [ + "@SRR11140744.1 M01472:285:000000000-CYHNP:1:1101:12117:3295 length=251", + "ACATAGGGCTGTTCAAGTTGAGGCAAAACGCCTTTTTCAACTTCTACTAAGCCACAAGTGCCATCTTTAAGATGTTGACGTGCCTCTGATAAGACCTCCTCCACGGAGTCTCCAAAGCCACGTACGAGCACGTCGCGAACCTGTAAAACAGGCAAACTGAGTTGGACGTGTGTTTTCTCGTTGAAACCAGGGACAAGGCTCTCCATCTTACCTTTCGGTCACACCCGGACGAAACCTAGATGTGCTGATGA", + "+SRR11140744.1 M01472:285:000000000-CYHNP:1:1101:12117:3295 length=251", + "BCCCCFFFFFCFGGGGGGGGGGHGGHHHHGGGHGHHHHHHHHHHHHHHHHHHHHHHHGHHHHHHHHHHHHHHHHHHHHHGGGHHHHHGHHGHHHHHHHHHHHHHGGGGGHHHHHHHHHHHHGHHHGGGGGHGHHGGGGGGGHHHHHHHHHHHGGHHHHHFHHHHHHHGGGHHHHHHHHHGGGHHHHHHHHGGGGGGGFGGGGGGGGGGGGGGGGGGGGGGFFFFFFFFFDFFFFFFFFFFFFFFFFFFFFB", + "@SRR11140744.2 M01472:285:000000000-CYHNP:1:1101:20752:3564 length=238", + "CGTACGAGCACGTCGCGAACCTGTAAAACAGGCAAACTGAGTTGGACGTGTGTTTTCTCGTTGAAACCAGGGACAAGGCTCTCCATCTTACCTTTCGGTCACACCCGGACGAAACCTAGATGTGCTGATGATCGGCTGCAACACGGACGAAACCGTAAGCAGCCTGCAGAAGATAGACGAGTTACTCGTGTCCTGTCAACGACAGTAATTAGTTATTAATTATACTGCGTGAGTGCAC" + ] + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "25.04.6" + }, + "timestamp": "2025-08-14T15:00:54.985177" + } +} \ No newline at end of file diff --git a/workflows/sra/main.nf b/workflows/sra/main.nf index 0c8cac0c..e837088a 100644 --- a/workflows/sra/main.nf +++ b/workflows/sra/main.nf @@ -11,6 +11,7 @@ include { SRA_RUNINFO_TO_FTP } from '../../modules/local/sra_runinfo_to_ftp include { ASPERA_CLI } from '../../modules/local/aspera_cli' include { SRA_TO_SAMPLESHEET } from '../../modules/local/sra_to_samplesheet' include { softwareVersionsToYAML } from '../../subworkflows/nf-core/utils_nfcore_pipeline' +include { FASTQ_DOWNLOAD_AWS_SRATOOLS } from '../../subworkflows/local/fastq_download_aws_sratools' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -76,6 +77,9 @@ workflow SRA { if (meta.fastq_aspera && params.download_method == 'aspera') { download_method = 'aspera' } + if (params.download_method == 'aws') { + download_method = 'aws' + } if ((!meta.fastq_aspera && !meta.fastq_1) || params.download_method == 'sratools') { download_method = 'sratools' } @@ -86,6 +90,8 @@ workflow SRA { return [ meta, [ meta.fastq_1, meta.fastq_2 ] ] sratools: download_method == 'sratools' return [ meta, meta.run_accession ] + aws: download_method == 'aws' + return [ meta, meta.run_accession ] } .set { ch_sra_reads } @@ -115,12 +121,22 @@ workflow SRA { ) ch_versions = ch_versions.mix(ASPERA_CLI.out.versions.first()) + // + // SUBWORKFLOW: Download sequencing reads from AWS S3 SRA mirror + // + FASTQ_DOWNLOAD_AWS_SRATOOLS ( + ch_sra_reads.aws, + params.dbgap_key ? file(params.dbgap_key, checkIfExists: true) : [] + ) + ch_versions = ch_versions.mix(FASTQ_DOWNLOAD_AWS_SRATOOLS.out.versions.first()) + // Isolate FASTQ channel which will be added to emit block SRA_FASTQ_FTP .out .fastq .mix(FASTQ_DOWNLOAD_PREFETCH_FASTERQDUMP_SRATOOLS.out.reads) .mix(ASPERA_CLI.out.fastq) + .mix(FASTQ_DOWNLOAD_AWS_SRATOOLS.out.reads) .map { meta, fastq -> def reads = fastq instanceof List ? fastq.flatten() : [ fastq ] diff --git a/workflows/sra/nextflow.config b/workflows/sra/nextflow.config index d242c238..8f3a65a5 100644 --- a/workflows/sra/nextflow.config +++ b/workflows/sra/nextflow.config @@ -4,5 +4,6 @@ includeConfig "../../modules/local/sra_fastq_ftp/nextflow.config" includeConfig "../../modules/local/sra_ids_to_runinfo/nextflow.config" includeConfig "../../modules/local/sra_runinfo_to_ftp/nextflow.config" includeConfig "../../modules/local/sra_to_samplesheet/nextflow.config" +includeConfig "../../modules/local/sra_aws_download/nextflow.config" includeConfig "../../modules/nf-core/sratools/prefetch/nextflow.config" includeConfig "../../subworkflows/nf-core/fastq_download_prefetch_fasterqdump_sratools/nextflow.config" From bfb598c8d5b022189af266b0854392b885fe4347 Mon Sep 17 00:00:00 2001 From: Tanaes Date: Thu, 14 Aug 2025 22:56:46 -0700 Subject: [PATCH 2/4] add workflow test --- .../sra/tests/sra_download_method_aws.nf.test | 55 +++++++++++++++++++ 1 file changed, 55 insertions(+) create mode 100644 workflows/sra/tests/sra_download_method_aws.nf.test diff --git a/workflows/sra/tests/sra_download_method_aws.nf.test b/workflows/sra/tests/sra_download_method_aws.nf.test new file mode 100644 index 00000000..11066d79 --- /dev/null +++ b/workflows/sra/tests/sra_download_method_aws.nf.test @@ -0,0 +1,55 @@ +nextflow_workflow { + + name "Test workflow: sra/main.nf" + script "../main.nf" + workflow "SRA" + tag "SRA_DOWNLOAD_METHOD_AWS" + + // Dependencies + tag "FASTQ_DOWNLOAD_AWS_SRATOOLS" + tag "SRA_IDS_TO_RUNINFO" + tag "SRA_RUNINFO_TO_FTP" + tag "SRA_TO_SAMPLESHEET" + tag "MULTIQC_MAPPINGS_CONFIG" + + test("Parameters: --download_method aws") { + + when { + workflow { + """ + input[0] = Channel.from("DRX026011", "ERX1234253", "SRX6725035") + """ + } + params { + download_method = 'aws' + } + } + + then { + assert workflow.success + + assertAll( + { + with(workflow.out.samplesheet) { + assert path(get(0)).readLines().size() == 4 + assert path(get(0)).readLines()*.split(',')[0].take(4) == ['"sample"', '"fastq_1"', '"fastq_2"', '"run_accession"'] + assert path(get(0)).readLines()*.split(',').collect { it[0] } == ['"sample"', '"DRX026011"', '"ERX1234253"', '"SRX6725035"'] + assert path(get(0)).text.contains('Illumina HiSeq 2500') + } + }, + { + with(workflow.out.mappings) { + assert path(get(0)).readLines().size() == 4 + assert path(get(0)).readLines()*.split(',').collect { it[0] } == ['"sample"', '"DRX026011"', '"ERX1234253"', '"SRX6725035"'] + assert path(get(0)).text.contains('Illumina HiSeq 2500') + } + }, + { + with(workflow.out.sample_mappings) { + assert path(get(0)[0]).md5 == "1ac06bb95b503703430e74660bbdd768" + } + } + ) + } + } +} \ No newline at end of file From be52c3e5d813a9270bfe18521013a58e57520aac Mon Sep 17 00:00:00 2001 From: Tanaes Date: Thu, 14 Aug 2025 23:09:35 -0700 Subject: [PATCH 3/4] changing aws workflow test to better match sra workflow test --- .../sra/tests/sra_download_method_aws.nf.test | 37 ++++++------------- 1 file changed, 12 insertions(+), 25 deletions(-) diff --git a/workflows/sra/tests/sra_download_method_aws.nf.test b/workflows/sra/tests/sra_download_method_aws.nf.test index 11066d79..973e2746 100644 --- a/workflows/sra/tests/sra_download_method_aws.nf.test +++ b/workflows/sra/tests/sra_download_method_aws.nf.test @@ -15,40 +15,27 @@ nextflow_workflow { test("Parameters: --download_method aws") { when { + params { + outdir = "$outputDir" + download_method = 'aws' + } workflow { """ input[0] = Channel.from("DRX026011", "ERX1234253", "SRX6725035") """ } - params { - download_method = 'aws' - } } then { - assert workflow.success - assertAll( - { - with(workflow.out.samplesheet) { - assert path(get(0)).readLines().size() == 4 - assert path(get(0)).readLines()*.split(',')[0].take(4) == ['"sample"', '"fastq_1"', '"fastq_2"', '"run_accession"'] - assert path(get(0)).readLines()*.split(',').collect { it[0] } == ['"sample"', '"DRX026011"', '"ERX1234253"', '"SRX6725035"'] - assert path(get(0)).text.contains('Illumina HiSeq 2500') - } - }, - { - with(workflow.out.mappings) { - assert path(get(0)).readLines().size() == 4 - assert path(get(0)).readLines()*.split(',').collect { it[0] } == ['"sample"', '"DRX026011"', '"ERX1234253"', '"SRX6725035"'] - assert path(get(0)).text.contains('Illumina HiSeq 2500') - } - }, - { - with(workflow.out.sample_mappings) { - assert path(get(0)[0]).md5 == "1ac06bb95b503703430e74660bbdd768" - } - } + { assert workflow.success}, + { assert snapshot( + file(workflow.out.samplesheet[0]).name, + workflow.out.mappings, + workflow.out.sample_mappings, + workflow.out.sra_metadata, + workflow.out.versions + ).match() } ) } } From 63a34dadf03460e1547204cd19599af18a83c837 Mon Sep 17 00:00:00 2001 From: Tanaes Date: Thu, 14 Aug 2025 23:11:59 -0700 Subject: [PATCH 4/4] added workflow test snapshot after dev merge --- .../sra_download_method_aws.nf.test.snap | 143 ++++++++++++++++++ 1 file changed, 143 insertions(+) create mode 100644 workflows/sra/tests/sra_download_method_aws.nf.test.snap diff --git a/workflows/sra/tests/sra_download_method_aws.nf.test.snap b/workflows/sra/tests/sra_download_method_aws.nf.test.snap new file mode 100644 index 00000000..822c52b0 --- /dev/null +++ b/workflows/sra/tests/sra_download_method_aws.nf.test.snap @@ -0,0 +1,143 @@ +{ + "Parameters: --download_method aws": { + "content": [ + "samplesheet.csv", + [ + "id_mappings.csv:md5,3e41ce6ab19feb76f2b20fa77a910ad3" + ], + [ + "multiqc_config.yml:md5,1ac06bb95b503703430e74660bbdd768" + ], + [ + { + "base_count": "194930", + "experiment_accession": "DRX026011", + "experiment_alias": "DRX026011", + "experiment_title": "Illumina HiSeq 2500 paired end sequencing: Illumina HiSeq 2500 paired end sequencing of SAMD00024405", + "fastq_1": "DRX026011_DRR028935_1.fastq.gz:md5,1c3a691ea99767f25de2492440a02cb7", + "fastq_2": "DRX026011_DRR028935_2.fastq.gz:md5,6fa02d3e52613cfe3464cc7a29f227d4", + "fastq_aspera": "fasp.sra.ebi.ac.uk:/vol1/fastq/DRR028/DRR028935/DRR028935_1.fastq.gz;fasp.sra.ebi.ac.uk:/vol1/fastq/DRR028/DRR028935/DRR028935_2.fastq.gz", + "fastq_bytes": "60275;61610", + "fastq_ftp": "ftp.sra.ebi.ac.uk/vol1/fastq/DRR028/DRR028935/DRR028935_1.fastq.gz;ftp.sra.ebi.ac.uk/vol1/fastq/DRR028/DRR028935/DRR028935_2.fastq.gz", + "fastq_galaxy": "ftp.sra.ebi.ac.uk/vol1/fastq/DRR028/DRR028935/DRR028935_1.fastq.gz;ftp.sra.ebi.ac.uk/vol1/fastq/DRR028/DRR028935/DRR028935_2.fastq.gz", + "fastq_md5": "cc01df82a9354bb6b3be93483b20c35a;941c9998d746416dad53c94c480ddf30", + "id": "DRX026011_DRR028935", + "instrument_model": "Illumina HiSeq 2500", + "instrument_platform": "ILLUMINA", + "library_layout": "PAIRED", + "library_name": "day0_BbSQE-I", + "library_selection": "cDNA", + "library_source": "TRANSCRIPTOMIC", + "library_strategy": "RNA-Seq", + "md5_1": "cc01df82a9354bb6b3be93483b20c35a", + "md5_2": "941c9998d746416dad53c94c480ddf30", + "read_count": "965", + "run_accession": "DRR028935", + "run_alias": "DRR028935", + "sample_accession": "SAMD00024405", + "sample_alias": "SAMD00024405", + "sample_description": "Liquid culture", + "sample_title": "Botryococcus braunii Showa at day 0 after inoculation into fresh culture medium", + "scientific_name": "Botryococcus braunii Showa", + "secondary_sample_accession": "DRS019431", + "secondary_study_accession": "DRP002616", + "single_end": false, + "study_accession": "PRJDB3420", + "study_alias": "DRP002616", + "study_title": "Liquid culture of Botryococcus braunii, race B, Showa", + "submission_accession": "DRA002949", + "tax_id": "1202541" + }, + { + "base_count": "1996273", + "experiment_accession": "SRX6725035", + "experiment_alias": "Emb289P1_bin131", + "experiment_title": "Illumina HiSeq 2500 sequencing: Binning of metagenomic reads from the P1 gut compartment of Embiratermes neotenicus", + "fastq_1": "SRX6725035_SRR9984183.fastq.gz:md5,aadf8ac0a6a3282b52404aa4dd14497c", + "fastq_2": "", + "fastq_aspera": "fasp.sra.ebi.ac.uk:/vol1/fastq/SRR998/003/SRR9984183/SRR9984183.fastq.gz", + "fastq_bytes": "605358", + "fastq_ftp": "ftp.sra.ebi.ac.uk/vol1/fastq/SRR998/003/SRR9984183/SRR9984183.fastq.gz", + "fastq_galaxy": "ftp.sra.ebi.ac.uk/vol1/fastq/SRR998/003/SRR9984183/SRR9984183.fastq.gz", + "fastq_md5": "0b512d2dc31685983456bd56fd836544", + "id": "SRX6725035_SRR9984183", + "instrument_model": "Illumina HiSeq 2500", + "instrument_platform": "ILLUMINA", + "library_layout": "SINGLE", + "library_name": "Emb289P1_bin131", + "library_selection": "RANDOM", + "library_source": "METAGENOMIC", + "library_strategy": "WGS", + "md5_1": "0b512d2dc31685983456bd56fd836544", + "md5_2": "", + "read_count": "58", + "run_accession": "SRR9984183", + "run_alias": "Emb289P1_bin131.fastq", + "sample_accession": "SAMN12581720", + "sample_alias": "Emb289P1_bin131", + "sample_description": "Keywords: GSC:MIxS MIMAG:5.0", + "sample_title": "MIMAG Metagenome-assembled Genome sample from Defluviitaleaceae bacterium", + "scientific_name": "Defluviitaleaceae bacterium", + "secondary_sample_accession": "SRS5277011", + "secondary_study_accession": "SRP218535", + "single_end": true, + "study_accession": "PRJNA560329", + "study_alias": "PRJNA560329", + "study_title": "Phylogenomic analysis of 589 metagenome-assembled genomes encompassing all major prokaryotic lineages from the gut of higher termites", + "submission_accession": "SRA942061", + "tax_id": "2660712" + }, + { + "base_count": "35658", + "experiment_accession": "ERX1234253", + "experiment_alias": "qiita_ptid_1263:10317.BLANK.93.3E.r22", + "experiment_title": "Illumina HiSeq 2500 sequencing: qiita_ptid_1263:10317.BLANK.93.3E.r22", + "fastq_1": "ERX1234253_ERR1160846.fastq.gz:md5,98515d664854f1c96f55ac836fb671b9", + "fastq_2": "", + "fastq_aspera": "fasp.sra.ebi.ac.uk:/vol1/fastq/ERR116/006/ERR1160846/ERR1160846.fastq.gz", + "fastq_bytes": "18077", + "fastq_ftp": "ftp.sra.ebi.ac.uk/vol1/fastq/ERR116/006/ERR1160846/ERR1160846.fastq.gz", + "fastq_galaxy": "ftp.sra.ebi.ac.uk/vol1/fastq/ERR116/006/ERR1160846/ERR1160846.fastq.gz", + "fastq_md5": "5924f20ef547ebdfed7cad795bbab6e6", + "id": "ERX1234253_ERR1160846", + "instrument_model": "Illumina HiSeq 2500", + "instrument_platform": "ILLUMINA", + "library_layout": "SINGLE", + "library_name": "10317.BLANK.93.3E.r22", + "library_selection": "PCR", + "library_source": "METAGENOMIC", + "library_strategy": "AMPLICON", + "md5_1": "5924f20ef547ebdfed7cad795bbab6e6", + "md5_2": "", + "read_count": "283", + "run_accession": "ERR1160846", + "run_alias": "qiita_ppdid_706:10317.BLANK.93.3E.r22", + "sample_accession": "SAMEA3687214", + "sample_alias": "qiita_sid_10317:10317.BLANK.93.3E.r22", + "sample_description": "American Gut control", + "sample_title": "10317.BLANK.93.3E.r22", + "scientific_name": "metagenome", + "secondary_sample_accession": "ERS994363", + "secondary_study_accession": "ERP012803", + "single_end": true, + "study_accession": "PRJEB11419", + "study_alias": "qiita_sid_10317", + "study_title": "American Gut Project", + "submission_accession": "ERA541392", + "tax_id": "256318" + } + ], + [ + "versions.yml:md5,08f737bc2e21b301336285511488a41e", + "versions.yml:md5,1496d1cbc9041e07ab8a0c25f0b054d9", + "versions.yml:md5,9b17045ca8bdc272cb3f9d349a81d206", + "versions.yml:md5,b52279f7d6b891a6523d9321f3f85b47" + ] + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "25.04.6" + }, + "timestamp": "2025-08-14T23:08:09.974937" + } +} \ No newline at end of file