Merge pull request #810 from drpatelh/updates

mahesh-panchal · web-flow · commit 88f61f04efb3 · 2022-04-27T09:20:40.000+02:00
Auto-detect whether using AWS iGenome and run appropriate STAR version
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -9,6 +9,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 - [[#764](https://github.com/nf-core/rnaseq/issues/764)] - Test fails when using GCP due to missing tools in the basic biocontainer
 - [[#791](https://github.com/nf-core/rnaseq/issues/791)] - Add outputs for umitools dedup summary stats
+- [[#808](https://github.com/nf-core/rnaseq/issues/808)] - Auto-detect usage of Illumina iGenomes reference
 - Updated pipeline template to [nf-core/tools 2.3.2](https://github.com/nf-core/tools/releases/tag/2.3.2)
 
 ### Parameters
diff --git a/README.md b/README.md
@@ -1,4 +1,4 @@
-# ![nf-core/rnaseq](docs/images/nf-core/rnaseq_logo_light.png#gh-light-mode-only) ![nf-core/rnaseq](docs/images/nf-core/rnaseq_logo_dark.png#gh-dark-mode-only)
+# ![nf-core/rnaseq](docs/images/nf-core-rnaseq_logo_light.png#gh-light-mode-only) ![nf-core/rnaseq](docs/images/nf-core-rnaseq_logo_dark.png#gh-dark-mode-only)
 
 [![GitHub Actions CI Status](https://github.com/nf-core/rnaseq/workflows/nf-core%20CI/badge.svg)](https://github.com/nf-core/rnaseq/actions?query=workflow%3A%22nf-core+CI%22)
 [![GitHub Actions Linting Status](https://github.com/nf-core/rnaseq/workflows/nf-core%20linting/badge.svg)](https://github.com/nf-core/rnaseq/actions?query=workflow%3A%22nf-core+linting%22)
diff --git a/conf/test.config b/conf/test.config
@@ -23,15 +23,14 @@ params {
     input = 'https://raw.githubusercontent.com/nf-core/test-datasets/rnaseq/samplesheet/v3.4/samplesheet_test.csv'
 
     // Genome references
-    fasta              = 'https://github.com/nf-core/test-datasets/raw/rnaseq/reference/genome.fa'
+    fasta              = 'https://github.com/nf-core/test-datasets/raw/rnaseq/reference/genome.fasta'
     gtf                = 'https://github.com/nf-core/test-datasets/raw/rnaseq/reference/genes.gtf.gz'
     gff                = 'https://github.com/nf-core/test-datasets/raw/rnaseq/reference/genes.gff.gz'
     transcript_fasta   = 'https://github.com/nf-core/test-datasets/raw/rnaseq/reference/transcriptome.fasta'
     additional_fasta   = 'https://github.com/nf-core/test-datasets/raw/rnaseq/reference/gfp.fa.gz'
 
     bbsplit_fasta_list = 'https://github.com/nf-core/test-datasets/raw/rnaseq/reference/bbsplit_fasta_list.txt'
     hisat2_index       = 'https://github.com/nf-core/test-datasets/raw/rnaseq/reference/hisat2.tar.gz'
-    star_index         = 'https://github.com/nf-core/test-datasets/raw/rnaseq/reference/star.tar.gz'
     salmon_index       = 'https://github.com/nf-core/test-datasets/raw/rnaseq/reference/salmon.tar.gz'
     rsem_index         = 'https://github.com/nf-core/test-datasets/raw/rnaseq/reference/rsem.tar.gz'
 
diff --git a/modules/local/star_align.nf b/modules/local/star_align.nf
@@ -2,39 +2,54 @@ process STAR_ALIGN {
     tag "$meta.id"
     label 'process_high'
 
-    // Note: 2.7X indices incompatible with AWS iGenomes.
-    conda (params.enable_conda ? "bioconda::star=2.6.1d" : null)
+    conda (params.enable_conda ? conda_str : null)
     container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
-        'https://depot.galaxyproject.org/singularity/star:2.6.1d--0' :
-        'quay.io/biocontainers/star:2.6.1d--0' }"
+        "https://depot.galaxyproject.org/singularity/${container_id}" :
+        "quay.io/biocontainers/${container_id}" }"
 
     input:
     tuple val(meta), path(reads)
-    path  index
-    path  gtf
+    path index
+    path gtf
+    val star_ignore_sjdbgtf
+    val seq_platform
+    val seq_center
+    val is_aws_igenome
 
     output:
     tuple val(meta), path('*d.out.bam')       , emit: bam
     tuple val(meta), path('*Log.final.out')   , emit: log_final
     tuple val(meta), path('*Log.out')         , emit: log_out
     tuple val(meta), path('*Log.progress.out'), emit: log_progress
-    path "versions.yml"                       , emit: versions
+    path  "versions.yml"                      , emit: versions
 
     tuple val(meta), path('*sortedByCoord.out.bam')  , optional:true, emit: bam_sorted
     tuple val(meta), path('*toTranscriptome.out.bam'), optional:true, emit: bam_transcript
     tuple val(meta), path('*Aligned.unsort.out.bam') , optional:true, emit: bam_unsorted
     tuple val(meta), path('*fastq.gz')               , optional:true, emit: fastq
     tuple val(meta), path('*.tab')                   , optional:true, emit: tab
+    tuple val(meta), path('*.out.junction')          , optional:true, emit: junction
+    tuple val(meta), path('*.out.sam')               , optional:true, emit: sam
 
     when:
     task.ext.when == null || task.ext.when
 
     script:
     def args = task.ext.args ?: ''
     def prefix = task.ext.prefix ?: "${meta.id}"
-    def ignore_gtf = params.star_ignore_sjdbgtf ? '' : "--sjdbGTFfile $gtf"
-    def seq_center = params.seq_center ? "--outSAMattrRGline ID:$prefix 'CN:$params.seq_center' 'SM:$prefix'" : "--outSAMattrRGline ID:$prefix 'SM:$prefix'"
-    def out_sam_type = (args.contains('--outSAMtype')) ? '' : '--outSAMtype BAM Unsorted'
+
+    // Note: 2.7X indices incompatible with AWS iGenomes so use older STAR version
+    conda_str = "bioconda::star=2.7.9a bioconda::samtools=1.13 conda-forge::gawk=5.1.0"
+    container_id = 'mulled-v2-1fa26d1ce03c295fe2fdcf85831a92fbcbd7e8c2:a7908dfb0485a80ca94e4d17b0ac991532e4e989-0'
+    if (is_aws_igenome) {
+        conda_str = "bioconda::star=2.6.1d bioconda::samtools=1.10 conda-forge::gawk=5.1.0"
+        container_id = 'mulled-v2-1fa26d1ce03c295fe2fdcf85831a92fbcbd7e8c2:59cdd445419f14abac76b31dd0d71217994cbcc9-0'
+    }
+
+    def ignore_gtf      = star_ignore_sjdbgtf ? '' : "--sjdbGTFfile $gtf"
+    def seq_platform    = seq_platform ? "'PL:$seq_platform'" : ""
+    def seq_center      = seq_center ? "--outSAMattrRGline ID:$prefix 'CN:$seq_center' 'SM:$prefix' $seq_platform " : "--outSAMattrRGline ID:$prefix 'SM:$prefix' $seq_platform "
+    def out_sam_type    = (args.contains('--outSAMtype')) ? '' : '--outSAMtype BAM Unsorted'
     def mv_unsorted_bam = (args.contains('--outSAMtype BAM Unsorted SortedByCoordinate')) ? "mv ${prefix}.Aligned.out.bam ${prefix}.Aligned.unsort.out.bam" : ''
     """
     STAR \\
@@ -61,6 +76,8 @@ process STAR_ALIGN {
     cat <<-END_VERSIONS > versions.yml
     "${task.process}":
         star: \$(STAR --version | sed -e "s/STAR_//g")
+        samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//')
+        gawk: \$(echo \$(gawk --version 2>&1) | sed 's/^.*GNU Awk //; s/, .*\$//')
     END_VERSIONS
     """
 }
diff --git a/modules/local/star_genomegenerate.nf b/modules/local/star_genomegenerate.nf
@@ -2,15 +2,15 @@ process STAR_GENOMEGENERATE {
     tag "$fasta"
     label 'process_high'
 
-    // Note: 2.7X indices incompatible with AWS iGenomes.
-    conda (params.enable_conda ? "bioconda::star=2.6.1d bioconda::samtools=1.10 conda-forge::gawk=5.1.0" : null)
+    conda (params.enable_conda ? conda_str : null)
     container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
-        'https://depot.galaxyproject.org/singularity/mulled-v2-1fa26d1ce03c295fe2fdcf85831a92fbcbd7e8c2:59cdd445419f14abac76b31dd0d71217994cbcc9-0' :
-        'quay.io/biocontainers/mulled-v2-1fa26d1ce03c295fe2fdcf85831a92fbcbd7e8c2:59cdd445419f14abac76b31dd0d71217994cbcc9-0' }"
+        "https://depot.galaxyproject.org/singularity/${container_id}" :
+        "quay.io/biocontainers/${container_id}" }"
 
     input:
     path fasta
     path gtf
+    val  is_aws_igenome
 
     output:
     path "star"        , emit: index
@@ -20,9 +20,19 @@ process STAR_GENOMEGENERATE {
     task.ext.when == null || task.ext.when
 
     script:
-    def args   = (task.ext.args ?: '').tokenize()
+    def args = task.ext.args ?: ''
+    def args_list = args.tokenize()
+
+    // Note: 2.7X indices incompatible with AWS iGenomes so use older STAR version
+    conda_str = "bioconda::star=2.7.9a bioconda::samtools=1.13 conda-forge::gawk=5.1.0"
+    container_id = 'mulled-v2-1fa26d1ce03c295fe2fdcf85831a92fbcbd7e8c2:a7908dfb0485a80ca94e4d17b0ac991532e4e989-0'
+    if (is_aws_igenome) {
+        conda_str = "bioconda::star=2.6.1d bioconda::samtools=1.10 conda-forge::gawk=5.1.0"
+        container_id = 'mulled-v2-1fa26d1ce03c295fe2fdcf85831a92fbcbd7e8c2:59cdd445419f14abac76b31dd0d71217994cbcc9-0'
+    }
+
     def memory = task.memory ? "--limitGenomeGenerateRAM ${task.memory.toBytes() - 100000000}" : ''
-    if (args.contains('--genomeSAindexNbases')) {
+    if (args_list.contains('--genomeSAindexNbases')) {
         """
         mkdir star
         STAR \\
@@ -32,11 +42,13 @@ process STAR_GENOMEGENERATE {
             --sjdbGTFfile $gtf \\
             --runThreadN $task.cpus \\
             $memory \\
-            ${args.join(' ')}
+            $args
 
         cat <<-END_VERSIONS > versions.yml
         "${task.process}":
             star: \$(STAR --version | sed -e "s/STAR_//g")
+            samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//')
+            gawk: \$(echo \$(gawk --version 2>&1) | sed 's/^.*GNU Awk //; s/, .*\$//')
         END_VERSIONS
         """
     } else {
@@ -53,11 +65,13 @@ process STAR_GENOMEGENERATE {
             --runThreadN $task.cpus \\
             --genomeSAindexNbases \$NUM_BASES \\
             $memory \\
-            ${args.join(' ')}
+            $args
 
         cat <<-END_VERSIONS > versions.yml
         "${task.process}":
             star: \$(STAR --version | sed -e "s/STAR_//g")
+            samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//')
+            gawk: \$(echo \$(gawk --version 2>&1) | sed 's/^.*GNU Awk //; s/, .*\$//')
         END_VERSIONS
         """
     }
diff --git a/subworkflows/local/align_star.nf b/subworkflows/local/align_star.nf
@@ -7,9 +7,13 @@ include { BAM_SORT_SAMTOOLS } from '../nf-core/bam_sort_samtools'
 
 workflow ALIGN_STAR {
     take:
-    reads // channel: [ val(meta), [ reads ] ]
-    index // channel: /path/to/star/index/
-    gtf   // channel: /path/to/genome.gtf
+    reads               // channel: [ val(meta), [ reads ] ]
+    index               // channel: /path/to/star/index/
+    gtf                 // channel: /path/to/genome.gtf
+    star_ignore_sjdbgtf // boolean: when using pre-built STAR indices do not re-extract and use splice junctions from the GTF file 
+    seq_platform        // string : sequencing platform
+    seq_center          // string : sequencing center
+    is_aws_igenome      // boolean: whether the genome files are from AWS iGenomes
 
     main:
 
@@ -18,7 +22,7 @@ workflow ALIGN_STAR {
     //
     // Map reads with STAR
     //
-    STAR_ALIGN ( reads, index, gtf )
+    STAR_ALIGN ( reads, index, gtf, star_ignore_sjdbgtf, seq_platform, seq_center, is_aws_igenome )
     ch_versions = ch_versions.mix(STAR_ALIGN.out.versions.first())
 
     //
diff --git a/subworkflows/local/prepare_genome.nf b/subworkflows/local/prepare_genome.nf
@@ -31,9 +31,9 @@ include { STAR_GENOMEGENERATE  } from '../../modules/local/star_genomegenerate'
 
 workflow PREPARE_GENOME {
     take:
-    prepare_tool_indices // list  : tools to prepare indices for
-    biotype              // string: if additional fasta file is provided
-                        //         biotype value to use when appending entries to GTF file
+    prepare_tool_indices // list   : tools to prepare indices for
+    biotype              // string : if additional fasta file is provided biotype value to use when appending entries to GTF file
+    is_aws_igenome       // boolean: whether the genome files are from AWS iGenomes
 
     main:
 
@@ -166,7 +166,7 @@ workflow PREPARE_GENOME {
                 ch_star_index = file(params.star_index)
             }
         } else {
-            ch_star_index = STAR_GENOMEGENERATE ( ch_fasta, ch_gtf ).index
+            ch_star_index = STAR_GENOMEGENERATE ( ch_fasta, ch_gtf, is_aws_igenome ).index
             ch_versions   = ch_versions.mix(STAR_GENOMEGENERATE.out.versions)
         }
     }
diff --git a/workflows/rnaseq.nf b/workflows/rnaseq.nf
@@ -66,6 +66,14 @@ if (anno_readme && file(anno_readme).exists()) {
 // Stage dummy file to be used as an optional input where required
 ch_dummy_file = file("$projectDir/assets/dummy_file.txt", checkIfExists: true)
 
+// Check if an AWS iGenome has been provided to use the appropriate version of STAR
+def is_aws_igenome = false
+if (params.fasta && params.gtf) {
+    if ((file(params.fasta).getName() - '.gz' == 'genome.fa') && (file(params.gtf).getName() - '.gz' == 'genes.gtf')) {
+        is_aws_igenome = true
+    }    
+}
+
 /*
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
     CONFIG FILES
@@ -162,7 +170,9 @@ workflow RNASEQ {
     def biotype = params.gencode ? "gene_type" : params.featurecounts_group_type
     PREPARE_GENOME (
         prepareToolIndices,
-        biotype
+        biotype,
+        is_aws_igenome
+
     )
     ch_versions = ch_versions.mix(PREPARE_GENOME.out.versions)
 
@@ -267,7 +277,11 @@ workflow RNASEQ {
         ALIGN_STAR (
             ch_filtered_reads,
             PREPARE_GENOME.out.star_index,
-            PREPARE_GENOME.out.gtf
+            PREPARE_GENOME.out.gtf,
+            params.star_ignore_sjdbgtf,
+            '',
+            params.seq_center ?: '',
+            is_aws_igenome
         )
         ch_genome_bam        = ALIGN_STAR.out.bam
         ch_genome_bam_index  = ALIGN_STAR.out.bai

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-# ![nf-core/rnaseq](docs/images/nf-core/rnaseq_logo_light.png#gh-light-mode-only) ![nf-core/rnaseq](docs/images/nf-core/rnaseq_logo_dark.png#gh-dark-mode-only)`
	`1`	`+# ![nf-core/rnaseq](docs/images/nf-core-rnaseq_logo_light.png#gh-light-mode-only) ![nf-core/rnaseq](docs/images/nf-core-rnaseq_logo_dark.png#gh-dark-mode-only)`
`2`	`2`
`3`	`3`	`[![GitHub Actions CI Status](https://github.com/nf-core/rnaseq/workflows/nf-core%20CI/badge.svg)](https://github.com/nf-core/rnaseq/actions?query=workflow%3A%22nf-core+CI%22)`
`4`	`4`	`[![GitHub Actions Linting Status](https://github.com/nf-core/rnaseq/workflows/nf-core%20linting/badge.svg)](https://github.com/nf-core/rnaseq/actions?query=workflow%3A%22nf-core+linting%22)`
Original file line number	Diff line number	Diff line change
`@@ -31,9 +31,9 @@ include { STAR_GENOMEGENERATE } from '../../modules/local/star_genomegenerate'`
`31`	`31`
`32`	`32`	`workflow PREPARE_GENOME {`
`33`	`33`	`take:`
`34`		`- prepare_tool_indices // list : tools to prepare indices for`
`35`		`- biotype // string: if additional fasta file is provided`
`36`		`- // biotype value to use when appending entries to GTF file`
	`34`	`+ prepare_tool_indices // list : tools to prepare indices for`
	`35`	`+ biotype // string : if additional fasta file is provided biotype value to use when appending entries to GTF file`
	`36`	`+ is_aws_igenome // boolean: whether the genome files are from AWS iGenomes`
`37`	`37`
`38`	`38`	`main:`
`39`	`39`
`@@ -166,7 +166,7 @@ workflow PREPARE_GENOME {`
`166`	`166`	`ch_star_index = file(params.star_index)`
`167`	`167`	`}`
`168`	`168`	`} else {`
`169`		`- ch_star_index = STAR_GENOMEGENERATE ( ch_fasta, ch_gtf ).index`
	`169`	`+ ch_star_index = STAR_GENOMEGENERATE ( ch_fasta, ch_gtf, is_aws_igenome ).index`
`170`	`170`	`ch_versions = ch_versions.mix(STAR_GENOMEGENERATE.out.versions)`
`171`	`171`	`}`
`172`	`172`	`}`