Merge pull request #1107 from nf-core/stringtie_gtf

drpatelh · web-flow · commit 7a04c5d571bf · 2023-11-15T18:07:24.000Z
Expand GTF filtering to remove rows with empty transcript ID when required, fix STAR GTF usage
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -3,7 +3,7 @@
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
-## v3.13.0dev - [date]
+## [[3.13.0](https://github.com/nf-core/rnaseq/releases/tag/3.13.0)] - 2023-11-17
 
 ### Credits
 
@@ -30,8 +30,12 @@ Thank you to everyone else that has contributed by reporting bugs, enhancements
 - [PR #1088](https://github.com/nf-core/rnaseq/pull/1088) - Updates contributing and code of conduct documents with nf-core template 2.10
 - [PR #1091](https://github.com/nf-core/rnaseq/pull/1091) - Reorganise parameters in schema for better usability
 - [PR #1106](https://github.com/nf-core/rnaseq/pull/1106) - Kallisto quantification
-- [PR #1106](https://github.com/nf-core/rnaseq/pull/1106) - MultiQC [version bump](https://github.com/nf-core/rnaseq/pull/1106/commits/aebad067a10a45510a2b421da852cb436ae65fd8)
+- [PR #1107](https://github.com/nf-core/rnaseq/pull/1107) - Expand GTF filtering to remove rows with empty transcript ID when required, fix STAR GTF usage
 - [#1050](https://github.com/nf-core/rnaseq/issues/1050) - Provide custom prefix/suffix for summary files to avoid overwriting
+- [#1074](https://github.com/nf-core/rnaseq/issues/1074) - Enable quantification using StringTie AND a custom
+- [#1082](https://github.com/nf-core/rnaseq/issues/1082) - More informative error message for `filter_gtf_for_genes_in_genome.py`
+- [#1102](https://github.com/nf-core/rnaseq/issues/1102) - gene entries with empty transcript_id fields
+  Ensembl genome
 
 ### Software dependencies
 
diff --git a/assets/multiqc_config.yml b/assets/multiqc_config.yml
@@ -1,7 +1,7 @@
 report_comment: >
-  This report has been generated by the <a href="https://github.com/nf-core/rnaseq/releases/tag/dev" target="_blank">nf-core/rnaseq</a>
+  This report has been generated by the <a href="https://github.com/nf-core/rnaseq/releases/tag/3.13.0" target="_blank">nf-core/rnaseq</a>
   analysis pipeline. For information about how to interpret these results, please see the
-  <a href="https://nf-co.re/rnaseq/dev/docs/output" target="_blank">documentation</a>.
+  <a href="https://nf-co.re/rnaseq/3.13.0/docs/output" target="_blank">documentation</a>.
 report_section_order:
   "nf-core-rnaseq-methods-description":
     order: -1000
diff --git a/bin/filter_gtf.py b/bin/filter_gtf.py
@@ -0,0 +1,70 @@
+#!/usr/bin/env python
+import logging
+import argparse
+import re
+import statistics
+from typing import Set
+
+# Create a logger
+logging.basicConfig(format="%(name)s - %(asctime)s %(levelname)s: %(message)s")
+logger = logging.getLogger("fasta_gtf_filter")
+logger.setLevel(logging.INFO)
+
+
+def extract_fasta_seq_names(fasta_name: str) -> Set[str]:
+    """Extracts the sequence names from a FASTA file."""
+    with open(fasta_name) as fasta:
+        return {line[1:].split(None, 1)[0] for line in fasta if line.startswith(">")}
+
+
+def tab_delimited(file: str) -> float:
+    """Check if file is tab-delimited and return median number of tabs."""
+    with open(file, "r") as f:
+        data = f.read(1024)
+        return statistics.median(line.count("\t") for line in data.split("\n"))
+
+
+def filter_gtf(fasta: str, gtf_in: str, filtered_gtf_out: str, skip_transcript_id_check: bool) -> None:
+    """Filter GTF file based on FASTA sequence names."""
+    if tab_delimited(gtf_in) != 8:
+        raise ValueError("Invalid GTF file: Expected 8 tab-separated columns.")
+
+    seq_names_in_genome = extract_fasta_seq_names(fasta)
+    logger.info(f"Extracted chromosome sequence names from {fasta}")
+    logger.debug("All sequence IDs from FASTA: " + ", ".join(sorted(seq_names_in_genome)))
+
+    seq_names_in_gtf = set()
+    try:
+        with open(gtf_in) as gtf, open(filtered_gtf_out, "w") as out:
+            line_count = 0
+            for line in gtf:
+                seq_name = line.split("\t")[0]
+                seq_names_in_gtf.add(seq_name)  # Add sequence name to the set
+
+                if seq_name in seq_names_in_genome:
+                    if skip_transcript_id_check or re.search(r'transcript_id "([^"]+)"', line):
+                        out.write(line)
+                        line_count += 1
+
+            if line_count == 0:
+                raise ValueError("All GTF lines removed by filters")
+
+    except IOError as e:
+        logger.error(f"File operation failed: {e}")
+        return
+
+    logger.debug("All sequence IDs from GTF: " + ", ".join(sorted(seq_names_in_gtf)))
+    logger.info(f"Extracted {line_count} matching sequences from {gtf_in} into {filtered_gtf_out}")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Filters a GTF file based on sequence names in a FASTA file.")
+    parser.add_argument("--gtf", type=str, required=True, help="GTF file")
+    parser.add_argument("--fasta", type=str, required=True, help="Genome fasta file")
+    parser.add_argument("--prefix", dest="prefix", default="genes", type=str, help="Prefix for output GTF files")
+    parser.add_argument(
+        "--skip_transcript_id_check", action="store_true", help="Skip checking for transcript IDs in the GTF file"
+    )
+
+    args = parser.parse_args()
+    filter_gtf(args.fasta, args.gtf, args.prefix + ".filtered.gtf", args.skip_transcript_id_check)
diff --git a/bin/filter_gtf_for_genes_in_genome.py b/bin/filter_gtf_for_genes_in_genome.py
diff --git a/conf/modules.config b/conf/modules.config
@@ -117,7 +117,8 @@ process {
         ]
     }
 
-    withName: 'GTF_GENE_FILTER' {
+    withName: 'GTF_FILTER' {
+        ext.args   = { params.skip_gtf_transcript_filter ?: '--skip_transcript_id_check' }
         publishDir = [
             path: { "${params.outdir}/genome" },
             mode: params.publish_dir_mode,
diff --git a/docs/usage.md b/docs/usage.md
@@ -198,6 +198,10 @@ Notes:
 
 - As of v3.7 of the pipeline, if you are using a genome downloaded from AWS iGenomes and using `--aligner star_salmon` (default) the version of STAR to use for the alignment will be auto-detected (see [#808](https://github.com/nf-core/rnaseq/issues/808)).
 
+### GTF filtering
+
+By default, the input GTF file will be filtered to ensure that sequence names correspond to those in the genome fasta file, and to remove rows with empty transcript identifiers. Filtering can be bypassed completely where you are confident it is not necessary, using the `--skip_gtf_filter` parameter. If you just want to skip the 'transcript_id' checking component of the GTF filtering script used in the pipeline this can be disabled specifically using the `--skip_gtf_transcript_filter` parameter.
+
 ## Running the pipeline
 
 The typical command for running the pipeline is as follows:
diff --git a/modules.json b/modules.json
@@ -71,8 +71,8 @@
                         "installed_by": ["modules"]
                     },
                     "kallisto/quant": {
-                        "branch": "kallisto_updates",
-                        "git_sha": "bc4719dcd079fcdb650ddeac05739c2f7dd58c84",
+                        "branch": "master",
+                        "git_sha": "bdc2a97ced7adc423acfa390742db83cab98c1ad",
                         "installed_by": ["modules"]
                     },
                     "picard/markduplicates": {
diff --git a/modules/local/gtf_filter/main.nf b/modules/local/gtf_filter/main.nf
@@ -1,4 +1,4 @@
-process GTF_GENE_FILTER {
+process GTF_FILTER {
     tag "$fasta"
 
     conda "conda-forge::python=3.9.5"
@@ -11,18 +11,18 @@ process GTF_GENE_FILTER {
     path gtf
 
     output:
-    path "*.gtf"       , emit: gtf
-    path "versions.yml", emit: versions
+    path "*.filtered.gtf", emit: genome_gtf
+    path "versions.yml"  , emit: versions
 
     when:
     task.ext.when == null || task.ext.when
 
-    script: // filter_gtf_for_genes_in_genome.py is bundled with the pipeline, in nf-core/rnaseq/bin/
+    script: // filter_gtf.py is bundled with the pipeline, in nf-core/rnaseq/bin/
     """
-    filter_gtf_for_genes_in_genome.py \\
+    filter_gtf.py \\
         --gtf $gtf \\
         --fasta $fasta \\
-        -o ${fasta.baseName}_genes.gtf
+        --prefix ${fasta.baseName}
 
     cat <<-END_VERSIONS > versions.yml
     "${task.process}":
diff --git a/modules/local/multiqc/main.nf b/modules/local/multiqc/main.nf
@@ -3,8 +3,8 @@ process MULTIQC {
 
     conda "bioconda::multiqc=1.17"
     container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
-        'https://depot.galaxyproject.org/singularity/multiqc:1.17--pyhdfd78af_0' :
-        'biocontainers/multiqc:1.17--pyhdfd78af_0' }"
+        'https://depot.galaxyproject.org/singularity/multiqc:1.17--pyhdfd78af_1' :
+        'biocontainers/multiqc:1.17--pyhdfd78af_1' }"
 
     input:
     path multiqc_config
diff --git a/modules/nf-core/kallisto/quant/main.nf b/modules/nf-core/kallisto/quant/main.nf
diff --git a/nextflow.config b/nextflow.config
@@ -17,6 +17,8 @@ params {
     splicesites                = null
     gtf_extra_attributes       = 'gene_name'
     gtf_group_features         = 'gene_id'
+    skip_gtf_filter            = false
+    skip_gtf_transcript_filter = false
     featurecounts_feature_type = 'exon'
     featurecounts_group_type   = 'gene_biotype'
     gencode                    = false
@@ -315,7 +317,7 @@ manifest {
     description     = """RNA sequencing analysis pipeline for gene/isoform quantification and extensive quality control."""
     mainScript      = 'main.nf'
     nextflowVersion = '!>=23.04.0'
-    version         = '3.13.0dev'
+    version         = '3.13.0'
     doi             = 'https://doi.org/10.5281/zenodo.1400710'
 }
 
diff --git a/nextflow_schema.json b/nextflow_schema.json
@@ -520,6 +520,17 @@
             "fa_icon": "fas fa-fast-forward",
             "description": "Options to skip various steps within the workflow.",
             "properties": {
+                "skip_gtf_filter": {
+                    "type": "boolean",
+                    "fa_icon": "fas fa-forward",
+                    "description": "Skip filtering of GTF for valid scaffolds and/ or transcript IDs.",
+                    "help_text": "If you're confident on the validity of the GTF with respect to the genome fasta file, or wish to disregard failures thriggered by the filtering module, activate this option."
+                },
+                "skip_gtf_transcript_filter": {
+                    "type": "boolean",
+                    "fa_icon": "fas fa-forward",
+                    "description": "Skip the 'transcript_id' checking component of the GTF filtering script used in the pipeline."
+                },
                 "skip_bbsplit": {
                     "type": "boolean",
                     "default": true,
diff --git a/subworkflows/local/prepare_genome/main.nf b/subworkflows/local/prepare_genome/main.nf
@@ -30,7 +30,7 @@ include { RSEM_PREPAREREFERENCE as MAKE_TRANSCRIPTS_FASTA       } from '../../..
 include { PREPROCESS_TRANSCRIPTS_FASTA_GENCODE } from '../../../modules/local/preprocess_transcripts_fasta_gencode'
 include { GTF2BED                              } from '../../../modules/local/gtf2bed'
 include { CAT_ADDITIONAL_FASTA                 } from '../../../modules/local/cat_additional_fasta'
-include { GTF_GENE_FILTER                      } from '../../../modules/local/gtf_gene_filter'
+include { GTF_FILTER                           } from '../../../modules/local/gtf_filter'
 include { STAR_GENOMEGENERATE_IGENOMES         } from '../../../modules/local/star_genomegenerate_igenomes'
 
 workflow PREPARE_GENOME {
@@ -53,6 +53,7 @@ workflow PREPARE_GENOME {
     is_aws_igenome       //   boolean: whether the genome files are from AWS iGenomes
     biotype              //    string: if additional fasta file is provided biotype value to use when appending entries to GTF file
     prepare_tool_indices //      list: tools to prepare indices for
+    filter_gtf           //   boolean: whether to filter GTF file
 
     main:
 
@@ -71,22 +72,30 @@ workflow PREPARE_GENOME {
     //
     // Uncompress GTF annotation file or create from GFF3 if required
     //
-    if (gtf) {
-        if (gtf.endsWith('.gz')) {
-            ch_gtf      = GUNZIP_GTF ( [ [:], gtf ] ).gunzip.map { it[1] }
-            ch_versions = ch_versions.mix(GUNZIP_GTF.out.versions)
-        } else {
-            ch_gtf = Channel.value(file(gtf))
+    if (gtf || gff) {
+        if (gtf) {
+            if (gtf.endsWith('.gz')) {
+                ch_gtf      = GUNZIP_GTF ( [ [:], gtf ] ).gunzip.map { it[1] }
+                ch_versions = ch_versions.mix(GUNZIP_GTF.out.versions)
+            } else {
+                ch_gtf = Channel.value(file(gtf))
+            }
+        } else if (gff) {
+            if (gff.endsWith('.gz')) {
+                ch_gff      = GUNZIP_GFF ( [ [:], gff ] ).gunzip.map { it[1] }
+                ch_versions = ch_versions.mix(GUNZIP_GFF.out.versions)
+            } else {
+                ch_gff = Channel.value(file(gff))
+            }
+            ch_gtf      = GFFREAD ( ch_gff ).gtf
+            ch_versions = ch_versions.mix(GFFREAD.out.versions)
         }
-    } else if (gff) {
-        if (gff.endsWith('.gz')) {
-            ch_gff      = GUNZIP_GFF ( [ [:], gff ] ).gunzip.map { it[1] }
-            ch_versions = ch_versions.mix(GUNZIP_GFF.out.versions)
-        } else {
-            ch_gff = Channel.value(file(gff))
+
+    	if (filter_gtf) {
+            GTF_FILTER ( ch_fasta, ch_gtf )
+            ch_gtf = GTF_FILTER.out.genome_gtf
+            ch_versions = ch_versions.mix(GTF_FILTER.out.versions)
         }
-        ch_gtf      = GFFREAD ( ch_gff ).gtf
-        ch_versions = ch_versions.mix(GFFREAD.out.versions)
     }
 
     //
@@ -136,9 +145,8 @@ workflow PREPARE_GENOME {
             ch_versions         = ch_versions.mix(PREPROCESS_TRANSCRIPTS_FASTA_GENCODE.out.versions)
         }
     } else {
-        ch_filter_gtf = GTF_GENE_FILTER ( ch_fasta, ch_gtf ).gtf
-        ch_transcript_fasta = MAKE_TRANSCRIPTS_FASTA ( ch_fasta, ch_filter_gtf ).transcript_fasta
-        ch_versions         = ch_versions.mix(GTF_GENE_FILTER.out.versions)
+        ch_transcript_fasta = MAKE_TRANSCRIPTS_FASTA ( ch_fasta, ch_gtf ).transcript_fasta
+        ch_versions         = ch_versions.mix(GTF_FILTER.out.versions)
         ch_versions         = ch_versions.mix(MAKE_TRANSCRIPTS_FASTA.out.versions)
     }
 
@@ -293,6 +301,5 @@ workflow PREPARE_GENOME {
     hisat2_index     = ch_hisat2_index           // channel: path(hisat2/index/)
     salmon_index     = ch_salmon_index           // channel: path(salmon/index/)
     kallisto_index   = ch_kallisto_index         // channel: [ meta, path(kallisto/index/) ]
-
     versions         = ch_versions.ifEmpty(null) // channel: [ versions.yml ]
 }
diff --git a/workflows/rnaseq.nf b/workflows/rnaseq.nf

Original file line number	Diff line number	Diff line change
`@@ -117,7 +117,8 @@ process {`
`117`	`117`	`]`
`118`	`118`	`}`
`119`	`119`
`120`		`- withName: 'GTF_GENE_FILTER' {`
	`120`	`+ withName: 'GTF_FILTER' {`
	`121`	`+ ext.args = { params.skip_gtf_transcript_filter ?: '--skip_transcript_id_check' }`
`121`	`122`	`publishDir = [`
`122`	`123`	`path: { "${params.outdir}/genome" },`
`123`	`124`	`mode: params.publish_dir_mode,`