Skip to content

Commit 7a04c5d

Browse files
authored
Merge pull request #1107 from nf-core/stringtie_gtf
Expand GTF filtering to remove rows with empty transcript ID when required, fix STAR GTF usage
2 parents ab9df9a + b67558d commit 7a04c5d

File tree

14 files changed

+156
-115
lines changed

14 files changed

+156
-115
lines changed

CHANGELOG.md

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
44
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
55

6-
## v3.13.0dev - [date]
6+
## [[3.13.0](https://github.com/nf-core/rnaseq/releases/tag/3.13.0)] - 2023-11-17
77

88
### Credits
99

@@ -30,8 +30,12 @@ Thank you to everyone else that has contributed by reporting bugs, enhancements
3030
- [PR #1088](https://github.com/nf-core/rnaseq/pull/1088) - Updates contributing and code of conduct documents with nf-core template 2.10
3131
- [PR #1091](https://github.com/nf-core/rnaseq/pull/1091) - Reorganise parameters in schema for better usability
3232
- [PR #1106](https://github.com/nf-core/rnaseq/pull/1106) - Kallisto quantification
33-
- [PR #1106](https://github.com/nf-core/rnaseq/pull/1106) - MultiQC [version bump](https://github.com/nf-core/rnaseq/pull/1106/commits/aebad067a10a45510a2b421da852cb436ae65fd8)
33+
- [PR #1107](https://github.com/nf-core/rnaseq/pull/1107) - Expand GTF filtering to remove rows with empty transcript ID when required, fix STAR GTF usage
3434
- [#1050](https://github.com/nf-core/rnaseq/issues/1050) - Provide custom prefix/suffix for summary files to avoid overwriting
35+
- [#1074](https://github.com/nf-core/rnaseq/issues/1074) - Enable quantification using StringTie AND a custom
36+
- [#1082](https://github.com/nf-core/rnaseq/issues/1082) - More informative error message for `filter_gtf_for_genes_in_genome.py`
37+
- [#1102](https://github.com/nf-core/rnaseq/issues/1102) - gene entries with empty transcript_id fields
38+
Ensembl genome
3539

3640
### Software dependencies
3741

assets/multiqc_config.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
report_comment: >
2-
This report has been generated by the <a href="https://github.com/nf-core/rnaseq/releases/tag/dev" target="_blank">nf-core/rnaseq</a>
2+
This report has been generated by the <a href="https://github.com/nf-core/rnaseq/releases/tag/3.13.0" target="_blank">nf-core/rnaseq</a>
33
analysis pipeline. For information about how to interpret these results, please see the
4-
<a href="https://nf-co.re/rnaseq/dev/docs/output" target="_blank">documentation</a>.
4+
<a href="https://nf-co.re/rnaseq/3.13.0/docs/output" target="_blank">documentation</a>.
55
report_section_order:
66
"nf-core-rnaseq-methods-description":
77
order: -1000

bin/filter_gtf.py

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
#!/usr/bin/env python
2+
import logging
3+
import argparse
4+
import re
5+
import statistics
6+
from typing import Set
7+
8+
# Create a logger
9+
logging.basicConfig(format="%(name)s - %(asctime)s %(levelname)s: %(message)s")
10+
logger = logging.getLogger("fasta_gtf_filter")
11+
logger.setLevel(logging.INFO)
12+
13+
14+
def extract_fasta_seq_names(fasta_name: str) -> Set[str]:
15+
"""Extracts the sequence names from a FASTA file."""
16+
with open(fasta_name) as fasta:
17+
return {line[1:].split(None, 1)[0] for line in fasta if line.startswith(">")}
18+
19+
20+
def tab_delimited(file: str) -> float:
21+
"""Check if file is tab-delimited and return median number of tabs."""
22+
with open(file, "r") as f:
23+
data = f.read(1024)
24+
return statistics.median(line.count("\t") for line in data.split("\n"))
25+
26+
27+
def filter_gtf(fasta: str, gtf_in: str, filtered_gtf_out: str, skip_transcript_id_check: bool) -> None:
28+
"""Filter GTF file based on FASTA sequence names."""
29+
if tab_delimited(gtf_in) != 8:
30+
raise ValueError("Invalid GTF file: Expected 8 tab-separated columns.")
31+
32+
seq_names_in_genome = extract_fasta_seq_names(fasta)
33+
logger.info(f"Extracted chromosome sequence names from {fasta}")
34+
logger.debug("All sequence IDs from FASTA: " + ", ".join(sorted(seq_names_in_genome)))
35+
36+
seq_names_in_gtf = set()
37+
try:
38+
with open(gtf_in) as gtf, open(filtered_gtf_out, "w") as out:
39+
line_count = 0
40+
for line in gtf:
41+
seq_name = line.split("\t")[0]
42+
seq_names_in_gtf.add(seq_name) # Add sequence name to the set
43+
44+
if seq_name in seq_names_in_genome:
45+
if skip_transcript_id_check or re.search(r'transcript_id "([^"]+)"', line):
46+
out.write(line)
47+
line_count += 1
48+
49+
if line_count == 0:
50+
raise ValueError("All GTF lines removed by filters")
51+
52+
except IOError as e:
53+
logger.error(f"File operation failed: {e}")
54+
return
55+
56+
logger.debug("All sequence IDs from GTF: " + ", ".join(sorted(seq_names_in_gtf)))
57+
logger.info(f"Extracted {line_count} matching sequences from {gtf_in} into {filtered_gtf_out}")
58+
59+
60+
if __name__ == "__main__":
61+
parser = argparse.ArgumentParser(description="Filters a GTF file based on sequence names in a FASTA file.")
62+
parser.add_argument("--gtf", type=str, required=True, help="GTF file")
63+
parser.add_argument("--fasta", type=str, required=True, help="Genome fasta file")
64+
parser.add_argument("--prefix", dest="prefix", default="genes", type=str, help="Prefix for output GTF files")
65+
parser.add_argument(
66+
"--skip_transcript_id_check", action="store_true", help="Skip checking for transcript IDs in the GTF file"
67+
)
68+
69+
args = parser.parse_args()
70+
filter_gtf(args.fasta, args.gtf, args.prefix + ".filtered.gtf", args.skip_transcript_id_check)

bin/filter_gtf_for_genes_in_genome.py

Lines changed: 0 additions & 78 deletions
This file was deleted.

conf/modules.config

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -117,7 +117,8 @@ process {
117117
]
118118
}
119119

120-
withName: 'GTF_GENE_FILTER' {
120+
withName: 'GTF_FILTER' {
121+
ext.args = { params.skip_gtf_transcript_filter ?: '--skip_transcript_id_check' }
121122
publishDir = [
122123
path: { "${params.outdir}/genome" },
123124
mode: params.publish_dir_mode,

docs/usage.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -198,6 +198,10 @@ Notes:
198198

199199
- As of v3.7 of the pipeline, if you are using a genome downloaded from AWS iGenomes and using `--aligner star_salmon` (default) the version of STAR to use for the alignment will be auto-detected (see [#808](https://github.com/nf-core/rnaseq/issues/808)).
200200

201+
### GTF filtering
202+
203+
By default, the input GTF file will be filtered to ensure that sequence names correspond to those in the genome fasta file, and to remove rows with empty transcript identifiers. Filtering can be bypassed completely where you are confident it is not necessary, using the `--skip_gtf_filter` parameter. If you just want to skip the 'transcript_id' checking component of the GTF filtering script used in the pipeline this can be disabled specifically using the `--skip_gtf_transcript_filter` parameter.
204+
201205
## Running the pipeline
202206

203207
The typical command for running the pipeline is as follows:

modules.json

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -71,8 +71,8 @@
7171
"installed_by": ["modules"]
7272
},
7373
"kallisto/quant": {
74-
"branch": "kallisto_updates",
75-
"git_sha": "bc4719dcd079fcdb650ddeac05739c2f7dd58c84",
74+
"branch": "master",
75+
"git_sha": "bdc2a97ced7adc423acfa390742db83cab98c1ad",
7676
"installed_by": ["modules"]
7777
},
7878
"picard/markduplicates": {

modules/local/gtf_gene_filter/main.nf renamed to modules/local/gtf_filter/main.nf

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
process GTF_GENE_FILTER {
1+
process GTF_FILTER {
22
tag "$fasta"
33

44
conda "conda-forge::python=3.9.5"
@@ -11,18 +11,18 @@ process GTF_GENE_FILTER {
1111
path gtf
1212

1313
output:
14-
path "*.gtf" , emit: gtf
15-
path "versions.yml", emit: versions
14+
path "*.filtered.gtf", emit: genome_gtf
15+
path "versions.yml" , emit: versions
1616

1717
when:
1818
task.ext.when == null || task.ext.when
1919

20-
script: // filter_gtf_for_genes_in_genome.py is bundled with the pipeline, in nf-core/rnaseq/bin/
20+
script: // filter_gtf.py is bundled with the pipeline, in nf-core/rnaseq/bin/
2121
"""
22-
filter_gtf_for_genes_in_genome.py \\
22+
filter_gtf.py \\
2323
--gtf $gtf \\
2424
--fasta $fasta \\
25-
-o ${fasta.baseName}_genes.gtf
25+
--prefix ${fasta.baseName}
2626
2727
cat <<-END_VERSIONS > versions.yml
2828
"${task.process}":

modules/local/multiqc/main.nf

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,8 @@ process MULTIQC {
33

44
conda "bioconda::multiqc=1.17"
55
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
6-
'https://depot.galaxyproject.org/singularity/multiqc:1.17--pyhdfd78af_0' :
7-
'biocontainers/multiqc:1.17--pyhdfd78af_0' }"
6+
'https://depot.galaxyproject.org/singularity/multiqc:1.17--pyhdfd78af_1' :
7+
'biocontainers/multiqc:1.17--pyhdfd78af_1' }"
88

99
input:
1010
path multiqc_config

modules/nf-core/kallisto/quant/main.nf

Lines changed: 1 addition & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)