Skip to content

Commit c4a416d

Browse files
committed
Make fasta optional for gtf filtering
1 parent a69d1d2 commit c4a416d

File tree

2 files changed

+14
-9
lines changed

2 files changed

+14
-9
lines changed

bin/filter_gtf.py

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
import argparse
77
import re
88
import statistics
9-
from typing import Set
9+
from typing import Optional, Set
1010

1111
# Create a logger
1212
logging.basicConfig(format="%(name)s - %(asctime)s %(levelname)s: %(message)s")
@@ -27,14 +27,15 @@ def tab_delimited(file: str) -> float:
2727
return statistics.median(line.count("\t") for line in data.split("\n"))
2828

2929

30-
def filter_gtf(fasta: str, gtf_in: str, filtered_gtf_out: str, skip_transcript_id_check: bool) -> None:
30+
def filter_gtf(fasta: Optional[str], gtf_in: str, filtered_gtf_out: str, skip_transcript_id_check: bool) -> None:
3131
"""Filter GTF file based on FASTA sequence names."""
3232
if tab_delimited(gtf_in) != 8:
3333
raise ValueError("Invalid GTF file: Expected 9 tab-separated columns.")
3434

35-
seq_names_in_genome = extract_fasta_seq_names(fasta)
36-
logger.info(f"Extracted chromosome sequence names from {fasta}")
37-
logger.debug("All sequence IDs from FASTA: " + ", ".join(sorted(seq_names_in_genome)))
35+
if (fasta is not None):
36+
seq_names_in_genome = extract_fasta_seq_names(fasta)
37+
logger.info(f"Extracted chromosome sequence names from {fasta}")
38+
logger.debug("All sequence IDs from FASTA: " + ", ".join(sorted(seq_names_in_genome)))
3839

3940
seq_names_in_gtf = set()
4041
try:
@@ -44,7 +45,7 @@ def filter_gtf(fasta: str, gtf_in: str, filtered_gtf_out: str, skip_transcript_i
4445
seq_name = line.split("\t")[0]
4546
seq_names_in_gtf.add(seq_name) # Add sequence name to the set
4647

47-
if seq_name in seq_names_in_genome:
48+
if fasta is None or seq_name in seq_names_in_genome:
4849
if skip_transcript_id_check or re.search(r'transcript_id "([^"]+)"', line):
4950
out.write(line)
5051
line_count += 1
@@ -63,7 +64,7 @@ def filter_gtf(fasta: str, gtf_in: str, filtered_gtf_out: str, skip_transcript_i
6364
if __name__ == "__main__":
6465
parser = argparse.ArgumentParser(description="Filters a GTF file based on sequence names in a FASTA file.")
6566
parser.add_argument("--gtf", type=str, required=True, help="GTF file")
66-
parser.add_argument("--fasta", type=str, required=True, help="Genome fasta file")
67+
parser.add_argument("--fasta", type=str, required=False, help="Genome fasta file")
6768
parser.add_argument("--prefix", dest="prefix", default="genes", type=str, help="Prefix for output GTF files")
6869
parser.add_argument(
6970
"--skip_transcript_id_check", action="store_true", help="Skip checking for transcript IDs in the GTF file"

modules/local/gtf_filter/main.nf

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,11 +18,15 @@ process GTF_FILTER {
1818
task.ext.when == null || task.ext.when
1919

2020
script: // filter_gtf.py is bundled with the pipeline, in nf-core/rnaseq/bin/
21+
fasta_text=''
22+
if (fasta){
23+
fasta_text="--fasta $fasta"
24+
}
2125
"""
2226
filter_gtf.py \\
2327
--gtf $gtf \\
24-
--fasta $fasta \\
25-
--prefix ${fasta.baseName}
28+
$fasta_text \\
29+
--prefix ${gtf.baseName}
2630
2731
cat <<-END_VERSIONS > versions.yml
2832
"${task.process}":

0 commit comments

Comments
 (0)