66import argparse
77import re
88import statistics
9- from typing import Set
9+ from typing import Optional , Set
1010
1111# Create a logger
1212logging .basicConfig (format = "%(name)s - %(asctime)s %(levelname)s: %(message)s" )
@@ -27,14 +27,15 @@ def tab_delimited(file: str) -> float:
2727 return statistics .median (line .count ("\t " ) for line in data .split ("\n " ))
2828
2929
30- def filter_gtf (fasta : str , gtf_in : str , filtered_gtf_out : str , skip_transcript_id_check : bool ) -> None :
30+ def filter_gtf (fasta : Optional [ str ] , gtf_in : str , filtered_gtf_out : str , skip_transcript_id_check : bool ) -> None :
3131 """Filter GTF file based on FASTA sequence names."""
3232 if tab_delimited (gtf_in ) != 8 :
3333 raise ValueError ("Invalid GTF file: Expected 9 tab-separated columns." )
3434
35- seq_names_in_genome = extract_fasta_seq_names (fasta )
36- logger .info (f"Extracted chromosome sequence names from { fasta } " )
37- logger .debug ("All sequence IDs from FASTA: " + ", " .join (sorted (seq_names_in_genome )))
35+ if (fasta is not None ):
36+ seq_names_in_genome = extract_fasta_seq_names (fasta )
37+ logger .info (f"Extracted chromosome sequence names from { fasta } " )
38+ logger .debug ("All sequence IDs from FASTA: " + ", " .join (sorted (seq_names_in_genome )))
3839
3940 seq_names_in_gtf = set ()
4041 try :
@@ -44,7 +45,7 @@ def filter_gtf(fasta: str, gtf_in: str, filtered_gtf_out: str, skip_transcript_i
4445 seq_name = line .split ("\t " )[0 ]
4546 seq_names_in_gtf .add (seq_name ) # Add sequence name to the set
4647
47- if seq_name in seq_names_in_genome :
48+ if fasta is None or seq_name in seq_names_in_genome :
4849 if skip_transcript_id_check or re .search (r'transcript_id "([^"]+)"' , line ):
4950 out .write (line )
5051 line_count += 1
@@ -63,7 +64,7 @@ def filter_gtf(fasta: str, gtf_in: str, filtered_gtf_out: str, skip_transcript_i
6364if __name__ == "__main__" :
6465 parser = argparse .ArgumentParser (description = "Filters a GTF file based on sequence names in a FASTA file." )
6566 parser .add_argument ("--gtf" , type = str , required = True , help = "GTF file" )
66- parser .add_argument ("--fasta" , type = str , required = True , help = "Genome fasta file" )
67+ parser .add_argument ("--fasta" , type = str , required = False , help = "Genome fasta file" )
6768 parser .add_argument ("--prefix" , dest = "prefix" , default = "genes" , type = str , help = "Prefix for output GTF files" )
6869 parser .add_argument (
6970 "--skip_transcript_id_check" , action = "store_true" , help = "Skip checking for transcript IDs in the GTF file"
0 commit comments