@@ -101,7 +101,7 @@ def modify_existing_chrom_seq(in_arg, index, prev_fasta_path, prev_modifications
101101 else :
102102 SeqIO .write ([chrom_seqs [s ]], f , "fasta" )
103103 ## Read in new GFF features from in_gff, False means modify existing chrom
104- in_gff_lines = get_in_gff_lines (in_arg .in_gff [index ])
104+ in_gff_lines = get_in_gff_lines (in_gff = in_arg .in_gff [index ], existing_chrom = in_arg . chrom , new_chrom = None )
105105 ## Create a temp file for gff, if index is not equal to last iteration
106106 annotation_name , annotation_ext = get_ref_basename (in_arg .ref_gff )
107107 if index < iterations - 1 :
@@ -157,7 +157,7 @@ def add_new_chrom_seq(in_arg, index, prev_fasta_path, prev_gff_path, iterations)
157157 SeqIO .write ([new_record ], f , "fasta" )
158158 ## Read in new GFF features from in_gff
159159 ## Pass the new_chrom name from command line and the length of the new sequence to correct ##sequence-region line
160- in_gff_lines = get_in_gff_lines (in_arg .in_gff [index ], in_arg .new_chrom [index ], len (new_seq ))
160+ in_gff_lines = get_in_gff_lines (in_gff = in_arg .in_gff [index ], new_chrom = in_arg .new_chrom [index ], sequence_length = len (new_seq ))
161161 ## Create a temp file for gff, if index is not equal to last iteration
162162 annotation_name , annotation_ext = get_ref_basename (in_arg .ref_gff )
163163 if index < iterations - 1 :
@@ -190,6 +190,19 @@ def read_fasta(in_arg, index, prev_fasta_path):
190190 raise FileNotFoundError (f"Error: File { filename_fa } does not exist." )
191191 real_path_fa = os .path .realpath (filename_fa )
192192 record = list (SeqIO .parse (in_arg .in_fasta [index ], "fasta" ))[0 ]
193+ # Check for mismatch between FASTA record ID and command line chromosome name
194+ if hasattr (in_arg , 'new_chrom' ) and in_arg .new_chrom is not None :
195+ if record .id != in_arg .new_chrom [index ]:
196+ print (f"** WARNING: Mismatch detected between chromosome name in input FASTA ({ record .id } ) "
197+ f"and command line parameter ({ in_arg .new_chrom [index ]} )." )
198+ print (f"Using command line chromosome name: { in_arg .new_chrom [index ]} " )
199+ # The actual override happens in add_new_chrom_seq where a new SeqRecord is created
200+ elif hasattr (in_arg , 'chrom' ) and in_arg .chrom is not None :
201+ if record .id != in_arg .chrom :
202+ print (f"** WARNING: Mismatch detected between chromosome name in input FASTA ({ record .id } ) "
203+ f"and command line parameter ({ in_arg .chrom } )." )
204+ print (f"Using command line chromosome name: { in_arg .chrom } " )
205+ # The actual override happens in modify_existing_chrom_seq where the existing sequence is modified
193206 except IndexError :
194207 raise ValueError (f"Error: { filename_fa } is not a valid FASTA file." )
195208 except Exception as e :
@@ -289,7 +302,7 @@ def valid_gff_line(line_elements):
289302 return False
290303 return True
291304
292- def get_in_gff_lines (in_gff , new_chrom = None , sequence_length = None ):
305+ def get_in_gff_lines (in_gff = None , existing_chrom = None , new_chrom = None , sequence_length = None ):
293306 '''
294307 Takes a gff file and returns a list of lists where
295308 each parent list item is a single line of the gff file
@@ -303,7 +316,7 @@ def get_in_gff_lines(in_gff, new_chrom=None, sequence_length=None):
303316 continue
304317
305318 # Handle differently based on whether we're adding a new chromosome or modifying existing
306- if line .startswith ("##sequence-region" ):
319+ if line .startswith ("##sequence-region" ) and new_chrom is not None :
307320 ## Paste ##sequence-region line which only exists in gtf/gff for adding new chromosome.
308321 ## Select user used delimiter based on content
309322 if '\t ' in line :
@@ -336,6 +349,11 @@ def get_in_gff_lines(in_gff, new_chrom=None, sequence_length=None):
336349 else :
337350 ## Split, check and add feature lines
338351 line_elements = line .split ('\t ' )
352+ chorme_id = existing_chrom if existing_chrom else new_chrom
353+ if line_elements [0 ] != chorme_id :
354+ print ("** Warning: The chromosome name in the GFF file does not match the new chromosome name." )
355+ print (f"Correct the chromosome name { line_elements [0 ]} to { chorme_id } " )
356+ line_elements [0 ] = chorme_id
339357 if not valid_gff_line (line_elements ):
340358 exit ()
341359 in_gff_lines .append (line_elements )
@@ -667,6 +685,7 @@ def create_new_gff_for_existing_gff(new_gff_name, ref_gff, in_gff_lines, chrom_i
667685 """
668686 Appends new annotations to an existing GFF file without modifying existing features.
669687 """
688+ gff_splitor = ''
670689 ref_gff_path = ref_gff
671690 ## Handle compressed .gz GFF files
672691 if ref_gff .endswith ('.gz' ):
@@ -681,15 +700,23 @@ def create_new_gff_for_existing_gff(new_gff_name, ref_gff, in_gff_lines, chrom_i
681700 ## Copy all existing annotations to new GFF file
682701 with open (ref_gff_path , "r" , encoding = "utf-8" ) as f :
683702 for line in f :
703+ if line .startswith ("##sequence-region" ) and gff_splitor == '' :
704+ ## Select user used delimiter based on content
705+ if '\t ' in line :
706+ gff_splitor = ('\t ' )
707+ else :
708+ gff_splitor = (' ' )
684709 gff_out .write (line )
685710 ## Append new annotations if present
686711 if in_gff_lines :
687712 print (f"Appending { len (in_gff_lines )} new annotations to chromosome { chrom_id } ." )
688713 for new_annotation in in_gff_lines :
689714 if new_annotation [0 ] == "##sequence-region" :
690715 ## Use predefined format for sequence-region line
716+ if gff_splitor == '' :
717+ gff_splitor = new_annotation [- 1 ]
691718 ## Remove format indicater, and add new line
692- gff_out .write (new_annotation [ - 1 ] .join (new_annotation [:- 1 ])+ '\n ' )
719+ gff_out .write (gff_splitor .join (new_annotation [:- 1 ])+ '\n ' )
693720 elif new_annotation :
694721 gff_out .write ("\t " .join (new_annotation ))
695722
0 commit comments