66from Bio import SeqIO
77from Bio .Seq import Seq
88from Bio .SeqRecord import SeqRecord
9+
10+ ## Importing gzip or pgzip module for file compression
11+ print ("------------------------------------------" )
12+ print (f"Compression Library Use:" )
913try :
1014 import pgzip as gzip_module
11- print (f"\n Using pgzip for gzip operations.\n " )
15+ print (f"Using pgzip for gzip operations." )
1216except ImportError :
1317 import gzip as gzip_module
14- print (f"\n pgzip not found, falling back to gzip.\n " )
18+ print (f"pgzip not found, falling back to gzip." )
1519
1620
1721def main ():
1822 ## Retrieve command line arguments and number of iterations
1923 in_arg , iterations = get_input_args ()
24+
25+ ## Print reference file paths at start of process
26+ print ("------------------------------------------" )
27+ print (f"Path to Reference Files:" )
28+ print (f"Reference FASTA: { os .path .realpath (in_arg .ref_fasta )} " )
29+ print (f"Reference Annotation: { os .path .realpath (in_arg .ref_gff )} " )
2030
2131 ## List for previous postion and modification length
2232 prev_modifications = []
@@ -28,24 +38,27 @@ def main():
2838 ## Sequential processing
2939 for index in range (iterations ):
3040 # Start interation
31- print ("-------------------------------------------" )
32- print (f"Begin modification from in{ index + 1 } .fa" )
33- print ("-------------------------------------------" )
3441 if hasattr (in_arg , 'chrom' ) and in_arg .chrom is not None :
3542 ## Modify existing chrom seq
43+ print ("-------------------------------------------" )
44+ print (f"Begin modification from { in_arg .in_fasta [index ]} " )
45+ print ("-------------------------------------------" )
3646 new_fasta , annotation_ext , new_gff_path , prev_fasta_path , prev_gff_path = \
3747 modify_existing_chrom_seq (in_arg , index , prev_fasta_path , prev_modifications , \
3848 iterations , prev_gff_path )
3949 else :
40- ## Add new chrom
50+ ## Add new chrom seq
51+ print ("-------------------------------------------" )
52+ print (f"Begin adding a new chromosome from { in_arg .in_fasta [index ]} " )
53+ print ("-------------------------------------------" )
4154 new_fasta , annotation_ext , new_gff_path , prev_fasta_path , prev_gff_path = \
4255 add_new_chrom_seq (in_arg , index , prev_fasta_path , prev_gff_path , iterations )
4356
4457 print ("------------------------------------------" )
4558 print (f"Reform Complete" )
46- print ("------------------------------------------" )
4759 print (f"New .fa file created: { os .path .realpath (new_fasta )} " )
4860 print (f"New { annotation_ext } file created: { os .path .realpath (new_gff_path )} " )
61+ print ("------------------------------------------" )
4962
5063def modify_existing_chrom_seq (in_arg , index , prev_fasta_path , prev_modifications , iterations , prev_gff_path ):
5164 """
@@ -72,7 +85,7 @@ def modify_existing_chrom_seq(in_arg, index, prev_fasta_path, prev_modifications
7285 prev_modifications .append ((position ,length_changed ))
7386 if position != down_position :
7487 print (f"Removing nucleotides from position { position } - { down_position } " )
75- print (f"Proceeding to insert sequence '{ record .description } ' from { in_arg .in_fasta [index ]} at position { position } on chromosome { in_arg .chrom } " )
88+ print (f"Proceeding to insert sequence '{ record .description . strip () } ' from { in_arg .in_fasta [index ]} at position { position } on chromosome { in_arg .chrom } " )
7689 ## Build the new chromosome sequence with the inserted_seq
7790 ## If the chromosome sequence length is in the header, replace it with new length
7891 new_seq = existing_seq_str [:position ] + str (record .seq ) + existing_seq_str [down_position :]
@@ -134,6 +147,7 @@ def add_new_chrom_seq(in_arg, index, prev_fasta_path, prev_gff_path, iterations)
134147 ## Build the new chromosome sequence by append new sequence below
135148 ## Using new_chrom as the id of the new chromosome
136149 new_seq = str (record .seq )
150+ new_seq_length = len (new_seq )
137151 new_record = SeqRecord (
138152 Seq (new_seq ),
139153 id = in_arg .new_chrom [index ], # Use new_chrom
@@ -156,24 +170,24 @@ def add_new_chrom_seq(in_arg, index, prev_fasta_path, prev_gff_path, iterations)
156170 SeqIO .write ([new_record ], f , "fasta" )
157171 ## Read in new GFF features from in_gff
158172 ## Pass the new_chrom name from command line and the length of the new sequence to correct ##sequence-region line
159- in_gff_lines = get_in_gff_lines (in_gff = in_arg .in_gff [index ], new_chrom = in_arg .new_chrom [index ], sequence_length = len ( new_seq ) )
173+ in_gff_lines = get_in_gff_lines (in_gff = in_arg .in_gff [index ], new_chrom = in_arg .new_chrom [index ], sequence_length = new_seq_length )
160174 ## Create a temp file for gff, if index is not equal to last iteration
161175 annotation_name , annotation_ext = get_ref_basename (in_arg .ref_gff )
162176 if index < iterations - 1 :
163177 temp_gff = tempfile .NamedTemporaryFile (delete = False , mode = 'w' , suffix = annotation_ext )
164178 temp_gff_name = temp_gff .name
165179 temp_gff .close ()
166180 if prev_gff_path :
167- new_gff_path = create_new_gff_for_existing_gff (temp_gff_name , prev_gff_path , in_gff_lines , in_arg .new_chrom [index ])
181+ new_gff_path = create_new_gff_for_existing_gff (temp_gff_name , prev_gff_path , in_gff_lines , in_arg .new_chrom [index ], new_seq_length )
168182 os .remove (prev_gff_path )
169183 else :
170- new_gff_path = create_new_gff_for_existing_gff (temp_gff_name , in_arg .ref_gff , in_gff_lines , in_arg .new_chrom [index ])
184+ new_gff_path = create_new_gff_for_existing_gff (temp_gff_name , in_arg .ref_gff , in_gff_lines , in_arg .new_chrom [index ], new_seq_length )
171185 else :
172186 new_gff_name = annotation_name + '_reformed' + annotation_ext
173187 if prev_gff_path :
174- new_gff_path = create_new_gff_for_existing_gff (new_gff_name , prev_gff_path , in_gff_lines , in_arg .new_chrom [index ])
188+ new_gff_path = create_new_gff_for_existing_gff (new_gff_name , prev_gff_path , in_gff_lines , in_arg .new_chrom [index ], new_seq_length )
175189 else :
176- new_gff_path = create_new_gff_for_existing_gff (new_gff_name , in_arg .ref_gff , in_gff_lines , in_arg .new_chrom [index ])
190+ new_gff_path = create_new_gff_for_existing_gff (new_gff_name , in_arg .ref_gff , in_gff_lines , in_arg .new_chrom [index ], new_seq_length )
177191 prev_gff_path = new_gff_path
178192 return new_fasta , annotation_ext , new_gff_path , prev_fasta_path , prev_gff_path
179193
@@ -196,18 +210,13 @@ def read_fasta(in_arg, index, prev_fasta_path):
196210 f"and command line parameter ({ in_arg .new_chrom [index ]} )." )
197211 print (f"Using command line chromosome name: { in_arg .new_chrom [index ]} " )
198212 # The actual override happens in add_new_chrom_seq where a new SeqRecord is created
199- elif hasattr (in_arg , 'chrom' ) and in_arg .chrom is not None :
200- if record .id != in_arg .chrom :
201- print (f"** WARNING: Mismatch detected between chromosome name in input FASTA ({ record .id } ) "
202- f"and command line parameter ({ in_arg .chrom } )." )
203- print (f"Using command line chromosome name: { in_arg .chrom } " )
204- # The actual override happens in modify_existing_chrom_seq where the existing sequence is modified
213+
205214 except IndexError :
206215 raise ValueError (f"Error: { filename_fa } is not a valid FASTA file." )
207216 except Exception as e :
208217 raise ValueError (f"Error parsing FASTA file: { str (e )} " )
209218 print (f"Preparing to create new FASTA file" )
210- print (f"Original FASTA: { real_path_fa } " )
219+ print (f"Input FASTA: { real_path_fa } " )
211220 ## Generate index of sequences from ref reference fasta
212221 if prev_fasta_path :
213222 chrom_seqs = index_fasta (prev_fasta_path )
@@ -225,7 +234,7 @@ def check_gff(in_arg, index):
225234 raise FileNotFoundError (f"Error: File { filename_gff } does not exist." )
226235 real_path_gff = os .path .realpath (filename_gff )
227236 print ("Preparing to create new annotation file" )
228- print (f"Original Annotation: { real_path_gff } " )
237+ print (f"Input Annotation: { real_path_gff } " )
229238 print () ### print new line
230239
231240def index_fasta (fasta_path ):
@@ -350,8 +359,8 @@ def get_in_gff_lines(in_gff=None, existing_chrom=None, new_chrom=None, sequence_
350359 line_elements = line .split ('\t ' )
351360 chorme_id = existing_chrom if existing_chrom else new_chrom
352361 if line_elements [0 ] != chorme_id :
353- print ("** Warning: The chromosome name in the GFF file does not match the new chromosome name ." )
354- print (f"Correct the chromosome name { line_elements [ 0 ] } to { chorme_id } " )
362+ print (f "** Warning: Mismatch detected between chromosome name in input annotation ( { line_elements [ 0 ] } ) and command line parameter ( { chorme_id } ) ." )
363+ print (f"Using command line chromosome name: { chorme_id } " )
355364 line_elements [0 ] = chorme_id
356365 if not valid_gff_line (line_elements ):
357366 exit ()
@@ -413,36 +422,39 @@ def get_position(index, positions, upstream, downstream, chrom, seq_str, prev_mo
413422 exit ()
414423 return {'position' : position , 'down_position' : down_position }
415424
416- def write_in_gff_lines ( gff_out , in_gff_lines , position , split_features , sequence_length , chrom ):
425+ def calculate_new_length_for_in_gff ( in_gff_lines , position , sequence_length ):
417426 '''
418- in_gff_lines: a list of lists where each nested list is a list of
419- columns (in gff format) associated with each new feature to insert
420- split_features: contains information about features in the original GFF
421- file that were split due to the insertion of the new sequence.
422- sequence_length: length of the inserted sequence, used to determine
423- the new end positions in the GFF file.
427+ Calculate the new length of the chromosome after modification.
428+ This function checks the start and end positions of the features in the GFF file
429+ and adjusts them based on the insertion position and the length of the inserted sequence.
424430 '''
425- ## Replace the chromosome ID from in_gff with the correct chromosome ID
426- for l in in_gff_lines :
427- l [0 ] = chrom
431+ ## List to store new gff lines
432+ new_gff_lines = []
428433 # Handling of single-line comments
429434 if len (in_gff_lines ) == 1 :
430- l = in_gff_lines [0 ]
431- ## Check length
432- ## l[3] is start position of fasta in in.gtf and l[4] is end position
433- seq_id = l [0 ]
434- if int (l [4 ]) - int (l [3 ]) + 1 != sequence_length :
435- print (f"** WARNING: Inconsistent length for { seq_id } . Correcting start position to 1 and end position to { sequence_length } ." )
436- ## Correct start(l[3]) to 1 and end(l[4]) to length of insert fasta
437- new_gff_line = modify_gff_line (
438- l , start = 1 + position , end = sequence_length + position )
439- gff_out .write (new_gff_line )
435+ ## Check if the line isn't a comment line
436+ if in_gff_lines [0 ][0 ].startswith ("##sequence-region" ):
437+ new_gff_lines .append (in_gff_lines [0 ])
438+ else :
439+ l = in_gff_lines [0 ]
440+ ## Check length
441+ ## l[3] is start position of fasta in in.gtf and l[4] is end position
442+ seq_id = l [0 ]
443+ if int (l [4 ]) - int (l [3 ]) + 1 != sequence_length :
444+ print (f"** WARNING: Annotation start and end positions do not match the sequence length in the FASTA input. Adjusting to match the input sequence: start=1, end={ sequence_length } .\n → Affected annotation: { in_gff_lines } " )
445+ ## Correct start(l[3]) to 1 and end(l[4]) to length of insert fasta
446+ new_gff_line = modify_gff_line (
447+ l , start = 1 + position , end = sequence_length + position )
448+ new_gff_lines .append (new_gff_line )
440449 # Handling of multiple-line comments
441450 else :
442451 ### Step1: extract all start and end into corresponding set()
443452 start_positions = set ()
444453 end_positions = set ()
445454 for l in in_gff_lines :
455+ ## Check length and ignore comment lines
456+ if l [0 ].startswith ("##sequence-region" ):
457+ continue
446458 start_positions .add (int (l [3 ]))
447459 end_positions .add (int (l [4 ]))
448460 ### Step2: Find min start and max end
@@ -459,11 +471,32 @@ def write_in_gff_lines(gff_out, in_gff_lines, position, split_features, sequence
459471 ### Step4: Adjust start and end. Offset will be 0 if no adjust need
460472 offset = min_start - 1
461473 for l in in_gff_lines :
474+ ## Update length and ignore comment lines for lenght calculation
475+ if l [0 ].startswith ("##sequence-region" ):
476+ new_gff_lines .append (l )
477+ continue
462478 ### Step5: Correct start(l[3]) and end(l[4]) by minus offset
463479 new_gff_line = modify_gff_line (
464480 l , start = int (l [3 ]) - offset + position , end = int (l [4 ]) - offset + position )
465- gff_out .write (new_gff_line )
481+ new_gff_lines .append (new_gff_line )
482+ return new_gff_lines
466483
484+ def write_in_gff_lines (gff_out , in_gff_lines , position , split_features , sequence_length , chrom ):
485+ '''
486+ in_gff_lines: a list of lists where each nested list is a list of
487+ columns (in gff format) associated with each new feature to insert
488+ split_features: contains information about features in the original GFF
489+ file that were split due to the insertion of the new sequence.
490+ sequence_length: length of the inserted sequence, used to determine
491+ the new end positions in the GFF file.
492+ '''
493+ ## Replace the chromosome ID from in_gff with the correct chromosome ID
494+ for l in in_gff_lines :
495+ l [0 ] = chrom
496+ ## Check if the lenght in_gff_lines are valid, and correct if needed
497+ new_gff_lines = calculate_new_length_for_in_gff (in_gff_lines , position , sequence_length )
498+ for gff_line in new_gff_lines :
499+ gff_out .write (gff_line )
467500 ## If insertion caused any existing features to be split, add
468501 ## the split features now immediately after adding the new features
469502 for sf in split_features :
@@ -678,7 +711,7 @@ def create_new_gff(new_gff_name, ref_gff, in_gff_lines, position, down_position,
678711 os .remove (ref_gff_path )
679712 return new_gff_name
680713
681- def create_new_gff_for_existing_gff (new_gff_name , ref_gff , in_gff_lines , chrom_id ):
714+ def create_new_gff_for_existing_gff (new_gff_name , ref_gff , in_gff_lines , chrom_id , sequence_length ):
682715 """
683716 Appends new annotations to an existing GFF file without modifying existing features.
684717 """
@@ -706,16 +739,19 @@ def create_new_gff_for_existing_gff(new_gff_name, ref_gff, in_gff_lines, chrom_i
706739 gff_out .write (line )
707740 ## Append new annotations if present
708741 if in_gff_lines :
742+ ## Call calculate_new_length_for_in_gff to correct the length of the chromosome
743+ ## Since we are not modifying existing features, we can set position to 0
744+ new_gff_lines = calculate_new_length_for_in_gff (in_gff_lines , 0 , sequence_length )
709745 print (f"Appending { len (in_gff_lines )} new annotations to chromosome { chrom_id } ." )
710- for new_annotation in in_gff_lines :
746+ for new_annotation in new_gff_lines :
711747 if new_annotation [0 ] == "##sequence-region" :
712748 ## Use predefined format for sequence-region line
713749 if gff_splitor == '' :
714750 gff_splitor = new_annotation [- 1 ]
715751 ## Remove format indicater, and add new line
716752 gff_out .write (gff_splitor .join (new_annotation [:- 1 ])+ '\n ' )
717753 elif new_annotation :
718- gff_out .write (" \t " . join ( new_annotation ) )
754+ gff_out .write (new_annotation )
719755
720756 ## Cleanup temp file if needed
721757 if ref_gff .endswith ('.gz' ) and ref_gff_path != ref_gff :
@@ -791,7 +827,7 @@ def get_input_args():
791827 in_args .in_fasta = in_args .in_fasta .split (',' )
792828 in_args .in_gff = in_args .in_gff .split (',' )
793829 if (len (in_args .in_fasta ) != len (in_args .in_gff )):
794- print ("** Error: The number of inserted FASTA files does not match the number of GTF files, or their counts and positions do not align." )
830+ print ("** Error: The number of inserted FASTA files does not match the number of annotation files, or their counts and positions do not align." )
795831 exit ()
796832 else :
797833 iterations = len (in_args .in_fasta )
@@ -826,7 +862,7 @@ def get_input_args():
826862 parser .error ("** Error: When using --new_chrom, you cannot provide --position, --upstream_fasta, or --downstream_fasta." )
827863 exit ()
828864 ## Convert new_chrom from string to list
829- in_args .new_chrom = in_args .new_chrom .split (',' )
865+ in_args .new_chrom = [ x . strip () for x in in_args .new_chrom .split (',' )]
830866
831867 return in_args , iterations
832868
0 commit comments