Skip to content

Commit eb8d274

Browse files
Merge pull request #38 from YuWei-CH/Printing&Logic-improve
Printing&logic improve
2 parents 7735a96 + 9ee72e3 commit eb8d274

File tree

9 files changed

+367
-58
lines changed

9 files changed

+367
-58
lines changed

reform.py

Lines changed: 85 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -6,17 +6,27 @@
66
from Bio import SeqIO
77
from Bio.Seq import Seq
88
from Bio.SeqRecord import SeqRecord
9+
10+
## Importing gzip or pgzip module for file compression
11+
print("------------------------------------------")
12+
print(f"Compression Library Use:")
913
try:
1014
import pgzip as gzip_module
11-
print(f"\nUsing pgzip for gzip operations.\n")
15+
print(f"Using pgzip for gzip operations.")
1216
except ImportError:
1317
import gzip as gzip_module
14-
print(f"\npgzip not found, falling back to gzip.\n")
18+
print(f"pgzip not found, falling back to gzip.")
1519

1620

1721
def main():
1822
## Retrieve command line arguments and number of iterations
1923
in_arg, iterations = get_input_args()
24+
25+
## Print reference file paths at start of process
26+
print("------------------------------------------")
27+
print(f"Path to Reference Files:")
28+
print(f"Reference FASTA: {os.path.realpath(in_arg.ref_fasta)}")
29+
print(f"Reference Annotation: {os.path.realpath(in_arg.ref_gff)}")
2030

2131
## List for previous postion and modification length
2232
prev_modifications = []
@@ -28,24 +38,27 @@ def main():
2838
## Sequential processing
2939
for index in range(iterations):
3040
# Start interation
31-
print("-------------------------------------------")
32-
print(f"Begin modification from in{index+1}.fa")
33-
print("-------------------------------------------")
3441
if hasattr(in_arg, 'chrom') and in_arg.chrom is not None:
3542
## Modify existing chrom seq
43+
print("-------------------------------------------")
44+
print(f"Begin modification from {in_arg.in_fasta[index]}")
45+
print("-------------------------------------------")
3646
new_fasta, annotation_ext, new_gff_path, prev_fasta_path, prev_gff_path = \
3747
modify_existing_chrom_seq(in_arg, index, prev_fasta_path, prev_modifications, \
3848
iterations, prev_gff_path)
3949
else:
40-
## Add new chrom
50+
## Add new chrom seq
51+
print("-------------------------------------------")
52+
print(f"Begin adding a new chromosome from {in_arg.in_fasta[index]}")
53+
print("-------------------------------------------")
4154
new_fasta, annotation_ext, new_gff_path, prev_fasta_path, prev_gff_path = \
4255
add_new_chrom_seq(in_arg, index, prev_fasta_path, prev_gff_path, iterations)
4356

4457
print("------------------------------------------")
4558
print(f"Reform Complete")
46-
print("------------------------------------------")
4759
print(f"New .fa file created: {os.path.realpath(new_fasta)}")
4860
print(f"New {annotation_ext} file created: {os.path.realpath(new_gff_path)}")
61+
print("------------------------------------------")
4962

5063
def modify_existing_chrom_seq(in_arg, index, prev_fasta_path, prev_modifications, iterations, prev_gff_path):
5164
"""
@@ -72,7 +85,7 @@ def modify_existing_chrom_seq(in_arg, index, prev_fasta_path, prev_modifications
7285
prev_modifications.append((position,length_changed))
7386
if position != down_position:
7487
print(f"Removing nucleotides from position {position} - {down_position}")
75-
print(f"Proceeding to insert sequence '{record.description}' from {in_arg.in_fasta[index]} at position {position} on chromosome {in_arg.chrom}")
88+
print(f"Proceeding to insert sequence '{record.description.strip()}' from {in_arg.in_fasta[index]} at position {position} on chromosome {in_arg.chrom}")
7689
## Build the new chromosome sequence with the inserted_seq
7790
## If the chromosome sequence length is in the header, replace it with new length
7891
new_seq = existing_seq_str[:position] + str(record.seq) + existing_seq_str[down_position:]
@@ -134,6 +147,7 @@ def add_new_chrom_seq(in_arg, index, prev_fasta_path, prev_gff_path, iterations)
134147
## Build the new chromosome sequence by append new sequence below
135148
## Using new_chrom as the id of the new chromosome
136149
new_seq = str(record.seq)
150+
new_seq_length = len(new_seq)
137151
new_record = SeqRecord(
138152
Seq(new_seq),
139153
id=in_arg.new_chrom[index], # Use new_chrom
@@ -156,24 +170,24 @@ def add_new_chrom_seq(in_arg, index, prev_fasta_path, prev_gff_path, iterations)
156170
SeqIO.write([new_record], f, "fasta")
157171
## Read in new GFF features from in_gff
158172
## Pass the new_chrom name from command line and the length of the new sequence to correct ##sequence-region line
159-
in_gff_lines = get_in_gff_lines(in_gff=in_arg.in_gff[index], new_chrom=in_arg.new_chrom[index], sequence_length=len(new_seq))
173+
in_gff_lines = get_in_gff_lines(in_gff=in_arg.in_gff[index], new_chrom=in_arg.new_chrom[index], sequence_length=new_seq_length)
160174
## Create a temp file for gff, if index is not equal to last iteration
161175
annotation_name, annotation_ext = get_ref_basename(in_arg.ref_gff)
162176
if index < iterations - 1:
163177
temp_gff = tempfile.NamedTemporaryFile(delete=False, mode='w', suffix=annotation_ext)
164178
temp_gff_name = temp_gff.name
165179
temp_gff.close()
166180
if prev_gff_path:
167-
new_gff_path = create_new_gff_for_existing_gff(temp_gff_name, prev_gff_path, in_gff_lines, in_arg.new_chrom[index])
181+
new_gff_path = create_new_gff_for_existing_gff(temp_gff_name, prev_gff_path, in_gff_lines, in_arg.new_chrom[index], new_seq_length)
168182
os.remove(prev_gff_path)
169183
else:
170-
new_gff_path = create_new_gff_for_existing_gff(temp_gff_name, in_arg.ref_gff, in_gff_lines, in_arg.new_chrom[index])
184+
new_gff_path = create_new_gff_for_existing_gff(temp_gff_name, in_arg.ref_gff, in_gff_lines, in_arg.new_chrom[index], new_seq_length)
171185
else:
172186
new_gff_name = annotation_name + '_reformed' + annotation_ext
173187
if prev_gff_path:
174-
new_gff_path = create_new_gff_for_existing_gff(new_gff_name, prev_gff_path, in_gff_lines, in_arg.new_chrom[index])
188+
new_gff_path = create_new_gff_for_existing_gff(new_gff_name, prev_gff_path, in_gff_lines, in_arg.new_chrom[index], new_seq_length)
175189
else:
176-
new_gff_path = create_new_gff_for_existing_gff(new_gff_name, in_arg.ref_gff, in_gff_lines, in_arg.new_chrom[index])
190+
new_gff_path = create_new_gff_for_existing_gff(new_gff_name, in_arg.ref_gff, in_gff_lines, in_arg.new_chrom[index], new_seq_length)
177191
prev_gff_path = new_gff_path
178192
return new_fasta, annotation_ext, new_gff_path, prev_fasta_path, prev_gff_path
179193

@@ -196,18 +210,13 @@ def read_fasta(in_arg, index, prev_fasta_path):
196210
f"and command line parameter ({in_arg.new_chrom[index]}).")
197211
print(f"Using command line chromosome name: {in_arg.new_chrom[index]}")
198212
# The actual override happens in add_new_chrom_seq where a new SeqRecord is created
199-
elif hasattr(in_arg, 'chrom') and in_arg.chrom is not None:
200-
if record.id != in_arg.chrom:
201-
print(f"** WARNING: Mismatch detected between chromosome name in input FASTA ({record.id}) "
202-
f"and command line parameter ({in_arg.chrom}).")
203-
print(f"Using command line chromosome name: {in_arg.chrom}")
204-
# The actual override happens in modify_existing_chrom_seq where the existing sequence is modified
213+
205214
except IndexError:
206215
raise ValueError(f"Error: {filename_fa} is not a valid FASTA file.")
207216
except Exception as e:
208217
raise ValueError(f"Error parsing FASTA file: {str(e)}")
209218
print(f"Preparing to create new FASTA file")
210-
print(f"Original FASTA: {real_path_fa}")
219+
print(f"Input FASTA: {real_path_fa}")
211220
## Generate index of sequences from ref reference fasta
212221
if prev_fasta_path:
213222
chrom_seqs = index_fasta(prev_fasta_path)
@@ -225,7 +234,7 @@ def check_gff(in_arg, index):
225234
raise FileNotFoundError(f"Error: File {filename_gff} does not exist.")
226235
real_path_gff = os.path.realpath(filename_gff)
227236
print("Preparing to create new annotation file")
228-
print(f"Original Annotation: {real_path_gff}")
237+
print(f"Input Annotation: {real_path_gff}")
229238
print() ### print new line
230239

231240
def index_fasta(fasta_path):
@@ -350,8 +359,8 @@ def get_in_gff_lines(in_gff=None, existing_chrom=None, new_chrom=None, sequence_
350359
line_elements = line.split('\t')
351360
chorme_id = existing_chrom if existing_chrom else new_chrom
352361
if line_elements[0] != chorme_id:
353-
print("** Warning: The chromosome name in the GFF file does not match the new chromosome name.")
354-
print(f"Correct the chromosome name {line_elements[0]} to {chorme_id}")
362+
print(f"** Warning: Mismatch detected between chromosome name in input annotation ({line_elements[0]}) and command line parameter ({chorme_id}).")
363+
print(f"Using command line chromosome name: {chorme_id}")
355364
line_elements[0] = chorme_id
356365
if not valid_gff_line(line_elements):
357366
exit()
@@ -413,36 +422,39 @@ def get_position(index, positions, upstream, downstream, chrom, seq_str, prev_mo
413422
exit()
414423
return {'position': position, 'down_position': down_position}
415424

416-
def write_in_gff_lines(gff_out, in_gff_lines, position, split_features, sequence_length, chrom):
425+
def calculate_new_length_for_in_gff(in_gff_lines, position, sequence_length):
417426
'''
418-
in_gff_lines: a list of lists where each nested list is a list of
419-
columns (in gff format) associated with each new feature to insert
420-
split_features: contains information about features in the original GFF
421-
file that were split due to the insertion of the new sequence.
422-
sequence_length: length of the inserted sequence, used to determine
423-
the new end positions in the GFF file.
427+
Calculate the new length of the chromosome after modification.
428+
This function checks the start and end positions of the features in the GFF file
429+
and adjusts them based on the insertion position and the length of the inserted sequence.
424430
'''
425-
## Replace the chromosome ID from in_gff with the correct chromosome ID
426-
for l in in_gff_lines:
427-
l[0] = chrom
431+
## List to store new gff lines
432+
new_gff_lines = []
428433
# Handling of single-line comments
429434
if len(in_gff_lines) == 1:
430-
l = in_gff_lines[0]
431-
## Check length
432-
## l[3] is start position of fasta in in.gtf and l[4] is end position
433-
seq_id = l[0]
434-
if int(l[4]) - int(l[3]) + 1 != sequence_length:
435-
print(f"** WARNING: Inconsistent length for {seq_id}. Correcting start position to 1 and end position to {sequence_length}.")
436-
## Correct start(l[3]) to 1 and end(l[4]) to length of insert fasta
437-
new_gff_line = modify_gff_line(
438-
l, start=1 + position, end=sequence_length + position)
439-
gff_out.write(new_gff_line)
435+
## Check if the line isn't a comment line
436+
if in_gff_lines[0][0].startswith("##sequence-region"):
437+
new_gff_lines.append(in_gff_lines[0])
438+
else:
439+
l = in_gff_lines[0]
440+
## Check length
441+
## l[3] is start position of fasta in in.gtf and l[4] is end position
442+
seq_id = l[0]
443+
if int(l[4]) - int(l[3]) + 1 != sequence_length:
444+
print(f"** WARNING: Annotation start and end positions do not match the sequence length in the FASTA input. Adjusting to match the input sequence: start=1, end={sequence_length}.\n→ Affected annotation: {in_gff_lines}")
445+
## Correct start(l[3]) to 1 and end(l[4]) to length of insert fasta
446+
new_gff_line = modify_gff_line(
447+
l, start=1 + position, end=sequence_length + position)
448+
new_gff_lines.append(new_gff_line)
440449
# Handling of multiple-line comments
441450
else:
442451
### Step1: extract all start and end into corresponding set()
443452
start_positions = set()
444453
end_positions = set()
445454
for l in in_gff_lines:
455+
## Check length and ignore comment lines
456+
if l[0].startswith("##sequence-region"):
457+
continue
446458
start_positions.add(int(l[3]))
447459
end_positions.add(int(l[4]))
448460
### Step2: Find min start and max end
@@ -459,11 +471,32 @@ def write_in_gff_lines(gff_out, in_gff_lines, position, split_features, sequence
459471
### Step4: Adjust start and end. Offset will be 0 if no adjust need
460472
offset = min_start - 1
461473
for l in in_gff_lines:
474+
## Update length and ignore comment lines for lenght calculation
475+
if l[0].startswith("##sequence-region"):
476+
new_gff_lines.append(l)
477+
continue
462478
### Step5: Correct start(l[3]) and end(l[4]) by minus offset
463479
new_gff_line = modify_gff_line(
464480
l, start = int(l[3]) - offset + position, end = int(l[4]) - offset + position)
465-
gff_out.write(new_gff_line)
481+
new_gff_lines.append(new_gff_line)
482+
return new_gff_lines
466483

484+
def write_in_gff_lines(gff_out, in_gff_lines, position, split_features, sequence_length, chrom):
485+
'''
486+
in_gff_lines: a list of lists where each nested list is a list of
487+
columns (in gff format) associated with each new feature to insert
488+
split_features: contains information about features in the original GFF
489+
file that were split due to the insertion of the new sequence.
490+
sequence_length: length of the inserted sequence, used to determine
491+
the new end positions in the GFF file.
492+
'''
493+
## Replace the chromosome ID from in_gff with the correct chromosome ID
494+
for l in in_gff_lines:
495+
l[0] = chrom
496+
## Check if the lenght in_gff_lines are valid, and correct if needed
497+
new_gff_lines = calculate_new_length_for_in_gff(in_gff_lines, position, sequence_length)
498+
for gff_line in new_gff_lines:
499+
gff_out.write(gff_line)
467500
## If insertion caused any existing features to be split, add
468501
## the split features now immediately after adding the new features
469502
for sf in split_features:
@@ -678,7 +711,7 @@ def create_new_gff(new_gff_name, ref_gff, in_gff_lines, position, down_position,
678711
os.remove(ref_gff_path)
679712
return new_gff_name
680713

681-
def create_new_gff_for_existing_gff(new_gff_name, ref_gff, in_gff_lines, chrom_id):
714+
def create_new_gff_for_existing_gff(new_gff_name, ref_gff, in_gff_lines, chrom_id, sequence_length):
682715
"""
683716
Appends new annotations to an existing GFF file without modifying existing features.
684717
"""
@@ -706,16 +739,19 @@ def create_new_gff_for_existing_gff(new_gff_name, ref_gff, in_gff_lines, chrom_i
706739
gff_out.write(line)
707740
## Append new annotations if present
708741
if in_gff_lines:
742+
## Call calculate_new_length_for_in_gff to correct the length of the chromosome
743+
## Since we are not modifying existing features, we can set position to 0
744+
new_gff_lines = calculate_new_length_for_in_gff(in_gff_lines, 0, sequence_length)
709745
print(f"Appending {len(in_gff_lines)} new annotations to chromosome {chrom_id}.")
710-
for new_annotation in in_gff_lines:
746+
for new_annotation in new_gff_lines:
711747
if new_annotation[0] == "##sequence-region":
712748
## Use predefined format for sequence-region line
713749
if gff_splitor == '':
714750
gff_splitor = new_annotation[-1]
715751
## Remove format indicater, and add new line
716752
gff_out.write(gff_splitor.join(new_annotation[:-1])+'\n')
717753
elif new_annotation:
718-
gff_out.write("\t".join(new_annotation))
754+
gff_out.write(new_annotation)
719755

720756
## Cleanup temp file if needed
721757
if ref_gff.endswith('.gz') and ref_gff_path != ref_gff:
@@ -791,7 +827,7 @@ def get_input_args():
791827
in_args.in_fasta = in_args.in_fasta.split(',')
792828
in_args.in_gff = in_args.in_gff.split(',')
793829
if (len(in_args.in_fasta) != len(in_args.in_gff)):
794-
print("** Error: The number of inserted FASTA files does not match the number of GTF files, or their counts and positions do not align.")
830+
print("** Error: The number of inserted FASTA files does not match the number of annotation files, or their counts and positions do not align.")
795831
exit()
796832
else:
797833
iterations = len(in_args.in_fasta)
@@ -826,7 +862,7 @@ def get_input_args():
826862
parser.error("** Error: When using --new_chrom, you cannot provide --position, --upstream_fasta, or --downstream_fasta.")
827863
exit()
828864
## Convert new_chrom from string to list
829-
in_args.new_chrom = in_args.new_chrom.split(',')
865+
in_args.new_chrom = [x.strip() for x in in_args.new_chrom.split(',')]
830866

831867
return in_args, iterations
832868

test_data/20/gold.fa

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
>X
2+
XZZZABBBBBDDDDDCCCCCIIIIIKKKKK
3+
>Y
4+
----------

test_data/20/gold.gtf

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
X ref exon 5 25 . + 0 gene_id "ref_gene"; transcript_id "ref_gene.1";
2+
X ref CDS 8 22 . + 0 gene_id "ref_gene"; transcript_id "ref_gene.1";
3+
X ref start_codon 5 7 . + 0 gene_id "ref_gene"; transcript_id "ref_gene.1";
4+
X ref stop_codon 23 25 . + 0 gene_id "ref_gene"; transcript_id "ref_gene.1";
5+
Y new exon 1 10 . + 0 gene_id "new"; transcript_id "new.1";

test_data/20/in.fa

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
>test 1
2+
----------

test_data/20/in.gtf

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
X new exon 1 3 . + 0 gene_id "new"; transcript_id "new.1";

test_data/20/ref.fa

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
>X
2+
XZZZABBBBBDDDDDCCCCCIIIIIKKKKK

test_data/20/ref.gtf

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
X ref exon 5 25 . + 0 gene_id "ref_gene"; transcript_id "ref_gene.1";
2+
X ref CDS 8 22 . + 0 gene_id "ref_gene"; transcript_id "ref_gene.1";
3+
X ref start_codon 5 7 . + 0 gene_id "ref_gene"; transcript_id "ref_gene.1";
4+
X ref stop_codon 23 25 . + 0 gene_id "ref_gene"; transcript_id "ref_gene.1";

test_order_explanation.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
import unittest
2+
3+
class TestOrderExample(unittest.TestCase):
4+
def test_z_third(self):
5+
print("This runs third despite being defined first")
6+
7+
def test_a_first(self):
8+
print("This runs first despite being defined second")
9+
10+
def test_b_second(self):
11+
print("This runs second despite being defined third")
12+
13+
if __name__ == '__main__':
14+
unittest.main()

0 commit comments

Comments
 (0)