Skip to content

Commit eccc15c

Browse files
committed
Resolve comments
1 parent d7047cd commit eccc15c

File tree

12 files changed

+125
-5
lines changed

12 files changed

+125
-5
lines changed

reform.py

Lines changed: 32 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -101,7 +101,7 @@ def modify_existing_chrom_seq(in_arg, index, prev_fasta_path, prev_modifications
101101
else:
102102
SeqIO.write([chrom_seqs[s]], f, "fasta")
103103
## Read in new GFF features from in_gff, False means modify existing chrom
104-
in_gff_lines = get_in_gff_lines(in_arg.in_gff[index])
104+
in_gff_lines = get_in_gff_lines(in_gff=in_arg.in_gff[index], existing_chrom=in_arg.chrom, new_chrom=None)
105105
## Create a temp file for gff, if index is not equal to last iteration
106106
annotation_name, annotation_ext = get_ref_basename(in_arg.ref_gff)
107107
if index < iterations - 1:
@@ -157,7 +157,7 @@ def add_new_chrom_seq(in_arg, index, prev_fasta_path, prev_gff_path, iterations)
157157
SeqIO.write([new_record], f, "fasta")
158158
## Read in new GFF features from in_gff
159159
## Pass the new_chrom name from command line and the length of the new sequence to correct ##sequence-region line
160-
in_gff_lines = get_in_gff_lines(in_arg.in_gff[index], in_arg.new_chrom[index], len(new_seq))
160+
in_gff_lines = get_in_gff_lines(in_gff=in_arg.in_gff[index], new_chrom=in_arg.new_chrom[index], sequence_length=len(new_seq))
161161
## Create a temp file for gff, if index is not equal to last iteration
162162
annotation_name, annotation_ext = get_ref_basename(in_arg.ref_gff)
163163
if index < iterations - 1:
@@ -190,6 +190,19 @@ def read_fasta(in_arg, index, prev_fasta_path):
190190
raise FileNotFoundError(f"Error: File {filename_fa} does not exist.")
191191
real_path_fa = os.path.realpath(filename_fa)
192192
record = list(SeqIO.parse(in_arg.in_fasta[index], "fasta"))[0]
193+
# Check for mismatch between FASTA record ID and command line chromosome name
194+
if hasattr(in_arg, 'new_chrom') and in_arg.new_chrom is not None:
195+
if record.id != in_arg.new_chrom[index]:
196+
print(f"** WARNING: Mismatch detected between chromosome name in input FASTA ({record.id}) "
197+
f"and command line parameter ({in_arg.new_chrom[index]}).")
198+
print(f"Using command line chromosome name: {in_arg.new_chrom[index]}")
199+
# The actual override happens in add_new_chrom_seq where a new SeqRecord is created
200+
elif hasattr(in_arg, 'chrom') and in_arg.chrom is not None:
201+
if record.id != in_arg.chrom:
202+
print(f"** WARNING: Mismatch detected between chromosome name in input FASTA ({record.id}) "
203+
f"and command line parameter ({in_arg.chrom}).")
204+
print(f"Using command line chromosome name: {in_arg.chrom}")
205+
# The actual override happens in modify_existing_chrom_seq where the existing sequence is modified
193206
except IndexError:
194207
raise ValueError(f"Error: {filename_fa} is not a valid FASTA file.")
195208
except Exception as e:
@@ -289,7 +302,7 @@ def valid_gff_line(line_elements):
289302
return False
290303
return True
291304

292-
def get_in_gff_lines(in_gff, new_chrom=None, sequence_length=None):
305+
def get_in_gff_lines(in_gff=None, existing_chrom=None, new_chrom=None, sequence_length=None):
293306
'''
294307
Takes a gff file and returns a list of lists where
295308
each parent list item is a single line of the gff file
@@ -303,7 +316,7 @@ def get_in_gff_lines(in_gff, new_chrom=None, sequence_length=None):
303316
continue
304317

305318
# Handle differently based on whether we're adding a new chromosome or modifying existing
306-
if line.startswith("##sequence-region"):
319+
if line.startswith("##sequence-region") and new_chrom is not None:
307320
## Paste ##sequence-region line which only exists in gtf/gff for adding new chromosome.
308321
## Select user used delimiter based on content
309322
if '\t' in line:
@@ -336,6 +349,11 @@ def get_in_gff_lines(in_gff, new_chrom=None, sequence_length=None):
336349
else:
337350
## Split, check and add feature lines
338351
line_elements = line.split('\t')
352+
chorme_id = existing_chrom if existing_chrom else new_chrom
353+
if line_elements[0] != chorme_id:
354+
print("** Warning: The chromosome name in the GFF file does not match the new chromosome name.")
355+
print(f"Correct the chromosome name {line_elements[0]} to {chorme_id}")
356+
line_elements[0] = chorme_id
339357
if not valid_gff_line(line_elements):
340358
exit()
341359
in_gff_lines.append(line_elements)
@@ -667,6 +685,7 @@ def create_new_gff_for_existing_gff(new_gff_name, ref_gff, in_gff_lines, chrom_i
667685
"""
668686
Appends new annotations to an existing GFF file without modifying existing features.
669687
"""
688+
gff_splitor = ''
670689
ref_gff_path = ref_gff
671690
## Handle compressed .gz GFF files
672691
if ref_gff.endswith('.gz'):
@@ -681,15 +700,23 @@ def create_new_gff_for_existing_gff(new_gff_name, ref_gff, in_gff_lines, chrom_i
681700
## Copy all existing annotations to new GFF file
682701
with open(ref_gff_path, "r", encoding="utf-8") as f:
683702
for line in f:
703+
if line.startswith("##sequence-region") and gff_splitor == '':
704+
## Select user used delimiter based on content
705+
if '\t' in line:
706+
gff_splitor = ('\t')
707+
else:
708+
gff_splitor = (' ')
684709
gff_out.write(line)
685710
## Append new annotations if present
686711
if in_gff_lines:
687712
print(f"Appending {len(in_gff_lines)} new annotations to chromosome {chrom_id}.")
688713
for new_annotation in in_gff_lines:
689714
if new_annotation[0] == "##sequence-region":
690715
## Use predefined format for sequence-region line
716+
if gff_splitor == '':
717+
gff_splitor = new_annotation[-1]
691718
## Remove format indicater, and add new line
692-
gff_out.write(new_annotation[-1].join(new_annotation[:-1])+'\n')
719+
gff_out.write(gff_splitor.join(new_annotation[:-1])+'\n')
693720
elif new_annotation:
694721
gff_out.write("\t".join(new_annotation))
695722

test_data/19/gold.fa

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
>X
2+
ZZZZABBBBBDDDDDCCCCCIIIIIKKKKK
3+
>Y
4+
AAAATTTTGGGGCCCC
5+
>H
6+
GGGGAATTCCCCGGGG
7+
>M
8+
CCCCGGGGAAAATTTT

test_data/19/gold.gtf

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
X ref exon 5 25 . + 0 gene_id "ref_gene"; transcript_id "ref_gene.1";
2+
X ref CDS 8 22 . + 0 gene_id "ref_gene"; transcript_id "ref_gene.1";
3+
X ref start_codon 5 7 . + 0 gene_id "ref_gene"; transcript_id "ref_gene.1";
4+
X ref stop_codon 23 25 . + 0 gene_id "ref_gene"; transcript_id "ref_gene.1";
5+
##sequence-region Y 1 16
6+
Y ref exon 1 16 . + 0 gene_id "new_gene"; transcript_id "new_gene.1";
7+
Y ref CDS 4 14 . + 0 gene_id "new_gene"; transcript_id "new_gene.1";
8+
Y ref start_codon 1 3 . + 0 gene_id "new_gene"; transcript_id "new_gene.1";
9+
Y ref stop_codon 14 16 . + 0 gene_id "new_gene"; transcript_id "new_gene.1";
10+
##sequence-region H 1 16
11+
H ref exon 1 16 . + 0 gene_id "gene2"; transcript_id "gene2.1";
12+
H ref CDS 4 14 . + 0 gene_id "gene2"; transcript_id "gene2.1";
13+
H ref start_codon 1 3 . + 0 gene_id "gene2"; transcript_id "gene2.1";
14+
H ref stop_codon 14 16 . + 0 gene_id "gene2"; transcript_id "gene2.1";
15+
M ref exon 1 16 . + 0 gene_id "gene4"; transcript_id "gene4.1";
16+
M ref CDS 4 14 . + 0 gene_id "gene4"; transcript_id "gene4.1";
17+
M ref start_codon 1 3 . + 0 gene_id "gene4"; transcript_id "gene4.1";
18+
M ref stop_codon 14 16 . + 0 gene_id "gene4"; transcript_id "gene4.1";

test_data/19/in1.fa

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
>Z
2+
AAAATTTTGGGGCCCC

test_data/19/in1.gtf

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
##sequence-region Y.11 1 16
2+
Z ref exon 1 16 . + 0 gene_id "new_gene"; transcript_id "new_gene.1";
3+
Z ref CDS 4 14 . + 0 gene_id "new_gene"; transcript_id "new_gene.1";
4+
Z ref start_codon 1 3 . + 0 gene_id "new_gene"; transcript_id "new_gene.1";
5+
Z ref stop_codon 14 16 . + 0 gene_id "new_gene"; transcript_id "new_gene.1";

test_data/19/in2.fa

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
>Y
2+
GGGGAATTCCCCGGGG

test_data/19/in2.gtf

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
##sequence-region F.11 3 27
2+
H ref exon 1 16 . + 0 gene_id "gene2"; transcript_id "gene2.1";
3+
H ref CDS 4 14 . + 0 gene_id "gene2"; transcript_id "gene2.1";
4+
H ref start_codon 1 3 . + 0 gene_id "gene2"; transcript_id "gene2.1";
5+
H ref stop_codon 14 16 . + 0 gene_id "gene2"; transcript_id "gene2.1";

test_data/19/in3.fa

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
>M
2+
CCCCGGGGAAAATTTT

test_data/19/in3.gtf

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
##Test Data
2+
M ref exon 1 16 . + 0 gene_id "gene4"; transcript_id "gene4.1";
3+
K ref CDS 4 14 . + 0 gene_id "gene4"; transcript_id "gene4.1";
4+
Z ref start_codon 1 3 . + 0 gene_id "gene4"; transcript_id "gene4.1";
5+
M ref stop_codon 14 16 . + 0 gene_id "gene4"; transcript_id "gene4.1";

test_data/19/ref.fa

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
>X
2+
ZZZZABBBBBDDDDDCCCCCIIIIIKKKKK

0 commit comments

Comments
 (0)