-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmake_ids_unique.py
More file actions
144 lines (110 loc) · 5.01 KB
/
make_ids_unique.py
File metadata and controls
144 lines (110 loc) · 5.01 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
import sys
import os
anno_gff = '/data/prostlocal2/projects/mh_bats_ncrna_annotation/2018/annotations/abbr/efu.gff'
anno_gff_new = '/home/ji57pog/Projects/mh_bats_ncrna_annotation/util/efu.gff'
root_dict = {}
root_id_list = []
child_dict = {}
with open(anno_gff, 'r') as inp:
for line in inp:
if not line.startswith('#'):
# normal record, split line
try:
contig, source, ftype, start, stop, _, strand, _, description = line.strip().split('\t')
except:
print(f'Line {line} does not have 9 fields:\n{line}')
sys.exit(1)
tags = [(f.split('=')) for f in description.split(';')]
# get id
ID = [v for t, v in tags if t == 'ID']
if ID == []:
print(f'No ID in line: {line}')
sys.exit(1)
else:
ID = ID[0]
# get parent
parent = [v for t, v in tags if t == 'Parent']
if parent == []:
# no parnet -> top level
root_id_list.append(ID)
if ID not in root_dict:
root_dict[ID] = [line]
else:
# print(f'Duplicated id {ID} at root level:\n{line}')
root_dict[ID].append(line)
else:
parent = parent[0]
if parent in child_dict:
child_dict[parent].append((ID, line))
else:
child_dict[parent] = [(ID, line)]
def change_id_in_record(old_ID, new_ID, record):
first_part = record.split(old_ID, 1)[0]
second_part = record.split(old_ID, 1)[1]
return old_ID + '_' + str(new_ID), first_part + old_ID + '_' + str(new_ID) + second_part
def change_parent_id_in_record(old_parent_ID, new_parent_ID, record):
first_part = record.split(old_parent_ID, 1)[0]
second_part = record.split(old_parent_ID, 1)[1]
return first_part + str(new_parent_ID) + second_part
def traverse(ID, start, end, out_file, new_parent_id=None):
global counter
if ID in child_dict:
# ID has one or more children
next_rec = child_dict[ID]
next_ids = [i for i, l in next_rec]
# print('-------')
# print(ID, next_ids, start, end)
if len(set(next_ids)) == len(next_ids):
# unique child ids
for child in next_rec:
start_child = int(child[1].split('\t')[3])
end_child = int(child[1].split('\t')[4])
if start <= start_child and end >= end_child:
if new_parent_id:
out_file.write(change_parent_id_in_record(ID, new_parent_id, child[1]))
else:
out_file.write(child[1])
for child in next_rec:
start_child = int(child[1].split('\t')[3])
end_child = int(child[1].split('\t')[4])
if start <= start_child and end >= end_child:
traverse(child[0], int(child[1].split('\t')[3]), int(child[1].split('\t')[4]), out_file)
else:
# one or more ambiguous child ids
for child in next_rec:
start_child = int(child[1].split('\t')[3])
end_child = int(child[1].split('\t')[4])
if start <= start_child and end >= end_child:
if next_ids.count(child[0]) == 1:
# unique
if new_parent_id:
out_file.write(change_parent_id_in_record(ID, new_parent_id, child[1]))
else:
out_file.write(child[1])
traverse(child[0], int(child[1].split('\t')[3]), int(child[1].split('\t')[4]), out_file)
else:
# ambiguous
new_ID, new_rec = change_id_in_record(child[0], counter, child[1])
if new_parent_id:
out_file.write(change_parent_id_in_record(ID, new_parent_id, new_rec))
else:
out_file.write(new_rec)
counter += 1
traverse(child[0], int(child[1].split('\t')[3]), int(child[1].split('\t')[4]), out_file, new_ID)
counter = 1
if os.path.isfile(anno_gff_new):
os.remove(anno_gff_new)
with open(anno_gff_new, 'a') as outf:
for r in root_id_list:
root_rec = root_dict[r]
if len(root_rec) == 1:
# unique id
outf.write(root_rec[0])
traverse(r, int(root_rec[0].split('\t')[3]), int(root_rec[0].split('\t')[4]), outf)
else:
# ambiguous id
outf.write(change_id_in_record(r, counter, root_rec[0])[1])
counter += 1
if r in child_dict:
# TODO get children of ambiguous roots with new parent ids
print('Skipping children of ambiguous roots: ' + root_rec[0].strip())