forked from RimaSGH/Lab_Project
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathContig_class.py
More file actions
154 lines (134 loc) · 4.76 KB
/
Contig_class.py
File metadata and controls
154 lines (134 loc) · 4.76 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
import forgi
import forgi.graph.bulge_graph as fgb
import os as os
from os.path import join, isfile
from shutil import copytree
import shutil
import pickle
import numpy as np
class Contig:
def __init__(self,DBN_file_path,free_energy):
self.bg = fgb.BulgeGraph.from_fasta(DBN_file_path)[0]
self.name = os.path.basename(DBN_file_path)[:-4]
self.length= self.bg.seq_length
self.free_energy = float(free_energy)
self.unpaired_ratio=100*(countUnpaired(DBN_file_path)/(self.length))
self.stems=Contig.countStems(self.bg)
self.hairpins=Contig.countHairpins(self.bg)
self.interior_loops= Contig.countInteriorLoops(self.bg)
self.multiloops = Contig.countMultiloops(self.bg)
self.stems_median_length = Contig.stem_median(self.bg)
self.interior_loops_median_length = Contig.interior_loops_median(self.bg)
self.hairpins_median_length = Contig.hairpins_median(self.bg)
# self.multiloops_median_length = Contig.multiloops_median(self.bg)
def countStems(bg):
cnt=0
for h in bg.stem_iterator():
cnt+=1
return cnt
def interior_loops_median(bg):
sizes=[]
for i in bg.iloop_iterator():
sizes.append(bg.element_length(i))
return np.median(sizes)
def stem_median(bg):
sizes=[]
for s in bg.stem_iterator():
sizes.append(bg.element_length(s))
return np.median(sizes)
def hairpins_median(bg):
sizes=[]
for h in bg.hloop_iterator():
sizes.append(bg.element_length(h))
return np.median(sizes)
# def multiloops_median(bg):
# sizes=[]
# for h in bg.stem_iterator():
# sizes.append(bg.element_length(h))
# return np.median(sizes)
def countHairpins(bg):
cnt=0
for h in bg.hloop_iterator():
cnt+=1
return cnt
def countInteriorLoops(bg):
cnt=0
for i in bg.iloop_iterator():
cnt+=1
return cnt
def countMultiloops(bg):
junctions=bg.junctions
reg_multiloops=[]
cnt=0
for el in junctions:
if bg.describe_multiloop(el)=={'regular_multiloop'}:
cnt +=1
return cnt
def describe_contig(self):
print('name = ',self.name)
print('length = ',self.length)
print('free_energy = ',self.free_energy )
print('unpaired = ',self.unpaired_ratio)
print('stems = ',self.stems)
print('hairpins = ',self.hairpins)
print('interior_loops = ',self.interior_loops)
print('multiloops = ',self.multiloops)
def copyFolder(original_folder_path,destination,suffix):
new_folder_name= os.path.basename(original_folder_path)+suffix
new_folder_path=os.path.join(destination,new_folder_name)
ignore_func = lambda d, files: [f for f in files if(f.endswith('.fasta'))]
#ignore any files but files with '.dbn' extension
shutil.copytree(original_folder_path, new_folder_path, ignore=ignore_func)
return new_folder_path
#returns free energy and !!!updates!!! DBN file to regular DBN file by removing free energy - value from line 2
def getFreeEnergy(DBN_file_path):
with open(DBN_file_path,'r') as file:
content = file.readlines()
DBNstr = ""
FreeEnergy = ""
i=-1
for x in content[2]:
i +=1
if x != " ":
DBNstr+=x
else:
break
for j in range(i+2,len(content[2])-2):
FreeEnergy += content[2][j]
content[2]=DBNstr
with open(DBN_file_path,'w') as file:
file.writelines(content)
return FreeEnergy
def countUnpaired(DBN_file_path):
with open(DBN_file_path,'r') as file:
content = file.readlines()
cnt=0
for ch in content[2]:
if (ch =='.'):
cnt+=1
return cnt
def read_pickled_contig(file_path, NORMALIZE=True):
# a function to import the pickled `Contig`s.
# The next code reads in a pickle and simplifies the dictionary a bit.
with open(file_path, 'rb') as handle:
contig = pickle.load(handle)
data_dict = vars(contig)
del data_dict['bg']
length=data_dict['length']
#values are nomralized by length if NORMALIZE == True
if (NORMALIZE == True):
data_dict['stems']= normalize_by_length(data_dict['stems'],length)
data_dict['hairpins']= normalize_by_length(data_dict['hairpins'],length)
data_dict['interior_loops']= normalize_by_length(data_dict['interior_loops'],length)
data_dict['multiloops'] = normalize_by_length(data_dict['multiloops'],length)
data_dict['free_energy'] = normalize_by_length(data_dict['free_energy'],length)
round_features(data_dict)
return data_dict
def normalize_by_length(element_count,contig_length,N=1000):
N=1000
return (element_count/contig_length)*N
def round_features(data_dict):
# For some downstream tests, we need the element counts/frequency to be of Integer types, so we'll add a function for internal rounding of the elements in a dictionary.
for feature in data_dict:
if feature != 'name':
data_dict[feature]=round(data_dict[feature],3)