RNAfold_virus_Rima/Contig_class.py at main · UriNeri/RNAfold_virus_Rima · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
import forgi
import forgi.graph.bulge_graph as fgb
import os as os
from os.path import join, isfile
from shutil import copytree
import shutil
import pickle
import numpy as np

class Contig:
  def __init__(self,DBN_file_path,free_energy):
    self.bg = fgb.BulgeGraph.from_fasta(DBN_file_path)[0]
    self.name = os.path.basename(DBN_file_path)[:-4]
    self.length= self.bg.seq_length
    self.free_energy = float(free_energy)
    self.unpaired_ratio=100*(countUnpaired(DBN_file_path)/(self.length))
    self.stems=Contig.countStems(self.bg)
    self.hairpins=Contig.countHairpins(self.bg)
    self.interior_loops= Contig.countInteriorLoops(self.bg)
    self.multiloops = Contig.countMultiloops(self.bg)

    self.stems_median_length = Contig.stem_median(self.bg)
    self.interior_loops_median_length = Contig.interior_loops_median(self.bg)
    self.hairpins_median_length = Contig.hairpins_median(self.bg)
    # self.multiloops_median_length = Contig.multiloops_median(self.bg)


  def countStems(bg):
    cnt=0
    for h in bg.stem_iterator():
      cnt+=1
    return cnt

  def interior_loops_median(bg):
    sizes=[]
    for i in bg.iloop_iterator():
        sizes.append(bg.element_length(i))
    return np.median(sizes)

  def stem_median(bg):
    sizes=[]
    for s in bg.stem_iterator():
        sizes.append(bg.element_length(s))
    return np.median(sizes)

  def hairpins_median(bg):
    sizes=[]
    for h in bg.hloop_iterator():
        sizes.append(bg.element_length(h))
    return np.median(sizes)

  # def multiloops_median(bg):
  #   sizes=[]
  #   for h in bg.stem_iterator():
  #       sizes.append(bg.element_length(h))
  #   return np.median(sizes)

  def countHairpins(bg):
    cnt=0
    for h in bg.hloop_iterator():
      cnt+=1
    return cnt

  def countInteriorLoops(bg):
    cnt=0
    for i in bg.iloop_iterator():
      cnt+=1
    return cnt

  def countMultiloops(bg):
    junctions=bg.junctions
    reg_multiloops=[]
    cnt=0
    for el in junctions:
      if bg.describe_multiloop(el)=={'regular_multiloop'}:
        cnt +=1
    return cnt


  def describe_contig(self):
    print('name = ',self.name)
    print('length = ',self.length)
    print('free_energy = ',self.free_energy )
    print('unpaired = ',self.unpaired_ratio)
    print('stems = ',self.stems)
    print('hairpins = ',self.hairpins)
    print('interior_loops = ',self.interior_loops)
    print('multiloops = ',self.multiloops)


def copyFolder(original_folder_path,destination,suffix):
  new_folder_name= os.path.basename(original_folder_path)+suffix
  new_folder_path=os.path.join(destination,new_folder_name)
  ignore_func = lambda d, files: [f for f in files if(f.endswith('.fasta'))]
  #ignore any files but files with '.dbn' extension
  shutil.copytree(original_folder_path, new_folder_path, ignore=ignore_func)
  return new_folder_path

#returns free energy and !!!updates!!! DBN file to regular DBN file by removing free energy - value from line 2
def getFreeEnergy(DBN_file_path):
  with open(DBN_file_path,'r') as file:
    content = file.readlines()
  DBNstr = ""
  FreeEnergy = ""
  i=-1
  for x in content[2]:
    i +=1
    if x != " ":
      DBNstr+=x
    else:
      break
  for j in range(i+2,len(content[2])-2):
    FreeEnergy += content[2][j]
  content[2]=DBNstr
  with open(DBN_file_path,'w') as file:
    file.writelines(content)
  return FreeEnergy

def countUnpaired(DBN_file_path):
  with open(DBN_file_path,'r') as file:
    content = file.readlines()
  cnt=0
  for ch in content[2]:
    if (ch =='.'):
      cnt+=1
  return cnt

def read_pickled_contig(file_path, NORMALIZE=True):
# a function to import the pickled `Contig`s.
# The next code reads in a pickle and simplifies the dictionary a bit.
  with open(file_path, 'rb') as handle:
    contig = pickle.load(handle)
  data_dict = vars(contig)
  del data_dict['bg']
  length=data_dict['length']
  #values are nomralized by length if NORMALIZE == True
  if (NORMALIZE == True):
     data_dict['stems']= normalize_by_length(data_dict['stems'],length)
     data_dict['hairpins']= normalize_by_length(data_dict['hairpins'],length)
     data_dict['interior_loops']= normalize_by_length(data_dict['interior_loops'],length)
     data_dict['multiloops'] = normalize_by_length(data_dict['multiloops'],length)
     data_dict['free_energy'] = normalize_by_length(data_dict['free_energy'],length)
  round_features(data_dict)
  return data_dict

def normalize_by_length(element_count,contig_length,N=1000):
  N=1000
  return (element_count/contig_length)*N

def round_features(data_dict):
    # For some downstream tests, we need the element counts/frequency to be of Integer types, so we'll add a function for internal rounding of the elements in a dictionary.
    for feature in data_dict:
        if feature != 'name':
            data_dict[feature]=round(data_dict[feature],3)