-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtest_lindvall_run.py
More file actions
149 lines (120 loc) · 4.42 KB
/
test_lindvall_run.py
File metadata and controls
149 lines (120 loc) · 4.42 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
import test_lindvall_columns
#from data_formats import SeqQuery
import numpy as np
import dimod
import neal
import pickle
from dwave.system.samplers import DWaveSampler
from dwave.system.composites import EmbeddingComposite
samples = 1000 # number of samples to run
save_file = "filename"
sequences = ["AT", "T"]
# sequences = ["AT", "T", "T", "A"]
# sequences = ["NVRLMLRL", "NVRLMLRL", "MNVRLMLRL", "NRLMLRL", "NVMLRLNL", "MNVRLRL"] # write your sequences here
simulation = True # flag determining to run simulated or quantum annealer
# rewards (negative) and costs (positive)
match_cost = -1
mismatch_cost = 1
sizes = [len(sequences[i]) for i in range(len(sequences))]
# calculate weights for matching
matchings = np.zeros((len(sequences), max(sizes), len(sequences), max(sizes)))
for s1 in range(len(sequences)):
for s2 in range(len(sequences)):
for n1 in range(sizes[s1]):
for n2 in range(sizes[s2]):
if sequences[s1][n1] == sequences[s2][n2]:
matchings[s1,n1,s2,n2] = match_cost
else:
matchings[s1,n1,s2,n2] = mismatch_cost
inserts = 0
gap_penalty = 0
params = {"gap_pen": gap_penalty, "extra_inserts": inserts}
mat, shift, rev_inds = test_lindvall_columns.get_MSA_qubitmat(sizes, matchings,\
gap_pen=gap_penalty, extra_inserts=inserts)
def mat_to_dimod_format(matrix):
"""interprets the QUBO matrix of interactions into separate format for linear and interaction terms """
n = matrix.shape[0]
linear = {}
interaction = {}
for i in range(n):
linear[i] = matrix[i,i]
for j in range(i+1,n):
interaction[(i,j)] = matrix[i,j]
return linear, interaction
h, J = mat_to_dimod_format(mat)
if not simulation:
cont = input("Continue? y/n ")
if cont != "y":
exit()
bqm = dimod.BinaryQuadraticModel(h,J, shift, dimod.BINARY)
if simulation:
solver = neal.SimulatedAnnealingSampler()
else:
# WARNING! requires setup of D-wave certificate file beforehand
solver = EmbeddingComposite(DWaveSampler())
response = solver.sample(bqm, num_reads=samples)
positions = response.lowest().samples()[0]
# here's the brute force assignment of elements to positions part
# get the number of items per alignment
total_len = 0
all_seq = ""
# there is an error in this
def get_alignment_string(sequence_string_set, gaps, positions):
# group positions based on sequence
string_size = max([len(s) for s in sequence_string_set]) + gaps
# get the positions, based on sequence
organized_positions = get_positions(string_size, sequence_string_set, positions)
# create an empty matrix
align_strings = [["-" for i in range(string_size)] for i in range(len(sequence_string_set))]
# fill in the matrix
for key in organized_positions.keys():
for result_id in range(len(organized_positions[key])):
if organized_positions[key][result_id]:
align_strings[key[0]][result_id] = sequence_string_set[key[0]][key[1]]
return align_strings
def get_positions(string_size, sequence_string_set, positions):
count = 0
# split into positions
organized_positions = dict()
for seq_number in range(len(sequence_string_set)):
for i in range(len(sequence_string_set[seq_number])):
# create list of items
temp = list()
start = count
for j in range(count, count + string_size):
count = count + 1
temp = temp + [positions[j]]
organized_positions[(seq_number, i)] = temp
return organized_positions
total_len = 0
max_len = -1
for s in sequences:
total_len += len(s)
if len(s) > max_len:
max_len = len(s)
max_seq_len = len(positions) // total_len
gaps = max_seq_len - max_len
print(gaps)
print(positions)
output = get_alignment_string(sequences, gaps, positions)
for item in output:
print(item)
"""
BRAINSTORMING
We are working with
G gaps
n total sequences, with L total letters
Every letter has max_seq_len spaces assigned to it
max_seq_len = ...
for every sequence:
seq_out = [] # initialize some max_seq_len length vector with '-' for spaces
length = ni
for letter in sequence:
letter_position = positions[subset]
for i in range(max_seq_len):
if positions[i] == 1:
seq_out[i] = letter
break
else:
continue
"""