msa/test_lindvall_run.py at main · malee4/msa · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
import test_lindvall_columns
#from data_formats import SeqQuery
import numpy as np
import dimod
import neal
import pickle

from dwave.system.samplers import DWaveSampler
from dwave.system.composites import EmbeddingComposite

samples = 1000 # number of samples to run
save_file = "filename"

sequences = ["AT", "T"]
# sequences = ["AT", "T", "T", "A"]
# sequences = ["NVRLMLRL", "NVRLMLRL", "MNVRLMLRL", "NRLMLRL", "NVMLRLNL", "MNVRLRL"] # write your sequences here
simulation = True # flag determining to run simulated or quantum annealer

# rewards (negative) and costs (positive)
match_cost = -1
mismatch_cost = 1


sizes = [len(sequences[i]) for i in range(len(sequences))]
# calculate weights for matching
matchings = np.zeros((len(sequences), max(sizes), len(sequences), max(sizes)))
for s1 in range(len(sequences)):
    for s2 in range(len(sequences)):
        for n1 in range(sizes[s1]):
            for n2 in range(sizes[s2]):
                if sequences[s1][n1] == sequences[s2][n2]:
                    matchings[s1,n1,s2,n2] = match_cost
                else:
                    matchings[s1,n1,s2,n2] = mismatch_cost

inserts = 0
gap_penalty = 0
params = {"gap_pen": gap_penalty, "extra_inserts": inserts}
mat, shift, rev_inds = test_lindvall_columns.get_MSA_qubitmat(sizes, matchings,\
gap_pen=gap_penalty, extra_inserts=inserts)

def mat_to_dimod_format(matrix):
    """interprets the QUBO matrix of interactions into separate format for linear and interaction terms """
    n = matrix.shape[0]
    linear = {}
    interaction = {}
    for i in range(n):
        linear[i] = matrix[i,i]
        for j in range(i+1,n):
            interaction[(i,j)] = matrix[i,j]
    return linear, interaction

h, J = mat_to_dimod_format(mat)
if not simulation:
    cont = input("Continue? y/n ")
    if cont != "y":
        exit()

bqm = dimod.BinaryQuadraticModel(h,J, shift, dimod.BINARY)
if simulation:
    solver = neal.SimulatedAnnealingSampler()
else:
    # WARNING! requires setup of D-wave certificate file beforehand
    solver = EmbeddingComposite(DWaveSampler())
response = solver.sample(bqm, num_reads=samples)
positions = response.lowest().samples()[0]

# here's the brute force assignment of elements to positions part

# get the number of items per alignment
total_len = 0
all_seq = ""

# there is an error in this
def get_alignment_string(sequence_string_set, gaps, positions):
    # group positions based on sequence
    string_size = max([len(s) for s in sequence_string_set]) + gaps

    # get the positions, based on sequence
    organized_positions = get_positions(string_size, sequence_string_set, positions)

    # create an empty matrix
    align_strings = [["-" for i in range(string_size)] for i in range(len(sequence_string_set))]

    # fill in the matrix
    for key in organized_positions.keys():
        for result_id in range(len(organized_positions[key])):
            if organized_positions[key][result_id]:
                align_strings[key[0]][result_id] = sequence_string_set[key[0]][key[1]]
    return align_strings

def get_positions(string_size, sequence_string_set, positions):
    count = 0
    # split into positions
    organized_positions = dict()

    for seq_number in range(len(sequence_string_set)):
        for i in range(len(sequence_string_set[seq_number])):
            # create list of items
            temp = list()
            start = count
            for j in range(count, count + string_size):
                count = count + 1
                temp = temp + [positions[j]]
            organized_positions[(seq_number, i)] = temp
    return organized_positions

total_len = 0
max_len = -1
for s in sequences:
    total_len += len(s)
    if len(s) > max_len:
        max_len = len(s)

max_seq_len = len(positions) // total_len
gaps = max_seq_len - max_len

print(gaps)
print(positions)

output = get_alignment_string(sequences, gaps, positions)
for item in output:
    print(item)

"""
BRAINSTORMING
We are working with
    G gaps
    n total sequences, with L total letters
    Every letter has max_seq_len spaces assigned to it

max_seq_len = ...

for every sequence:
    seq_out = [] # initialize some max_seq_len length vector with '-' for spaces
    length = ni
    for letter in sequence:
        letter_position = positions[subset]
        for i in range(max_seq_len):
            if positions[i] == 1:
                seq_out[i] = letter
                break
            else:
                continue


"""