1+ from random import randint
2+
3+ # Constant values to be used for the scoring grid in the Needleman-Wunsch algorithm
4+ IDENTITY = 4
5+ TRANSITION = - 2
6+ TRANSVERSION = - 3
7+ GAP = - 8
8+ SCORES = { # scoring matrix for the Needleman-Wunsch algorithm
9+ "A" : {
10+ "A" : IDENTITY ,
11+ "T" : TRANSVERSION ,
12+ "C" : TRANSVERSION ,
13+ "G" : TRANSITION
14+ },
15+ "T" : {
16+ "A" : TRANSVERSION ,
17+ "T" : IDENTITY ,
18+ "C" : TRANSITION ,
19+ "G" : TRANSVERSION
20+ },
21+ "C" : {
22+ "A" : TRANSVERSION ,
23+ "T" : TRANSITION ,
24+ "C" : IDENTITY ,
25+ "G" : TRANSVERSION
26+ },
27+ "G" : {
28+ "A" : TRANSITION ,
29+ "T" : TRANSVERSION ,
30+ "C" : TRANSVERSION ,
31+ "G" : IDENTITY
32+ }
33+ }
34+
35+ # left, up, diagonal, used to fill direction matrix path for backpropagation
36+ DIRECTIONS = ("DIAG" , "LEFT" , "UP" )
37+
38+ def get_seq () -> tuple [str , str ]:
39+ """Prompt user for sequences to be used in the Needleman-Wunsch algorithm,
40+ then read and format sequences.
41+
42+ Returns a tuple of cleaned DNA sequence strings.
43+ """
44+ seq = ["" , "" ]
45+ path = input ("Please enter a filepath containing two sequences:\n " ).strip ()
46+
47+ try :
48+ with open (path , "r" , encoding = 'utf-8' ) as f :
49+ lines = f .readlines ()
50+ idx_1 , idx_2 = None , None
51+ for i in range (len (lines )):
52+ if lines [i ].startswith (">" ):
53+ if idx_1 == None :
54+ idx_1 = i
55+ elif idx_2 == None :
56+ idx_2 = i
57+ break
58+
59+ # Format txt file input of FASTA seq into a continuous string of nucleotides
60+ try :
61+ seq [0 ] = "" .join (lines [idx_1 + 1 :idx_2 ]).upper ().replace ("\n " , "" )
62+ seq [1 ] = "" .join (lines [idx_2 + 1 :]).upper ().replace ("\n " , "" )
63+ except TypeError as e :
64+ print (e , "\n File must be in the form of\n >SEQUENCE 1 NAME\n AGTC ... GTA\n >SEQUENCE 2 NAME\n TGAT ... CCA" )
65+ except FileNotFoundError as e :
66+ print ("ERROR READING FILEPATH: " , e )
67+
68+
69+ return seq
70+
71+ def make_matrix (seq_1 :str , seq_2 :str , make_dir :bool ) -> list [list ]:
72+ """Create grid of values to be used for the Needleman-Wunsch algorithm
73+ This matrix is in the form of:
74+ [
75+ [_, _, A, T, A, ..., G]
76+ [_, 0, _, _, _, ..., _]
77+ ...
78+ [A, _, _, _, _, ..., _]
79+ ]
80+
81+ if make_dir is True, the [1][1] position contains the string 'DONE' to halt
82+ back propagation, and the first row/col axes have 'LEFT' and 'UP' directions,
83+ respectively
84+
85+ otherwise, fill the valued matrix alongside the first row/col by multiples
86+ of the GAP penalty
87+ """
88+
89+ # initialize empty array
90+ mat = [ [None ]* (len (seq_2 ) + 2 ) for i in range (len (seq_1 ) + 2 ) ]
91+
92+ if make_dir : # fill direction matrix
93+ # make seq 1 exist along the y axis (rows)
94+ for i in range (len (seq_1 )):
95+ mat [i + 2 ][0 ] = seq_1 [i ]
96+ mat [i + 2 ][1 ] = DIRECTIONS [0 ]
97+
98+ # make seq 2 exist along the x axis (cols)
99+ for i in range (len (seq_2 )):
100+ mat [0 ][i + 2 ] = seq_2 [i ]
101+ mat [1 ][i + 2 ] = DIRECTIONS [1 ]
102+ else : # fill value matrix
103+ # make seq 1 exist along the y axis (rows)
104+ for i in range (len (seq_1 )):
105+ mat [i + 2 ][0 ] = seq_1 [i ]
106+ mat [i + 2 ][1 ] = GAP * (i + 1 )
107+
108+ # make seq 2 exist along the x axis (cols)
109+ for i in range (len (seq_2 )):
110+ mat [0 ][i + 2 ] = seq_2 [i ]
111+ mat [1 ][i + 2 ] = GAP * (i + 1 )
112+
113+ mat [1 ][1 ] = "DONE" if make_dir else 0
114+
115+ return mat
116+
117+ def fill_matrix (mat :list [list ], mat_dir :list [list ], x :int , y :int , max_x :int , max_y :int ) -> None :
118+ """Uses the scores matrix to fill values and directions for the Needleman-Wunsch algorithm grid"""
119+ # end if x or y positions are out of bounds or mat not initialized
120+ if mat == None or x > max_x or y > max_y :
121+ return
122+ # end if None is at the current position, the left, or above
123+ if mat [y ][x ] != None or mat [y - 1 ][x ] == None or mat [y ][x - 1 ] == None :
124+ return
125+
126+ # find the score that corresponds to the index/column of current position
127+ score = SCORES [mat [y ][0 ]] [mat [0 ][x ]]
128+
129+ # get list of the 3 possible new values
130+ values = [
131+ mat [y - 1 ][x - 1 ] + score , # value coming from diagonal,
132+ mat [y ][x - 1 ] + GAP , # value coming from left
133+ mat [y - 1 ][x ] + GAP , # value coming from up
134+ ]
135+
136+ max_val = max (values ) # max val
137+ max_idx = values .index (max_val ) # index corresponds to the back-propagation movement, prioritizes first index (diagonal)
138+ if max_idx > 0 and values [1 ] == values [2 ]:
139+ max_idx = randint (1 ,2 ) # if theres a tie between left and up, randomly pick one
140+
141+ mat [y ][x ] = max_val # make position's value the best scoring outcome
142+ mat_dir [y ][x ] = DIRECTIONS [max_idx ] # make positions value the backprop direction
143+
144+ fill_matrix (mat , mat_dir , x + 1 , y , max_x , max_y ) # recusively fill values by moving right
145+ fill_matrix (mat , mat_dir , x , y + 1 , max_x , max_y ) # recursively fill values by moving down
146+
147+ def get_alignment (mat :list [list ], mat_dir :list [list ]) -> tuple [str , str , int ]:
148+ """Uses a filled direction matrix and uses the Needleman-Wunsch algorithm to get the
149+ best scoring alignment via backpropogation. The maximum value is also obtained from
150+ the valued matrix
151+
152+ Function returns a tuple containing an optimal alignment for seq1, seq2, and the best
153+ score associated with the alignment"""
154+ x = len (mat [0 ]) - 1
155+ y = len (mat ) - 1
156+ best_score = mat [y ][x ]
157+
158+ align_1 = "" # vertical seq
159+ align_2 = "" # horizontal seq
160+ while mat_dir [y ][x ] != "DONE" :
161+ direction = mat_dir [y ][x ]
162+
163+ if direction == "DIAG" :
164+ # backpropagation must move diagonally
165+ align_1 = mat_dir [y ][0 ] + align_1
166+ align_2 = mat_dir [0 ][x ] + align_2
167+ x -= 1
168+ y -= 1
169+ elif direction == "LEFT" :
170+ # backpropagation must move left
171+ align_1 = "-" + align_1
172+ align_2 = mat_dir [0 ][x ] + align_2
173+ x -= 1
174+ else :
175+ # backpropagation must move up
176+ align_1 = mat_dir [y ][0 ] + align_1
177+ align_2 = "-" + align_2
178+ y -= 1
179+
180+ return (align_1 , align_2 , best_score )
181+
182+ def print_align_results (align_1 :str , align_2 :str , score :int ) -> None :
183+ """Takes the aligned sequences strings and best score, then prints out the results"""
184+ print (f"SEQUENCE 1:\t { align_1 } " )
185+ print (f"SEQUENCE 2:\t { align_2 } " )
186+ print ("Alignment Score:" , score )
187+
188+ def print_mat (mat :list [list ]) -> None :
189+ """cleanly prints a provided matrix"""
190+ for row in mat :
191+ for i in row :
192+ print ("_" if i == None else i , end = "\t " )
193+ print ("\n " )
194+
195+ if __name__ == "__main__" :
196+ while True :
197+ seq1 , seq2 = get_seq ()
198+ if seq1 == "" or seq2 == "" :
199+ print ("Inputs contain invalid sequences, please check your FASTA files." )
200+ else :
201+ mat = make_matrix (seq1 , seq2 , make_dir = False )
202+ mat_dir = make_matrix (seq1 , seq2 , make_dir = True )
203+
204+ fill_matrix (mat , mat_dir , 2 , 2 , len (mat [0 ])- 1 , len (mat )- 1 )
205+
206+ align_1 , align_2 , score = get_alignment (mat , mat_dir )
207+
208+ print ("############ DYNAMIC PROGRAMMING TABLE ############\n " )
209+ print_mat (mat )
210+ print_mat (mat_dir )
211+
212+ print ("############ ALIGNMENT RESULTS ############\n " )
213+ print_align_results (align_1 , align_2 , score )
214+
215+ if input ("Would you like to align sequences again? (y/n)" ).strip ().lower () == "y" :
216+ continue
217+ else :
218+ break
0 commit comments