1
+ import numpy as np
2
+ import pandas as pd
3
+ import re
4
+ from bs4 import BeautifulSoup
5
+ import argparse
6
+
7
+
8
+ class AminoAcid :
9
+
10
+ def __init__ (self , nucleotide , bold , color , underline , large , position , open_tag , close_tag ):
11
+ self .nucleotide = nucleotide
12
+ self .bold = bold
13
+ self .color = color
14
+ self .underline = underline
15
+ self .large = large
16
+ self .position = position
17
+ self .open_tag = open_tag
18
+ self .close_tag = close_tag
19
+
20
+ # ---- PARSE ARGUMENTS -------------------------------------------------------
21
+ # Parses command line arguments
22
+ # Enables user help
23
+ def parse_arguments ():
24
+ # Parse command line arugments
25
+ parser = argparse .ArgumentParser (description = 'Color the 51mer peptide' )
26
+
27
+ parser .add_argument ('-p' ,
28
+ help = 'The path to the Peptides 51 mer' , required = True )
29
+ parser .add_argument ('-classI' ,
30
+ help = 'The path to the classI all_epitopes.aggregated.tsv used in pVACseq' , required = True )
31
+ parser .add_argument ('-classII' ,
32
+ help = 'The path to the classII all_epitopes.aggregated.tsv used in pVACseq' , required = True )
33
+
34
+ parser .add_argument ('-o' , help = "Output location" , required = True )
35
+
36
+ return (parser .parse_args ())
37
+
38
+ # Function to rearrange string so that G518D looks like 518G/D
39
+ def rearrange_string (s ):
40
+ match = re .match (r'([A-Za-z]+)([\d-]+)([A-Za-z]*)' , s )
41
+ if match :
42
+ letters_before = match .group (1 )
43
+ numbers = match .group (2 )
44
+ letters_after = match .group (3 )
45
+
46
+ #return f"{numbers}{letters_before}/{letters_after}"
47
+ # Just use the postion for the key to avoid FS problem
48
+ return f"{ numbers } "
49
+ else :
50
+ return s
51
+
52
+
53
+ def annotate_every_nucleotide (sequence , classI_peptide , classII_peptide ):
54
+
55
+ peptide_sequence = []
56
+
57
+ # Make the sequence a list of AminoAcid objects
58
+ for i in range (len (sequence )):
59
+ new_AA = AminoAcid (sequence [i ], False , False , False , False , - 1 , False , False )
60
+
61
+ if sequence [i ] == 'C' :
62
+ new_AA .large = True
63
+
64
+ peptide_sequence .append (new_AA )
65
+
66
+ # CLASS I
67
+ positions = []
68
+ # Get the positions in the peptide_sequence where the classI is located
69
+ for i in range (len (peptide_sequence )):
70
+ for j in range (len (classI_peptide )):
71
+ if peptide_sequence [i ].nucleotide == classI_peptide [j ]:
72
+ positions .append (i )
73
+ i += 1
74
+ else :
75
+ break
76
+
77
+ if len (positions ) == len (classI_peptide ):
78
+ break
79
+ else :
80
+ positions = []
81
+
82
+ # set those positions to red
83
+ j = 0
84
+ for i in range (len (peptide_sequence )):
85
+ if j < len (positions ) and i == positions [j ]:
86
+ peptide_sequence [i ].color = True
87
+ j += 1
88
+
89
+ # CLASS II
90
+ positions = []
91
+ for i in range (len (peptide_sequence )):
92
+ for j in range (len (classII_peptide )):
93
+ if peptide_sequence [i ].nucleotide == classII_peptide [j ]:
94
+ positions .append (i )
95
+ i += 1
96
+ else :
97
+ break
98
+
99
+ if len (positions ) == len (classII_peptide ):
100
+ break
101
+ else :
102
+ positions = []
103
+
104
+ j = 0
105
+ for i in range (len (peptide_sequence )):
106
+ if j < len (positions ) and i == positions [j ]:
107
+ peptide_sequence [i ].bold = True
108
+ j += 1
109
+
110
+
111
+ return (peptide_sequence )
112
+
113
+ def set_underline (peptide_sequence , mutant_peptide_pos ):
114
+
115
+ frameshift = False
116
+ classI_position = 0
117
+
118
+ if '-' in mutant_peptide_pos :
119
+ positions = mutant_peptide_pos .split ("-" )
120
+
121
+ start_position = int (positions [0 ])
122
+ end_position = int (positions [1 ])
123
+
124
+ frameshift = True
125
+
126
+ else :
127
+ mutant_peptide_pos = int (mutant_peptide_pos )
128
+
129
+ if frameshift :
130
+
131
+ continue_underline = False
132
+
133
+ for i in range (len (peptide_sequence )):
134
+
135
+ if peptide_sequence [i ].color :
136
+ classI_position += 1
137
+ else :
138
+ classI_position = 0
139
+ continue_underline = False
140
+
141
+ if classI_position == start_position :
142
+ peptide_sequence [i ].underline = True
143
+ continue_underline = True
144
+ elif continue_underline :
145
+ peptide_sequence [i ].underline = True
146
+ elif classI_position == end_position :
147
+ peptide_sequence [i ].underline = True
148
+ continue_underline = False
149
+ i += 1
150
+ else :
151
+ for i in range (len (peptide_sequence )):
152
+
153
+ if peptide_sequence [i ].color :
154
+ classI_position += 1
155
+ else :
156
+ classI_position = 0
157
+
158
+ if classI_position == int (mutant_peptide_pos ):
159
+ peptide_sequence [i ].underline = True
160
+ i += 1
161
+
162
+ def set_span_tags (peptide_sequence ):
163
+
164
+ currently_bold = False
165
+ currently_red = False
166
+ currently_underlined = False
167
+ currently_large = False
168
+ inside_span = False
169
+
170
+ for nucleotide in peptide_sequence :
171
+
172
+ if currently_bold != nucleotide .bold or currently_red != nucleotide .color or currently_underlined != nucleotide .underline or currently_large != nucleotide .large :
173
+
174
+ nucleotide .open_tag = True
175
+
176
+ if inside_span :
177
+ nucleotide .close_tag = True # only if its isnide a span tag
178
+ else :
179
+ nucleotide .close_tag = False
180
+
181
+
182
+ currently_bold = nucleotide .bold
183
+ currently_red = nucleotide .color
184
+ currently_underlined = nucleotide .underline
185
+ currently_large = nucleotide .large
186
+
187
+ inside_span = True
188
+
189
+ return (peptide_sequence )
190
+
191
+ def create_stylized_sequence (peptide_sequence ):
192
+
193
+ new_string = ''
194
+
195
+ for nucleotide in peptide_sequence :
196
+
197
+ if nucleotide .open_tag or nucleotide .close_tag :
198
+ if nucleotide .close_tag :
199
+ new_string += '</span>'
200
+
201
+ if nucleotide .open_tag :
202
+
203
+ if nucleotide .large : # we are assuming that a cystine is never in the classI and classIi
204
+ new_string += '<span style="font-size:105%">'
205
+ new_string += nucleotide .nucleotide
206
+
207
+ if nucleotide .bold and nucleotide .color and nucleotide .underline :
208
+ new_string += '<span style="font-weight:bold;color:#ff0000;text-decoration:underline;">'
209
+ new_string += nucleotide .nucleotide
210
+ elif nucleotide .bold and not nucleotide .color and not nucleotide .underline :
211
+ new_string += '<span style="font-weight:bold;">'
212
+ new_string += nucleotide .nucleotide
213
+ elif not nucleotide .bold and nucleotide .color and not nucleotide .underline :
214
+ new_string += '<span style="color:#ff0000;">'
215
+ new_string += nucleotide .nucleotide
216
+ elif not nucleotide .bold and not nucleotide .color and nucleotide .underline :
217
+ new_string += '<span style="text-decoration:underline;">'
218
+ new_string += nucleotide .nucleotide
219
+ elif nucleotide .bold and nucleotide .color and not nucleotide .underline :
220
+ new_string += '<span style="font-weight:bold;color:#ff0000;">'
221
+ new_string += nucleotide .nucleotide
222
+ elif not nucleotide .bold and nucleotide .color and nucleotide .underline :
223
+ new_string += '<span style="color:#ff0000;text-decoration:underline;">'
224
+ new_string += nucleotide .nucleotide
225
+ elif nucleotide .bold and not nucleotide .color and nucleotide .underline :
226
+ new_string += '<span style="font-weight:bold;text-decoration:underline;">'
227
+ new_string += nucleotide .nucleotide
228
+ else :
229
+ new_string += nucleotide .nucleotide
230
+
231
+ return (new_string )
232
+
233
+ def main ():
234
+ args = parse_arguments ()
235
+
236
+ # read in classI and class II
237
+ #peptides_51mer = pd.read_excel("/Volumes/mgriffit/Active/griffithlab/gc2596/e.schmidt/neoag_vaccine_scripts/scripts/data_files/10146-0021_Peptides_51-mer.xlsx")
238
+ #classI = pd.read_csv("/Volumes/mgriffit/Active/griffithlab/gc2596/e.schmidt/neoag_vaccine_scripts/scripts/data_files/classI.TWJF-10146-0021-Tumor_Lysate.all_epitopes.aggregated.tsv", sep="\t")
239
+ #classII = pd.read_csv("/Volumes/mgriffit/Active/griffithlab/gc2596/e.schmidt/neoag_vaccine_scripts/scripts/data_files/classII.TWJF-10146-0021-Tumor_Lysate.all_epitopes.aggregated.tsv", sep="\t")
240
+
241
+ peptides_51mer = pd .read_excel (args .p )
242
+ classI = pd .read_csv (args .classI , sep = "\t " )
243
+ classII = pd .read_csv (args .classII , sep = "\t " )
244
+
245
+ # Create a universal ID by editing the peptide 51mer ID
246
+ peptides_51mer .rename (columns = {'ID' : 'full ID' }, inplace = True )
247
+ peptides_51mer ['ID' ] = peptides_51mer ['full ID' ]
248
+
249
+ peptides_51mer ['ID' ] = peptides_51mer ['ID' ].apply (lambda x : '.' .join (x .split ('.' )[1 :])) # Removing before first period, periods will be removed
250
+ peptides_51mer ['ID' ] = peptides_51mer ['ID' ].apply (lambda x : '.' .join (x .split ('.' )[1 :])) # Removing before second period
251
+ peptides_51mer ['ID' ] = peptides_51mer ['ID' ].apply (lambda x : '.' .join (x .split ('.' )[:3 ]) + '.' + '.' .join (x .split ('.' )[4 :]))
252
+
253
+ for index , row in peptides_51mer .iterrows ():
254
+ for i , char in enumerate (row ['ID' ][::- 1 ]):
255
+ if char .isdigit ():
256
+ peptides_51mer .at [index , 'ID' ] = row ['ID' ][:- i ]
257
+ break
258
+ else :
259
+ result = row ['ID' ]
260
+
261
+ # create a key that is gene, transcript, AA change for CLASSI
262
+ classII ['modified AA Change' ] = classII ['AA Change' ]
263
+
264
+ # Apply the function to the 'Value' column
265
+ classII ['modified AA Change' ] = classII ['modified AA Change' ].apply (rearrange_string )
266
+
267
+ classII ['ID' ] = classII ['Gene' ] + '.' + classII ['Best Transcript' ] + '.' + classII ['modified AA Change' ]
268
+
269
+ # create a key that is gene, transcript, AA change for CLASSI
270
+ classI ['modified AA Change' ] = classI ['AA Change' ]
271
+
272
+ # Apply the function to the 'Value' column
273
+ classI ['modified AA Change' ] = classI ['modified AA Change' ].apply (rearrange_string )
274
+
275
+ classI ['ID' ] = classI ['Gene' ] + '.' + classI ['Best Transcript' ] + '.' + classI ['modified AA Change' ]
276
+
277
+ # Merge the sequences from classI and classII with peptide 51mer
278
+ merged_peptide_51mer = pd .merge (peptides_51mer , classII [['ID' , 'Best Peptide' ]], on = 'ID' , how = 'left' )
279
+
280
+ merged_peptide_51mer .rename (columns = {"Best Peptide" :"Best Peptide Class II" }, inplace = True )
281
+
282
+ merged_peptide_51mer = pd .merge (merged_peptide_51mer , classI [['ID' , 'Best Peptide' , 'Pos' ]], on = 'ID' , how = 'left' )
283
+
284
+ merged_peptide_51mer .rename (columns = {"Best Peptide" :"Best Peptide Class I" }, inplace = True )
285
+
286
+ # convert peptide 51mer to HTML
287
+ peptides_51mer_html = peptides_51mer .to_html (index = False ) # convert to html
288
+
289
+ # Creating a BeautifulSoup object and specifying the parser
290
+ peptides_51mer_soup = BeautifulSoup (peptides_51mer_html , 'html.parser' )
291
+
292
+
293
+ for index , row in peptides_51mer .iterrows ():
294
+
295
+ search_string = row ['full ID' ]
296
+
297
+ #classII_sequence
298
+ classII_peptide = merged_peptide_51mer .loc [merged_peptide_51mer ['full ID' ] == search_string , 'Best Peptide Class II' ].values [0 ]
299
+ #classI_sequence
300
+ classI_peptide = merged_peptide_51mer .loc [merged_peptide_51mer ['full ID' ] == search_string , 'Best Peptide Class I' ].values [0 ]
301
+ # mutant pepetide position --- not working yet becasue of STUPID frameshift
302
+ mutant_peptide_pos = str (merged_peptide_51mer .loc [merged_peptide_51mer ['full ID' ] == search_string , 'Pos' ].values [0 ])
303
+
304
+ # Find the tag containing the search string
305
+ tag_with_search_string = peptides_51mer_soup .find ('td' , string = search_string )
306
+
307
+ if tag_with_search_string and isinstance (classII_peptide , str ):
308
+
309
+ # Find the parent <tr> tag of the tag containing the search string
310
+ parent_tr = tag_with_search_string .find_parent ('tr' )
311
+ # Find the next two <td> tags
312
+ next_td_tags = parent_tr .findChildren ('td' , limit = 3 )
313
+
314
+ sequence = next_td_tags [2 ].get_text ()
315
+
316
+ # make sequence the list of objects
317
+ peptide_sequence = annotate_every_nucleotide (sequence , classI_peptide , classII_peptide )
318
+
319
+ # actaully lets break class I and classII into two steps and handle the mutated nucleotide in class I function
320
+ # it should be basically like at that position in the class I set
321
+
322
+ set_underline (peptide_sequence , mutant_peptide_pos )
323
+
324
+ set_span_tags (peptide_sequence ) # pass by reference
325
+
326
+ new_string = create_stylized_sequence (peptide_sequence )
327
+
328
+ print (new_string )
329
+
330
+ next_td_tags [2 ].string = new_string
331
+
332
+ modified_html = peptides_51mer_soup .prettify (formatter = None )
333
+
334
+ else :
335
+ print ("Search string not found." )
336
+
337
+ with open (args .o , "w" , encoding = 'utf-8' ) as file :
338
+ file .write (modified_html )
339
+
340
+
341
+
342
+ if __name__ == "__main__" :
343
+ main ()
0 commit comments