4
4
from bs4 import BeautifulSoup
5
5
import argparse
6
6
7
+ '''
8
+ Example Command:
9
+ python3 ../scripts/color_peptides51mer.py -p manual_review/JLF-100-051_Peptides_51-mer.xlsx -samp JLF-100-051 -o manual_review/
10
+ '''
11
+
7
12
8
13
class AminoAcid :
9
14
@@ -35,40 +40,24 @@ def parse_arguments():
35
40
36
41
parser .add_argument ('-p' ,
37
42
help = 'The path to the Peptides 51 mer' , required = True )
38
- parser .add_argument ('-classI' ,
39
- help = 'The path to the classI all_epitopes.aggregated.tsv used in pVACseq' , required = True )
40
- parser .add_argument ('-classII' ,
41
- help = 'The path to the classII all_epitopes.aggregated.tsv used in pVACseq' , required = True )
42
- parser .add_argument ('-WB' ,
43
- help = 'the path to the gcp_immuno folder of the trial you wish to tun script on, defined as WORKING_BASE in envs.txt' )
44
- parser .add_argument ('-samp' , help = 'Name of the sample' )
45
-
43
+ parser .add_argument ('-samp' ,
44
+ help = 'The name of the sample' , required = True )
45
+ parser .add_argument ('-o' ,
46
+ help = 'the path to output folder' )
46
47
47
48
return (parser .parse_args ())
48
49
49
- # Function to rearrange string so that G518D looks like 518G/D
50
- def rearrange_string (s ):
51
- match = re .match (r'([A-Za-z]+)([\d-]+)([A-Za-z]*)' , s )
52
- if match :
53
- letters_before = match .group (1 )
54
- numbers = match .group (2 )
55
- letters_after = match .group (3 )
56
-
57
- #return f"{numbers}{letters_before}/{letters_after}"
58
- # Just use the postion for the key to avoid FS problem
59
- return f"{ numbers } "
60
- else :
61
- return s
62
-
63
50
64
- def annotate_every_nucleotide (sequence , classI_peptide , classII_peptide ):
51
+ def annotate_every_nucleotide (sequence , classI_peptide , classII_peptide ,
52
+ classI_ic50 , classI_percentile , classII_percentile ,
53
+ classI_transcript , classII_transcript ):
65
54
66
- peptide_sequence = []
55
+ peptide_sequence = [] # Create a list to hold all AA of the 51mer
67
56
68
57
# Make the sequence a list of AminoAcid objects
69
58
for i in range (len (sequence )):
70
59
new_AA = AminoAcid (sequence [i ], False , False , False , False , - 1 , False , False )
71
-
60
+
72
61
if sequence [i ] == 'C' :
73
62
new_AA .large = True
74
63
@@ -91,32 +80,39 @@ def annotate_every_nucleotide(sequence, classI_peptide, classII_peptide):
91
80
positions = []
92
81
93
82
# set those positions to red
83
+ # if median affinity < 1000 nm OR percentile < 2%
94
84
j = 0
95
- for i in range (len (peptide_sequence )):
96
- if j < len (positions ) and i == positions [j ]:
97
- peptide_sequence [i ].color = True
98
- j += 1
85
+ if float (classI_ic50 ) < 1000 or float (classI_percentile ) < 2 :
86
+ for i in range (len (peptide_sequence )):
87
+ if j < len (positions ) and i == positions [j ]:
88
+ peptide_sequence [i ].color = True
89
+ j += 1
99
90
100
- # CLASS II
101
- positions = []
102
- for i in range (len (peptide_sequence )):
103
- for j in range (len (classII_peptide )):
104
- if peptide_sequence [i ].nucleotide == classII_peptide [j ]:
105
- positions .append (i )
106
- i += 1
107
- else :
91
+ if classI_transcript == classII_transcript :
92
+ # CLASS II
93
+ positions = []
94
+ for i in range (len (peptide_sequence )):
95
+ for j in range (len (classII_peptide )):
96
+ if peptide_sequence [i ].nucleotide == classII_peptide [j ]:
97
+ positions .append (i )
98
+ i += 1
99
+ else :
100
+ break
101
+
102
+ if len (positions ) == len (classII_peptide ):
108
103
break
109
-
110
- if len (positions ) == len (classII_peptide ):
111
- break
112
- else :
113
- positions = []
114
-
115
- j = 0
116
- for i in range (len (peptide_sequence )):
117
- if j < len (positions ) and i == positions [j ]:
118
- peptide_sequence [i ].bold = True
119
- j += 1
104
+ else :
105
+ positions = []
106
+ # Set class II to bold
107
+ # if percentile < 2%
108
+ j = 0
109
+ if float (classII_percentile ) < 2 :
110
+ for i in range (len (peptide_sequence )):
111
+ if j < len (positions ) and i == positions [j ]:
112
+ peptide_sequence [i ].bold = True
113
+ j += 1
114
+ else :
115
+ print ("Note: ClassII transcript different then ClassI. ClassII peptide not bolded." )
120
116
121
117
122
118
return (peptide_sequence )
@@ -126,21 +122,17 @@ def set_underline(peptide_sequence, mutant_peptide_pos, row_ID):
126
122
frameshift = False
127
123
classI_position = 0
128
124
125
+ # Determine if frameshift mutation by seraching for = '-'
129
126
if '-' in mutant_peptide_pos :
130
127
positions = mutant_peptide_pos .split ("-" )
131
128
start_position = int (positions [0 ])
132
129
end_position = int (positions [1 ])
133
130
134
131
frameshift = True
135
-
136
-
137
132
else :
138
133
mutant_peptide_pos = int (mutant_peptide_pos )
139
134
140
-
141
-
142
135
if frameshift :
143
-
144
136
continue_underline = False
145
137
146
138
for i in range (len (peptide_sequence )):
@@ -232,60 +224,35 @@ def main():
232
224
233
225
# read in classI and class II
234
226
peptides_51mer = pd .read_excel (args .p )
235
- classI = pd .read_csv (args .classI , sep = "\t " )
236
- classII = pd .read_csv (args .classII , sep = "\t " )
237
-
238
- # Create a universal ID by editing the peptide 51mer ID
239
- peptides_51mer .rename (columns = {'ID' : 'full ID' }, inplace = True )
240
- peptides_51mer ['51mer ID' ] = peptides_51mer ['full ID' ]
241
-
242
- peptides_51mer ['51mer ID' ] = peptides_51mer ['51mer ID' ].apply (lambda x : '.' .join (x .split ('.' )[1 :])) # Removing before first period, periods will be removed
243
-
244
- peptides_51mer ['51mer ID' ] = peptides_51mer ['51mer ID' ].apply (lambda x : '.' .join (x .split ('.' )[1 :])) # Removing before second period
245
- peptides_51mer ['51mer ID' ] = peptides_51mer ['51mer ID' ].apply (lambda x : '.' .join (x .split ('.' )[:3 ]) + '.' + '.' .join (x .split ('.' )[4 :]))
246
-
247
-
248
- for index , row in peptides_51mer .iterrows ():
249
- for i , char in enumerate (row ['51mer ID' ][::- 1 ]):
250
- if char .isdigit ():
251
- peptides_51mer .at [index , '51mer ID' ] = row ['51mer ID' ][:- i ]
252
- break
253
- else :
254
- result = row ['51mer ID' ]
255
-
256
- # create a dataframe that contains the classI and classII pepetide sequence
257
- classI .rename (columns = {"Best Peptide" :"Best Peptide Class I" }, inplace = True )
258
- classII .rename (columns = {"Best Peptide" :"Best Peptide Class II" }, inplace = True )
259
-
260
- # create a key that is gene, transcript, AA change for ClassI to join to the peptides order form
261
- classI ['modified AA Change' ] = classI ['AA Change' ]
262
- classI ['modified AA Change' ] = classI ['modified AA Change' ].apply (rearrange_string )
263
- classI ['51mer ID' ] = classI ['Gene' ] + '.' + classI ['Best Transcript' ] + '.' + classI ['modified AA Change' ]
264
-
265
- class_sequences = pd .merge (classI [['ID' , 'Best Peptide Class I' , '51mer ID' , 'Pos' ]], classII [['ID' , 'Best Peptide Class II' ]], on = 'ID' , how = 'left' )
266
-
267
- # Create a dataframe that has the classI and classII sequence
268
- merged_peptide_51mer = pd .merge (peptides_51mer , class_sequences , on = '51mer ID' , how = 'left' )
269
227
270
228
# convert peptide 51mer to HTML
271
229
peptides_51mer_html = peptides_51mer .to_html (index = False ) # convert to html
272
230
273
231
# Creating a BeautifulSoup object and specifying the parser
274
232
peptides_51mer_soup = BeautifulSoup (peptides_51mer_html , 'html.parser' )
275
233
276
-
277
234
for index , row in peptides_51mer .iterrows ():
278
235
279
236
search_string = row ['51mer ID' ]
280
237
281
- #classII_sequence
282
- classII_peptide = merged_peptide_51mer .loc [merged_peptide_51mer ['51mer ID' ] == search_string , 'Best Peptide Class II' ].values [0 ]
283
- #classI_sequence
284
- classI_peptide = merged_peptide_51mer .loc [merged_peptide_51mer ['51mer ID' ] == search_string , 'Best Peptide Class I' ].values [0 ]
285
-
286
-
287
- # mutant pepetide position ---
288
- mutant_peptide_pos = str (merged_peptide_51mer .loc [merged_peptide_51mer ['51mer ID' ] == search_string , 'Pos' ].values [0 ])
238
+ # classII sequence
239
+ classII_peptide = peptides_51mer .loc [peptides_51mer ['51mer ID' ] == search_string , 'Best Peptide Class II' ].values [0 ]
240
+ # classI sequence
241
+ classI_peptide = peptides_51mer .loc [peptides_51mer ['51mer ID' ] == search_string , 'Best Peptide Class I' ].values [0 ]
242
+ # mutant peptide position
243
+ mutant_peptide_pos = str (peptides_51mer .loc [peptides_51mer ['51mer ID' ] == search_string , 'Pos' ].values [0 ])
244
+ # classI IC50
245
+ classI_ic50 = peptides_51mer .loc [peptides_51mer ['51mer ID' ] == search_string , 'Class I IC50 MT' ].values [0 ]
246
+ # classI percentile
247
+ classI_percentile = peptides_51mer .loc [peptides_51mer ['51mer ID' ] == search_string , 'Class I %ile MT' ].values [0 ]
248
+ # classII IC50 -- not used to determine sequence coloring
249
+ # classII percentile
250
+ classII_percentile = peptides_51mer .loc [peptides_51mer ['51mer ID' ] == search_string , 'Class II %ile MT' ].values [0 ]
251
+ # classI transcript
252
+ classI_transcript = peptides_51mer .loc [peptides_51mer ['51mer ID' ] == search_string , 'Class I Best Transcript' ].values [0 ]
253
+ # classI transcript
254
+ classII_transcript = peptides_51mer .loc [peptides_51mer ['51mer ID' ] == search_string , 'Class II Best Transcript' ].values [0 ]
255
+
289
256
290
257
# Find the tag containing the search string
291
258
tag_with_search_string = peptides_51mer_soup .find ('td' , string = search_string )
@@ -300,7 +267,9 @@ def main():
300
267
sequence = next_td_tags [2 ].get_text ()
301
268
302
269
# make sequence the list of objects
303
- peptide_sequence = annotate_every_nucleotide (sequence , classI_peptide , classII_peptide )
270
+ peptide_sequence = annotate_every_nucleotide (sequence , classI_peptide , classII_peptide ,
271
+ classI_ic50 , classI_percentile , classII_percentile ,
272
+ classI_transcript , classII_transcript )
304
273
305
274
# actaully lets break class I and classII into two steps and handle the mutated nucleotide in class I function
306
275
# it should be basically like at that position in the class I set
@@ -331,13 +300,15 @@ def main():
331
300
# Now 'soup' contains the modified HTML with the tag removed
332
301
modified_html = peptides_51mer_soup .prettify (formatter = None )
333
302
334
- if args .WB :
335
- html_file_name = args .WB + '/../manual_review/' + args .samp + ".Colored_Peptides.html"
303
+ print ()
304
+
305
+ if args .o :
306
+ html_file_name = args .o + args .samp + ".Colored_Peptides.html"
336
307
else :
337
- html_file_name = args .samp + ".Colored_Peptides.html"
308
+ html_file_name = args .o + ".Colored_Peptides.html"
338
309
339
310
with open (html_file_name , "w" , encoding = 'utf-8' ) as file :
340
- file .write (modified_html )
311
+ file .write ( modified_html )
341
312
342
313
343
314
if __name__ == "__main__" :
0 commit comments