Skip to content

Commit 49ff8f4

Browse files
author
Evelyn Schmidt
committed
working color peptides 51mer
1 parent 5e26dd4 commit 49ff8f4

File tree

1 file changed

+343
-0
lines changed

1 file changed

+343
-0
lines changed

scripts/color_peptides51mer.py

Lines changed: 343 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,343 @@
1+
import numpy as np
2+
import pandas as pd
3+
import re
4+
from bs4 import BeautifulSoup
5+
import argparse
6+
7+
8+
class AminoAcid:
9+
10+
def __init__(self, nucleotide, bold, color, underline, large, position, open_tag, close_tag):
11+
self.nucleotide = nucleotide
12+
self.bold = bold
13+
self.color = color
14+
self.underline = underline
15+
self.large = large
16+
self.position = position
17+
self.open_tag = open_tag
18+
self.close_tag = close_tag
19+
20+
# ---- PARSE ARGUMENTS -------------------------------------------------------
21+
# Parses command line arguments
22+
# Enables user help
23+
def parse_arguments():
24+
# Parse command line arugments
25+
parser = argparse.ArgumentParser(description='Color the 51mer peptide')
26+
27+
parser.add_argument('-p',
28+
help='The path to the Peptides 51 mer', required=True)
29+
parser.add_argument('-classI',
30+
help='The path to the classI all_epitopes.aggregated.tsv used in pVACseq', required=True)
31+
parser.add_argument('-classII',
32+
help='The path to the classII all_epitopes.aggregated.tsv used in pVACseq', required=True)
33+
34+
parser.add_argument('-o', help="Output location", required=True)
35+
36+
return(parser.parse_args())
37+
38+
# Function to rearrange string so that G518D looks like 518G/D
39+
def rearrange_string(s):
40+
match = re.match(r'([A-Za-z]+)([\d-]+)([A-Za-z]*)', s)
41+
if match:
42+
letters_before = match.group(1)
43+
numbers = match.group(2)
44+
letters_after = match.group(3)
45+
46+
#return f"{numbers}{letters_before}/{letters_after}"
47+
# Just use the postion for the key to avoid FS problem
48+
return f"{numbers}"
49+
else:
50+
return s
51+
52+
53+
def annotate_every_nucleotide(sequence, classI_peptide, classII_peptide):
54+
55+
peptide_sequence = []
56+
57+
# Make the sequence a list of AminoAcid objects
58+
for i in range(len(sequence)):
59+
new_AA = AminoAcid(sequence[i], False, False, False, False, -1, False, False)
60+
61+
if sequence[i] == 'C':
62+
new_AA.large = True
63+
64+
peptide_sequence.append(new_AA)
65+
66+
# CLASS I
67+
positions = []
68+
# Get the positions in the peptide_sequence where the classI is located
69+
for i in range(len(peptide_sequence)):
70+
for j in range(len(classI_peptide)):
71+
if peptide_sequence[i].nucleotide == classI_peptide[j]:
72+
positions.append(i)
73+
i+=1
74+
else:
75+
break
76+
77+
if len(positions) == len(classI_peptide):
78+
break
79+
else:
80+
positions = []
81+
82+
# set those positions to red
83+
j = 0
84+
for i in range(len(peptide_sequence)):
85+
if j < len(positions) and i == positions[j]:
86+
peptide_sequence[i].color = True
87+
j+=1
88+
89+
# CLASS II
90+
positions = []
91+
for i in range(len(peptide_sequence)):
92+
for j in range(len(classII_peptide)):
93+
if peptide_sequence[i].nucleotide == classII_peptide[j]:
94+
positions.append(i)
95+
i+=1
96+
else:
97+
break
98+
99+
if len(positions) == len(classII_peptide):
100+
break
101+
else:
102+
positions = []
103+
104+
j = 0
105+
for i in range(len(peptide_sequence)):
106+
if j < len(positions) and i == positions[j]:
107+
peptide_sequence[i].bold = True
108+
j+=1
109+
110+
111+
return(peptide_sequence)
112+
113+
def set_underline(peptide_sequence, mutant_peptide_pos):
114+
115+
frameshift = False
116+
classI_position = 0
117+
118+
if '-' in mutant_peptide_pos:
119+
positions = mutant_peptide_pos.split("-")
120+
121+
start_position = int(positions[0])
122+
end_position = int(positions[1])
123+
124+
frameshift = True
125+
126+
else:
127+
mutant_peptide_pos = int(mutant_peptide_pos)
128+
129+
if frameshift:
130+
131+
continue_underline = False
132+
133+
for i in range(len(peptide_sequence)):
134+
135+
if peptide_sequence[i].color:
136+
classI_position += 1
137+
else:
138+
classI_position = 0
139+
continue_underline = False
140+
141+
if classI_position == start_position:
142+
peptide_sequence[i].underline = True
143+
continue_underline = True
144+
elif continue_underline:
145+
peptide_sequence[i].underline = True
146+
elif classI_position == end_position:
147+
peptide_sequence[i].underline = True
148+
continue_underline = False
149+
i+=1
150+
else:
151+
for i in range(len(peptide_sequence)):
152+
153+
if peptide_sequence[i].color:
154+
classI_position += 1
155+
else:
156+
classI_position = 0
157+
158+
if classI_position == int(mutant_peptide_pos):
159+
peptide_sequence[i].underline = True
160+
i+=1
161+
162+
def set_span_tags(peptide_sequence):
163+
164+
currently_bold = False
165+
currently_red = False
166+
currently_underlined = False
167+
currently_large = False
168+
inside_span = False
169+
170+
for nucleotide in peptide_sequence:
171+
172+
if currently_bold != nucleotide.bold or currently_red != nucleotide.color or currently_underlined != nucleotide.underline or currently_large != nucleotide.large:
173+
174+
nucleotide.open_tag = True
175+
176+
if inside_span:
177+
nucleotide.close_tag = True # only if its isnide a span tag
178+
else:
179+
nucleotide.close_tag = False
180+
181+
182+
currently_bold = nucleotide.bold
183+
currently_red = nucleotide.color
184+
currently_underlined = nucleotide.underline
185+
currently_large = nucleotide.large
186+
187+
inside_span = True
188+
189+
return(peptide_sequence)
190+
191+
def create_stylized_sequence(peptide_sequence):
192+
193+
new_string = ''
194+
195+
for nucleotide in peptide_sequence:
196+
197+
if nucleotide.open_tag or nucleotide.close_tag:
198+
if nucleotide.close_tag:
199+
new_string += '</span>'
200+
201+
if nucleotide.open_tag:
202+
203+
if nucleotide.large: # we are assuming that a cystine is never in the classI and classIi
204+
new_string += '<span style="font-size:105%">'
205+
new_string += nucleotide.nucleotide
206+
207+
if nucleotide.bold and nucleotide.color and nucleotide.underline:
208+
new_string += '<span style="font-weight:bold;color:#ff0000;text-decoration:underline;">'
209+
new_string += nucleotide.nucleotide
210+
elif nucleotide.bold and not nucleotide.color and not nucleotide.underline:
211+
new_string += '<span style="font-weight:bold;">'
212+
new_string += nucleotide.nucleotide
213+
elif not nucleotide.bold and nucleotide.color and not nucleotide.underline:
214+
new_string += '<span style="color:#ff0000;">'
215+
new_string += nucleotide.nucleotide
216+
elif not nucleotide.bold and not nucleotide.color and nucleotide.underline:
217+
new_string += '<span style="text-decoration:underline;">'
218+
new_string += nucleotide.nucleotide
219+
elif nucleotide.bold and nucleotide.color and not nucleotide.underline:
220+
new_string += '<span style="font-weight:bold;color:#ff0000;">'
221+
new_string += nucleotide.nucleotide
222+
elif not nucleotide.bold and nucleotide.color and nucleotide.underline:
223+
new_string += '<span style="color:#ff0000;text-decoration:underline;">'
224+
new_string += nucleotide.nucleotide
225+
elif nucleotide.bold and not nucleotide.color and nucleotide.underline:
226+
new_string += '<span style="font-weight:bold;text-decoration:underline;">'
227+
new_string += nucleotide.nucleotide
228+
else:
229+
new_string += nucleotide.nucleotide
230+
231+
return(new_string)
232+
233+
def main():
234+
args = parse_arguments()
235+
236+
# read in classI and class II
237+
#peptides_51mer = pd.read_excel("/Volumes/mgriffit/Active/griffithlab/gc2596/e.schmidt/neoag_vaccine_scripts/scripts/data_files/10146-0021_Peptides_51-mer.xlsx")
238+
#classI = pd.read_csv("/Volumes/mgriffit/Active/griffithlab/gc2596/e.schmidt/neoag_vaccine_scripts/scripts/data_files/classI.TWJF-10146-0021-Tumor_Lysate.all_epitopes.aggregated.tsv", sep="\t")
239+
#classII = pd.read_csv("/Volumes/mgriffit/Active/griffithlab/gc2596/e.schmidt/neoag_vaccine_scripts/scripts/data_files/classII.TWJF-10146-0021-Tumor_Lysate.all_epitopes.aggregated.tsv", sep="\t")
240+
241+
peptides_51mer = pd.read_excel(args.p)
242+
classI = pd.read_csv(args.classI, sep="\t")
243+
classII = pd.read_csv(args.classII, sep="\t")
244+
245+
# Create a universal ID by editing the peptide 51mer ID
246+
peptides_51mer.rename(columns={'ID': 'full ID'}, inplace=True)
247+
peptides_51mer['ID'] = peptides_51mer['full ID']
248+
249+
peptides_51mer['ID'] = peptides_51mer['ID'].apply(lambda x: '.'.join(x.split('.')[1:])) # Removing before first period, periods will be removed
250+
peptides_51mer['ID'] = peptides_51mer['ID'].apply(lambda x: '.'.join(x.split('.')[1:])) # Removing before second period
251+
peptides_51mer['ID'] = peptides_51mer['ID'].apply(lambda x: '.'.join(x.split('.')[:3]) + '.' + '.'.join(x.split('.')[4:]))
252+
253+
for index, row in peptides_51mer.iterrows():
254+
for i, char in enumerate(row['ID'][::-1]):
255+
if char.isdigit():
256+
peptides_51mer.at[index, 'ID'] = row['ID'][:-i]
257+
break
258+
else:
259+
result = row['ID']
260+
261+
# create a key that is gene, transcript, AA change for CLASSI
262+
classII['modified AA Change'] = classII['AA Change']
263+
264+
# Apply the function to the 'Value' column
265+
classII['modified AA Change'] = classII['modified AA Change'].apply(rearrange_string)
266+
267+
classII['ID'] = classII['Gene'] + '.' + classII['Best Transcript'] + '.' + classII['modified AA Change']
268+
269+
# create a key that is gene, transcript, AA change for CLASSI
270+
classI['modified AA Change'] = classI['AA Change']
271+
272+
# Apply the function to the 'Value' column
273+
classI['modified AA Change'] = classI['modified AA Change'].apply(rearrange_string)
274+
275+
classI['ID'] = classI['Gene'] + '.' + classI['Best Transcript'] + '.' + classI['modified AA Change']
276+
277+
# Merge the sequences from classI and classII with peptide 51mer
278+
merged_peptide_51mer = pd.merge(peptides_51mer, classII[['ID', 'Best Peptide']], on='ID', how='left')
279+
280+
merged_peptide_51mer.rename(columns = {"Best Peptide":"Best Peptide Class II"}, inplace=True)
281+
282+
merged_peptide_51mer = pd.merge(merged_peptide_51mer, classI[['ID', 'Best Peptide', 'Pos']], on='ID', how='left')
283+
284+
merged_peptide_51mer.rename(columns = {"Best Peptide":"Best Peptide Class I"}, inplace=True)
285+
286+
# convert peptide 51mer to HTML
287+
peptides_51mer_html = peptides_51mer.to_html(index=False) # convert to html
288+
289+
# Creating a BeautifulSoup object and specifying the parser
290+
peptides_51mer_soup = BeautifulSoup(peptides_51mer_html, 'html.parser')
291+
292+
293+
for index, row in peptides_51mer.iterrows():
294+
295+
search_string = row['full ID']
296+
297+
#classII_sequence
298+
classII_peptide = merged_peptide_51mer.loc[merged_peptide_51mer['full ID'] == search_string, 'Best Peptide Class II'].values[0]
299+
#classI_sequence
300+
classI_peptide = merged_peptide_51mer.loc[merged_peptide_51mer['full ID'] == search_string, 'Best Peptide Class I'].values[0]
301+
# mutant pepetide position --- not working yet becasue of STUPID frameshift
302+
mutant_peptide_pos = str(merged_peptide_51mer.loc[merged_peptide_51mer['full ID'] == search_string, 'Pos'].values[0])
303+
304+
# Find the tag containing the search string
305+
tag_with_search_string = peptides_51mer_soup.find('td', string=search_string)
306+
307+
if tag_with_search_string and isinstance(classII_peptide, str):
308+
309+
# Find the parent <tr> tag of the tag containing the search string
310+
parent_tr = tag_with_search_string.find_parent('tr')
311+
# Find the next two <td> tags
312+
next_td_tags = parent_tr.findChildren('td', limit=3)
313+
314+
sequence = next_td_tags[2].get_text()
315+
316+
# make sequence the list of objects
317+
peptide_sequence = annotate_every_nucleotide(sequence, classI_peptide, classII_peptide)
318+
319+
# actaully lets break class I and classII into two steps and handle the mutated nucleotide in class I function
320+
# it should be basically like at that position in the class I set
321+
322+
set_underline(peptide_sequence, mutant_peptide_pos)
323+
324+
set_span_tags(peptide_sequence) # pass by reference
325+
326+
new_string = create_stylized_sequence(peptide_sequence)
327+
328+
print(new_string)
329+
330+
next_td_tags[2].string = new_string
331+
332+
modified_html = peptides_51mer_soup.prettify(formatter=None)
333+
334+
else:
335+
print("Search string not found.")
336+
337+
with open(args.o, "w", encoding = 'utf-8') as file:
338+
file.write(modified_html)
339+
340+
341+
342+
if __name__ == "__main__":
343+
main()

0 commit comments

Comments
 (0)