Skip to content

Commit d3170e1

Browse files
Merge pull request #14 from griffithlab/extended-pepetide-sheet
Automtically fill in restrciting HLA alleles
2 parents 24c4e2d + daa91eb commit d3170e1

File tree

6 files changed

+212
-144
lines changed

6 files changed

+212
-144
lines changed

Dockerfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
1+
# bsub -G compute-oncology -q oncology-interactive -Is -a 'docker_build(griffithlab/neoang_scripts)' -- --tag griffithlab/neoang_scripts .
22

33
FROM python:3.8-slim-buster
44

scripts/color_peptides51mer.py

Lines changed: 72 additions & 101 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,11 @@
44
from bs4 import BeautifulSoup
55
import argparse
66

7+
'''
8+
Example Command:
9+
python3 ../scripts/color_peptides51mer.py -p manual_review/JLF-100-051_Peptides_51-mer.xlsx -samp JLF-100-051 -o manual_review/
10+
'''
11+
712

813
class AminoAcid:
914

@@ -35,40 +40,24 @@ def parse_arguments():
3540

3641
parser.add_argument('-p',
3742
help='The path to the Peptides 51 mer', required=True)
38-
parser.add_argument('-classI',
39-
help='The path to the classI all_epitopes.aggregated.tsv used in pVACseq', required=True)
40-
parser.add_argument('-classII',
41-
help='The path to the classII all_epitopes.aggregated.tsv used in pVACseq', required=True)
42-
parser.add_argument('-WB',
43-
help='the path to the gcp_immuno folder of the trial you wish to tun script on, defined as WORKING_BASE in envs.txt')
44-
parser.add_argument('-samp', help='Name of the sample')
45-
43+
parser.add_argument('-samp',
44+
help='The name of the sample', required=True)
45+
parser.add_argument('-o',
46+
help='the path to output folder')
4647

4748
return(parser.parse_args())
4849

49-
# Function to rearrange string so that G518D looks like 518G/D
50-
def rearrange_string(s):
51-
match = re.match(r'([A-Za-z]+)([\d-]+)([A-Za-z]*)', s)
52-
if match:
53-
letters_before = match.group(1)
54-
numbers = match.group(2)
55-
letters_after = match.group(3)
56-
57-
#return f"{numbers}{letters_before}/{letters_after}"
58-
# Just use the postion for the key to avoid FS problem
59-
return f"{numbers}"
60-
else:
61-
return s
62-
6350

64-
def annotate_every_nucleotide(sequence, classI_peptide, classII_peptide):
51+
def annotate_every_nucleotide(sequence, classI_peptide, classII_peptide,
52+
classI_ic50, classI_percentile, classII_percentile,
53+
classI_transcript, classII_transcript):
6554

66-
peptide_sequence = []
55+
peptide_sequence = [] # Create a list to hold all AA of the 51mer
6756

6857
# Make the sequence a list of AminoAcid objects
6958
for i in range(len(sequence)):
7059
new_AA = AminoAcid(sequence[i], False, False, False, False, -1, False, False)
71-
60+
7261
if sequence[i] == 'C':
7362
new_AA.large = True
7463

@@ -91,32 +80,39 @@ def annotate_every_nucleotide(sequence, classI_peptide, classII_peptide):
9180
positions = []
9281

9382
# set those positions to red
83+
# if median affinity < 1000 nm OR percentile < 2%
9484
j = 0
95-
for i in range(len(peptide_sequence)):
96-
if j < len(positions) and i == positions[j]:
97-
peptide_sequence[i].color = True
98-
j+=1
85+
if float(classI_ic50) < 1000 or float(classI_percentile) < 2:
86+
for i in range(len(peptide_sequence)):
87+
if j < len(positions) and i == positions[j]:
88+
peptide_sequence[i].color = True
89+
j+=1
9990

100-
# CLASS II
101-
positions = []
102-
for i in range(len(peptide_sequence)):
103-
for j in range(len(classII_peptide)):
104-
if peptide_sequence[i].nucleotide == classII_peptide[j]:
105-
positions.append(i)
106-
i+=1
107-
else:
91+
if classI_transcript == classII_transcript:
92+
# CLASS II
93+
positions = []
94+
for i in range(len(peptide_sequence)):
95+
for j in range(len(classII_peptide)):
96+
if peptide_sequence[i].nucleotide == classII_peptide[j]:
97+
positions.append(i)
98+
i+=1
99+
else:
100+
break
101+
102+
if len(positions) == len(classII_peptide):
108103
break
109-
110-
if len(positions) == len(classII_peptide):
111-
break
112-
else:
113-
positions = []
114-
115-
j = 0
116-
for i in range(len(peptide_sequence)):
117-
if j < len(positions) and i == positions[j]:
118-
peptide_sequence[i].bold = True
119-
j+=1
104+
else:
105+
positions = []
106+
# Set class II to bold
107+
# if percentile < 2%
108+
j = 0
109+
if float(classII_percentile) < 2:
110+
for i in range(len(peptide_sequence)):
111+
if j < len(positions) and i == positions[j]:
112+
peptide_sequence[i].bold = True
113+
j+=1
114+
else:
115+
print("Note: ClassII transcript different then ClassI. ClassII peptide not bolded.")
120116

121117

122118
return(peptide_sequence)
@@ -126,21 +122,17 @@ def set_underline(peptide_sequence, mutant_peptide_pos, row_ID):
126122
frameshift = False
127123
classI_position = 0
128124

125+
# Determine if frameshift mutation by seraching for = '-'
129126
if '-' in mutant_peptide_pos:
130127
positions = mutant_peptide_pos.split("-")
131128
start_position = int(positions[0])
132129
end_position = int(positions[1])
133130

134131
frameshift = True
135-
136-
137132
else:
138133
mutant_peptide_pos = int(mutant_peptide_pos)
139134

140-
141-
142135
if frameshift:
143-
144136
continue_underline = False
145137

146138
for i in range(len(peptide_sequence)):
@@ -232,60 +224,35 @@ def main():
232224

233225
# read in classI and class II
234226
peptides_51mer = pd.read_excel(args.p)
235-
classI = pd.read_csv(args.classI, sep="\t")
236-
classII = pd.read_csv(args.classII, sep="\t")
237-
238-
# Create a universal ID by editing the peptide 51mer ID
239-
peptides_51mer.rename(columns={'ID': 'full ID'}, inplace=True)
240-
peptides_51mer['51mer ID'] = peptides_51mer['full ID']
241-
242-
peptides_51mer['51mer ID'] = peptides_51mer['51mer ID'].apply(lambda x: '.'.join(x.split('.')[1:])) # Removing before first period, periods will be removed
243-
244-
peptides_51mer['51mer ID'] = peptides_51mer['51mer ID'].apply(lambda x: '.'.join(x.split('.')[1:])) # Removing before second period
245-
peptides_51mer['51mer ID'] = peptides_51mer['51mer ID'].apply(lambda x: '.'.join(x.split('.')[:3]) + '.' + '.'.join(x.split('.')[4:]))
246-
247-
248-
for index, row in peptides_51mer.iterrows():
249-
for i, char in enumerate(row['51mer ID'][::-1]):
250-
if char.isdigit():
251-
peptides_51mer.at[index, '51mer ID'] = row['51mer ID'][:-i]
252-
break
253-
else:
254-
result = row['51mer ID']
255-
256-
# create a dataframe that contains the classI and classII pepetide sequence
257-
classI.rename(columns = {"Best Peptide":"Best Peptide Class I"}, inplace=True)
258-
classII.rename(columns = {"Best Peptide":"Best Peptide Class II"}, inplace=True)
259-
260-
# create a key that is gene, transcript, AA change for ClassI to join to the peptides order form
261-
classI['modified AA Change'] = classI['AA Change']
262-
classI['modified AA Change'] = classI['modified AA Change'].apply(rearrange_string)
263-
classI['51mer ID'] = classI['Gene'] + '.' + classI['Best Transcript'] + '.' + classI['modified AA Change']
264-
265-
class_sequences = pd.merge(classI[['ID', 'Best Peptide Class I', '51mer ID', 'Pos']], classII[['ID', 'Best Peptide Class II']], on='ID', how='left')
266-
267-
# Create a dataframe that has the classI and classII sequence
268-
merged_peptide_51mer = pd.merge(peptides_51mer, class_sequences, on='51mer ID', how='left')
269227

270228
# convert peptide 51mer to HTML
271229
peptides_51mer_html = peptides_51mer.to_html(index=False) # convert to html
272230

273231
# Creating a BeautifulSoup object and specifying the parser
274232
peptides_51mer_soup = BeautifulSoup(peptides_51mer_html, 'html.parser')
275233

276-
277234
for index, row in peptides_51mer.iterrows():
278235

279236
search_string = row['51mer ID']
280237

281-
#classII_sequence
282-
classII_peptide = merged_peptide_51mer.loc[merged_peptide_51mer['51mer ID'] == search_string, 'Best Peptide Class II'].values[0]
283-
#classI_sequence
284-
classI_peptide = merged_peptide_51mer.loc[merged_peptide_51mer['51mer ID'] == search_string, 'Best Peptide Class I'].values[0]
285-
286-
287-
# mutant pepetide position ---
288-
mutant_peptide_pos = str(merged_peptide_51mer.loc[merged_peptide_51mer['51mer ID'] == search_string, 'Pos'].values[0])
238+
# classII sequence
239+
classII_peptide = peptides_51mer.loc[peptides_51mer['51mer ID'] == search_string, 'Best Peptide Class II'].values[0]
240+
# classI sequence
241+
classI_peptide = peptides_51mer.loc[peptides_51mer['51mer ID'] == search_string, 'Best Peptide Class I'].values[0]
242+
# mutant peptide position
243+
mutant_peptide_pos = str(peptides_51mer.loc[peptides_51mer['51mer ID'] == search_string, 'Pos'].values[0])
244+
# classI IC50
245+
classI_ic50 = peptides_51mer.loc[peptides_51mer['51mer ID'] == search_string, 'Class I IC50 MT'].values[0]
246+
# classI percentile
247+
classI_percentile = peptides_51mer.loc[peptides_51mer['51mer ID'] == search_string, 'Class I %ile MT'].values[0]
248+
# classII IC50 -- not used to determine sequence coloring
249+
# classII percentile
250+
classII_percentile = peptides_51mer.loc[peptides_51mer['51mer ID'] == search_string, 'Class II %ile MT'].values[0]
251+
# classI transcript
252+
classI_transcript = peptides_51mer.loc[peptides_51mer['51mer ID'] == search_string, 'Class I Best Transcript'].values[0]
253+
# classI transcript
254+
classII_transcript = peptides_51mer.loc[peptides_51mer['51mer ID'] == search_string, 'Class II Best Transcript'].values[0]
255+
289256

290257
# Find the tag containing the search string
291258
tag_with_search_string = peptides_51mer_soup.find('td', string=search_string)
@@ -300,7 +267,9 @@ def main():
300267
sequence = next_td_tags[2].get_text()
301268

302269
# make sequence the list of objects
303-
peptide_sequence = annotate_every_nucleotide(sequence, classI_peptide, classII_peptide)
270+
peptide_sequence = annotate_every_nucleotide(sequence, classI_peptide, classII_peptide,
271+
classI_ic50, classI_percentile, classII_percentile,
272+
classI_transcript, classII_transcript)
304273

305274
# actaully lets break class I and classII into two steps and handle the mutated nucleotide in class I function
306275
# it should be basically like at that position in the class I set
@@ -331,13 +300,15 @@ def main():
331300
# Now 'soup' contains the modified HTML with the tag removed
332301
modified_html = peptides_51mer_soup.prettify(formatter=None)
333302

334-
if args.WB:
335-
html_file_name = args.WB + '/../manual_review/' + args.samp + ".Colored_Peptides.html"
303+
print()
304+
305+
if args.o:
306+
html_file_name = args.o + args.samp + ".Colored_Peptides.html"
336307
else:
337-
html_file_name = args.samp + ".Colored_Peptides.html"
308+
html_file_name = args.o + ".Colored_Peptides.html"
338309

339310
with open(html_file_name, "w", encoding = 'utf-8') as file:
340-
file.write(modified_html)
311+
file.write( modified_html)
341312

342313

343314
if __name__ == "__main__":

scripts/fda_quality_thresholds.csv

Lines changed: 17 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,18 @@
11
Criteria,Threshold for pass/failure
2-
TOTAL_READS (tumor DNA),>250M,/qc/fda_metrics/unaligned_sample/sample_table1.csv
3-
TOTAL_READS (normal DNA),>100M,/qc/fda_metrics/unaligned_sample/sample_table1.csv
4-
TOTAL_READS (tumor RNA),>200M,/qc/fda_metrics/unaligned_sample/sample_table1.csv
5-
PCT_PF_READS_ALIGNED (tumor/normal DNA),>95%,/qc/fda_metrics/aligned_sample/aligned_sample_table2.csv
6-
PCT_PF_READS_ALIGNED (tumor RNA),>85%,/qc/fda_metrics/aligned_sample/aligned_sample_table2.csv
7-
PCT_USABLE_BASES_ON_TARGET  (tumor/normal DNA),>20%,/qc/fda_metrics/aligned_sample/aligned_sample_table2.csv
8-
PCT_EXC_OFF_TARGET  (tumor/normal DNA),<60%,/qc/fda_metrics/aligned_sample/aligned_sample_table2.csv
9-
PERCENT_DUPLICATION  (tumor/normal DNA),<40%,/qc/fda_metrics/aligned_sample/aligned_sample_table2.csv
10-
MEAN_TARGET_COVERAGE (tumor DNA),>250x,/qc/fda_metrics/aligned_sample/aligned_sample_table2.csv
11-
MEAN_TARGET_COVERAGE (normal DNA),>100x,/qc/fda_metrics/aligned_sample/aligned_sample_table2.csv
12-
PCT_TARGET_BASES_20X  (tumor/normal DNA),>95%,/qc/fda_metrics/aligned_sample/aligned_sample_table2.csv
13-
PCT_READS_ALIGNED_IN_PAIRS  (tumor/normal DNA),>95%,/qc/fda_metrics/aligned_sample/aligned_sample_table2.csv
14-
MEAN_INSERT_SIZE (tumor/normal DNA),125 - 300bp,/qc/fda_metrics/aligned_sample/aligned_sample_table2.csv
15-
PF_MISMATCH_RATE_1 (tumor/normal DNA),<0.75%,/qc/fda_metrics/aligned_sample/aligned_sample_table2.csv
16-
PF_MISMATCH_RATE_2 (tumor/normal DNA),<1.00%,/qc/fda_metrics/aligned_sample/aligned_sample_table2.csv
17-
Genotype Concordance (tumor/normal DNA),>95%,/qc/concordance/concordance.somalier.pairs.tsv
18-
Contamination Estimate (tumor/normal DNA),<7.5%,/qc/sample/normal.VerifyBamId.selfSM
2+
TOTAL_READS (tumor DNA),>250M
3+
TOTAL_READS (normal DNA),>100M
4+
TOTAL_READS (tumor RNA),>200M
5+
PCT_PF_READS_ALIGNED (tumor/normal DNA),>95%
6+
PCT_PF_READS_ALIGNED (tumor RNA),>85%
7+
PCT_USABLE_BASES_ON_TARGET  (tumor/normal DNA),>20%
8+
PCT_EXC_OFF_TARGET  (tumor/normal DNA),<60%
9+
PERCENT_DUPLICATION  (tumor/normal DNA),<40%
10+
MEAN_TARGET_COVERAGE (tumor DNA),>250x
11+
MEAN_TARGET_COVERAGE (normal DNA),>100x
12+
PCT_TARGET_BASES_20X  (tumor/normal DNA),>95%
13+
PCT_READS_ALIGNED_IN_PAIRS  (tumor/normal DNA),>95%
14+
MEAN_INSERT_SIZE (tumor/normal DNA),125 - 300bp
15+
PF_MISMATCH_RATE_1 (tumor/normal DNA),<0.75%
16+
PF_MISMATCH_RATE_2 (tumor/normal DNA),<1.00%
17+
Genotype Concordance (tumor/normal DNA),>95%
18+
Contamination Estimate (tumor/normal DNA),<7.5%
Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
Criteria,Threshold for pass/failure,
2+
TOTAL_READS (tumor DNA),>250M,/qc/fda_metrics/unaligned_sample/sample_table1.csv
3+
TOTAL_READS (normal DNA),>100M,/qc/fda_metrics/unaligned_sample/sample_table1.csv
4+
TOTAL_READS (tumor RNA),>200M,/qc/fda_metrics/unaligned_sample/sample_table1.csv
5+
PCT_PF_READS_ALIGNED (tumor/normal DNA),>95%,/qc/fda_metrics/aligned_sample/aligned_sample_table2.csv
6+
PCT_PF_READS_ALIGNED (tumor RNA),>85%,/qc/fda_metrics/aligned_sample/aligned_sample_table2.csv
7+
PCT_USABLE_BASES_ON_TARGET  (tumor/normal DNA),>20%,/qc/fda_metrics/aligned_sample/aligned_sample_table2.csv
8+
PCT_EXC_OFF_TARGET  (tumor/normal DNA),<60%,/qc/fda_metrics/aligned_sample/aligned_sample_table2.csv
9+
PERCENT_DUPLICATION  (tumor/normal DNA),<40%,/qc/fda_metrics/aligned_sample/aligned_sample_table2.csv
10+
MEAN_TARGET_COVERAGE (tumor DNA),>250x,/qc/fda_metrics/aligned_sample/aligned_sample_table2.csv
11+
MEAN_TARGET_COVERAGE (normal DNA),>100x,/qc/fda_metrics/aligned_sample/aligned_sample_table2.csv
12+
PCT_TARGET_BASES_20X  (tumor/normal DNA),>95%,/qc/fda_metrics/aligned_sample/aligned_sample_table2.csv
13+
PCT_READS_ALIGNED_IN_PAIRS  (tumor/normal DNA),>95%,/qc/fda_metrics/aligned_sample/aligned_sample_table2.csv
14+
MEAN_INSERT_SIZE (tumor/normal DNA),125 - 300bp,/qc/fda_metrics/aligned_sample/aligned_sample_table2.csv
15+
PF_MISMATCH_RATE_1 (tumor/normal DNA),<0.75%,/qc/fda_metrics/aligned_sample/aligned_sample_table2.csv
16+
PF_MISMATCH_RATE_2 (tumor/normal DNA),<1.00%,/qc/fda_metrics/aligned_sample/aligned_sample_table2.csv
17+
Genotype Concordance (tumor/normal DNA),>95%,/qc/concordance/concordance.somalier.pairs.tsv
18+
Contamination Estimate (tumor/normal DNA),<7.5%,/qc/sample/normal.VerifyBamId.selfSM

0 commit comments

Comments
 (0)