Skip to content

Commit 5b31655

Browse files
final modifications
1 parent 710c37d commit 5b31655

File tree

2 files changed

+167
-4
lines changed

2 files changed

+167
-4
lines changed

scripts/generate_reviews_files.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,7 @@ def main():
9898
# create sorting ID that is gene and transcript to sort in the same order as peptide
9999
reviewed_candidates['sorting id'] = reviewed_candidates['Gene'] + '.' + reviewed_candidates['Best Transcript']
100100
# make sure the sorting id column is unique
101-
reviewed_canidates = make_column_unique(reviewed_candidates, 'sorting id')
101+
reviewed_candidates = make_column_unique(reviewed_candidates, 'sorting id')
102102

103103
peptides = pd.read_csv(args.c, sep="\t")
104104
peptides = peptides.drop(['cterm_7mer_gravy_score', 'cysteine_count', 'n_terminal_asparagine', 'asparagine_proline_bond_count',
@@ -141,7 +141,7 @@ def main():
141141
else:
142142
Neoantigen_Canidates_file_name = args.samp + ".Annotated.Neoantigen_Candidates.xlsx"
143143

144-
reviewed_canidates.to_excel(Neoantigen_Canidates_file_name, index=False)
144+
reviewed_candidates.to_excel(Neoantigen_Canidates_file_name, index=False)
145145

146146

147147
if __name__ == "__main__":

scripts/modify_peptides.py

Lines changed: 165 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,11 @@
11
import argparse
22
from itertools import product, chain
33
import pandas as pd
4+
import sys
5+
import os
6+
import re
7+
import subprocess
8+
49

510

611
# ---- PARSE ARGUMENTS -------------------------------------------------------
@@ -14,6 +19,13 @@ def parse_arguments():
1419
help='The maximum number of mofifying peptides to add to the begining or end')
1520
parser.add_argument('-m',
1621
help='A csv file containing the name/indentifer for the sequence which does NOT have to be unique')
22+
parser.add_argument('-samp',
23+
help='sample name')
24+
parser.add_argument('-HLA',
25+
help='a list of the HLA alleles in the format: HLA-A*02:01,HLA-A*24:02,HLA-B*07:02,HLA-B*35:02,HLA-C*04:01,HLA-C*07:02')
26+
parser.add_argument('-WD',
27+
help='The directory in which you would like to run pVACbind')
28+
1729
return(parser.parse_args())
1830

1931
def generate_modifed_peptides(n, name, base_sequence):
@@ -80,9 +92,160 @@ def main():
8092

8193
list = list + sequences_list
8294

83-
df = pd.DataFrame(list)
95+
peptide_table = pd.DataFrame(list)
96+
97+
peptide_table.to_csv('peptide_table.tsv', sep="\t", index=False, header=None)
98+
99+
100+
# Perform checks
101+
num_entries = len(peptide_table)
102+
103+
# Check if all entries in the sequence_name column are unique
104+
are_all_unique = not peptide_table['sequence_name'].duplicated().any()
105+
106+
if are_all_unique:
107+
print("All entries in the sequence_name column are unique.")
108+
else:
109+
print("There are duplicate entries in the sequence_name column.")
110+
sys.exit(1)
111+
112+
with open('modified_peptides.fa', 'w') as fasta_file:
113+
for index, row in peptide_table.iterrows():
114+
sequence_name = ">" + row['sequence_name']
115+
sequence = row['parsed_sequence']
116+
fasta_file.write(f"{sequence_name}\n{sequence}\n")
117+
118+
119+
# Create dirs for processing the N-term and C-term sequences separately
120+
os.makedirs("n-term", exist_ok=True)
121+
os.makedirs("c-term", exist_ok=True)
122+
123+
with open('modified_peptides.fa', 'r') as input_file, \
124+
open('n-term/modified_peptides_n-term.fa', 'w') as n_term_file, \
125+
open('c-term/modified_peptides_c-term.fa', 'w') as c_term_file:
126+
127+
for line in input_file:
128+
line = line.strip()
129+
130+
# Check for the pattern 'n-term' in the line
131+
if 'n-term' in line:
132+
n_term_file.write(line + '\n')
133+
134+
# Ensure there are at least two more lines in the file
135+
try:
136+
next_line = next(input_file).strip()
137+
n_term_file.write(next_line + '\n')
138+
except StopIteration:
139+
# Handle the case where there are not enough lines left
140+
pass
141+
142+
# Check for the pattern 'c-term' in the line
143+
if 'c-term' in line:
144+
c_term_file.write(line + '\n')
145+
146+
# Ensure there are at least two more lines in the file
147+
try:
148+
next_line = next(input_file).strip()
149+
c_term_file.write(next_line + '\n')
150+
except StopIteration:
151+
# Handle the case where there are not enough lines left
152+
pass
153+
154+
def get_line_count(file_path):
155+
with open(file_path, 'r') as file:
156+
return sum(1 for line in file)
157+
158+
modified_peptides_count = get_line_count('modified_peptides.fa')
159+
n_term_count = get_line_count('n-term/modified_peptides_n-term.fa')
160+
c_term_count = get_line_count('c-term/modified_peptides_c-term.fa')
161+
162+
if n_term_count+ c_term_count == modified_peptides_count:
163+
print("Sucessfully split fasta into n-term and c-term")
164+
else:
165+
print("N-term and c-term fasta lines to not equal modified fasta lines")
166+
167+
168+
def create_subpeptide_fastas_n_term(input_dir, results_dir, infile_path):
169+
# Define the lengths to iterate over
170+
lengths = [8, 9, 10, 11]
171+
172+
# Iterate over each test length
173+
for LENGTH in lengths:
174+
# Create input files for each test length
175+
print(f"Creating input fasta to test peptides of length: {LENGTH}")
176+
177+
# Set the file paths
178+
LENGTH_FASTA = os.path.join(input_dir , f"{LENGTH}-mer-test.fa")
179+
LENGTH_RESULT_DIR = os.path.join(results_dir, f"{LENGTH}-mer-test")
180+
181+
# Create the result directory if it doesn't exist
182+
os.makedirs(LENGTH_RESULT_DIR, exist_ok=True)
183+
184+
with open(infile_path, 'r') as infile, open(LENGTH_FASTA, 'w') as length_fasta:
185+
for line in infile:
186+
line = line.strip()
187+
if line.startswith('>'):
188+
length_fasta.write(f"{line}\n")
189+
elif match := re.match(r'(\w+)\|(\w+)', line):
190+
before, after = match.groups()
191+
sub = after[:LENGTH - 1]
192+
length_fasta.write(f"{before}{sub}\n")
193+
infile.close()
194+
195+
def create_subpeptide_fastas_c_term(input_dir, results_dir, infile_path):
196+
# Define the lengths to iterate over
197+
lengths = [8, 9, 10, 11]
198+
199+
for LENGTH in lengths:
200+
# Create input files for each test length so that each test sequence will contain at least one modified base
201+
print(f"Creating input fasta to test peptides of length: {LENGTH}")
202+
203+
LENGTH_FASTA = os.path.join(input_dir, f"{LENGTH}-mer-test.fa")
204+
LENGTH_RESULT_DIR = os.path.join(results_dir, f"{LENGTH}-mer-test")
205+
206+
os.makedirs(LENGTH_RESULT_DIR, exist_ok=True)
207+
208+
with open(infile_path, 'r') as infile, open(LENGTH_FASTA, 'w') as length_fasta:
209+
for line in infile:
210+
line = line.strip()
211+
if line.startswith('>'):
212+
length_fasta.write(f"{line}\n")
213+
elif match := re.match(r'(\w+)\|(\w+)', line):
214+
before, after = match.groups()
215+
sub = before[-(LENGTH - 1):]
216+
length_fasta.write(f"{sub}{after}\n")
217+
infile.close()
218+
219+
# Set up sub-peptide fasta sequences for each target class I prediction length
220+
working_dir = args.WD
221+
222+
input_dir = os.path.join(working_dir + "/n-term/pvacbind_inputs")
223+
results_dir = os.path.join(working_dir + "/n-term/pvacbind_results")
224+
infile_path = os.path.join(working_dir + "/n-term/modified_peptides_n-term.fa")
225+
226+
os.makedirs(input_dir, exist_ok=True)
227+
os.makedirs(results_dir, exist_ok=True)
228+
229+
print("Creating sub-pepetide fastas for n-term")
230+
create_subpeptide_fastas_n_term(input_dir, results_dir, infile_path)
231+
232+
input_dir = os.path.join(working_dir + "/c-term/pvacbind_inputs")
233+
results_dir = os.path.join(working_dir + "/c-term/pvacbind_results")
234+
infile_path = os.path.join(working_dir + "/c-term/modified_peptides_c-term.fa")
235+
236+
os.makedirs(input_dir, exist_ok=True)
237+
os.makedirs(results_dir, exist_ok=True)
238+
239+
print("Creating sub-pepetide fastas for c-term")
240+
create_subpeptide_fastas_c_term(input_dir, results_dir, infile_path)
241+
242+
# Run pVACtools
243+
244+
245+
246+
247+
84248

85-
df.to_csv('peptide_table.tsv', sep="\t", index=False, header=None)
86249

87250
if __name__ == "__main__":
88251
main()

0 commit comments

Comments
 (0)