1
1
import argparse
2
2
from itertools import product , chain
3
3
import pandas as pd
4
+ import sys
5
+ import os
6
+ import re
7
+ import subprocess
8
+
4
9
5
10
6
11
# ---- PARSE ARGUMENTS -------------------------------------------------------
@@ -14,6 +19,13 @@ def parse_arguments():
14
19
help = 'The maximum number of mofifying peptides to add to the begining or end' )
15
20
parser .add_argument ('-m' ,
16
21
help = 'A csv file containing the name/indentifer for the sequence which does NOT have to be unique' )
22
+ parser .add_argument ('-samp' ,
23
+ help = 'sample name' )
24
+ parser .add_argument ('-HLA' ,
25
+ help = 'a list of the HLA alleles in the format: HLA-A*02:01,HLA-A*24:02,HLA-B*07:02,HLA-B*35:02,HLA-C*04:01,HLA-C*07:02' )
26
+ parser .add_argument ('-WD' ,
27
+ help = 'The directory in which you would like to run pVACbind' )
28
+
17
29
return (parser .parse_args ())
18
30
19
31
def generate_modifed_peptides (n , name , base_sequence ):
@@ -80,9 +92,160 @@ def main():
80
92
81
93
list = list + sequences_list
82
94
83
- df = pd .DataFrame (list )
95
+ peptide_table = pd .DataFrame (list )
96
+
97
+ peptide_table .to_csv ('peptide_table.tsv' , sep = "\t " , index = False , header = None )
98
+
99
+
100
+ # Perform checks
101
+ num_entries = len (peptide_table )
102
+
103
+ # Check if all entries in the sequence_name column are unique
104
+ are_all_unique = not peptide_table ['sequence_name' ].duplicated ().any ()
105
+
106
+ if are_all_unique :
107
+ print ("All entries in the sequence_name column are unique." )
108
+ else :
109
+ print ("There are duplicate entries in the sequence_name column." )
110
+ sys .exit (1 )
111
+
112
+ with open ('modified_peptides.fa' , 'w' ) as fasta_file :
113
+ for index , row in peptide_table .iterrows ():
114
+ sequence_name = ">" + row ['sequence_name' ]
115
+ sequence = row ['parsed_sequence' ]
116
+ fasta_file .write (f"{ sequence_name } \n { sequence } \n " )
117
+
118
+
119
+ # Create dirs for processing the N-term and C-term sequences separately
120
+ os .makedirs ("n-term" , exist_ok = True )
121
+ os .makedirs ("c-term" , exist_ok = True )
122
+
123
+ with open ('modified_peptides.fa' , 'r' ) as input_file , \
124
+ open ('n-term/modified_peptides_n-term.fa' , 'w' ) as n_term_file , \
125
+ open ('c-term/modified_peptides_c-term.fa' , 'w' ) as c_term_file :
126
+
127
+ for line in input_file :
128
+ line = line .strip ()
129
+
130
+ # Check for the pattern 'n-term' in the line
131
+ if 'n-term' in line :
132
+ n_term_file .write (line + '\n ' )
133
+
134
+ # Ensure there are at least two more lines in the file
135
+ try :
136
+ next_line = next (input_file ).strip ()
137
+ n_term_file .write (next_line + '\n ' )
138
+ except StopIteration :
139
+ # Handle the case where there are not enough lines left
140
+ pass
141
+
142
+ # Check for the pattern 'c-term' in the line
143
+ if 'c-term' in line :
144
+ c_term_file .write (line + '\n ' )
145
+
146
+ # Ensure there are at least two more lines in the file
147
+ try :
148
+ next_line = next (input_file ).strip ()
149
+ c_term_file .write (next_line + '\n ' )
150
+ except StopIteration :
151
+ # Handle the case where there are not enough lines left
152
+ pass
153
+
154
+ def get_line_count (file_path ):
155
+ with open (file_path , 'r' ) as file :
156
+ return sum (1 for line in file )
157
+
158
+ modified_peptides_count = get_line_count ('modified_peptides.fa' )
159
+ n_term_count = get_line_count ('n-term/modified_peptides_n-term.fa' )
160
+ c_term_count = get_line_count ('c-term/modified_peptides_c-term.fa' )
161
+
162
+ if n_term_count + c_term_count == modified_peptides_count :
163
+ print ("Sucessfully split fasta into n-term and c-term" )
164
+ else :
165
+ print ("N-term and c-term fasta lines to not equal modified fasta lines" )
166
+
167
+
168
+ def create_subpeptide_fastas_n_term (input_dir , results_dir , infile_path ):
169
+ # Define the lengths to iterate over
170
+ lengths = [8 , 9 , 10 , 11 ]
171
+
172
+ # Iterate over each test length
173
+ for LENGTH in lengths :
174
+ # Create input files for each test length
175
+ print (f"Creating input fasta to test peptides of length: { LENGTH } " )
176
+
177
+ # Set the file paths
178
+ LENGTH_FASTA = os .path .join (input_dir , f"{ LENGTH } -mer-test.fa" )
179
+ LENGTH_RESULT_DIR = os .path .join (results_dir , f"{ LENGTH } -mer-test" )
180
+
181
+ # Create the result directory if it doesn't exist
182
+ os .makedirs (LENGTH_RESULT_DIR , exist_ok = True )
183
+
184
+ with open (infile_path , 'r' ) as infile , open (LENGTH_FASTA , 'w' ) as length_fasta :
185
+ for line in infile :
186
+ line = line .strip ()
187
+ if line .startswith ('>' ):
188
+ length_fasta .write (f"{ line } \n " )
189
+ elif match := re .match (r'(\w+)\|(\w+)' , line ):
190
+ before , after = match .groups ()
191
+ sub = after [:LENGTH - 1 ]
192
+ length_fasta .write (f"{ before } { sub } \n " )
193
+ infile .close ()
194
+
195
+ def create_subpeptide_fastas_c_term (input_dir , results_dir , infile_path ):
196
+ # Define the lengths to iterate over
197
+ lengths = [8 , 9 , 10 , 11 ]
198
+
199
+ for LENGTH in lengths :
200
+ # Create input files for each test length so that each test sequence will contain at least one modified base
201
+ print (f"Creating input fasta to test peptides of length: { LENGTH } " )
202
+
203
+ LENGTH_FASTA = os .path .join (input_dir , f"{ LENGTH } -mer-test.fa" )
204
+ LENGTH_RESULT_DIR = os .path .join (results_dir , f"{ LENGTH } -mer-test" )
205
+
206
+ os .makedirs (LENGTH_RESULT_DIR , exist_ok = True )
207
+
208
+ with open (infile_path , 'r' ) as infile , open (LENGTH_FASTA , 'w' ) as length_fasta :
209
+ for line in infile :
210
+ line = line .strip ()
211
+ if line .startswith ('>' ):
212
+ length_fasta .write (f"{ line } \n " )
213
+ elif match := re .match (r'(\w+)\|(\w+)' , line ):
214
+ before , after = match .groups ()
215
+ sub = before [- (LENGTH - 1 ):]
216
+ length_fasta .write (f"{ sub } { after } \n " )
217
+ infile .close ()
218
+
219
+ # Set up sub-peptide fasta sequences for each target class I prediction length
220
+ working_dir = args .WD
221
+
222
+ input_dir = os .path .join (working_dir + "/n-term/pvacbind_inputs" )
223
+ results_dir = os .path .join (working_dir + "/n-term/pvacbind_results" )
224
+ infile_path = os .path .join (working_dir + "/n-term/modified_peptides_n-term.fa" )
225
+
226
+ os .makedirs (input_dir , exist_ok = True )
227
+ os .makedirs (results_dir , exist_ok = True )
228
+
229
+ print ("Creating sub-pepetide fastas for n-term" )
230
+ create_subpeptide_fastas_n_term (input_dir , results_dir , infile_path )
231
+
232
+ input_dir = os .path .join (working_dir + "/c-term/pvacbind_inputs" )
233
+ results_dir = os .path .join (working_dir + "/c-term/pvacbind_results" )
234
+ infile_path = os .path .join (working_dir + "/c-term/modified_peptides_c-term.fa" )
235
+
236
+ os .makedirs (input_dir , exist_ok = True )
237
+ os .makedirs (results_dir , exist_ok = True )
238
+
239
+ print ("Creating sub-pepetide fastas for c-term" )
240
+ create_subpeptide_fastas_c_term (input_dir , results_dir , infile_path )
241
+
242
+ # Run pVACtools
243
+
244
+
245
+
246
+
247
+
84
248
85
- df .to_csv ('peptide_table.tsv' , sep = "\t " , index = False , header = None )
86
249
87
250
if __name__ == "__main__" :
88
251
main ()
0 commit comments