1
+ import argparse
2
+ from itertools import product , chain
3
+ import pandas as pd
4
+
5
+
6
+ # ---- PARSE ARGUMENTS -------------------------------------------------------
7
+ # Parses command line arguments
8
+ # Enables user help
9
+ def parse_arguments ():
10
+ # Parse command line arugments
11
+ parser = argparse .ArgumentParser (description = 'Create peptide_table.tsv needed for modifying peptide sequences for pVACbind' )
12
+
13
+ parser .add_argument ('-n' ,
14
+ help = 'The maximum number of mofifying peptides to add to the begining or end' )
15
+ parser .add_argument ('-m' ,
16
+ help = 'A csv file containing the name/indentifer for the sequence which does NOT have to be unique' )
17
+ return (parser .parse_args ())
18
+
19
+ def generate_modifed_peptides (n , name , base_sequence ):
20
+ characters = ['K' , 'R' ]
21
+
22
+ # Generate all possible combinations up to length n
23
+ all_combinations = chain .from_iterable (product (characters , repeat = i ) for i in range (1 , n + 1 ))
24
+ possible_modifications = set ('' .join (combination ) for combination in all_combinations )
25
+
26
+ peptide_table = []
27
+
28
+ for modification in possible_modifications :
29
+
30
+ Nterm_dict = {}
31
+ Cterm_dict = {}
32
+
33
+ Nterm_sequence_name = name + "." + "n-term" + "-" + modification
34
+ Cterm_sequence_name = name + "." + "c-term" + "-" + modification
35
+
36
+ Nterm_sequence = modification + base_sequence
37
+ Cterm_sequence = base_sequence + modification
38
+
39
+ Nterm_parsed_sequence = modification + '|' + base_sequence
40
+ Cterm_parsed_sequence = base_sequence + '|' + modification
41
+
42
+ Nterm_dict .update ({'sequence_name' : Nterm_sequence_name , 'sequence' : Nterm_sequence , 'parsed_sequence' : Nterm_parsed_sequence })
43
+ Cterm_dict .update ({'sequence_name' : Cterm_sequence_name , 'sequence' : Cterm_sequence , 'parsed_sequence' : Cterm_parsed_sequence })
44
+
45
+ peptide_table .append (Nterm_dict )
46
+ peptide_table .append (Cterm_dict )
47
+
48
+ return (peptide_table )
49
+
50
+
51
+ def assign_unique_numbers (df , column_name ):
52
+ counts = df [column_name ].value_counts ()
53
+ duplicated_values = counts [counts > 1 ].index
54
+
55
+ for value in duplicated_values :
56
+ indices = df .index [df [column_name ] == value ]
57
+ for i , index in enumerate (indices , start = 1 ):
58
+ df .at [index , column_name ] = f"{ value } .{ i } "
59
+
60
+ return df
61
+
62
+ def main ():
63
+ args = parse_arguments ()
64
+
65
+ max_length = args .n
66
+
67
+ peptides = pd .read_csv (args .m , names = ["Name" , "Sequence" ], header = None )
68
+ peptides = peptides [1 :]
69
+ peptides = assign_unique_numbers (peptides , "Name" )
70
+
71
+ max_length = 3
72
+ list = []
73
+
74
+ for index , row in peptides .iterrows ():
75
+ sequences_list = []
76
+
77
+ name = row ['Name' ]
78
+ base_sequence = row ['Sequence' ]
79
+ sequences_list = generate_modifed_peptides (max_length , name , base_sequence )
80
+
81
+ list = list + sequences_list
82
+
83
+ df = pd .DataFrame (list )
84
+
85
+ df .to_csv ('peptide_table.tsv' , sep = "\t " , index = False , header = None )
86
+
87
+ if __name__ == "__main__" :
88
+ main ()
0 commit comments