1
+ #!/usr/bin/env python3
2
+ import argparse
3
+ import random
4
+ import sys
5
+ from pathlib import Path
6
+ from typing import List
7
+
8
+ AMINO_ACIDS = "ACDEFGHIKLMNPQRSTVWY"
9
+
10
+ def generate_random_peptides (length : int , count : int ) -> List [str ]:
11
+ return ["" .join (random .choices (AMINO_ACIDS , k = length )) for _ in range (count )]
12
+
13
+ def parse_fasta_sequences (fasta_path : Path ) -> List [str ]:
14
+ sequences = []
15
+ seq = []
16
+ with open (fasta_path , 'r' ) as f :
17
+ for line in f :
18
+ line = line .strip ()
19
+ if not line :
20
+ continue
21
+ if line .startswith ('>' ):
22
+ if seq :
23
+ sequences .append ('' .join (seq ))
24
+ seq = []
25
+ else :
26
+ seq .append (line )
27
+ if seq :
28
+ sequences .append ('' .join (seq ))
29
+ return sequences
30
+
31
+ def sample_peptides_from_fasta (fasta_path : Path , length : int , count : int ) -> List [str ]:
32
+ sequences = parse_fasta_sequences (fasta_path )
33
+ all_subseqs = []
34
+ for seq in sequences :
35
+ if len (seq ) >= length :
36
+ for i in range (len (seq ) - length + 1 ):
37
+ all_subseqs .append (seq [i :i + length ])
38
+ if not all_subseqs :
39
+ raise ValueError (f"No subsequences of length { length } found in { fasta_path } " )
40
+ peptides = random .sample (all_subseqs , k = min (count , len (all_subseqs )))
41
+ while len (peptides ) < count :
42
+ peptides .append (random .choice (all_subseqs ))
43
+ return peptides [:count ]
44
+
45
+ def generate_protgpt2_peptides (length : int , count : int ) -> List [str ]:
46
+ try :
47
+ from transformers import pipeline
48
+ except ImportError :
49
+ print ("Error: transformers package is required for ProtGPT2 generation. Please install with 'pip install transformers torch'" , file = sys .stderr )
50
+ sys .exit (1 )
51
+ # Each token is ~4 amino acids, so for a peptide of length N, set max_length ≈ N/4 (rounded up)
52
+ max_length = max (5 , (length + 3 ) // 4 ) # ensure at least 1 token
53
+ protgpt2 = pipeline ('text-generation' , model = "nferruz/ProtGPT2" , framework = "pt" )
54
+ peptides = []
55
+ tries = 0
56
+ while len (peptides ) < count and tries < count * 10 :
57
+ sequences = protgpt2 ("<|endoftext|>" , max_length = max_length , do_sample = True , top_k = 950 , repetition_penalty = 1.2 , num_return_sequences = min (count - len (peptides ), 10 ), eos_token_id = 0 )
58
+ if not sequences or not hasattr (sequences , '__iter__' ):
59
+ tries += 1
60
+ continue
61
+ for seq in sequences :
62
+ if not isinstance (seq , dict ):
63
+ continue
64
+ gen_text = seq .get ('generated_text' , '' )
65
+ if not isinstance (gen_text , str ):
66
+ continue
67
+ # Remove whitespace and newlines, keep only valid amino acids
68
+ pep = '' .join ([c for c in gen_text if c in AMINO_ACIDS ])
69
+ if len (pep ) == length :
70
+ peptides .append (pep )
71
+ tries += 1
72
+ if len (peptides ) < count :
73
+ print (f"Warning: Only generated { len (peptides )} peptides of requested { count } with exact length { length } ." , file = sys .stderr )
74
+ return peptides [:count ]
75
+
76
+ def write_fasta (peptides : List [str ], output_path : Path , prefix : str = "peptide" ):
77
+ with open (output_path , 'w' ) as f :
78
+ for i , pep in enumerate (peptides , 1 ):
79
+ f .write (f">{ prefix } _{ i } \n { pep } \n " )
80
+
81
+ def main ():
82
+ parser = argparse .ArgumentParser (description = "Generate control peptides for neoantigen analysis." )
83
+ parser .add_argument ('--length' , type = int , required = True , help = 'Peptide length (e.g., 8, 9, 10)' )
84
+ parser .add_argument ('--count' , type = int , required = True , help = 'Number of peptides to generate' )
85
+ parser .add_argument ('--source' , choices = ['random' , 'fasta' , 'protgpt2' ], required = True , help = 'Source of peptides: random, fasta, or protgpt2' )
86
+ parser .add_argument ('--fasta_file' , type = Path , help = 'Path to reference FASTA file (required if source is fasta)' )
87
+ parser .add_argument ('--output' , type = Path , default = Path ('control_peptides.fasta' ), help = 'Output FASTA file' )
88
+ parser .add_argument ('--seed' , type = int , help = 'Random seed for reproducibility (not used for protgpt2)' )
89
+ args = parser .parse_args ()
90
+
91
+ if args .seed is not None and args .source != 'protgpt2' :
92
+ random .seed (args .seed )
93
+
94
+ if args .source == 'random' :
95
+ peptides = generate_random_peptides (args .length , args .count )
96
+ elif args .source == 'fasta' :
97
+ if not args .fasta_file :
98
+ print ('Error: --fasta_file is required when source is fasta' , file = sys .stderr )
99
+ sys .exit (1 )
100
+ peptides = sample_peptides_from_fasta (args .fasta_file , args .length , args .count )
101
+ elif args .source == 'protgpt2' :
102
+ peptides = generate_protgpt2_peptides (args .length , args .count )
103
+ else :
104
+ print (f"Unknown source: { args .source } " , file = sys .stderr )
105
+ sys .exit (1 )
106
+
107
+ write_fasta (peptides , args .output )
108
+ print (f"Wrote { len (peptides )} peptides to { args .output } " )
109
+
110
+ if __name__ == "__main__" :
111
+ main ()
0 commit comments