@@ -40,15 +40,23 @@ def parse_fasta_sequences(fasta_path: Path) -> List[str]:
40
40
return sequences
41
41
42
42
def sample_peptides_from_fasta (fasta_path : Path , length : int , count : int ) -> List [str ]:
43
+ print (f"Parsing FASTA file: { fasta_path } " )
43
44
sequences = parse_fasta_sequences (fasta_path )
45
+
46
+ print (f"Extracting { length } -mer peptides from { len (sequences )} proteins..." )
44
47
all_subseqs = set () # Use set to automatically collapse duplicates
45
- for seq in sequences :
48
+
49
+ # Use progress bar for subsequence extraction
50
+ for seq in tqdm (sequences , desc = "Processing proteins" , unit = "protein" ):
46
51
if len (seq ) >= length :
47
52
for i in range (len (seq ) - length + 1 ):
48
53
all_subseqs .add (seq [i :i + length ])
54
+
49
55
if not all_subseqs :
50
56
raise ValueError (f"No subsequences of length { length } found in { fasta_path } " )
51
57
58
+ print (f"Found { len (all_subseqs )} unique { length } -mer peptides" )
59
+
52
60
# Convert set back to list for sampling
53
61
unique_subseqs = list (all_subseqs )
54
62
@@ -58,6 +66,7 @@ def sample_peptides_from_fasta(fasta_path: Path, length: int, count: int) -> Lis
58
66
return unique_subseqs
59
67
60
68
# Sample without replacement
69
+ print (f"Sampling { count } peptides without replacement..." )
61
70
return random .sample (unique_subseqs , k = count )
62
71
63
72
def generate_llm_peptides (length : int , count : int , model_name : str = "protgpt2" , top_k : int = 950 , top_p : float = 0.9 , repetition_penalty : float = 1.2 ) -> List [str ]:
@@ -332,9 +341,11 @@ def generate_fake_proteome(num_proteins: int, target_lengths: List[int], model_n
332
341
sys .exit (1 )
333
342
334
343
def write_fasta (peptides : List [str ], output_path : Path , prefix : str = "peptide" ):
344
+ print (f"Writing { len (peptides )} peptides to { output_path } ..." )
335
345
with open (output_path , 'w' ) as f :
336
- for i , pep in enumerate (peptides , 1 ):
337
- f .write (f">{ prefix } _{ i } \n { pep } \n " )
346
+ for i , pep in enumerate (tqdm (peptides , desc = "Writing peptides" , unit = "peptide" )):
347
+ f .write (f">{ prefix } _{ i + 1 } \n { pep } \n " )
348
+ print (f"✅ Successfully wrote { len (peptides )} peptides to { output_path } " )
338
349
339
350
def main ():
340
351
parser = argparse .ArgumentParser (description = "Generate control peptides for neoantigen analysis." )
@@ -381,39 +392,50 @@ def main():
381
392
sys .exit (1 )
382
393
peptides = sample_peptides_from_fasta (args .fasta_file , args .length , args .count )
383
394
elif args .source == 'llm' :
384
- # Interactive workflow for LLM-based peptide generation
385
- print (f"\n Generating { args .count } peptides of length { args .length } using LLM approach..." )
386
- print ("This approach generates a fake proteome first, then samples peptides from it." )
387
-
388
- # Check if user has existing proteome
389
- has_existing = get_user_input ("\n Do you have an existing ProtGPT2-generated proteome file? (y/n): " ).lower ().startswith ('y' )
390
-
391
- if has_existing :
392
- proteome_path = get_existing_proteome_path ()
393
- print (f"Using existing proteome: { proteome_path } " )
395
+ # Choose workflow based on the LLM model
396
+ if args .llm_model .lower () == 'protgpt2' :
397
+ # Interactive proteome workflow for ProtGPT2 (to avoid M bias)
398
+ print (f"\n Generating { args .count } peptides of length { args .length } using ProtGPT2 proteome approach..." )
399
+ print ("This approach generates a fake proteome first, then samples peptides from it." )
400
+ elif args .llm_model .lower () == 'esm2' :
401
+ # Direct generation for ESM2 (no M bias issue)
402
+ print (f"\n Generating { args .count } peptides of length { args .length } using ESM2 direct generation..." )
403
+ peptides = generate_llm_peptides (args .length , args .count , args .llm_model , args .top_k , args .top_p , args .repetition_penalty )
394
404
else :
395
- # Ask if user wants to generate new proteome
396
- generate_new = get_user_input ("Would you like to generate a new fake proteome? (y/n): " ).lower ().startswith ('y' )
397
-
398
- if not generate_new :
399
- print ("Cannot proceed without a proteome. Exiting." )
400
- sys .exit (1 )
401
-
402
- # Configure proteome generation (no reference needed)
403
- num_proteins , target_lengths = configure_proteome_generation ()
405
+ print (f"Error: Unsupported LLM model '{ args .llm_model } '" , file = sys .stderr )
406
+ sys .exit (1 )
407
+
408
+ # Only run interactive proteome workflow for ProtGPT2
409
+ if args .llm_model .lower () == 'protgpt2' :
410
+ # Check if user has existing proteome
411
+ has_existing = get_user_input ("\n Do you have an existing ProtGPT2-generated proteome file? (y/n): " ).lower ().startswith ('y' )
404
412
405
- # Generate the fake proteome
406
- fake_proteins = generate_fake_proteome (num_proteins , target_lengths , args .llm_model )
413
+ if has_existing :
414
+ proteome_path = get_existing_proteome_path ()
415
+ print (f"Using existing proteome: { proteome_path } " )
416
+ else :
417
+ # Ask if user wants to generate new proteome
418
+ generate_new = get_user_input ("Would you like to generate a new fake proteome? (y/n): " ).lower ().startswith ('y' )
419
+
420
+ if not generate_new :
421
+ print ("Cannot proceed without a proteome. Exiting." )
422
+ sys .exit (1 )
423
+
424
+ # Configure proteome generation (no reference needed)
425
+ num_proteins , target_lengths = configure_proteome_generation ()
426
+
427
+ # Generate the fake proteome
428
+ fake_proteins = generate_fake_proteome (num_proteins , target_lengths , args .llm_model )
429
+
430
+ # Save the generated proteome
431
+ proteome_output = Path (f'fake_proteome_{ num_proteins } proteins.fasta' )
432
+ write_fasta (fake_proteins , proteome_output , prefix = "protein" )
433
+ print (f"\n Generated fake proteome saved to: { proteome_output } " )
434
+ proteome_path = proteome_output
407
435
408
- # Save the generated proteome
409
- proteome_output = Path (f'fake_proteome_{ num_proteins } proteins.fasta' )
410
- write_fasta (fake_proteins , proteome_output , prefix = "protein" )
411
- print (f"\n Generated fake proteome saved to: { proteome_output } " )
412
- proteome_path = proteome_output
413
-
414
- # Now sample peptides from the proteome using FASTA method
415
- print (f"\n Sampling { args .count } peptides from the proteome..." )
416
- peptides = sample_peptides_from_fasta (proteome_path , args .length , args .count )
436
+ # Now sample peptides from the proteome using FASTA method
437
+ print (f"\n Sampling { args .count } peptides from the proteome..." )
438
+ peptides = sample_peptides_from_fasta (proteome_path , args .length , args .count )
417
439
else :
418
440
print (f"Unknown source: { args .source } " , file = sys .stderr )
419
441
sys .exit (1 )
0 commit comments