Add ESM2-based peptide generation script

Chris2004m · Chris2004m · commit 038d25e2f6a4 · 2025-08-09T22:57:31.000-04:00
- Create generate_esm2_peptides.py for 1M peptide dataset generation
- Uses ESM2 masked language model (faster than ProtGPT-2)
- No fake proteome required - generates biologically plausible sequences directly
- Supports 8mer, 9mer, 10mer, 11mer peptides
- GPU accelerated with Lightning AI compatibility
diff --git a/scripts/generation/generate_esm2_peptides.py b/scripts/generation/generate_esm2_peptides.py
@@ -0,0 +1,178 @@
+#!/usr/bin/env python3
+"""
+Generate 1M peptide datasets using ESM2 for algorithm benchmarking.
+Creates 4 datasets using ESM2 masked language model: 8mer, 9mer, 10mer, 11mer.
+Much faster than ProtGPT-2 and doesn't require fake proteome generation.
+"""
+
+import torch
+import sys
+import time
+import random
+import numpy as np
+from pathlib import Path
+from typing import List, Tuple
+from tqdm import tqdm
+from transformers import EsmTokenizer, EsmForMaskedLM
+
+def setup_esm2_model(model_name: str = "facebook/esm2_t6_8M_UR50D") -> Tuple[EsmTokenizer, EsmForMaskedLM]:
+    """
+    Set up ESM2 model and tokenizer.
+    Using smallest model (8M) for speed - can upgrade to larger models if needed.
+    """
+    print(f"🔄 Loading ESM2 model: {model_name}")
+    
+    tokenizer = EsmTokenizer.from_pretrained(model_name)
+    model = EsmForMaskedLM.from_pretrained(model_name)
+    
+    # Move to GPU if available
+    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+    model.to(device)
+    model.eval()
+    
+    print(f"✅ ESM2 model loaded on {device}")
+    return tokenizer, model
+
+def generate_peptide_with_esm2(tokenizer, model, length: int, seed_sequence: str = None) -> str:
+    """
+    Generate a single peptide of specified length using ESM2.
+    Uses iterative masking and prediction.
+    """
+    device = next(model.parameters()).device
+    
+    # Start with a random seed or provided sequence
+    if seed_sequence is None:
+        # Create random starting sequence
+        amino_acids = "ACDEFGHIKLMNPQRSTVWY"
+        sequence = ''.join(random.choices(amino_acids, k=length))
+    else:
+        sequence = seed_sequence[:length].ljust(length, 'A')  # Pad or truncate to desired length
+    
+    # Iteratively improve the sequence using ESM2
+    num_iterations = 3  # Number of refinement passes
+    
+    for iteration in range(num_iterations):
+        # Randomly mask 1-2 positions
+        masked_sequence = list(sequence)
+        mask_positions = random.sample(range(length), min(2, length // 3))
+        
+        for pos in mask_positions:
+            masked_sequence[pos] = tokenizer.mask_token
+        
+        masked_text = ''.join(masked_sequence)
+        
+        # Tokenize and predict
+        inputs = tokenizer(masked_text, return_tensors="pt").to(device)
+        
+        with torch.no_grad():
+            outputs = model(**inputs)
+            predictions = outputs.logits
+        
+        # Get predictions for masked positions
+        new_sequence = list(sequence)
+        for pos in mask_positions:
+            token_id = inputs.input_ids[0, pos + 1]  # +1 for CLS token
+            predicted_token_id = torch.argmax(predictions[0, pos + 1]).item()
+            predicted_token = tokenizer.decode([predicted_token_id])
+            
+            # Only use valid amino acids
+            if predicted_token in "ACDEFGHIKLMNPQRSTVWY":
+                new_sequence[pos] = predicted_token
+        
+        sequence = ''.join(new_sequence)
+    
+    return sequence
+
+def generate_esm2_peptides(tokenizer, model, length: int, count: int, batch_size: int = 100) -> List[str]:
+    """Generate multiple peptides of specified length using ESM2."""
+    peptides = set()  # Use set to avoid duplicates
+    
+    print(f"🔄 Generating {count:,} unique {length}-mer peptides...")
+    
+    with tqdm(total=count, desc=f"ESM2 {length}mers") as pbar:
+        while len(peptides) < count:
+            batch_peptides = []
+            
+            # Generate batch
+            for _ in range(min(batch_size, count - len(peptides))):
+                peptide = generate_peptide_with_esm2(tokenizer, model, length)
+                batch_peptides.append(peptide)
+            
+            # Add unique peptides
+            initial_size = len(peptides)
+            peptides.update(batch_peptides)
+            new_peptides = len(peptides) - initial_size
+            
+            pbar.update(new_peptides)
+    
+    return list(peptides)[:count]
+
+def write_fasta(sequences: List[str], output_file: Path, prefix: str = "peptide"):
+    """Write peptide sequences to FASTA format."""
+    with open(output_file, 'w') as f:
+        for i, seq in enumerate(sequences, 1):
+            f.write(f">{prefix}_{i:07d}\n{seq}\n")
+
+def generate_esm2_datasets():
+    """Generate all 4 ESM2-based peptide datasets."""
+    
+    # Configuration
+    base_dir = Path("/Users/chris/Desktop/Griffith Lab/Peptide Sequence Synthesis")
+    output_dir = base_dir / "data" / "ESM2_1M_Peptides"
+    
+    lengths = [8, 9, 10, 11]
+    count = 1_000_000  # 1 million peptides per dataset
+    
+    # Ensure output directory exists
+    output_dir.mkdir(parents=True, exist_ok=True)
+    
+    print(f"🧬 Generating ESM2-based 1M peptide datasets")
+    print(f"📁 Output directory: {output_dir}")
+    print(f"🔢 Lengths: {lengths}")
+    print(f"📈 Count per dataset: {count:,}")
+    print("=" * 60)
+    
+    # Set up ESM2 model
+    tokenizer, model = setup_esm2_model()
+    
+    total_start_time = time.time()
+    
+    # Generate datasets for each length
+    for length in lengths:
+        print(f"\n🧪 Generating ESM2 {length}-mer dataset...")
+        start_time = time.time()
+        
+        peptides = generate_esm2_peptides(tokenizer, model, length, count)
+        
+        # Save to file
+        output_file = output_dir / f"esm2_{length}mer_1M.fasta"
+        write_fasta(peptides, output_file, prefix=f"esm2_{length}mer")
+        
+        elapsed = time.time() - start_time
+        print(f"✅ Saved {len(peptides):,} unique {length}-mer peptides to {output_file.name}")
+        print(f"⏱️  Time: {elapsed:.1f} seconds ({elapsed/60:.1f} minutes)")
+        
+        # Optional: Save memory by clearing cache
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+    
+    total_elapsed = time.time() - total_start_time
+    
+    print("\n" + "=" * 60)
+    print(f"🎉 ALL ESM2 DATASETS GENERATED SUCCESSFULLY!")
+    print(f"⏱️  Total time: {total_elapsed:.1f} seconds ({total_elapsed/60:.1f} minutes)")
+    print(f"📁 Output directory: {output_dir}")
+    print("\n📋 Generated files:")
+    
+    # List all generated files
+    for length in lengths:
+        esm2_file = output_dir / f"esm2_{length}mer_1M.fasta"
+        print(f"   • {esm2_file.name}")
+
+if __name__ == "__main__":
+    # Set random seeds for reproducibility
+    random.seed(42)
+    np.random.seed(42)
+    torch.manual_seed(42)
+    
+    generate_esm2_datasets()