Skip to content

Commit 1dbca04

Browse files
committed
Implement interactive ProtGPT2 proteome workflow with improvements
- Add interactive workflow for LLM peptide generation: - Ask user if they have existing ProtGPT2 proteome or want to generate new one - Remove reference proteome dependency for ProtGPT2 workflow - Generate synthetic proteins using ProtGPT2 only - Remove recommended values from user prompts for cleaner interface - Add progress bar (tqdm) for proteome generation status - Fix sampling to be strictly without replacement - Fix path handling to strip quotes from user input - Add tqdm>=4.64.0 to requirements.txt - Ensure peptides match initial request specifications after proteome creation
1 parent f880c7b commit 1dbca04

File tree

2 files changed

+173
-6
lines changed

2 files changed

+173
-6
lines changed

requirements.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,4 +7,5 @@ numpy>=1.23.0
77
matplotlib>=3.5.0
88
seaborn>=0.12.0
99
psutil>=5.9.0
10-
sentencepiece>=0.1.99
10+
sentencepiece>=0.1.99
11+
tqdm>=4.64.0

scripts/generation/generate_control_peptides.py

Lines changed: 171 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@
55
import sys
66
from pathlib import Path
77
from typing import List
8+
import numpy as np
9+
from tqdm import tqdm
810

911
AMINO_ACIDS = "ACDEFGHIKLMNPQRSTVWY"
1012

@@ -49,10 +51,14 @@ def sample_peptides_from_fasta(fasta_path: Path, length: int, count: int) -> Lis
4951

5052
# Convert set back to list for sampling
5153
unique_subseqs = list(all_subseqs)
52-
peptides = random.sample(unique_subseqs, k=min(count, len(unique_subseqs)))
53-
while len(peptides) < count:
54-
peptides.append(random.choice(unique_subseqs))
55-
return peptides[:count]
54+
55+
# Ensure sampling without replacement - if not enough unique peptides, inform user
56+
if count > len(unique_subseqs):
57+
print(f"Warning: Requested {count} peptides, but only {len(unique_subseqs)} unique peptides available. Returning all {len(unique_subseqs)} unique peptides.", file=sys.stderr)
58+
return unique_subseqs
59+
60+
# Sample without replacement
61+
return random.sample(unique_subseqs, k=count)
5662

5763
def generate_llm_peptides(length: int, count: int, model_name: str = "protgpt2", top_k: int = 950, top_p: float = 0.9, repetition_penalty: float = 1.2) -> List[str]:
5864
try:
@@ -197,6 +203,134 @@ def generate_llm_peptides(length: int, count: int, model_name: str = "protgpt2",
197203
print(f"Warning: Only generated {len(peptides)} peptides of requested {count} with exact length {length}.", file=sys.stderr)
198204
return peptides[:count]
199205

206+
def generate_fake_proteome_lengths(num_proteins: int, reference_fasta_path: Path) -> List[int]:
207+
"""Generate protein lengths matching the distribution from a reference proteome."""
208+
# Parse reference proteome to get length distribution
209+
sequences = parse_fasta_sequences(reference_fasta_path)
210+
reference_lengths = [len(seq) for seq in sequences]
211+
212+
# Sample from the empirical distribution
213+
sampled_lengths = np.random.choice(reference_lengths, size=num_proteins, replace=True)
214+
return sampled_lengths.tolist()
215+
216+
def get_user_input(prompt: str) -> str:
217+
"""Get user input with a prompt."""
218+
return input(prompt).strip()
219+
220+
def get_existing_proteome_path() -> Path:
221+
"""Interactive prompt to get existing proteome file path."""
222+
while True:
223+
path_str = get_user_input("Please provide the path to your ProtGPT2-generated proteome file: ")
224+
# Strip quotes that users might add
225+
path_str = path_str.strip('"\'')
226+
path = Path(path_str)
227+
if path.exists() and path.is_file():
228+
return path
229+
else:
230+
print(f"Error: File '{path}' not found. Please try again.")
231+
232+
def configure_proteome_generation():
233+
"""Interactive configuration for new proteome generation using ProtGPT2 only."""
234+
print("\nConfiguring new ProtGPT2 proteome generation...")
235+
print("This will generate completely synthetic proteins using ProtGPT2.")
236+
237+
# Get number of proteins
238+
while True:
239+
try:
240+
num_proteins = int(get_user_input("How many proteins should be generated? "))
241+
if num_proteins > 0:
242+
break
243+
else:
244+
print("Please enter a positive number.")
245+
except ValueError:
246+
print("Please enter a valid integer.")
247+
248+
# Get protein length range
249+
print("\nSpecify the protein length range:")
250+
while True:
251+
try:
252+
min_len = int(get_user_input("Minimum protein length: "))
253+
max_len = int(get_user_input("Maximum protein length: "))
254+
if min_len > 0 and max_len >= min_len:
255+
break
256+
else:
257+
print("Please ensure minimum > 0 and maximum >= minimum.")
258+
except ValueError:
259+
print("Please enter valid integers.")
260+
261+
# Generate uniform random lengths in the specified range
262+
target_lengths = [random.randint(min_len, max_len) for _ in range(num_proteins)]
263+
264+
print(f"\nWill generate {num_proteins} proteins with lengths between {min_len}-{max_len} amino acids.")
265+
return num_proteins, target_lengths
266+
267+
def generate_fake_proteome(num_proteins: int, target_lengths: List[int], model_name: str = "protgpt2") -> List[str]:
268+
"""Generate a fake proteome with specified protein lengths using LLM."""
269+
try:
270+
from transformers import pipeline
271+
except ImportError:
272+
print("Error: transformers package is required for fake proteome generation. Please install with 'pip install transformers torch'", file=sys.stderr)
273+
sys.exit(1)
274+
275+
print(f"Generating {num_proteins} proteins using {model_name}...")
276+
277+
if model_name.lower() == "protgpt2":
278+
model_id = "nferruz/ProtGPT2"
279+
llm_pipeline = pipeline('text-generation', model=model_id, framework="pt")
280+
281+
proteins = []
282+
# Use progress bar for protein generation
283+
for i, target_length in enumerate(tqdm(target_lengths, desc="Generating proteins", unit="protein")):
284+
285+
# Generate protein of approximately target length
286+
max_tokens = max(10, target_length // 4) # ProtGPT2 tokens are ~4 amino acids
287+
tries = 0
288+
protein = None
289+
290+
while tries < 5: # Try up to 5 times to get reasonable length
291+
try:
292+
# Start with empty prompt to get natural protein start
293+
sequences = llm_pipeline(
294+
"",
295+
max_length=max_tokens,
296+
do_sample=True,
297+
top_k=950,
298+
top_p=0.9,
299+
temperature=1.0,
300+
repetition_penalty=1.2,
301+
num_return_sequences=1,
302+
eos_token_id=0
303+
)
304+
305+
if sequences and len(sequences) > 0:
306+
gen_text = sequences[0].get('generated_text', '')
307+
# Clean the sequence - keep only valid amino acids
308+
clean_seq = ''.join([c for c in gen_text.upper() if c in AMINO_ACIDS])
309+
310+
if len(clean_seq) >= 50: # Minimum reasonable protein length
311+
# Trim or extend to approximate target length
312+
if len(clean_seq) > target_length * 1.5:
313+
clean_seq = clean_seq[:target_length]
314+
protein = clean_seq
315+
break
316+
317+
except Exception as e:
318+
print(f"Warning: Error generating protein {i+1}: {e}", file=sys.stderr)
319+
320+
tries += 1
321+
322+
# Fallback to random sequence if generation failed
323+
if protein is None:
324+
protein = ''.join(random.choices(AMINO_ACIDS, k=target_length))
325+
326+
proteins.append(protein)
327+
328+
return proteins
329+
330+
else:
331+
print(f"Error: Unsupported model '{model_name}' for proteome generation. Only protgpt2 is supported.", file=sys.stderr)
332+
sys.exit(1)
333+
200334
def write_fasta(peptides: List[str], output_path: Path, prefix: str = "peptide"):
201335
with open(output_path, 'w') as f:
202336
for i, pep in enumerate(peptides, 1):
@@ -247,7 +381,39 @@ def main():
247381
sys.exit(1)
248382
peptides = sample_peptides_from_fasta(args.fasta_file, args.length, args.count)
249383
elif args.source == 'llm':
250-
peptides = generate_llm_peptides(args.length, args.count, args.llm_model, args.top_k, args.top_p, args.repetition_penalty)
384+
# Interactive workflow for LLM-based peptide generation
385+
print(f"\nGenerating {args.count} peptides of length {args.length} using LLM approach...")
386+
print("This approach generates a fake proteome first, then samples peptides from it.")
387+
388+
# Check if user has existing proteome
389+
has_existing = get_user_input("\nDo you have an existing ProtGPT2-generated proteome file? (y/n): ").lower().startswith('y')
390+
391+
if has_existing:
392+
proteome_path = get_existing_proteome_path()
393+
print(f"Using existing proteome: {proteome_path}")
394+
else:
395+
# Ask if user wants to generate new proteome
396+
generate_new = get_user_input("Would you like to generate a new fake proteome? (y/n): ").lower().startswith('y')
397+
398+
if not generate_new:
399+
print("Cannot proceed without a proteome. Exiting.")
400+
sys.exit(1)
401+
402+
# Configure proteome generation (no reference needed)
403+
num_proteins, target_lengths = configure_proteome_generation()
404+
405+
# Generate the fake proteome
406+
fake_proteins = generate_fake_proteome(num_proteins, target_lengths, args.llm_model)
407+
408+
# Save the generated proteome
409+
proteome_output = Path(f'fake_proteome_{num_proteins}proteins.fasta')
410+
write_fasta(fake_proteins, proteome_output, prefix="protein")
411+
print(f"\nGenerated fake proteome saved to: {proteome_output}")
412+
proteome_path = proteome_output
413+
414+
# Now sample peptides from the proteome using FASTA method
415+
print(f"\nSampling {args.count} peptides from the proteome...")
416+
peptides = sample_peptides_from_fasta(proteome_path, args.length, args.count)
251417
else:
252418
print(f"Unknown source: {args.source}", file=sys.stderr)
253419
sys.exit(1)

0 commit comments

Comments
 (0)