|
5 | 5 | import sys
|
6 | 6 | from pathlib import Path
|
7 | 7 | from typing import List
|
| 8 | +import numpy as np |
| 9 | +from tqdm import tqdm |
8 | 10 |
|
9 | 11 | AMINO_ACIDS = "ACDEFGHIKLMNPQRSTVWY"
|
10 | 12 |
|
@@ -49,10 +51,14 @@ def sample_peptides_from_fasta(fasta_path: Path, length: int, count: int) -> Lis
|
49 | 51 |
|
50 | 52 | # Convert set back to list for sampling
|
51 | 53 | unique_subseqs = list(all_subseqs)
|
52 |
| - peptides = random.sample(unique_subseqs, k=min(count, len(unique_subseqs))) |
53 |
| - while len(peptides) < count: |
54 |
| - peptides.append(random.choice(unique_subseqs)) |
55 |
| - return peptides[:count] |
| 54 | + |
| 55 | + # Ensure sampling without replacement - if not enough unique peptides, inform user |
| 56 | + if count > len(unique_subseqs): |
| 57 | + print(f"Warning: Requested {count} peptides, but only {len(unique_subseqs)} unique peptides available. Returning all {len(unique_subseqs)} unique peptides.", file=sys.stderr) |
| 58 | + return unique_subseqs |
| 59 | + |
| 60 | + # Sample without replacement |
| 61 | + return random.sample(unique_subseqs, k=count) |
56 | 62 |
|
57 | 63 | def generate_llm_peptides(length: int, count: int, model_name: str = "protgpt2", top_k: int = 950, top_p: float = 0.9, repetition_penalty: float = 1.2) -> List[str]:
|
58 | 64 | try:
|
@@ -197,6 +203,134 @@ def generate_llm_peptides(length: int, count: int, model_name: str = "protgpt2",
|
197 | 203 | print(f"Warning: Only generated {len(peptides)} peptides of requested {count} with exact length {length}.", file=sys.stderr)
|
198 | 204 | return peptides[:count]
|
199 | 205 |
|
| 206 | +def generate_fake_proteome_lengths(num_proteins: int, reference_fasta_path: Path) -> List[int]: |
| 207 | + """Generate protein lengths matching the distribution from a reference proteome.""" |
| 208 | + # Parse reference proteome to get length distribution |
| 209 | + sequences = parse_fasta_sequences(reference_fasta_path) |
| 210 | + reference_lengths = [len(seq) for seq in sequences] |
| 211 | + |
| 212 | + # Sample from the empirical distribution |
| 213 | + sampled_lengths = np.random.choice(reference_lengths, size=num_proteins, replace=True) |
| 214 | + return sampled_lengths.tolist() |
| 215 | + |
| 216 | +def get_user_input(prompt: str) -> str: |
| 217 | + """Get user input with a prompt.""" |
| 218 | + return input(prompt).strip() |
| 219 | + |
| 220 | +def get_existing_proteome_path() -> Path: |
| 221 | + """Interactive prompt to get existing proteome file path.""" |
| 222 | + while True: |
| 223 | + path_str = get_user_input("Please provide the path to your ProtGPT2-generated proteome file: ") |
| 224 | + # Strip quotes that users might add |
| 225 | + path_str = path_str.strip('"\'') |
| 226 | + path = Path(path_str) |
| 227 | + if path.exists() and path.is_file(): |
| 228 | + return path |
| 229 | + else: |
| 230 | + print(f"Error: File '{path}' not found. Please try again.") |
| 231 | + |
| 232 | +def configure_proteome_generation(): |
| 233 | + """Interactive configuration for new proteome generation using ProtGPT2 only.""" |
| 234 | + print("\nConfiguring new ProtGPT2 proteome generation...") |
| 235 | + print("This will generate completely synthetic proteins using ProtGPT2.") |
| 236 | + |
| 237 | + # Get number of proteins |
| 238 | + while True: |
| 239 | + try: |
| 240 | + num_proteins = int(get_user_input("How many proteins should be generated? ")) |
| 241 | + if num_proteins > 0: |
| 242 | + break |
| 243 | + else: |
| 244 | + print("Please enter a positive number.") |
| 245 | + except ValueError: |
| 246 | + print("Please enter a valid integer.") |
| 247 | + |
| 248 | + # Get protein length range |
| 249 | + print("\nSpecify the protein length range:") |
| 250 | + while True: |
| 251 | + try: |
| 252 | + min_len = int(get_user_input("Minimum protein length: ")) |
| 253 | + max_len = int(get_user_input("Maximum protein length: ")) |
| 254 | + if min_len > 0 and max_len >= min_len: |
| 255 | + break |
| 256 | + else: |
| 257 | + print("Please ensure minimum > 0 and maximum >= minimum.") |
| 258 | + except ValueError: |
| 259 | + print("Please enter valid integers.") |
| 260 | + |
| 261 | + # Generate uniform random lengths in the specified range |
| 262 | + target_lengths = [random.randint(min_len, max_len) for _ in range(num_proteins)] |
| 263 | + |
| 264 | + print(f"\nWill generate {num_proteins} proteins with lengths between {min_len}-{max_len} amino acids.") |
| 265 | + return num_proteins, target_lengths |
| 266 | + |
| 267 | +def generate_fake_proteome(num_proteins: int, target_lengths: List[int], model_name: str = "protgpt2") -> List[str]: |
| 268 | + """Generate a fake proteome with specified protein lengths using LLM.""" |
| 269 | + try: |
| 270 | + from transformers import pipeline |
| 271 | + except ImportError: |
| 272 | + print("Error: transformers package is required for fake proteome generation. Please install with 'pip install transformers torch'", file=sys.stderr) |
| 273 | + sys.exit(1) |
| 274 | + |
| 275 | + print(f"Generating {num_proteins} proteins using {model_name}...") |
| 276 | + |
| 277 | + if model_name.lower() == "protgpt2": |
| 278 | + model_id = "nferruz/ProtGPT2" |
| 279 | + llm_pipeline = pipeline('text-generation', model=model_id, framework="pt") |
| 280 | + |
| 281 | + proteins = [] |
| 282 | + # Use progress bar for protein generation |
| 283 | + for i, target_length in enumerate(tqdm(target_lengths, desc="Generating proteins", unit="protein")): |
| 284 | + |
| 285 | + # Generate protein of approximately target length |
| 286 | + max_tokens = max(10, target_length // 4) # ProtGPT2 tokens are ~4 amino acids |
| 287 | + tries = 0 |
| 288 | + protein = None |
| 289 | + |
| 290 | + while tries < 5: # Try up to 5 times to get reasonable length |
| 291 | + try: |
| 292 | + # Start with empty prompt to get natural protein start |
| 293 | + sequences = llm_pipeline( |
| 294 | + "", |
| 295 | + max_length=max_tokens, |
| 296 | + do_sample=True, |
| 297 | + top_k=950, |
| 298 | + top_p=0.9, |
| 299 | + temperature=1.0, |
| 300 | + repetition_penalty=1.2, |
| 301 | + num_return_sequences=1, |
| 302 | + eos_token_id=0 |
| 303 | + ) |
| 304 | + |
| 305 | + if sequences and len(sequences) > 0: |
| 306 | + gen_text = sequences[0].get('generated_text', '') |
| 307 | + # Clean the sequence - keep only valid amino acids |
| 308 | + clean_seq = ''.join([c for c in gen_text.upper() if c in AMINO_ACIDS]) |
| 309 | + |
| 310 | + if len(clean_seq) >= 50: # Minimum reasonable protein length |
| 311 | + # Trim or extend to approximate target length |
| 312 | + if len(clean_seq) > target_length * 1.5: |
| 313 | + clean_seq = clean_seq[:target_length] |
| 314 | + protein = clean_seq |
| 315 | + break |
| 316 | + |
| 317 | + except Exception as e: |
| 318 | + print(f"Warning: Error generating protein {i+1}: {e}", file=sys.stderr) |
| 319 | + |
| 320 | + tries += 1 |
| 321 | + |
| 322 | + # Fallback to random sequence if generation failed |
| 323 | + if protein is None: |
| 324 | + protein = ''.join(random.choices(AMINO_ACIDS, k=target_length)) |
| 325 | + |
| 326 | + proteins.append(protein) |
| 327 | + |
| 328 | + return proteins |
| 329 | + |
| 330 | + else: |
| 331 | + print(f"Error: Unsupported model '{model_name}' for proteome generation. Only protgpt2 is supported.", file=sys.stderr) |
| 332 | + sys.exit(1) |
| 333 | + |
200 | 334 | def write_fasta(peptides: List[str], output_path: Path, prefix: str = "peptide"):
|
201 | 335 | with open(output_path, 'w') as f:
|
202 | 336 | for i, pep in enumerate(peptides, 1):
|
@@ -247,7 +381,39 @@ def main():
|
247 | 381 | sys.exit(1)
|
248 | 382 | peptides = sample_peptides_from_fasta(args.fasta_file, args.length, args.count)
|
249 | 383 | elif args.source == 'llm':
|
250 |
| - peptides = generate_llm_peptides(args.length, args.count, args.llm_model, args.top_k, args.top_p, args.repetition_penalty) |
| 384 | + # Interactive workflow for LLM-based peptide generation |
| 385 | + print(f"\nGenerating {args.count} peptides of length {args.length} using LLM approach...") |
| 386 | + print("This approach generates a fake proteome first, then samples peptides from it.") |
| 387 | + |
| 388 | + # Check if user has existing proteome |
| 389 | + has_existing = get_user_input("\nDo you have an existing ProtGPT2-generated proteome file? (y/n): ").lower().startswith('y') |
| 390 | + |
| 391 | + if has_existing: |
| 392 | + proteome_path = get_existing_proteome_path() |
| 393 | + print(f"Using existing proteome: {proteome_path}") |
| 394 | + else: |
| 395 | + # Ask if user wants to generate new proteome |
| 396 | + generate_new = get_user_input("Would you like to generate a new fake proteome? (y/n): ").lower().startswith('y') |
| 397 | + |
| 398 | + if not generate_new: |
| 399 | + print("Cannot proceed without a proteome. Exiting.") |
| 400 | + sys.exit(1) |
| 401 | + |
| 402 | + # Configure proteome generation (no reference needed) |
| 403 | + num_proteins, target_lengths = configure_proteome_generation() |
| 404 | + |
| 405 | + # Generate the fake proteome |
| 406 | + fake_proteins = generate_fake_proteome(num_proteins, target_lengths, args.llm_model) |
| 407 | + |
| 408 | + # Save the generated proteome |
| 409 | + proteome_output = Path(f'fake_proteome_{num_proteins}proteins.fasta') |
| 410 | + write_fasta(fake_proteins, proteome_output, prefix="protein") |
| 411 | + print(f"\nGenerated fake proteome saved to: {proteome_output}") |
| 412 | + proteome_path = proteome_output |
| 413 | + |
| 414 | + # Now sample peptides from the proteome using FASTA method |
| 415 | + print(f"\nSampling {args.count} peptides from the proteome...") |
| 416 | + peptides = sample_peptides_from_fasta(proteome_path, args.length, args.count) |
251 | 417 | else:
|
252 | 418 | print(f"Unknown source: {args.source}", file=sys.stderr)
|
253 | 419 | sys.exit(1)
|
|
0 commit comments