1+ #!/usr/bin/env python3
2+ """
3+ Prepare sequences for Boltz2 refolding.
4+
5+ This script combines two operations:
6+ 1. Extract target sequence from original Boltzgen structure (for Boltz2 input)
7+ 2. Split ProteinMPNN multi-sequence FASTA into individual files
8+
9+ All sequences are included (original Boltzgen sequence + MPNN-designed sequences).
10+ """
11+
12+ import os
13+ import sys
14+ import json
15+ from pathlib import Path
16+ import argparse
17+ import Bio
18+ from Bio import PDB
19+ from Bio .PDB import PDBIO , MMCIFParser , PDBParser
20+ from Bio .SeqUtils import seq1
21+
22+
23+ def extract_target_sequence (structure_file , design_id ):
24+ """Extract target sequence from Boltzgen structure."""
25+ print ("=" * 80 )
26+ print ("PART 1: Extracting target sequence from Boltzgen structure" )
27+ print ("=" * 80 )
28+
29+ structure_path = Path (structure_file )
30+ print (f"Structure file: { structure_path } " )
31+
32+ try :
33+ if structure_path .suffix .lower () == '.cif' :
34+ parser = MMCIFParser (QUIET = True )
35+ else :
36+ parser = PDBParser (QUIET = True )
37+
38+ structure = parser .get_structure ('structure' , str (structure_path ))
39+
40+ # Extract sequences from all chains
41+ sequences = {}
42+ for model in structure :
43+ for chain in model :
44+ chain_id = chain .id
45+ residues = []
46+ for residue in chain :
47+ if PDB .is_aa (residue ):
48+ resname = residue .get_resname ()
49+ try :
50+ # seq1 converts 3-letter to 1-letter code
51+ one_letter = seq1 (resname )
52+ residues .append (one_letter )
53+ except (KeyError , ValueError ):
54+ # Handle non-standard amino acids
55+ residues .append ('X' )
56+
57+ if residues :
58+ sequences [chain_id ] = '' .join (residues )
59+
60+ if not sequences :
61+ print ("ERROR: No amino acid sequences found in structure" , file = sys .stderr )
62+ sys .exit (1 )
63+
64+ # Identify target chain (longest chain)
65+ target_chain_id = max (sequences .items (), key = lambda x : len (x [1 ]))[0 ]
66+ target_sequence = sequences [target_chain_id ]
67+
68+ # Write target sequence to file
69+ target_seq_file = f"{ design_id } _target_sequence.txt"
70+ with open (target_seq_file , 'w' ) as f :
71+ f .write (target_sequence + "\n " )
72+
73+ print (f"✓ Target chain { target_chain_id } extracted ({ len (target_sequence )} residues)" )
74+
75+ # Create info JSON
76+ info = {
77+ "design_id" : design_id ,
78+ "source_structure" : structure_path .name ,
79+ "target_chain" : target_chain_id ,
80+ "target_length" : len (target_sequence ),
81+ "all_chains" : {cid : len (seq ) for cid , seq in sequences .items ()}
82+ }
83+
84+ target_info_file = f"{ design_id } _target_info.json"
85+ with open (target_info_file , 'w' ) as f :
86+ json .dump (info , f , indent = 2 )
87+
88+ return target_seq_file , target_info_file
89+
90+ except Exception as e :
91+ print (f"ERROR: Failed to extract target sequence: { e } " , file = sys .stderr )
92+ import traceback
93+ traceback .print_exc ()
94+ sys .exit (1 )
95+
96+
97+ def split_fasta (input_file , output_dir = "sequences" ):
98+ """Split multi-sequence FASTA into individual files."""
99+ print ()
100+ print ("=" * 80 )
101+ print ("PART 2: Splitting ProteinMPNN sequences" )
102+ print ("=" * 80 )
103+
104+ print (f"FASTA file: { input_file } " )
105+
106+ sequences = []
107+ current_seq = []
108+ current_header = None
109+
110+ # Read FASTA
111+ try :
112+ with open (input_file , 'r' ) as f :
113+ for line in f :
114+ line = line .strip ()
115+ if not line :
116+ continue
117+ if line .startswith ('>' ):
118+ if current_header and current_seq :
119+ sequences .append ((current_header , '' .join (current_seq )))
120+ current_header = line
121+ current_seq = []
122+ else :
123+ current_seq .append (line )
124+
125+ # Add last sequence
126+ if current_header and current_seq :
127+ sequences .append ((current_header , '' .join (current_seq )))
128+ except Exception as e :
129+ print (f"ERROR: Failed to read FASTA file { input_file } : { e } " , file = sys .stderr )
130+ sys .exit (1 )
131+
132+ print (f"Found { len (sequences )} sequences in { input_file } " )
133+
134+ # Include ALL sequences (including the original first sequence)
135+ sequences_to_process = sequences
136+
137+ if not sequences_to_process :
138+ print ("ERROR: No sequences found in FASTA file" , file = sys .stderr )
139+ sys .exit (1 )
140+
141+ print (f"Splitting { len (sequences_to_process )} sequences (including original)" )
142+
143+ # Create output directory
144+ os .makedirs (output_dir , exist_ok = True )
145+
146+ # Write each sequence to a separate file
147+ base_name = os .path .splitext (os .path .basename (input_file ))[0 ]
148+
149+ output_files = []
150+ for idx , (header , seq ) in enumerate (sequences_to_process ):
151+ # Use 0-based indexing: seq_0 is original, seq_1+ are MPNN designs
152+ seq_num = idx
153+ output_file = os .path .join (output_dir , f"{ base_name } _seq_{ seq_num } .fa" )
154+
155+ with open (output_file , 'w' ) as out :
156+ out .write (f"{ header } \n { seq } \n " )
157+
158+ output_files .append (output_file )
159+ seq_type = "original" if idx == 0 else f"MPNN design { idx } "
160+ print (f"✓ Created { output_file } ({ seq_type } )" )
161+
162+ return output_files
163+
164+
165+ def main ():
166+ parser = argparse .ArgumentParser (description = "Prepare sequences for Boltz2 refolding" )
167+ parser .add_argument ("mpnn_fasta" , help = "ProteinMPNN multi-sequence FASTA file" )
168+ parser .add_argument ("boltzgen_structure" , help = "Boltzgen structure file (CIF or PDB)" )
169+ parser .add_argument ("design_id" , help = "Design ID for output files" )
170+
171+ args = parser .parse_args ()
172+
173+ # Extract target sequence
174+ target_seq_file , target_info_file = extract_target_sequence (
175+ args .boltzgen_structure ,
176+ args .design_id
177+ )
178+
179+ # Split MPNN sequences
180+ sequence_files = split_fasta (args .mpnn_fasta )
181+
182+ # Generate version information
183+ with open ("versions.yml" , "w" ) as f :
184+ f .write ('"NFPROTEINDESIGN:PROTEIN_DESIGN:PREPARE_BOLTZ2_SEQUENCES":\n ' )
185+ f .write (f' python: { sys .version .split ()[0 ]} \n ' )
186+ f .write (f' biopython: { Bio .__version__ } \n ' )
187+
188+ print ()
189+ print ("=" * 80 )
190+ print ("✓ Sequence preparation complete" )
191+ print ("=" * 80 )
192+
193+
194+ if __name__ == "__main__" :
195+ main ()
0 commit comments