@@ -14,6 +14,7 @@ def parse_args():
1414 parser .add_argument ('--parent_id' , required = True , help = 'Parent ID' )
1515 parser .add_argument ('--predict_affinity' , action = 'store_true' , help = 'Enable affinity prediction' )
1616 parser .add_argument ('--output_dir' , default = 'yaml_inputs' , help = 'Directory to save YAML files' )
17+ parser .add_argument ('--treat_as_designed' , action = 'store_true' , help = 'Treat the first sequence as a designed sequence (do not skip)' )
1718 return parser .parse_args ()
1819
1920def main ():
@@ -65,81 +66,57 @@ def main():
6566
6667 print (f"Found { len (sequences )} sequences in { fasta_file } " )
6768
68- # Skip the first sequence (it's always the original sequence from Boltzgen)
69- # We only want to refold the NEW sequences generated by ProteinMPNN
70- sequences_to_process = sequences [1 :] if len (sequences ) > 1 else []
69+ # Determine which sequences to process
70+ if args .treat_as_designed :
71+ # If treating as designed, process ALL sequences (including the first one)
72+ sequences_to_process = sequences
73+ print (f"Processing all { len (sequences_to_process )} sequences (treating first as designed)" )
74+ else :
75+ # Default behavior: Skip the first sequence (original from Boltzgen)
76+ sequences_to_process = sequences [1 :] if len (sequences ) > 1 else []
77+ print (f"Processing { len (sequences_to_process )} new MPNN sequences (skipping original)" )
7178
7279 if not sequences_to_process :
73- print (f"⚠ Warning: Only found 1 sequence (original), no new MPNN sequences to refold " )
80+ print (f"⚠ Warning: No sequences to process in { fasta_file } " )
7481 continue
7582
76- print (f"Processing { len (sequences_to_process )} new MPNN sequences (skipping original)" )
77-
78- # Create Boltz-2 YAML for each NEW sequence (skip first one)
83+ # Create Boltz-2 YAML for each sequence
7984 for idx , (header , binder_seq ) in enumerate (sequences_to_process ):
8085 # Create YAML input for Boltz-2
8186 # Format: binder (designed sequence) + target (original protein)
8287 # Note: Only target gets MSA; Boltz-2 will infer missing MSA info for binder
88+ # Ensure binder sequence contains only the first chain (strip any '/' separators)
89+ binder_seq_clean = binder_seq .split ('/' )[0 ] if '/' in binder_seq else binder_seq
8390 binder_entry = {
8491 'protein' : {
85- 'id' : 'BINDER ' ,
86- 'sequence' : binder_seq ,
92+ 'id' : 'A ' ,
93+ 'sequence' : binder_seq_clean ,
8794 'msa' : 'empty'
8895 }
8996 }
9097
9198 target_entry = {
9299 'protein' : {
93- 'id' : 'TARGET' ,
100+ 'id' : 'B' ,
94101 'sequence' : target_seq
95102 }
96103 }
97-
98- # Add target MSA if available (binder MSA will be inferred by Boltz-2)
99104 if has_target_msa and target_msa_path :
100105 target_entry ['protein' ]['msa' ] = os .path .abspath (target_msa_path )
101106 print (f" Adding target MSA: { target_msa_path } " )
102-
103- # Check for multi-chain sequence (ProteinMPNN uses / separator)
104- if '/' in binder_seq :
105- # Multi-chain case: ProteinMPNN output includes all chains
106- # We split them and create separate entities
107- parts = binder_seq .split ('/' )
108- seq_list = []
109- for i , part in enumerate (parts ):
110- # Use simple IDs: A, B, C...
111- chain_id = chr (65 + i )
112- seq_list .append ({
113- 'protein' : {
114- 'id' : chain_id ,
115- 'sequence' : part ,
116- 'msa' : 'empty'
117- }
118- })
119- # Add the target entry (with MSA if available) to the sequences list
120- seq_list .append (target_entry )
121-
122- boltz2_input = {
123- 'version' : 1 ,
124- 'sequences' : seq_list
125- }
126- print (f" Detected multi-chain sequence ({ len (parts )} chains)" )
127-
128- else :
129- # Single chain case: Binder + Target
130- boltz2_input = {
131- 'version' : 1 ,
132- 'sequences' : [binder_entry , target_entry ]
133- }
134-
135- # Add affinity prediction property (only for single binder case)
136- # Note: Boltz-2 currently only supports affinity for ligands, so this might fail for proteins
137- if args .predict_affinity :
138- boltz2_input ['properties' ] = [
139- {'affinity' : {'binder' : 'BINDER' }}
140- ]
107+ # Build final YAML input with exactly two entries
108+ boltz2_input = {
109+ 'version' : 1 ,
110+ 'sequences' : [binder_entry , target_entry ]
111+ }
112+ # Add affinity prediction property (only for single binder case)
113+ if args .predict_affinity :
114+ boltz2_input ['properties' ] = [
115+ {'affinity' : {'binder' : 'A' }}
116+ ]
141117
142118 # Write YAML file
119+ # Use a unique suffix based on the loop index to avoid overwriting
143120 yaml_file = f"{ args .output_dir } /{ output_base } _seq_{ yaml_count } .yaml"
144121 with open (yaml_file , 'w' ) as yf :
145122 yaml .dump (boltz2_input , yf , default_flow_style = False )
0 commit comments