Skip to content

Commit 49d2db2

Browse files
committed
wip: Applied fixes to make pipeline functional end to end.
1 parent a018641 commit 49d2db2

12 files changed

+962
-1382
lines changed

assets/consolidate_design_metrics.py

Lines changed: 469 additions & 1072 deletions
Large diffs are not rendered by default.
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,2 @@
11
sample_id,design_yaml,structure_files,protocol,num_designs,budget,reuse,target_msa
2-
2vsm_protein_binder,assets/test_data/2VSM_protein_design.yaml,assets/test_data/2VSM.cif,protein-anything,3,2,,assets/test_data/2VSM_seq.Uniref30_2302.a3m
2+
2vsm,assets/test_data/2VSM_protein_design.yaml,assets/test_data/2VSM.cif,protein-anything,3,2,,assets/test_data/2VSM_seq.Uniref30_2302.a3m

bin/prepare_boltz2_sequences.py

Lines changed: 195 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,195 @@
1+
#!/usr/bin/env python3
2+
"""
3+
Prepare sequences for Boltz2 refolding.
4+
5+
This script combines two operations:
6+
1. Extract target sequence from original Boltzgen structure (for Boltz2 input)
7+
2. Split ProteinMPNN multi-sequence FASTA into individual files
8+
9+
All sequences are included (original Boltzgen sequence + MPNN-designed sequences).
10+
"""
11+
12+
import os
13+
import sys
14+
import json
15+
from pathlib import Path
16+
import argparse
17+
import Bio
18+
from Bio import PDB
19+
from Bio.PDB import PDBIO, MMCIFParser, PDBParser
20+
from Bio.SeqUtils import seq1
21+
22+
23+
def extract_target_sequence(structure_file, design_id):
24+
"""Extract target sequence from Boltzgen structure."""
25+
print("=" * 80)
26+
print("PART 1: Extracting target sequence from Boltzgen structure")
27+
print("=" * 80)
28+
29+
structure_path = Path(structure_file)
30+
print(f"Structure file: {structure_path}")
31+
32+
try:
33+
if structure_path.suffix.lower() == '.cif':
34+
parser = MMCIFParser(QUIET=True)
35+
else:
36+
parser = PDBParser(QUIET=True)
37+
38+
structure = parser.get_structure('structure', str(structure_path))
39+
40+
# Extract sequences from all chains
41+
sequences = {}
42+
for model in structure:
43+
for chain in model:
44+
chain_id = chain.id
45+
residues = []
46+
for residue in chain:
47+
if PDB.is_aa(residue):
48+
resname = residue.get_resname()
49+
try:
50+
# seq1 converts 3-letter to 1-letter code
51+
one_letter = seq1(resname)
52+
residues.append(one_letter)
53+
except (KeyError, ValueError):
54+
# Handle non-standard amino acids
55+
residues.append('X')
56+
57+
if residues:
58+
sequences[chain_id] = ''.join(residues)
59+
60+
if not sequences:
61+
print("ERROR: No amino acid sequences found in structure", file=sys.stderr)
62+
sys.exit(1)
63+
64+
# Identify target chain (longest chain)
65+
target_chain_id = max(sequences.items(), key=lambda x: len(x[1]))[0]
66+
target_sequence = sequences[target_chain_id]
67+
68+
# Write target sequence to file
69+
target_seq_file = f"{design_id}_target_sequence.txt"
70+
with open(target_seq_file, 'w') as f:
71+
f.write(target_sequence + "\n")
72+
73+
print(f"✓ Target chain {target_chain_id} extracted ({len(target_sequence)} residues)")
74+
75+
# Create info JSON
76+
info = {
77+
"design_id": design_id,
78+
"source_structure": structure_path.name,
79+
"target_chain": target_chain_id,
80+
"target_length": len(target_sequence),
81+
"all_chains": {cid: len(seq) for cid, seq in sequences.items()}
82+
}
83+
84+
target_info_file = f"{design_id}_target_info.json"
85+
with open(target_info_file, 'w') as f:
86+
json.dump(info, f, indent=2)
87+
88+
return target_seq_file, target_info_file
89+
90+
except Exception as e:
91+
print(f"ERROR: Failed to extract target sequence: {e}", file=sys.stderr)
92+
import traceback
93+
traceback.print_exc()
94+
sys.exit(1)
95+
96+
97+
def split_fasta(input_file, output_dir="sequences"):
98+
"""Split multi-sequence FASTA into individual files."""
99+
print()
100+
print("=" * 80)
101+
print("PART 2: Splitting ProteinMPNN sequences")
102+
print("=" * 80)
103+
104+
print(f"FASTA file: {input_file}")
105+
106+
sequences = []
107+
current_seq = []
108+
current_header = None
109+
110+
# Read FASTA
111+
try:
112+
with open(input_file, 'r') as f:
113+
for line in f:
114+
line = line.strip()
115+
if not line:
116+
continue
117+
if line.startswith('>'):
118+
if current_header and current_seq:
119+
sequences.append((current_header, ''.join(current_seq)))
120+
current_header = line
121+
current_seq = []
122+
else:
123+
current_seq.append(line)
124+
125+
# Add last sequence
126+
if current_header and current_seq:
127+
sequences.append((current_header, ''.join(current_seq)))
128+
except Exception as e:
129+
print(f"ERROR: Failed to read FASTA file {input_file}: {e}", file=sys.stderr)
130+
sys.exit(1)
131+
132+
print(f"Found {len(sequences)} sequences in {input_file}")
133+
134+
# Include ALL sequences (including the original first sequence)
135+
sequences_to_process = sequences
136+
137+
if not sequences_to_process:
138+
print("ERROR: No sequences found in FASTA file", file=sys.stderr)
139+
sys.exit(1)
140+
141+
print(f"Splitting {len(sequences_to_process)} sequences (including original)")
142+
143+
# Create output directory
144+
os.makedirs(output_dir, exist_ok=True)
145+
146+
# Write each sequence to a separate file
147+
base_name = os.path.splitext(os.path.basename(input_file))[0]
148+
149+
output_files = []
150+
for idx, (header, seq) in enumerate(sequences_to_process):
151+
# Use 0-based indexing: seq_0 is original, seq_1+ are MPNN designs
152+
seq_num = idx
153+
output_file = os.path.join(output_dir, f"{base_name}_seq_{seq_num}.fa")
154+
155+
with open(output_file, 'w') as out:
156+
out.write(f"{header}\n{seq}\n")
157+
158+
output_files.append(output_file)
159+
seq_type = "original" if idx == 0 else f"MPNN design {idx}"
160+
print(f"✓ Created {output_file} ({seq_type})")
161+
162+
return output_files
163+
164+
165+
def main():
166+
parser = argparse.ArgumentParser(description="Prepare sequences for Boltz2 refolding")
167+
parser.add_argument("mpnn_fasta", help="ProteinMPNN multi-sequence FASTA file")
168+
parser.add_argument("boltzgen_structure", help="Boltzgen structure file (CIF or PDB)")
169+
parser.add_argument("design_id", help="Design ID for output files")
170+
171+
args = parser.parse_args()
172+
173+
# Extract target sequence
174+
target_seq_file, target_info_file = extract_target_sequence(
175+
args.boltzgen_structure,
176+
args.design_id
177+
)
178+
179+
# Split MPNN sequences
180+
sequence_files = split_fasta(args.mpnn_fasta)
181+
182+
# Generate version information
183+
with open("versions.yml", "w") as f:
184+
f.write('"NFPROTEINDESIGN:PROTEIN_DESIGN:PREPARE_BOLTZ2_SEQUENCES":\n')
185+
f.write(f' python: {sys.version.split()[0]}\n')
186+
f.write(f' biopython: {Bio.__version__}\n')
187+
188+
print()
189+
print("=" * 80)
190+
print("✓ Sequence preparation complete")
191+
print("=" * 80)
192+
193+
194+
if __name__ == "__main__":
195+
main()

conf/base.config

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -106,12 +106,12 @@ process {
106106
cpus = { check_max( 8 * task.attempt, 'cpus' ) }
107107
memory = { check_max( 32.GB * task.attempt, 'memory' ) }
108108
time = { check_max( 4.h * task.attempt, 'time' ) }
109-
109+
110110
// Request 1 GPU for accelerated searches
111111
accelerator = { check_max( 1, 'gpus' ) }
112-
113-
// Container GPU access for Docker
114-
containerOptions = '--gpus all'
112+
113+
// Container GPU access for Docker + override entrypoint to allow bash scripts
114+
containerOptions = '--gpus all --entrypoint ""'
115115
}
116116

117117
withName:BOLTZ2_REFOLD {

conf/test_design_protein.config

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ params {
3232
boltz2_num_diffusion = 2
3333
run_ipsae = true
3434
run_prodigy = true
35-
run_foldseek = false
35+
run_foldseek = true
3636
run_consolidation = true
3737

3838
// Output

modules/local/boltz2_refold.nf

Lines changed: 47 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -31,10 +31,10 @@ process BOLTZ2_REFOLD {
3131

3232
output:
3333
tuple val(meta), path("${meta.id}_boltz2_output"), emit: predictions
34-
tuple val(meta), path("${meta.id}_boltz2_output/**/*.cif"), optional: true, emit: structures
35-
tuple val(meta), path("${meta.id}_boltz2_output/**/*confidence*.json"), optional: true, emit: confidence
36-
tuple val(meta), path("${meta.id}_boltz2_output/**/*pae*.npz"), optional: true, emit: pae_npz
37-
tuple val(meta), path("${meta.id}_boltz2_output/**/*affinity*.json"), optional: true, emit: affinity
34+
tuple val(meta), path("${meta.id}_boltz2_output/*.cif"), optional: true, emit: structures
35+
tuple val(meta), path("${meta.id}_boltz2_output/*confidence*.json"), optional: true, emit: confidence
36+
tuple val(meta), path("${meta.id}_boltz2_output/*pae*.npz"), optional: true, emit: pae_npz
37+
tuple val(meta), path("${meta.id}_boltz2_output/*affinity*.json"), optional: true, emit: affinity
3838
path "versions.yml", emit: versions
3939

4040
script:
@@ -126,33 +126,51 @@ process BOLTZ2_REFOLD {
126126
# Organize outputs
127127
echo ""
128128
echo "Organizing outputs..."
129-
130-
# Move all results to output directory with sequence-specific naming
129+
130+
# Move all results to output directory
131+
# Boltz2 output structure: boltz2_results/boltz_results_<name>/predictions/<name>/<files>
131132
if [ -d "boltz2_results" ]; then
132133
mkdir -p ${meta.id}_boltz2_output
133-
134-
# Process each prediction directory
135-
for pred_dir in boltz2_results/predictions/*_seq_*; do
136-
if [ -d "\${pred_dir}" ]; then
137-
# Extract sequence number from directory name
138-
seq_num=\$(basename "\${pred_dir}" | grep -oP '_seq_\\K[0-9]+')
139-
140-
echo " Processing sequence \${seq_num}..."
141-
142-
# Rename files to include sequence suffix
143-
find "\${pred_dir}" -type f \\( -name "*.cif" -o -name "*.json" -o -name "*.npz" \\) | while read file; do
144-
filename=\$(basename "\${file}")
145-
extension="\${filename##*.}"
146-
basename_without_ext="\${filename%.*}"
147-
148-
# Add sequence suffix before extension
149-
new_filename="\${basename_without_ext}_seq\${seq_num}.\${extension}"
150-
151-
# Copy to output directory
152-
cp "\${file}" "${meta.id}_boltz2_output/\${new_filename}"
153-
echo " Saved: \${new_filename}"
154-
done
155-
fi
134+
135+
echo " Searching for Boltz2 output files..."
136+
137+
# Find all prediction directories (handles nested structure)
138+
find boltz2_results -type d -name "predictions" | while read pred_parent; do
139+
# Get the actual prediction subdirectories
140+
for pred_dir in "\${pred_parent}"/*/; do
141+
if [ -d "\${pred_dir}" ]; then
142+
dir_name=\$(basename "\${pred_dir}")
143+
echo " Processing prediction: \${dir_name}"
144+
145+
# Copy CIF files (format: <name>_model_0.cif)
146+
find "\${pred_dir}" -name "*.cif" -type f | while read file; do
147+
filename=\$(basename "\${file}")
148+
cp "\${file}" "${meta.id}_boltz2_output/\${filename}"
149+
echo " Saved CIF: \${filename}"
150+
done
151+
152+
# Copy PAE NPZ files (format: pae_<name>_model_0.npz)
153+
find "\${pred_dir}" -name "pae*.npz" -type f | while read file; do
154+
filename=\$(basename "\${file}")
155+
cp "\${file}" "${meta.id}_boltz2_output/\${filename}"
156+
echo " Saved PAE: \${filename}"
157+
done
158+
159+
# Copy confidence JSON files
160+
find "\${pred_dir}" -name "*confidence*.json" -type f | while read file; do
161+
filename=\$(basename "\${file}")
162+
cp "\${file}" "${meta.id}_boltz2_output/\${filename}"
163+
echo " Saved confidence: \${filename}"
164+
done
165+
166+
# Copy affinity JSON files
167+
find "\${pred_dir}" -name "*affinity*.json" -type f | while read file; do
168+
filename=\$(basename "\${file}")
169+
cp "\${file}" "${meta.id}_boltz2_output/\${filename}"
170+
echo " Saved affinity: \${filename}"
171+
done
172+
fi
173+
done
156174
done
157175
fi
158176

modules/local/boltzgen_run.nf

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,8 +28,7 @@ process BOLTZGEN_RUN {
2828
tuple val(meta), path("${meta.id}_output/intermediate_designs/*.npz"), optional: true, emit: intermediate_npz
2929

3030
// Intermediate inverse folded designs (all budget designs - this is what we want for IPSAE/PRODIGY)
31-
// Collect both CIF files from final_ranked_designs subdirectories AND from intermediate_designs_inverse_folded
32-
tuple val(meta), path("${meta.id}_output/intermediate_designs_inverse_folded/*.cif"), optional: true, emit: budget_design_cifs
31+
tuple val(meta), path("${meta.id}_output/final_ranked_designs/final_*_designs/*.cif"), optional: true, emit: budget_design_cifs
3332
tuple val(meta), path("${meta.id}_output/intermediate_designs_inverse_folded/*.npz"), optional: true, emit: budget_design_npz
3433

3534
// Specific intermediate outputs: binder by itself and refolded complex

0 commit comments

Comments
 (0)