Skip to content

Commit 82bacd9

Browse files
authored
Merge pull request #46 from seqeralabs/seqera-ai/20251121-212340-fix-proteinmpnn-parallelization
Fix ProteinMPNN parallelization and enable missing modules in tests
2 parents a607fd4 + ec3703e commit 82bacd9

File tree

5 files changed

+48
-8
lines changed

5 files changed

+48
-8
lines changed

conf/test_design_nanobody.config

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,8 +31,10 @@ params {
3131
// Enable all metrics modules for comprehensive testing
3232
run_proteinmpnn = true
3333
mpnn_num_seq_per_target = 2 // Reduced from default 8 for faster testing
34+
run_protenix_refold = true
3435
run_ipsae = true
3536
run_prodigy = true
37+
run_foldseek = true
3638
run_consolidation = true
3739

3840
// Output

conf/test_design_peptide.config

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,8 +31,10 @@ params {
3131
// Enable all metrics modules for comprehensive testing
3232
run_proteinmpnn = true
3333
mpnn_num_seq_per_target = 2 // Reduced from default 8 for faster testing
34+
run_protenix_refold = true
3435
run_ipsae = true
3536
run_prodigy = true
37+
run_foldseek = true
3638
run_consolidation = true
3739

3840
// Output

conf/test_design_protein.config

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,8 +31,10 @@ params {
3131
// Enable all metrics modules for comprehensive testing
3232
run_proteinmpnn = true
3333
mpnn_num_seq_per_target = 2 // Reduced from default 8 for faster testing
34+
run_protenix_refold = true
3435
run_ipsae = true
3536
run_prodigy = true
37+
run_foldseek = true
3638
run_consolidation = true
3739

3840
// Output

modules/local/convert_cif_to_pdb.nf

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,8 @@ process CONVERT_CIF_TO_PDB {
88
tuple val(meta), path(structures)
99

1010
output:
11-
tuple val(meta), path("${meta.id}_pdb_structures/*.pdb"), emit: pdb_files
11+
tuple val(meta), path("${meta.id}_pdb_structures/*.pdb"), emit: pdb_files_all
12+
tuple val(meta), path("${meta.id}_pdb_structures"), emit: pdb_dir
1213
path "versions.yml", emit: versions
1314

1415
script:

workflows/protein_design.nf

Lines changed: 40 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -47,8 +47,26 @@ workflow PROTEIN_DESIGN {
4747

4848
CONVERT_CIF_TO_PDB(ch_structures_for_conversion)
4949

50-
// Step 2: Run ProteinMPNN on converted PDB structures
51-
PROTEINMPNN_OPTIMIZE(CONVERT_CIF_TO_PDB.out.pdb_files)
50+
// Step 2: Parallelize ProteinMPNN - run separately for each budget design
51+
// Use flatMap to create individual tasks per PDB file (one per budget iteration)
52+
ch_pdb_per_design = CONVERT_CIF_TO_PDB.out.pdb_files_all
53+
.flatMap { meta, pdb_files ->
54+
// Convert to list if single file
55+
def pdb_list = pdb_files instanceof List ? pdb_files : [pdb_files]
56+
57+
// Create a separate channel entry for each PDB file
58+
pdb_list.collect { pdb_file ->
59+
def design_meta = [:]
60+
design_meta.id = "${meta.id}_${pdb_file.baseName}"
61+
design_meta.parent_id = meta.id
62+
design_meta.design_name = pdb_file.baseName
63+
64+
[design_meta, pdb_file]
65+
}
66+
}
67+
68+
// Run ProteinMPNN on each design individually (parallel execution per budget design)
69+
PROTEINMPNN_OPTIMIZE(ch_pdb_per_design)
5270

5371
// Use ProteinMPNN optimized structures for downstream analyses
5472
ch_final_designs_for_analysis = PROTEINMPNN_OPTIMIZE.out.optimized_designs
@@ -61,9 +79,24 @@ workflow PROTEIN_DESIGN {
6179
ch_boltzgen_structures = BOLTZGEN_RUN.out.final_cifs
6280
EXTRACT_TARGET_SEQUENCES(ch_boltzgen_structures)
6381

64-
// Combine ProteinMPNN FASTA outputs with target sequence
65-
// Join based on parent_id (meta.parent_id from MPNN matches meta.id from Boltzgen)
66-
ch_protenix_input = PROTEINMPNN_OPTIMIZE.out.sequences
82+
// Parallelize Protenix per FASTA file (one per ProteinMPNN sequence)
83+
// Each ProteinMPNN run generates multiple FASTA files (mpnn_num_seq_per_target)
84+
ch_protenix_per_sequence = PROTEINMPNN_OPTIMIZE.out.sequences
85+
.flatMap { meta, fasta_files ->
86+
// Convert to list if single file
87+
def fasta_list = fasta_files instanceof List ? fasta_files : [fasta_files]
88+
89+
// Create a separate entry for each FASTA file
90+
fasta_list.collect { fasta_file ->
91+
def seq_meta = [:]
92+
seq_meta.id = "${meta.id}_${fasta_file.baseName}"
93+
seq_meta.parent_id = meta.parent_id
94+
seq_meta.mpnn_parent_id = meta.id
95+
seq_meta.sequence_name = fasta_file.baseName
96+
97+
[seq_meta, fasta_file]
98+
}
99+
}
67100
.map { meta, fasta ->
68101
[meta.parent_id, meta, fasta]
69102
}
@@ -76,8 +109,8 @@ workflow PROTEIN_DESIGN {
76109
[meta, fasta, target_seq]
77110
}
78111

79-
// Run Protenix structure prediction on combined sequences
80-
PROTENIX_REFOLD(ch_protenix_input)
112+
// Run Protenix structure prediction on each sequence individually
113+
PROTENIX_REFOLD(ch_protenix_per_sequence)
81114

82115
// ================================================================
83116
// Step 4: Convert Protenix confidence JSON to NPZ for ipSAE

0 commit comments

Comments
 (0)