wip: Fix ipSAE and prodigy not running on cloud.

FloWuenne · FloWuenne · commit 3137fedd153d · 2025-12-01T17:17:10.000Z
diff --git a/conf/test_design_nanobody.config b/conf/test_design_nanobody.config
@@ -21,16 +21,13 @@ params {
     // Input data - design mode with 2VSM nanobody
     input       = "${projectDir}/assets/test_data/samplesheet_design_nanobody.csv"
     mode        = 'design'
-    
-    // Enable all metrics modules for comprehensive testing
-    run_proteinmpnn            = true
-    mpnn_num_seq_per_target    = 2     // Reduced from default 8 for faster testing
-    run_boltz2_refold          = true
-    run_ipsae                  = true
-    run_prodigy                = true
-    run_foldseek               = true
-    run_consolidation          = true
-    
+
+    // Test-specific parameter overrides (all analysis modules are enabled by default)
+    mpnn_num_seq_per_target    = 2      // Reduced from default 8 for faster testing
+    boltz2_num_recycling       = 1      // Reduced for faster testing
+    boltz2_num_diffusion       = 1      // Reduced for faster testing
+    run_foldseek               = false  // Disabled - requires external database
+
     // Output
     outdir      = './results_test_design_nanobody'
 }
diff --git a/conf/test_design_peptide.config b/conf/test_design_peptide.config
@@ -21,16 +21,13 @@ params {
     // Input data - design mode with 2VSM peptide
     input       = "${projectDir}/assets/test_data/samplesheet_design_peptide.csv"
     mode        = 'design'
-    
-    // Enable all metrics modules for comprehensive testing
-    run_proteinmpnn            = true
-    mpnn_num_seq_per_target    = 2     // Reduced from default 8 for faster testing
-    run_boltz2_refold          = true
-    run_ipsae                  = true
-    run_prodigy                = true
-    run_foldseek               = true
-    run_consolidation          = true
-    
+
+    // Test-specific parameter overrides (all analysis modules are enabled by default)
+    mpnn_num_seq_per_target    = 2      // Reduced from default 8 for faster testing
+    boltz2_num_recycling       = 1      // Reduced for faster testing
+    boltz2_num_diffusion       = 1      // Reduced for faster testing
+    run_foldseek               = false  // Disabled - requires external database
+
     // Output
     outdir      = './results_test_design_peptide'
 }
diff --git a/conf/test_design_protein.config b/conf/test_design_protein.config
@@ -21,20 +21,15 @@ params {
     // Input data - design mode with 2VSM protein
     input       = "${projectDir}/assets/test_data/samplesheet_design_protein.csv"
     mode        = 'design'
-    
-    // Enable all metrics modules for comprehensive testing
-    run_proteinmpnn            = true
-    mpnn_num_seq_per_target    = 2     // Reduced from default 8 for faster testing
-    run_boltz2_refold          = true
-    boltz2_predict_affinity    = false // Affinity only supported for ligands, not proteins
+
+    // Test-specific parameter overrides (all analysis modules are enabled by default)
+    mpnn_num_seq_per_target    = 2      // Reduced from default 8 for faster testing
+    boltz2_predict_affinity    = false  // Affinity only supported for ligands, not proteins
     boltz2_use_msa             = false  // Required when input YAML has no MSAs
-    boltz2_num_recycling       = 1
-    boltz2_num_diffusion       = 1
-    run_ipsae                  = true
-    run_prodigy                = true
-    run_foldseek               = true
-    run_consolidation          = true
-    
+    boltz2_num_recycling       = 1      // Reduced for faster testing
+    boltz2_num_diffusion       = 1      // Reduced for faster testing
+    run_foldseek               = false  // Disabled - requires external database
+
     // Output
     outdir      = './results_test_design_protein'
 }
diff --git a/modules/local/boltz2_refold.nf b/modules/local/boltz2_refold.nf
@@ -242,10 +242,11 @@ SUMMARY
     stub:
     """
     mkdir -p ${meta.id}_boltz2_output
-    touch ${meta.id}_boltz2_output/placeholder.cif
-    touch ${meta.id}_boltz2_output/placeholder_confidence.json
-    touch ${meta.id}_boltz2_output/placeholder_pae.npz
-    touch ${meta.id}_boltz2_output/placeholder_affinity.json
+    # Create stub files with realistic names that match downstream filtering patterns
+    touch ${meta.id}_boltz2_output/${meta.id}_model_0.cif
+    touch ${meta.id}_boltz2_output/${meta.id}_model_0_confidence.json
+    touch ${meta.id}_boltz2_output/pae_${meta.id}_model_0.npz
+    touch ${meta.id}_boltz2_output/${meta.id}_model_0_affinity.json
     touch versions.yml
     """
 }
diff --git a/modules/local/boltzgen_run.nf b/modules/local/boltzgen_run.nf
@@ -82,10 +82,12 @@ process BOLTZGEN_RUN {
 
     stub:
     """
-    mkdir -p ${meta.id}_output/final_ranked_designs
+    mkdir -p ${meta.id}_output/final_ranked_designs/final_${meta.budget}_designs
     mkdir -p ${meta.id}_output/intermediate_designs
     mkdir -p ${meta.id}_output/intermediate_designs_inverse_folded
-    touch ${meta.id}_output/final_ranked_designs/placeholder.cif
+    # Create stub files with realistic names that match downstream patterns
+    touch ${meta.id}_output/final_ranked_designs/final_${meta.budget}_designs/rank1_${meta.id}_design.cif
+    touch ${meta.id}_output/final_ranked_designs/final_${meta.budget}_designs/rank2_${meta.id}_design.cif
     touch versions.yml
     """
 }
diff --git a/modules/local/convert_cif_to_pdb.nf b/modules/local/convert_cif_to_pdb.nf
@@ -146,7 +146,9 @@ process CONVERT_CIF_TO_PDB {
     stub:
     """
     mkdir -p ${meta.id}_pdb_structures
-    touch ${meta.id}_pdb_structures/placeholder.pdb
+    # Create stub files with realistic names that match downstream rank extraction patterns
+    touch ${meta.id}_pdb_structures/rank1_${meta.id}_design.pdb
+    touch ${meta.id}_pdb_structures/rank2_${meta.id}_design.pdb
     touch versions.yml
     """
 }
diff --git a/modules/local/ipsae_calculate.nf b/modules/local/ipsae_calculate.nf
@@ -45,9 +45,10 @@ process IPSAE_CALCULATE {
     def pae_cutoff = params.ipsae_pae_cutoff ?: 10
     def dist_cutoff = params.ipsae_dist_cutoff ?: 10
     """
-    touch stub_output_${pae_cutoff}_${dist_cutoff}.txt
-    touch stub_output_${pae_cutoff}_${dist_cutoff}_byres.txt
-    touch stub_output.pml
+    # Create stub files with unique names using meta.id
+    touch ${meta.id}_${pae_cutoff}_${dist_cutoff}.txt
+    touch ${meta.id}_${pae_cutoff}_${dist_cutoff}_byres.txt
+    touch ${meta.id}.pml
     touch versions.yml
     """
 }
diff --git a/modules/local/prepare_boltz2_sequences.nf b/modules/local/prepare_boltz2_sequences.nf
@@ -120,7 +120,8 @@ process PREPARE_BOLTZ2_SEQUENCES {
     """
     mkdir -p sequences
     echo "MOCKSEQUENCE" > ${meta.id}_target_sequence.txt
-    touch sequences/placeholder_seq_0.fa
+    # Create stub files with unique names matching the expected pattern
+    touch sequences/${meta.id}_s0.fa
     touch versions.yml
     """
 }
diff --git a/nextflow.config b/nextflow.config
@@ -35,7 +35,7 @@ params {
     steps                      = null                // Optional: Comma-separated list of steps to run (e.g., 'filtering' to rerun only filtering)
     
     // ProteinMPNN sequence optimization options
-    run_proteinmpnn            = false               // Enable ProteinMPNN sequence optimization of Boltzgen designs
+    run_proteinmpnn            = true                // Enable ProteinMPNN sequence optimization of Boltzgen designs (set to false to disable)
     mpnn_sampling_temp         = 0.1                 // Sampling temperature (0.1-0.3 recommended, lower = more conservative)
     mpnn_num_seq_per_target    = 8                   // Number of sequence variants to generate per structure
     mpnn_batch_size            = 1                   // Batch size for ProteinMPNN inference
@@ -47,7 +47,7 @@ params {
     mpnn_designed_chains       = null                // Chains to design (e.g., 'C' - typically the binder chain)
     
     // Boltz-2 structure prediction options (for refolding ProteinMPNN sequences)
-    run_boltz2_refold          = false               // Enable Boltz-2 structure prediction for ProteinMPNN sequences
+    run_boltz2_refold          = true                // Enable Boltz-2 structure prediction for ProteinMPNN sequences (set to false to disable)
     boltz2_cache               = null                // Cache directory for Boltz-2 model weights (~6GB), defaults to ~/.boltz
     boltz2_num_recycling       = 3                   // Number of recycling steps (3-5 recommended)
     boltz2_num_diffusion       = 200                 // Number of diffusion samples (higher = slower but more accurate)
@@ -56,16 +56,16 @@ params {
     boltz2_torch_precision     = 'medium'            // Torch float32 matmul precision: 'medium', 'high', or 'highest' (for Tensor Cores)
 
     // IPSAE scoring options
-    run_ipsae                  = false               // Enable IPSAE scoring of Boltzgen predictions (evaluates protein-protein interactions)
+    run_ipsae                  = true                // Enable IPSAE scoring of Boltz-2 structures (set to false to disable)
     ipsae_pae_cutoff           = 10                  // PAE cutoff for IPSAE calculation (Angstroms, default: 10)
     ipsae_dist_cutoff          = 10                  // Distance cutoff for CA-CA contacts (Angstroms, default: 10)
     
     // PRODIGY binding affinity prediction options
-    run_prodigy                = false               // Enable PRODIGY binding affinity prediction on final designs
+    run_prodigy                = true                // Enable PRODIGY binding affinity prediction on final designs (set to false to disable)
     prodigy_selection          = null                // Chain selection for PRODIGY (e.g., 'A,B'). If null, auto-detects from structure
     
     // Foldseek structural search options
-    run_foldseek               = false               // Enable Foldseek structural similarity search for budget designs and Boltz-2 structures
+    run_foldseek               = true                // Enable Foldseek structural similarity search (requires foldseek_database, set to false to disable)
     foldseek_database          = null                // Path to Foldseek database (e.g., AlphaFold/Swiss-Model, required if run_foldseek is true)
     foldseek_evalue            = 0.001               // E-value threshold for reporting matches (lower = more stringent)
     foldseek_max_seqs          = 100                 // Maximum number of target sequences to report
@@ -74,7 +74,7 @@ params {
     foldseek_alignment_type    = 2                   // Alignment type: 0=3Di only, 1=TMalign (global), 2=3Di+AA (local, default)
     
     // Metrics consolidation and reporting options
-    run_consolidation          = false               // Enable consolidated metrics report generation
+    run_consolidation          = true                // Enable consolidated metrics report generation (set to false to disable)
     report_top_n               = 10                  // Number of top designs to highlight in report
     
     // Output options
diff --git a/workflows/protein_design.nf b/workflows/protein_design.nf
@@ -55,14 +55,14 @@ workflow PROTEIN_DESIGN {
     
     // Extract budget_design_cifs from both sources for downstream processing
     ch_budget_cifs_new = BOLTZGEN_RUN.out.budget_design_cifs
-    
+
+    // For precomputed results, extract CIF files from the precomputed directory
     ch_budget_cifs_precomputed = ch_branched.with_precomputed
         .map { meta, boltzgen_dir ->
-            // Extract budget design CIF files from pre-computed directory
             def budget_cifs = file("${boltzgen_dir}/final_ranked_designs/final_*_designs/*.cif")
             [meta, budget_cifs]
         }
-    
+
     ch_budget_design_cifs = ch_budget_cifs_new
         .mix(ch_budget_cifs_precomputed)