Merge pull request #20 from OpenMined/madhava/fix_failures

madhavajay · web-flow · commit 03c5e2af6a0f · 2025-11-13T15:09:46.000+10:00
Made bioscript a bit harder when processing files and added combine
diff --git a/examples/apol1/apol1-classifier/pipeline.yaml b/examples/apol1/apol1-classifier/pipeline.yaml
@@ -1,5 +1,5 @@
 name: apol1-classifier
-description: Classification of APOL1 genotypes (G0, G1, G2) for kidney disease risk assessment.
+version: 0.1.1
 inputs:
   samplesheet: List[GenotypeRecord]
 steps:
diff --git a/examples/apol1/apol1-classifier/project.yaml b/examples/apol1/apol1-classifier/project.yaml
@@ -5,6 +5,7 @@ template: dynamic-nextflow
 version: 0.1.1
 assets:
 - classify_apol1.py
+description: Classification of APOL1 genotypes (G0, G1, G2) for kidney disease risk assessment.
 inputs:
 - name: participants
   type: List[GenotypeRecord]
diff --git a/examples/apol1/apol1-classifier/workflow.nf b/examples/apol1/apol1-classifier/workflow.nf
@@ -1,3 +1,5 @@
+// BioVault workflow export v0.1.1
+
 nextflow.enable.dsl=2
 
 workflow USER {
@@ -39,6 +41,8 @@ process apol1_classifier {
     container 'ghcr.io/openmined/bioscript:0.1.4'
     publishDir params.results_dir, mode: 'copy', overwrite: true, pattern: 'result_APOL1_*.tsv'
     tag { participant_id }
+    errorStrategy { params.nextflow.error_strategy }
+    maxRetries { params.nextflow.max_retries }
 
     input:
         tuple path(assets_dir), val(participant_id), path(genotype_file)
@@ -47,9 +51,10 @@ process apol1_classifier {
         path "result_APOL1_${participant_id}.tsv"
 
     script:
-    def filename = genotype_file.name
+    def genoFileName = genotype_file.getName()
     """
-    bioscript classify "${{assets_dir}}/classify_apol1.py" --file "${filename}" --participant_id "${{participant_id}}"
+    GENO_FILE=\$(printf '%q' "${genoFileName}")
+    bioscript classify "${assets_dir}/classify_apol1.py" --file \$GENO_FILE --participant_id "${participant_id}"
     """
 }
 
@@ -64,13 +69,9 @@ process aggregate_results {
         path "result_APOL1.tsv"
 
     script:
+    def manifestContent = individual_results.collect { it.toString() }.join('\n') + '\n'
     """
-    # Extract header from first file
-    head -n 1 ${individual_results[0]} > result_APOL1.tsv
-
-    # Append all data rows (skip headers)
-    for file in ${individual_results}; do
-        tail -n +2 "\$file" >> result_APOL1.tsv
-    done
+    cat <<'EOF' > results.list\n${manifestContent}EOF
+    bioscript combine --list results.list --output result_APOL1.tsv
     """
 }
diff --git a/examples/apol1/apol1_dev.ipynb b/examples/apol1/apol1_dev.ipynb
@@ -318,6 +318,8 @@
     "            'path': 'result_APOL1.tsv',\n",
     "        },\n",
     "    ],\n",
+    "    version=\"0.1.1\",\n",
+    "    description=\"Classification of APOL1 genotypes (G0, G1, G2) for kidney disease risk assessment.\",\n",
     ")\n",
     "project\n"
    ]
@@ -356,6 +358,7 @@
     "            },\n",
     "        ),\n",
     "    ],\n",
+    "    version=\"0.1.1\",\n",
     ")\n",
     "pipeline\n"
    ]
@@ -544,27 +547,6 @@
     "!bioscript classify classify_apol1.py --file apol1_headerless.txt --participant_id=\"HEADERLESS\"\n",
     "!cat result_APOL1_HEADERLESS.tsv\n"
    ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
   }
  ],
  "metadata": {
diff --git a/examples/brca/brca-classifier/pipeline.yaml b/examples/brca/brca-classifier/pipeline.yaml
@@ -1,5 +1,5 @@
 name: brca-classifier
-description: Classification of BRCA1 and BRCA2 variants from ClinVar database.
+version: 0.1.1
 inputs:
   samplesheet: List[GenotypeRecord]
 steps:
diff --git a/examples/brca/brca-classifier/project.yaml b/examples/brca/brca-classifier/project.yaml
@@ -7,6 +7,7 @@ assets:
 - classify_brca.py
 - brca2_clinvar.tsv
 - brca1_clinvar.tsv
+description: Classification of BRCA variants using ClinVar reference data for hereditary cancer risk.
 inputs:
 - name: participants
   type: List[GenotypeRecord]
diff --git a/examples/brca/brca-classifier/workflow.nf b/examples/brca/brca-classifier/workflow.nf
@@ -1,3 +1,5 @@
+// BioVault workflow export v0.1.1
+
 nextflow.enable.dsl=2
 
 workflow USER {
@@ -39,6 +41,8 @@ process brca_classifier {
     container 'ghcr.io/openmined/bioscript:0.1.4'
     publishDir params.results_dir, mode: 'copy', overwrite: true, pattern: 'result_BRCA_*.tsv'
     tag { participant_id }
+    errorStrategy { params.nextflow.error_strategy }
+    maxRetries { params.nextflow.max_retries }
 
     input:
         tuple path(assets_dir), val(participant_id), path(genotype_file)
@@ -47,9 +51,10 @@ process brca_classifier {
         path "result_BRCA_${participant_id}.tsv"
 
     script:
-    def filename = genotype_file.name
+    def genoFileName = genotype_file.getName()
     """
-    bioscript classify "${{assets_dir}}/classify_brca.py" --file "${filename}" --participant_id "${{participant_id}}"
+    GENO_FILE=\$(printf '%q' "${genoFileName}")
+    bioscript classify "${assets_dir}/classify_brca.py" --file \$GENO_FILE --participant_id "${participant_id}"
     """
 }
 
@@ -64,13 +69,9 @@ process aggregate_results {
         path "result_BRCA.tsv"
 
     script:
+    def manifestContent = individual_results.collect { it.toString() }.join('\n') + '\n'
     """
-    # Extract header from first file
-    head -n 1 ${individual_results[0]} > result_BRCA.tsv
-
-    # Append all data rows (skip headers)
-    for file in ${individual_results}; do
-        tail -n +2 "\$file" >> result_BRCA.tsv
-    done
+    cat <<'EOF' > results.list\n${manifestContent}EOF
+    bioscript combine --list results.list --output result_BRCA.tsv
     """
 }
diff --git a/examples/brca/brca_dev.ipynb b/examples/brca/brca_dev.ipynb
@@ -345,6 +345,8 @@
     "            'path': 'result_BRCA.tsv',\n",
     "        },\n",
     "    ],\n",
+    "    version=\"0.1.1\",\n",
+    "    description=\"Classification of BRCA variants using ClinVar reference data for hereditary cancer risk.\",\n",
     ")\n",
     "project\n"
    ]
@@ -383,6 +385,7 @@
     "            },\n",
     "        ),\n",
     "    ],\n",
+    "    version=\"0.1.1\",\n",
     ")\n",
     "pipeline\n"
    ]
diff --git a/examples/herc2/herc2-classifier/pipeline.yaml b/examples/herc2/herc2-classifier/pipeline.yaml
@@ -1,5 +1,5 @@
 name: herc2-classifier
-description: Classification of HERC2 gene variant (rs12913832) for eye color prediction.
+version: 0.1.1
 inputs:
   samplesheet: List[GenotypeRecord]
 steps:
diff --git a/examples/herc2/herc2-classifier/project.yaml b/examples/herc2/herc2-classifier/project.yaml
@@ -5,6 +5,7 @@ template: dynamic-nextflow
 version: 0.1.1
 assets:
 - classify_herc2.py
+description: Classification of HERC2 genotypes for eye color prediction.
 inputs:
 - name: participants
   type: List[GenotypeRecord]
diff --git a/examples/herc2/herc2-classifier/workflow.nf b/examples/herc2/herc2-classifier/workflow.nf
@@ -1,3 +1,5 @@
+// BioVault workflow export v0.1.1
+
 nextflow.enable.dsl=2
 
 workflow USER {
@@ -39,6 +41,8 @@ process herc2_classifier {
     container 'ghcr.io/openmined/bioscript:0.1.4'
     publishDir params.results_dir, mode: 'copy', overwrite: true, pattern: 'result_HERC2_*.tsv'
     tag { participant_id }
+    errorStrategy { params.nextflow.error_strategy }
+    maxRetries { params.nextflow.max_retries }
 
     input:
         tuple path(assets_dir), val(participant_id), path(genotype_file)
@@ -47,9 +51,10 @@ process herc2_classifier {
         path "result_HERC2_${participant_id}.tsv"
 
     script:
-    def filename = genotype_file.name
+    def genoFileName = genotype_file.getName()
     """
-    bioscript classify "${{assets_dir}}/classify_herc2.py" --file "${filename}" --participant_id "${{participant_id}}"
+    GENO_FILE=\$(printf '%q' "${genoFileName}")
+    bioscript classify "${assets_dir}/classify_herc2.py" --file \$GENO_FILE --participant_id "${participant_id}"
     """
 }
 
@@ -64,13 +69,9 @@ process aggregate_results {
         path "result_HERC2.tsv"
 
     script:
+    def manifestContent = individual_results.collect { it.toString() }.join('\n') + '\n'
     """
-    # Extract header from first file
-    head -n 1 ${individual_results[0]} > result_HERC2.tsv
-
-    # Append all data rows (skip headers)
-    for file in ${individual_results}; do
-        tail -n +2 "\$file" >> result_HERC2.tsv
-    done
+    cat <<'EOF' > results.list\n${manifestContent}EOF
+    bioscript combine --list results.list --output result_HERC2.tsv
     """
 }
diff --git a/examples/herc2/herc2_dev.ipynb b/examples/herc2/herc2_dev.ipynb
@@ -335,6 +335,8 @@
     "            'path': 'result_HERC2.tsv',\n",
     "        },\n",
     "    ],\n",
+    "    version=\"0.1.1\",\n",
+    "    description=\"Classification of HERC2 genotypes for eye color prediction.\",\n",
     ")\n",
     "project"
    ]
@@ -374,25 +376,10 @@
     "            },\n",
     "        ),\n",
     "    ],\n",
+    "    version=\"0.1.1\",\n",
     ")\n",
     "pipeline\n"
    ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "e5047888-0dca-4ba7-b0eb-d6e9c5bcbf76",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "8dccac3e-2274-488c-b130-c3403828ecd3",
-   "metadata": {},
-   "outputs": [],
-   "source": []
   }
  ],
  "metadata": {
diff --git a/python/src/bioscript/biovault.py b/python/src/bioscript/biovault.py
diff --git a/python/src/bioscript/cli.py b/python/src/bioscript/cli.py