Merge pull request #7 from OpenMined/madhava/tweaks

madhavajay · web-flow · commit 0a54d3c5e59a · 2025-10-29T17:20:53.000+10:00
Madhava/tweaks
diff --git a/.gitignore b/.gitignore
@@ -64,4 +64,7 @@ notebooks/downloads/*
 notebooks/work/*
 **/*.ipynb_checkpoints
 examples/herc2/herc2_*.tsv
-examples/herc2/classify_herc2_*.py
+examples/herc2/classify_herc2_*.py
+
+examples/**/work/*
+examples/**/.nextflow*
diff --git a/docker/build.sh b/docker/build.sh
@@ -42,7 +42,7 @@ echo "Building bioscript:${VERSION} for ${PLATFORMS}..."
 
 REMOTE_TAGS=( )
 if [ -n "$REMOTE_IMAGE" ]; then
-    REMOTE_TAGS=("${REMOTE_IMAGE}:${VERSION}" "${REMOTE_IMAGE}:latest")
+    REMOTE_TAGS=("${REMOTE_IMAGE}:${VERSION}" "${REMOTE_IMAGE}:latest" "${REMOTE_IMAGE}:0.1.1")
 fi
 
 BUILD_CMD=(docker buildx build
@@ -96,7 +96,7 @@ else
     LOAD_PLATFORM_RESOLVED="$LOAD_PLATFORM"
 fi
 
-LOCAL_TAGS=("bioscript:${VERSION}" "bioscript:latest")
+LOCAL_TAGS=("bioscript:${VERSION}" "bioscript:latest" "${REMOTE_IMAGE}:0.1.1")
 LOCAL_LOADED=0
 if [ -n "$LOAD_PLATFORM_RESOLVED" ]; then
     LOAD_CMD=(docker buildx build
diff --git a/examples/herc2/herc2-classifier/project.yaml b/examples/herc2/herc2-classifier/project.yaml
@@ -3,21 +3,19 @@ author: madhava@openmined.org
 workflow: workflow.nf
 template: dynamic-nextflow
 version: 0.1.0
-docker_image: ghcr.io/openmined/bioscript:0.1.1
-docker_platform: linux/amd64
 assets:
 - classify_herc2.py
 inputs:
-- name: genotype_file
-  type: File
-  description: Participant genotype TSV
-  format: tsv
-- name: data_dir
-  type: Directory
-  description: Base directory containing sample files
+- name: participants
+  type: List[GenotypeRecord]
+  description: CSV/TSV with participant_id and genotype_file columns
+  format: csv
+  mapping:
+    participant_id: participant_id
+    genotype_file: genotype_file
 outputs:
 - name: classification_result
   type: File
-  description: HERC2 eye color classification
+  description: HERC2 eye color classification (aggregated)
   format: tsv
-  path: result_HERC2_{participant_id}.tsv
+  path: result_HERC2.tsv
diff --git a/examples/herc2/herc2-classifier/workflow.nf b/examples/herc2/herc2-classifier/workflow.nf
@@ -3,39 +3,71 @@ nextflow.enable.dsl=2
 workflow USER {
     take:
         context
-        genotype_file
-        data_dir
+        participants  // Channel emitting GenotypeRecord maps
 
     main:
         def assetsDir = file(context.params.assets_dir)
         def workflowScript = file("${assetsDir}/classify_herc2.py")
-        def script_ch = Channel.value(workflowScript)
-        def classification_result_ch = herc2_classifier(
-            script_ch,
-            genotype_file,
-            data_dir
+
+        // Extract (participant_id, genotype_file) tuples from the records channel
+        def participant_tuples = participants.map { record ->
+            tuple(
+                record.participant_id,
+                file(record.genotype_file)
+            )
+        }
+
+        // Process each participant
+        def per_participant_results = herc2_classifier(
+            workflowScript,
+            participant_tuples
+        )
+
+        // Aggregate all results into single file
+        def aggregated = aggregate_results(
+            per_participant_results.collect()
         )
 
     emit:
-        classification_result = classification_result_ch
+        classification_result = aggregated
 }
 
 process herc2_classifier {
     container 'ghcr.io/openmined/bioscript:0.1.1'
-    publishDir params.results_dir, mode: 'copy', overwrite: true
+    publishDir params.results_dir, mode: 'copy', overwrite: true, pattern: 'result_HERC2_*.tsv'
+    tag { participant_id }
 
     input:
         path script
-        path genotype_file
-        path data_dir
+        tuple val(participant_id), path(genotype_file)
+
+    output:
+        path "result_HERC2_${participant_id}.tsv"
+
+    script:
+    """
+    bioscript classify "${script}" --file "${genotype_file}" --participant_id "${participant_id}"
+    """
+}
+
+process aggregate_results {
+    container 'ghcr.io/openmined/bioscript:0.1.1'
+    publishDir params.results_dir, mode: 'copy', overwrite: true
+
+    input:
+        path individual_results
 
     output:
-        path 'result_HERC2_{participant_id}.tsv', emit: classification_result
+        path "result_HERC2.tsv"
 
     script:
     """
-    python3 ${script} \n        --input "${genotype_file}"
-        --data-dir "${data_dir}"
-        --output "result_HERC2_{participant_id}.tsv"
+    # Extract header from first file
+    head -n 1 ${individual_results[0]} > result_HERC2.tsv
+
+    # Append all data rows (skip headers)
+    for file in ${individual_results}; do
+        tail -n +2 "\$file" >> result_HERC2.tsv
+    done
     """
 }
diff --git a/examples/herc2/herc2_dev.ipynb b/examples/herc2/herc2_dev.ipynb
@@ -20,26 +20,6 @@
     "# !uv pip install -e ../../python"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "743e7ef5-9338-4a8e-83f5-e80e08c37f2c",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from bioscript import __version__"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "7abe09bc-a6bf-4de0-85c9-082253f29500",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "print(__version__)"
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -327,7 +307,7 @@
     "from pathlib import Path\n",
     "from bioscript import export_bioscript_workflow\n",
     "\n",
-    "# workflow_root = Path('examples/herc2')\n",
+    "# Export with List[GenotypeRecord] for multi-participant processing\n",
     "project = export_bioscript_workflow(\n",
     "    script_path='./classify_herc2.py',\n",
     "    workflow_name='herc2-classifier',\n",
@@ -336,24 +316,23 @@
     "    assets={},\n",
     "    inputs=[\n",
     "        {\n",
-    "            'name': 'genotype_file',\n",
-    "            'type': 'File',\n",
-    "            'description': 'Participant genotype TSV',\n",
-    "            'format': 'tsv',\n",
-    "        },\n",
-    "        {\n",
-    "            'name': 'data_dir',\n",
-    "            'type': 'Directory',\n",
-    "            'description': 'Base directory containing sample files',\n",
+    "            'name': 'participants',\n",
+    "            'type': 'List[GenotypeRecord]',\n",
+    "            'description': 'CSV/TSV with participant_id and genotype_file columns',\n",
+    "            'format': 'csv',\n",
+    "            'mapping': {\n",
+    "                'participant_id': 'participant_id',\n",
+    "                'genotype_file': 'genotype_file',\n",
+    "            }\n",
     "        }\n",
     "    ],\n",
     "    outputs=[\n",
     "        {\n",
     "            'name': 'classification_result',\n",
     "            'type': 'File',\n",
-    "            'description': 'HERC2 eye color classification',\n",
+    "            'description': 'HERC2 eye color classification (aggregated)',\n",
     "            'format': 'tsv',\n",
-    "            'path': 'result_HERC2_{participant_id}.tsv',\n",
+    "            'path': 'result_HERC2.tsv',\n",
     "        },\n",
     "    ],\n",
     ")\n",
@@ -363,7 +342,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "8591d722-8b83-4c9d-9cf1-51b6d26ae05b",
+   "id": "e5047888-0dca-4ba7-b0eb-d6e9c5bcbf76",
    "metadata": {},
    "outputs": [],
    "source": []
diff --git a/lint.sh b/lint.sh
@@ -1,6 +1,11 @@
 #!/bin/bash
 set -e
 
+export UV_VENV_CLEAR=1
+uv venv
+uv pip install -e ./python
+uv pip install pytest ruff mypy vulture
+
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 cd "$SCRIPT_DIR/python"
 
diff --git a/python/src/bioscript/biovault.py b/python/src/bioscript/biovault.py
@@ -304,10 +304,11 @@ def to_yaml(self) -> str:
             "version": self.version,
         }
 
-        if self.docker_image:
-            data["docker_image"] = self.docker_image
-        if self.docker_platform:
-            data["docker_platform"] = self.docker_platform
+        # Docker image and platform are hardcoded in workflow generation, not exposed in YAML
+        # if self.docker_image:
+        #     data["docker_image"] = self.docker_image
+        # if self.docker_platform:
+        #     data["docker_platform"] = self.docker_platform
 
         if self.assets:
             data["assets"] = self.assets
@@ -440,6 +441,113 @@ def set_docker_image(
         self.docker_platform = platform
         return self
 
+    def _generate_participant_workflow_nf(self, entrypoint: Optional[str] = None) -> str:
+        """Generate workflow for List[GenotypeRecord] with participant iteration and aggregation."""
+
+        primary_process = self.processes[0]
+        container_image = primary_process.container or self.docker_image or _default_docker_image()
+        workflow_script_asset = entrypoint or self._entrypoint or primary_process.script
+
+        # Determine output pattern from outputs
+        individual_pattern = None
+        aggregated_path = None
+        classifier_name = None
+
+        for output_spec in self.outputs:
+            if output_spec.path:
+                if "{participant_id}" in output_spec.path:
+                    individual_pattern = output_spec.path.replace("{participant_id}", "*")
+                else:
+                    aggregated_path = output_spec.path
+                    # Extract classifier name from aggregated path (e.g., result_HERC2.tsv -> HERC2)
+                    if aggregated_path.startswith("result_") and aggregated_path.endswith(".tsv"):
+                        classifier_name = aggregated_path[7:-4]  # Remove "result_" and ".tsv"
+
+        if not classifier_name:
+            classifier_name = self.name.upper().replace("-", "_").replace(" ", "_")
+
+        if not individual_pattern:
+            individual_pattern = f"result_{classifier_name}_*.tsv"
+        if not aggregated_path:
+            aggregated_path = f"result_{classifier_name}.tsv"
+
+        # Generate workflow
+        workflow = f'''nextflow.enable.dsl=2
+
+workflow USER {{
+    take:
+        context
+        participants  // Channel emitting GenotypeRecord maps
+
+    main:
+        def assetsDir = file(context.params.assets_dir)
+        def workflowScript = file("${{assetsDir}}/{workflow_script_asset}")
+
+        // Extract (participant_id, genotype_file) tuples from the records channel
+        def participant_tuples = participants.map {{ record ->
+            tuple(
+                record.participant_id,
+                file(record.genotype_file)
+            )
+        }}
+
+        // Process each participant
+        def per_participant_results = {primary_process.name}(
+            workflowScript,
+            participant_tuples
+        )
+
+        // Aggregate all results into single file
+        def aggregated = aggregate_results(
+            per_participant_results.collect()
+        )
+
+    emit:
+        {self.outputs[0].name if self.outputs else "classification_result"} = aggregated
+}}
+
+process {primary_process.name} {{
+    container '{container_image}'
+    publishDir params.results_dir, mode: 'copy', overwrite: true, pattern: '{individual_pattern}'
+    tag {{ participant_id }}
+
+    input:
+        path script
+        tuple val(participant_id), path(genotype_file)
+
+    output:
+        path "result_{classifier_name}_${{participant_id}}.tsv"
+
+    script:
+    """
+    bioscript classify "${{script}}" --file "${{genotype_file}}" --participant_id "${{participant_id}}"
+    """
+}}
+
+process aggregate_results {{
+    container '{container_image}'
+    publishDir params.results_dir, mode: 'copy', overwrite: true
+
+    input:
+        path individual_results
+
+    output:
+        path "{aggregated_path}"
+
+    script:
+    """
+    # Extract header from first file
+    head -n 1 ${{individual_results[0]}} > {aggregated_path}
+
+    # Append all data rows (skip headers)
+    for file in ${{individual_results}}; do
+        tail -n +2 "\\$file" >> {aggregated_path}
+    done
+    """
+}}
+'''
+        return workflow
+
     def generate_workflow_nf(self, entrypoint: Optional[str] = None) -> str:
         """Generate a Nextflow workflow file for this workflow."""
 
@@ -463,6 +571,14 @@ def generate_workflow_nf(self, entrypoint: Optional[str] = None) -> str:
             if not self._entrypoint:
                 self._entrypoint = script_candidate
 
+        # Check if using List[GenotypeRecord] - requires different workflow pattern
+        uses_genotype_list = any(
+            inp.type.startswith("List[GenotypeRecord") for inp in self.inputs
+        )
+
+        if uses_genotype_list:
+            return self._generate_participant_workflow_nf(entrypoint)
+
         if len(self.processes) > 1:
             raise NotImplementedError("Multiple processes per workflow are not supported yet")
 
diff --git a/test.sh b/test.sh
@@ -1,6 +1,9 @@
 #!/bin/bash
 set -e
-
+export UV_VENV_CLEAR=1
+uv venv
+uv pip install -e ./python
+uv pip install pytest
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 cd "$SCRIPT_DIR/python"
 uv run pytest