Skip to content

Commit 0a54d3c

Browse files
authored
Merge pull request #7 from OpenMined/madhava/tweaks
Madhava/tweaks
2 parents fe16267 + 65a6937 commit 0a54d3c

File tree

8 files changed

+203
-67
lines changed

8 files changed

+203
-67
lines changed

.gitignore

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -64,4 +64,7 @@ notebooks/downloads/*
6464
notebooks/work/*
6565
**/*.ipynb_checkpoints
6666
examples/herc2/herc2_*.tsv
67-
examples/herc2/classify_herc2_*.py
67+
examples/herc2/classify_herc2_*.py
68+
69+
examples/**/work/*
70+
examples/**/.nextflow*

docker/build.sh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ echo "Building bioscript:${VERSION} for ${PLATFORMS}..."
4242

4343
REMOTE_TAGS=( )
4444
if [ -n "$REMOTE_IMAGE" ]; then
45-
REMOTE_TAGS=("${REMOTE_IMAGE}:${VERSION}" "${REMOTE_IMAGE}:latest")
45+
REMOTE_TAGS=("${REMOTE_IMAGE}:${VERSION}" "${REMOTE_IMAGE}:latest" "${REMOTE_IMAGE}:0.1.1")
4646
fi
4747

4848
BUILD_CMD=(docker buildx build
@@ -96,7 +96,7 @@ else
9696
LOAD_PLATFORM_RESOLVED="$LOAD_PLATFORM"
9797
fi
9898

99-
LOCAL_TAGS=("bioscript:${VERSION}" "bioscript:latest")
99+
LOCAL_TAGS=("bioscript:${VERSION}" "bioscript:latest" "${REMOTE_IMAGE}:0.1.1")
100100
LOCAL_LOADED=0
101101
if [ -n "$LOAD_PLATFORM_RESOLVED" ]; then
102102
LOAD_CMD=(docker buildx build

examples/herc2/herc2-classifier/project.yaml

Lines changed: 9 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -3,21 +3,19 @@ author: [email protected]
33
workflow: workflow.nf
44
template: dynamic-nextflow
55
version: 0.1.0
6-
docker_image: ghcr.io/openmined/bioscript:0.1.1
7-
docker_platform: linux/amd64
86
assets:
97
- classify_herc2.py
108
inputs:
11-
- name: genotype_file
12-
type: File
13-
description: Participant genotype TSV
14-
format: tsv
15-
- name: data_dir
16-
type: Directory
17-
description: Base directory containing sample files
9+
- name: participants
10+
type: List[GenotypeRecord]
11+
description: CSV/TSV with participant_id and genotype_file columns
12+
format: csv
13+
mapping:
14+
participant_id: participant_id
15+
genotype_file: genotype_file
1816
outputs:
1917
- name: classification_result
2018
type: File
21-
description: HERC2 eye color classification
19+
description: HERC2 eye color classification (aggregated)
2220
format: tsv
23-
path: result_HERC2_{participant_id}.tsv
21+
path: result_HERC2.tsv

examples/herc2/herc2-classifier/workflow.nf

Lines changed: 47 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -3,39 +3,71 @@ nextflow.enable.dsl=2
33
workflow USER {
44
take:
55
context
6-
genotype_file
7-
data_dir
6+
participants // Channel emitting GenotypeRecord maps
87

98
main:
109
def assetsDir = file(context.params.assets_dir)
1110
def workflowScript = file("${assetsDir}/classify_herc2.py")
12-
def script_ch = Channel.value(workflowScript)
13-
def classification_result_ch = herc2_classifier(
14-
script_ch,
15-
genotype_file,
16-
data_dir
11+
12+
// Extract (participant_id, genotype_file) tuples from the records channel
13+
def participant_tuples = participants.map { record ->
14+
tuple(
15+
record.participant_id,
16+
file(record.genotype_file)
17+
)
18+
}
19+
20+
// Process each participant
21+
def per_participant_results = herc2_classifier(
22+
workflowScript,
23+
participant_tuples
24+
)
25+
26+
// Aggregate all results into single file
27+
def aggregated = aggregate_results(
28+
per_participant_results.collect()
1729
)
1830

1931
emit:
20-
classification_result = classification_result_ch
32+
classification_result = aggregated
2133
}
2234

2335
process herc2_classifier {
2436
container 'ghcr.io/openmined/bioscript:0.1.1'
25-
publishDir params.results_dir, mode: 'copy', overwrite: true
37+
publishDir params.results_dir, mode: 'copy', overwrite: true, pattern: 'result_HERC2_*.tsv'
38+
tag { participant_id }
2639

2740
input:
2841
path script
29-
path genotype_file
30-
path data_dir
42+
tuple val(participant_id), path(genotype_file)
43+
44+
output:
45+
path "result_HERC2_${participant_id}.tsv"
46+
47+
script:
48+
"""
49+
bioscript classify "${script}" --file "${genotype_file}" --participant_id "${participant_id}"
50+
"""
51+
}
52+
53+
process aggregate_results {
54+
container 'ghcr.io/openmined/bioscript:0.1.1'
55+
publishDir params.results_dir, mode: 'copy', overwrite: true
56+
57+
input:
58+
path individual_results
3159

3260
output:
33-
path 'result_HERC2_{participant_id}.tsv', emit: classification_result
61+
path "result_HERC2.tsv"
3462

3563
script:
3664
"""
37-
python3 ${script} \n --input "${genotype_file}"
38-
--data-dir "${data_dir}"
39-
--output "result_HERC2_{participant_id}.tsv"
65+
# Extract header from first file
66+
head -n 1 ${individual_results[0]} > result_HERC2.tsv
67+
68+
# Append all data rows (skip headers)
69+
for file in ${individual_results}; do
70+
tail -n +2 "\$file" >> result_HERC2.tsv
71+
done
4072
"""
4173
}

examples/herc2/herc2_dev.ipynb

Lines changed: 12 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -20,26 +20,6 @@
2020
"# !uv pip install -e ../../python"
2121
]
2222
},
23-
{
24-
"cell_type": "code",
25-
"execution_count": null,
26-
"id": "743e7ef5-9338-4a8e-83f5-e80e08c37f2c",
27-
"metadata": {},
28-
"outputs": [],
29-
"source": [
30-
"from bioscript import __version__"
31-
]
32-
},
33-
{
34-
"cell_type": "code",
35-
"execution_count": null,
36-
"id": "7abe09bc-a6bf-4de0-85c9-082253f29500",
37-
"metadata": {},
38-
"outputs": [],
39-
"source": [
40-
"print(__version__)"
41-
]
42-
},
4323
{
4424
"cell_type": "code",
4525
"execution_count": null,
@@ -327,7 +307,7 @@
327307
"from pathlib import Path\n",
328308
"from bioscript import export_bioscript_workflow\n",
329309
"\n",
330-
"# workflow_root = Path('examples/herc2')\n",
310+
"# Export with List[GenotypeRecord] for multi-participant processing\n",
331311
"project = export_bioscript_workflow(\n",
332312
" script_path='./classify_herc2.py',\n",
333313
" workflow_name='herc2-classifier',\n",
@@ -336,24 +316,23 @@
336316
" assets={},\n",
337317
" inputs=[\n",
338318
" {\n",
339-
" 'name': 'genotype_file',\n",
340-
" 'type': 'File',\n",
341-
" 'description': 'Participant genotype TSV',\n",
342-
" 'format': 'tsv',\n",
343-
" },\n",
344-
" {\n",
345-
" 'name': 'data_dir',\n",
346-
" 'type': 'Directory',\n",
347-
" 'description': 'Base directory containing sample files',\n",
319+
" 'name': 'participants',\n",
320+
" 'type': 'List[GenotypeRecord]',\n",
321+
" 'description': 'CSV/TSV with participant_id and genotype_file columns',\n",
322+
" 'format': 'csv',\n",
323+
" 'mapping': {\n",
324+
" 'participant_id': 'participant_id',\n",
325+
" 'genotype_file': 'genotype_file',\n",
326+
" }\n",
348327
" }\n",
349328
" ],\n",
350329
" outputs=[\n",
351330
" {\n",
352331
" 'name': 'classification_result',\n",
353332
" 'type': 'File',\n",
354-
" 'description': 'HERC2 eye color classification',\n",
333+
" 'description': 'HERC2 eye color classification (aggregated)',\n",
355334
" 'format': 'tsv',\n",
356-
" 'path': 'result_HERC2_{participant_id}.tsv',\n",
335+
" 'path': 'result_HERC2.tsv',\n",
357336
" },\n",
358337
" ],\n",
359338
")\n",
@@ -363,7 +342,7 @@
363342
{
364343
"cell_type": "code",
365344
"execution_count": null,
366-
"id": "8591d722-8b83-4c9d-9cf1-51b6d26ae05b",
345+
"id": "e5047888-0dca-4ba7-b0eb-d6e9c5bcbf76",
367346
"metadata": {},
368347
"outputs": [],
369348
"source": []

lint.sh

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,11 @@
11
#!/bin/bash
22
set -e
33

4+
export UV_VENV_CLEAR=1
5+
uv venv
6+
uv pip install -e ./python
7+
uv pip install pytest ruff mypy vulture
8+
49
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
510
cd "$SCRIPT_DIR/python"
611

python/src/bioscript/biovault.py

Lines changed: 120 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -304,10 +304,11 @@ def to_yaml(self) -> str:
304304
"version": self.version,
305305
}
306306

307-
if self.docker_image:
308-
data["docker_image"] = self.docker_image
309-
if self.docker_platform:
310-
data["docker_platform"] = self.docker_platform
307+
# Docker image and platform are hardcoded in workflow generation, not exposed in YAML
308+
# if self.docker_image:
309+
# data["docker_image"] = self.docker_image
310+
# if self.docker_platform:
311+
# data["docker_platform"] = self.docker_platform
311312

312313
if self.assets:
313314
data["assets"] = self.assets
@@ -440,6 +441,113 @@ def set_docker_image(
440441
self.docker_platform = platform
441442
return self
442443

444+
def _generate_participant_workflow_nf(self, entrypoint: Optional[str] = None) -> str:
445+
"""Generate workflow for List[GenotypeRecord] with participant iteration and aggregation."""
446+
447+
primary_process = self.processes[0]
448+
container_image = primary_process.container or self.docker_image or _default_docker_image()
449+
workflow_script_asset = entrypoint or self._entrypoint or primary_process.script
450+
451+
# Determine output pattern from outputs
452+
individual_pattern = None
453+
aggregated_path = None
454+
classifier_name = None
455+
456+
for output_spec in self.outputs:
457+
if output_spec.path:
458+
if "{participant_id}" in output_spec.path:
459+
individual_pattern = output_spec.path.replace("{participant_id}", "*")
460+
else:
461+
aggregated_path = output_spec.path
462+
# Extract classifier name from aggregated path (e.g., result_HERC2.tsv -> HERC2)
463+
if aggregated_path.startswith("result_") and aggregated_path.endswith(".tsv"):
464+
classifier_name = aggregated_path[7:-4] # Remove "result_" and ".tsv"
465+
466+
if not classifier_name:
467+
classifier_name = self.name.upper().replace("-", "_").replace(" ", "_")
468+
469+
if not individual_pattern:
470+
individual_pattern = f"result_{classifier_name}_*.tsv"
471+
if not aggregated_path:
472+
aggregated_path = f"result_{classifier_name}.tsv"
473+
474+
# Generate workflow
475+
workflow = f'''nextflow.enable.dsl=2
476+
477+
workflow USER {{
478+
take:
479+
context
480+
participants // Channel emitting GenotypeRecord maps
481+
482+
main:
483+
def assetsDir = file(context.params.assets_dir)
484+
def workflowScript = file("${{assetsDir}}/{workflow_script_asset}")
485+
486+
// Extract (participant_id, genotype_file) tuples from the records channel
487+
def participant_tuples = participants.map {{ record ->
488+
tuple(
489+
record.participant_id,
490+
file(record.genotype_file)
491+
)
492+
}}
493+
494+
// Process each participant
495+
def per_participant_results = {primary_process.name}(
496+
workflowScript,
497+
participant_tuples
498+
)
499+
500+
// Aggregate all results into single file
501+
def aggregated = aggregate_results(
502+
per_participant_results.collect()
503+
)
504+
505+
emit:
506+
{self.outputs[0].name if self.outputs else "classification_result"} = aggregated
507+
}}
508+
509+
process {primary_process.name} {{
510+
container '{container_image}'
511+
publishDir params.results_dir, mode: 'copy', overwrite: true, pattern: '{individual_pattern}'
512+
tag {{ participant_id }}
513+
514+
input:
515+
path script
516+
tuple val(participant_id), path(genotype_file)
517+
518+
output:
519+
path "result_{classifier_name}_${{participant_id}}.tsv"
520+
521+
script:
522+
"""
523+
bioscript classify "${{script}}" --file "${{genotype_file}}" --participant_id "${{participant_id}}"
524+
"""
525+
}}
526+
527+
process aggregate_results {{
528+
container '{container_image}'
529+
publishDir params.results_dir, mode: 'copy', overwrite: true
530+
531+
input:
532+
path individual_results
533+
534+
output:
535+
path "{aggregated_path}"
536+
537+
script:
538+
"""
539+
# Extract header from first file
540+
head -n 1 ${{individual_results[0]}} > {aggregated_path}
541+
542+
# Append all data rows (skip headers)
543+
for file in ${{individual_results}}; do
544+
tail -n +2 "\\$file" >> {aggregated_path}
545+
done
546+
"""
547+
}}
548+
'''
549+
return workflow
550+
443551
def generate_workflow_nf(self, entrypoint: Optional[str] = None) -> str:
444552
"""Generate a Nextflow workflow file for this workflow."""
445553

@@ -463,6 +571,14 @@ def generate_workflow_nf(self, entrypoint: Optional[str] = None) -> str:
463571
if not self._entrypoint:
464572
self._entrypoint = script_candidate
465573

574+
# Check if using List[GenotypeRecord] - requires different workflow pattern
575+
uses_genotype_list = any(
576+
inp.type.startswith("List[GenotypeRecord") for inp in self.inputs
577+
)
578+
579+
if uses_genotype_list:
580+
return self._generate_participant_workflow_nf(entrypoint)
581+
466582
if len(self.processes) > 1:
467583
raise NotImplementedError("Multiple processes per workflow are not supported yet")
468584

test.sh

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,9 @@
11
#!/bin/bash
22
set -e
3-
3+
export UV_VENV_CLEAR=1
4+
uv venv
5+
uv pip install -e ./python
6+
uv pip install pytest
47
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
58
cd "$SCRIPT_DIR/python"
69
uv run pytest

0 commit comments

Comments
 (0)