Skip to content

Commit 3339eb7

Browse files
authored
Merge pull request #9 from OpenMined/madhava/pipelines
Assets dir and participant column fixes
2 parents ba9ce66 + 0597d98 commit 3339eb7

File tree

17 files changed

+245
-207
lines changed

17 files changed

+245
-207
lines changed

examples/apol1/apol1-classifier/pipeline.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,4 +14,4 @@ steps:
1414
destination: SQL()
1515
source: classification_result
1616
table_name: apol1_{run_id}
17-
key_column: participant_id
17+
participant_column: participant_id

examples/apol1/apol1-classifier/workflow.nf

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -6,21 +6,24 @@ workflow USER {
66
participants // Channel emitting GenotypeRecord maps
77

88
main:
9-
def assetsDir = file(context.params.assets_dir)
10-
def workflowScript = file("${assetsDir}/classify_apol1.py")
9+
def assetsDir = context.assets_dir
10+
if (!assetsDir) {
11+
throw new IllegalStateException("Missing assets directory in context")
12+
}
13+
def assetsDirPath = file(assetsDir)
1114

12-
// Extract (participant_id, genotype_file) tuples from the records channel
13-
def participant_tuples = participants.map { record ->
15+
// Pair the assets directory with each (participant_id, genotype_file) tuple
16+
def participant_work_items = participants.map { record ->
1417
tuple(
18+
assetsDirPath,
1519
record.participant_id,
1620
file(record.genotype_file)
1721
)
1822
}
1923

2024
// Process each participant
2125
def per_participant_results = apol1_classifier(
22-
workflowScript,
23-
participant_tuples
26+
participant_work_items
2427
)
2528

2629
// Aggregate all results into single file
@@ -38,15 +41,14 @@ process apol1_classifier {
3841
tag { participant_id }
3942

4043
input:
41-
path script
42-
tuple val(participant_id), path(genotype_file)
44+
tuple path(assets_dir), val(participant_id), path(genotype_file)
4345

4446
output:
4547
path "result_APOL1_${participant_id}.tsv"
4648

4749
script:
4850
"""
49-
bioscript classify "${script}" --file "${genotype_file}" --participant_id "${participant_id}"
51+
bioscript classify "${assets_dir}/classify_apol1.py" --file "${genotype_file}" --participant_id "${participant_id}"
5052
"""
5153
}
5254

examples/apol1/apol1_dev.ipynb

Lines changed: 27 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -261,7 +261,7 @@
261261
"# Run tests\n",
262262
"test_g0_homozygous()\n",
263263
"test_g1_homozygous()\n",
264-
"print(\"\u2713 All tests passed!\")"
264+
"print(\" All tests passed!\")"
265265
]
266266
},
267267
{
@@ -382,7 +382,7 @@
382382
" return test_file\n",
383383
"\n",
384384
"test_file = create_apol1_test_file()\n",
385-
"print('\u2713 Test file ready!')\n"
385+
"print(' Test file ready!')\n"
386386
]
387387
},
388388
{
@@ -405,7 +405,6 @@
405405
{
406406
"cell_type": "code",
407407
"execution_count": null,
408-
"id": "6515fe1b",
409408
"metadata": {},
410409
"outputs": [],
411410
"source": [
@@ -436,13 +435,12 @@
436435
" return csv_path\n",
437436
"\n",
438437
"decodeme_file = create_apol1_decodeme_file()\n",
439-
"print('\u2713 DecodeME sample ready!')\n"
438+
"print(' DecodeME sample ready!')\n"
440439
]
441440
},
442441
{
443442
"cell_type": "code",
444443
"execution_count": null,
445-
"id": "695eabd8",
446444
"metadata": {},
447445
"outputs": [],
448446
"source": [
@@ -453,7 +451,6 @@
453451
{
454452
"cell_type": "code",
455453
"execution_count": null,
456-
"id": "101625f9",
457454
"metadata": {},
458455
"outputs": [],
459456
"source": [
@@ -487,13 +484,12 @@
487484
" return csv_path\n",
488485
"\n",
489486
"myheritage_file = create_apol1_myheritage_file()\n",
490-
"print('\u2713 MyHeritage sample ready!')\n"
487+
"print(' MyHeritage sample ready!')\n"
491488
]
492489
},
493490
{
494491
"cell_type": "code",
495492
"execution_count": null,
496-
"id": "b808ac94",
497493
"metadata": {},
498494
"outputs": [],
499495
"source": [
@@ -504,7 +500,6 @@
504500
{
505501
"cell_type": "code",
506502
"execution_count": null,
507-
"id": "328c4426",
508503
"metadata": {},
509504
"outputs": [],
510505
"source": [
@@ -537,19 +532,39 @@
537532
" return tsv_path\n",
538533
"\n",
539534
"headerless_file = create_apol1_headerless_file()\n",
540-
"print('\u2713 Headerless sample ready!')\n"
535+
"print(' Headerless sample ready!')\n"
541536
]
542537
},
543538
{
544539
"cell_type": "code",
545540
"execution_count": null,
546-
"id": "159f07aa",
547541
"metadata": {},
548542
"outputs": [],
549543
"source": [
550544
"!bioscript classify classify_apol1.py --file apol1_headerless.txt --participant_id=\"HEADERLESS\"\n",
551545
"!cat result_APOL1_HEADERLESS.tsv\n"
552546
]
547+
},
548+
{
549+
"cell_type": "code",
550+
"execution_count": null,
551+
"metadata": {},
552+
"outputs": [],
553+
"source": []
554+
},
555+
{
556+
"cell_type": "code",
557+
"execution_count": null,
558+
"metadata": {},
559+
"outputs": [],
560+
"source": []
561+
},
562+
{
563+
"cell_type": "code",
564+
"execution_count": null,
565+
"metadata": {},
566+
"outputs": [],
567+
"source": []
553568
}
554569
],
555570
"metadata": {
@@ -573,4 +588,4 @@
573588
},
574589
"nbformat": 4,
575590
"nbformat_minor": 4
576-
}
591+
}

examples/brca/brca-classifier/assets/classify_brca.py

Lines changed: 19 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,22 @@
22
from bioscript import optional_int, optional_str, write_tsv
33
from bioscript.classifier import GenotypeClassifier
44
from bioscript.types import VariantCall
5+
from bioscript import assets_dir
6+
7+
ASSETS_DIR = assets_dir()
8+
RESULT_HEADERS = [
9+
"participant_id",
10+
"filename",
11+
"gene",
12+
"rsid",
13+
"chromosome",
14+
"position",
15+
"genotype",
16+
"ref",
17+
"alt",
18+
"variant_type",
19+
"match_type",
20+
]
521

622
def generate_variant_calls(df: pd.DataFrame) -> list[VariantCall]:
723
"""Generate VariantCall objects from ClinVar DataFrame."""
@@ -21,7 +37,8 @@ def generate_variant_calls(df: pd.DataFrame) -> list[VariantCall]:
2137

2238
def get_vcs() -> list[VariantCall]:
2339
"""Load BRCA1 and BRCA2 variant calls from ClinVar TSV files."""
24-
dfs = [pd.read_csv(f, sep="\t") for f in ["brca1_clinvar.tsv", "brca2_clinvar.tsv"]]
40+
data_files = [ASSETS_DIR / name for name in ["brca1_clinvar.tsv", "brca2_clinvar.tsv"]]
41+
dfs = [pd.read_csv(f, sep="\t") for f in data_files]
2542
df = pd.concat(dfs, ignore_index=True)
2643
print(f"Loaded {len(df)} variants from BRCA1 and BRCA2")
2744
return generate_variant_calls(df)
@@ -41,7 +58,7 @@ def classify(self, matches):
4158
write_tsv(f"{self.output_basename}_ref.tsv", ref_rows)
4259
write_tsv(f"{self.output_basename}_no.tsv", no_rows)
4360

44-
write_tsv(f"{self.output_basename}.tsv", var_rows)
61+
write_tsv(f"{self.output_basename}.tsv", var_rows, headers=RESULT_HEADERS)
4562

4663
# Return variant rows for testing
4764
return var_rows

examples/brca/brca-classifier/pipeline.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,4 +14,4 @@ steps:
1414
destination: SQL()
1515
source: classification_result
1616
table_name: brca_{run_id}
17-
key_column: participant_id
17+
participant_column: participant_id

examples/brca/brca-classifier/workflow.nf

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -6,21 +6,24 @@ workflow USER {
66
participants // Channel emitting GenotypeRecord maps
77

88
main:
9-
def assetsDir = file(context.params.assets_dir)
10-
def workflowScript = file("${assetsDir}/classify_brca.py")
9+
def assetsDir = context.assets_dir
10+
if (!assetsDir) {
11+
throw new IllegalStateException("Missing assets directory in context")
12+
}
13+
def assetsDirPath = file(assetsDir)
1114

12-
// Extract (participant_id, genotype_file) tuples from the records channel
13-
def participant_tuples = participants.map { record ->
15+
// Pair the assets directory with each (participant_id, genotype_file) tuple
16+
def participant_work_items = participants.map { record ->
1417
tuple(
18+
assetsDirPath,
1519
record.participant_id,
1620
file(record.genotype_file)
1721
)
1822
}
1923

2024
// Process each participant
2125
def per_participant_results = brca_classifier(
22-
workflowScript,
23-
participant_tuples
26+
participant_work_items
2427
)
2528

2629
// Aggregate all results into single file
@@ -38,15 +41,14 @@ process brca_classifier {
3841
tag { participant_id }
3942

4043
input:
41-
path script
42-
tuple val(participant_id), path(genotype_file)
44+
tuple path(assets_dir), val(participant_id), path(genotype_file)
4345

4446
output:
4547
path "result_BRCA_${participant_id}.tsv"
4648

4749
script:
4850
"""
49-
bioscript classify "${script}" --file "${genotype_file}" --participant_id "${participant_id}"
51+
bioscript classify "${assets_dir}/classify_brca.py" --file "${genotype_file}" --participant_id "${participant_id}"
5052
"""
5153
}
5254

examples/brca/brca_dev.ipynb

Lines changed: 27 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,30 @@
2727
"import pandas as pd\n",
2828
"from bioscript import optional_int, optional_str, write_tsv\n",
2929
"from bioscript.classifier import GenotypeClassifier\n",
30-
"from bioscript.types import VariantCall"
30+
"from bioscript.types import VariantCall\n",
31+
"from bioscript import assets_dir"
32+
]
33+
},
34+
{
35+
"cell_type": "code",
36+
"execution_count": null,
37+
"metadata": {},
38+
"outputs": [],
39+
"source": [
40+
"ASSETS_DIR = assets_dir()\n",
41+
"RESULT_HEADERS = [\n",
42+
" \"participant_id\",\n",
43+
" \"filename\",\n",
44+
" \"gene\",\n",
45+
" \"rsid\",\n",
46+
" \"chromosome\",\n",
47+
" \"position\",\n",
48+
" \"genotype\",\n",
49+
" \"ref\",\n",
50+
" \"alt\",\n",
51+
" \"variant_type\",\n",
52+
" \"match_type\",\n",
53+
"]"
3154
]
3255
},
3356
{
@@ -61,7 +84,8 @@
6184
"source": [
6285
"def get_vcs() -> list[VariantCall]:\n",
6386
" \"\"\"Load BRCA1 and BRCA2 variant calls from ClinVar TSV files.\"\"\"\n",
64-
" dfs = [pd.read_csv(f, sep=\"\\t\") for f in [\"brca1_clinvar.tsv\", \"brca2_clinvar.tsv\"]]\n",
87+
" data_files = [ASSETS_DIR / name for name in [\"brca1_clinvar.tsv\", \"brca2_clinvar.tsv\"]]\n",
88+
" dfs = [pd.read_csv(f, sep=\"\\t\") for f in data_files]\n",
6589
" df = pd.concat(dfs, ignore_index=True)\n",
6690
" print(f\"Loaded {len(df)} variants from BRCA1 and BRCA2\")\n",
6791
" return generate_variant_calls(df)"
@@ -88,7 +112,7 @@
88112
" write_tsv(f\"{self.output_basename}_ref.tsv\", ref_rows)\n",
89113
" write_tsv(f\"{self.output_basename}_no.tsv\", no_rows)\n",
90114
"\n",
91-
" write_tsv(f\"{self.output_basename}.tsv\", var_rows)\n",
115+
" write_tsv(f\"{self.output_basename}.tsv\", var_rows, headers=RESULT_HEADERS)\n",
92116
" \n",
93117
" # Return variant rows for testing\n",
94118
" return var_rows"
@@ -363,20 +387,6 @@
363387
"pipeline\n"
364388
]
365389
},
366-
{
367-
"cell_type": "code",
368-
"execution_count": null,
369-
"metadata": {},
370-
"outputs": [],
371-
"source": []
372-
},
373-
{
374-
"cell_type": "code",
375-
"execution_count": null,
376-
"metadata": {},
377-
"outputs": [],
378-
"source": []
379-
},
380390
{
381391
"cell_type": "code",
382392
"execution_count": null,

examples/brca/classify_brca.py

Lines changed: 19 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,22 @@
22
from bioscript import optional_int, optional_str, write_tsv
33
from bioscript.classifier import GenotypeClassifier
44
from bioscript.types import VariantCall
5+
from bioscript import assets_dir
6+
7+
ASSETS_DIR = assets_dir()
8+
RESULT_HEADERS = [
9+
"participant_id",
10+
"filename",
11+
"gene",
12+
"rsid",
13+
"chromosome",
14+
"position",
15+
"genotype",
16+
"ref",
17+
"alt",
18+
"variant_type",
19+
"match_type",
20+
]
521

622
def generate_variant_calls(df: pd.DataFrame) -> list[VariantCall]:
723
"""Generate VariantCall objects from ClinVar DataFrame."""
@@ -21,7 +37,8 @@ def generate_variant_calls(df: pd.DataFrame) -> list[VariantCall]:
2137

2238
def get_vcs() -> list[VariantCall]:
2339
"""Load BRCA1 and BRCA2 variant calls from ClinVar TSV files."""
24-
dfs = [pd.read_csv(f, sep="\t") for f in ["brca1_clinvar.tsv", "brca2_clinvar.tsv"]]
40+
data_files = [ASSETS_DIR / name for name in ["brca1_clinvar.tsv", "brca2_clinvar.tsv"]]
41+
dfs = [pd.read_csv(f, sep="\t") for f in data_files]
2542
df = pd.concat(dfs, ignore_index=True)
2643
print(f"Loaded {len(df)} variants from BRCA1 and BRCA2")
2744
return generate_variant_calls(df)
@@ -41,7 +58,7 @@ def classify(self, matches):
4158
write_tsv(f"{self.output_basename}_ref.tsv", ref_rows)
4259
write_tsv(f"{self.output_basename}_no.tsv", no_rows)
4360

44-
write_tsv(f"{self.output_basename}.tsv", var_rows)
61+
write_tsv(f"{self.output_basename}.tsv", var_rows, headers=RESULT_HEADERS)
4562

4663
# Return variant rows for testing
4764
return var_rows

examples/herc2/herc2-classifier/pipeline.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,4 +14,4 @@ steps:
1414
destination: SQL()
1515
source: classification_result
1616
table_name: herc2_{run_id}
17-
key_column: participant_id
17+
participant_column: participant_id

0 commit comments

Comments
 (0)