Skip to content

Commit ba9ce66

Browse files
authored
Merge pull request #8 from OpenMined/madhava/pipeline_export
Added pipeline export and updated support for different genotype files
2 parents 0a54d3c + f06ec4f commit ba9ce66

File tree

18 files changed

+2815
-272
lines changed

18 files changed

+2815
-272
lines changed
Lines changed: 168 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,168 @@
1+
from bioscript.classifier import DiploidResult, GenotypeClassifier, GenotypeEnum
2+
from bioscript.types import Alleles, VariantCall
3+
4+
# Define APOL1 variant calls
5+
rs73885319 = VariantCall(rsid="rs73885319", ref=Alleles.A, alt=Alleles.NOT_A)
6+
rs60910145 = VariantCall(rsid="rs60910145", ref=Alleles.T, alt=Alleles.NOT_T)
7+
rs71785313 = VariantCall(
8+
rsid=["rs71785313", "rs1317778148", "rs143830837"], ref=Alleles.I, alt=Alleles.D
9+
)
10+
11+
class APOL1Genotypes(GenotypeEnum):
12+
G2 = "G2"
13+
G1 = "G1"
14+
G0 = "G0"
15+
16+
MISSING = "G-"
17+
18+
from bioscript import write_tsv
19+
from bioscript.types import MatchType
20+
21+
class APOL1Classifier(GenotypeClassifier):
22+
def classify(self, matches) -> list[dict[str, object]]:
23+
g2_match = matches.get(rs71785313)
24+
site1_match = matches.get(rs73885319)
25+
site2_match = matches.get(rs60910145)
26+
27+
variant_matches = [
28+
("rs71785313", rs71785313, g2_match),
29+
("rs73885319", rs73885319, site1_match),
30+
("rs60910145", rs60910145, site2_match),
31+
]
32+
33+
if not any(match is not None for _, _, match in variant_matches):
34+
diploid_result = DiploidResult(MISSING, MISSING)
35+
else:
36+
d_count = g2_match.alt_count if g2_match else 0
37+
site1_variants = site1_match.alt_count if site1_match else 0
38+
site2_variants = site2_match.alt_count if site2_match else 0
39+
40+
has_g1 = site1_variants > 0 and site2_variants > 0
41+
g1_total = site1_variants + site2_variants if has_g1 else 0
42+
43+
if d_count == 2:
44+
diploid_result = DiploidResult(APOL1Genotypes.G2, APOL1Genotypes.G2)
45+
elif d_count == 1:
46+
if g1_total >= 2:
47+
diploid_result = DiploidResult(APOL1Genotypes.G2, APOL1Genotypes.G1)
48+
else:
49+
diploid_result = DiploidResult(APOL1Genotypes.G2, APOL1Genotypes.G0)
50+
else:
51+
if g1_total == 4:
52+
diploid_result = DiploidResult(APOL1Genotypes.G1, APOL1Genotypes.G1)
53+
elif g1_total >= 2:
54+
diploid_result = DiploidResult(APOL1Genotypes.G1, APOL1Genotypes.G0)
55+
else:
56+
diploid_result = DiploidResult(APOL1Genotypes.G0, APOL1Genotypes.G0)
57+
58+
apol1_status = str(diploid_result.sorted())
59+
60+
report_rows = []
61+
for fallback_rsid, variant_call, match in variant_matches:
62+
if match and match.source_row:
63+
rsid = match.source_row.rsid
64+
chromosome = match.source_row.chromosome
65+
position = match.source_row.position
66+
else:
67+
aliases = getattr(getattr(variant_call, "rsid", None), "aliases", None)
68+
rsid = sorted(aliases)[0] if aliases else fallback_rsid
69+
chromosome = getattr(variant_call, "chromosome", None)
70+
position = getattr(variant_call, "position", None)
71+
72+
if match:
73+
genotype = match.genotype_sorted
74+
match_type = (
75+
match.match_type.value if not match.has_missing else MatchType.NO_CALL.value
76+
)
77+
else:
78+
genotype = None
79+
match_type = MatchType.NO_CALL.value
80+
81+
report_rows.append(
82+
{
83+
"participant_id": self.participant_id,
84+
"filename": self.filename,
85+
"rsid": rsid,
86+
"chromosome": chromosome,
87+
"position": position,
88+
"genotype": genotype,
89+
"match_type": match_type,
90+
"apol1_status": apol1_status,
91+
}
92+
)
93+
94+
write_tsv(f"{self.output_basename}.tsv", report_rows)
95+
return report_rows
96+
97+
__bioscript__ = {
98+
"variant_calls": [rs73885319, rs60910145, rs71785313],
99+
"classifier": APOL1Classifier,
100+
"name": "APOL1",
101+
}
102+
103+
from bioscript import VariantFixture
104+
from bioscript.types import MatchList
105+
106+
fixture = VariantFixture(
107+
[
108+
{"rsid": "rs73885319", "chromosome": "22", "position": 36265860},
109+
{"rsid": "rs60910145", "chromosome": "22", "position": 36265988},
110+
{"rsid": "rs71785313", "chromosome": "22", "position": 36266000},
111+
],
112+
assembly="GRCh38",
113+
)
114+
115+
import os
116+
from bioscript.types import MatchType
117+
118+
OUTPUT_FILE = "result_APOL1_TEST_ID.tsv"
119+
120+
def cleanup_output():
121+
if os.path.exists(OUTPUT_FILE):
122+
os.remove(OUTPUT_FILE)
123+
124+
def classify_fixture(genotypes):
125+
cleanup_output()
126+
variants = fixture(genotypes)
127+
matches = MatchList([rs73885319, rs60910145, rs71785313]).match_rows(variants)
128+
classifier = APOL1Classifier(participant_id="TEST_ID", name="APOL1", filename="test.txt")
129+
result = classifier(matches)
130+
assert isinstance(result, list)
131+
assert len(result) == 3
132+
rows = {row["rsid"]: row for row in result}
133+
assert set(rows.keys()) == {"rs73885319", "rs60910145", "rs71785313"}
134+
for row in rows.values():
135+
assert row["participant_id"] == "TEST_ID"
136+
assert row["filename"] == "test.txt"
137+
return rows
138+
139+
def test_g0_homozygous():
140+
rows = classify_fixture(["AA", "TT", "II"])
141+
assert rows["rs71785313"]["apol1_status"] == "G0/G0"
142+
assert rows["rs71785313"]["genotype"] == "II"
143+
assert rows["rs71785313"]["match_type"] == MatchType.NO_CALL.value
144+
145+
assert rows["rs73885319"]["apol1_status"] == "G0/G0"
146+
assert rows["rs73885319"]["genotype"] == "AA"
147+
assert rows["rs73885319"]["match_type"] == MatchType.REFERENCE_CALL.value
148+
149+
assert rows["rs60910145"]["apol1_status"] == "G0/G0"
150+
assert rows["rs60910145"]["genotype"] == "TT"
151+
assert rows["rs60910145"]["match_type"] == MatchType.REFERENCE_CALL.value
152+
153+
cleanup_output()
154+
155+
def test_g1_homozygous():
156+
rows = classify_fixture(["GG", "CC", "II"])
157+
assert all(row["apol1_status"] == "G1/G1" for row in rows.values())
158+
159+
assert rows["rs73885319"]["genotype"] == "GG"
160+
assert rows["rs73885319"]["match_type"] == MatchType.VARIANT_CALL.value
161+
162+
assert rows["rs60910145"]["genotype"] == "CC"
163+
assert rows["rs60910145"]["match_type"] == MatchType.VARIANT_CALL.value
164+
165+
assert rows["rs71785313"]["genotype"] == "II"
166+
assert rows["rs71785313"]["match_type"] == MatchType.NO_CALL.value
167+
168+
cleanup_output()
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
name: apol1-classifier
2+
inputs:
3+
samplesheet: List[GenotypeRecord]
4+
steps:
5+
- id: apol1
6+
uses: ./
7+
with:
8+
participants: inputs.samplesheet
9+
publish:
10+
classification_result: File(result_APOL1.tsv)
11+
store:
12+
counts_sql:
13+
kind: sql
14+
destination: SQL()
15+
source: classification_result
16+
table_name: apol1_{run_id}
17+
key_column: participant_id
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
name: apol1-classifier
2+
3+
workflow: workflow.nf
4+
template: dynamic-nextflow
5+
version: 0.1.0
6+
assets:
7+
- classify_apol1.py
8+
inputs:
9+
- name: participants
10+
type: List[GenotypeRecord]
11+
description: CSV/TSV with participant_id and genotype_file columns
12+
format: csv
13+
mapping:
14+
participant_id: participant_id
15+
genotype_file: genotype_file
16+
outputs:
17+
- name: classification_result
18+
type: File
19+
description: APOL1 genotype classification
20+
format: tsv
21+
path: result_APOL1.tsv
Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
nextflow.enable.dsl=2
2+
3+
workflow USER {
4+
take:
5+
context
6+
participants // Channel emitting GenotypeRecord maps
7+
8+
main:
9+
def assetsDir = file(context.params.assets_dir)
10+
def workflowScript = file("${assetsDir}/classify_apol1.py")
11+
12+
// Extract (participant_id, genotype_file) tuples from the records channel
13+
def participant_tuples = participants.map { record ->
14+
tuple(
15+
record.participant_id,
16+
file(record.genotype_file)
17+
)
18+
}
19+
20+
// Process each participant
21+
def per_participant_results = apol1_classifier(
22+
workflowScript,
23+
participant_tuples
24+
)
25+
26+
// Aggregate all results into single file
27+
def aggregated = aggregate_results(
28+
per_participant_results.collect()
29+
)
30+
31+
emit:
32+
classification_result = aggregated
33+
}
34+
35+
process apol1_classifier {
36+
container 'ghcr.io/openmined/bioscript:0.1.1'
37+
publishDir params.results_dir, mode: 'copy', overwrite: true, pattern: 'result_APOL1_*.tsv'
38+
tag { participant_id }
39+
40+
input:
41+
path script
42+
tuple val(participant_id), path(genotype_file)
43+
44+
output:
45+
path "result_APOL1_${participant_id}.tsv"
46+
47+
script:
48+
"""
49+
bioscript classify "${script}" --file "${genotype_file}" --participant_id "${participant_id}"
50+
"""
51+
}
52+
53+
process aggregate_results {
54+
container 'ghcr.io/openmined/bioscript:0.1.1'
55+
publishDir params.results_dir, mode: 'copy', overwrite: true
56+
57+
input:
58+
path individual_results
59+
60+
output:
61+
path "result_APOL1.tsv"
62+
63+
script:
64+
"""
65+
# Extract header from first file
66+
head -n 1 ${individual_results[0]} > result_APOL1.tsv
67+
68+
# Append all data rows (skip headers)
69+
for file in ${individual_results}; do
70+
tail -n +2 "\$file" >> result_APOL1.tsv
71+
done
72+
"""
73+
}

0 commit comments

Comments
 (0)