Skip to content

Commit d1ec9f6

Browse files
authored
Merge pull request #22 from OpenMined/madhava/thalassemia
adding thalassemia
2 parents 607cd74 + d3c427e commit d1ec9f6

File tree

8 files changed

+1348
-0
lines changed

8 files changed

+1348
-0
lines changed
Lines changed: 144 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,144 @@
1+
import pandas as pd
2+
from bioscript import optional_int, optional_str, write_tsv
3+
from bioscript.classifier import GenotypeClassifier
4+
from bioscript.types import VariantCall
5+
from bioscript import assets_dir
6+
7+
ASSETS_DIR = assets_dir()
8+
CLINVAR_TSV = 'thalassemia_clinvar.tsv'
9+
RESULT_HEADERS = [
10+
'participant_id',
11+
'filename',
12+
'gene',
13+
'rsid',
14+
'chromosome',
15+
'position',
16+
'genotype',
17+
'ref',
18+
'alt',
19+
'variant_type',
20+
'match_type',
21+
]
22+
23+
def generate_variant_calls(df: pd.DataFrame) -> list[VariantCall]:
24+
"""Generate VariantCall objects from ClinVar DataFrame."""
25+
vcs: list[VariantCall] = []
26+
for _, row in df.iterrows():
27+
vcs.append(
28+
VariantCall(
29+
rsid=optional_str(row["rsid"]),
30+
ref=optional_str(row["ref"]),
31+
alt=optional_str(row["alt"]),
32+
chromosome=optional_str(row["chromosome"]),
33+
position=optional_int(row["position"]),
34+
gene=optional_str(row.get("gene"), upper=True),
35+
)
36+
)
37+
return vcs
38+
39+
def get_vcs() -> list[VariantCall]:
40+
"""Load thalassemia-associated variant calls from a ClinVar TSV file."""
41+
df = pd.read_csv(ASSETS_DIR / CLINVAR_TSV, sep=' ')
42+
print(f'Loaded {len(df)} variants from {CLINVAR_TSV}')
43+
return generate_variant_calls(df)
44+
45+
class ThalassemiaClassifier(GenotypeClassifier):
46+
def classify(self, matches):
47+
"""Classify thalassemia-associated variants and write results to TSV files."""
48+
if not matches.all_matches:
49+
print('No variant matches were found.', flush=True)
50+
51+
# Get categorized matches as report rows
52+
ref_rows, var_rows, no_rows = matches.categorize_report_rows(
53+
self.participant_id, self.filename
54+
)
55+
56+
if self.debug:
57+
write_tsv(f'{self.output_basename}_ref.tsv', ref_rows)
58+
write_tsv(f'{self.output_basename}_no.tsv', no_rows)
59+
60+
write_tsv(f'{self.output_basename}.tsv', var_rows, headers=RESULT_HEADERS)
61+
62+
# Return variant rows for testing
63+
return var_rows
64+
65+
__bioscript__ = {
66+
'variant_calls': get_vcs,
67+
'classifier': ThalassemiaClassifier,
68+
'name': 'THALASSEMIA',
69+
}
70+
71+
from bioscript import VariantFixture
72+
from bioscript.types import MatchList
73+
import os
74+
75+
# Create test fixtures for thalassemia-associated HBB variants (subset from thalassemia_clinvar.tsv)
76+
fixture = VariantFixture(
77+
[
78+
{'rsid': 'rs33985472', 'chromosome': '11', 'position': 5225485},
79+
{'rsid': 'rs63751128', 'chromosome': '11', 'position': 5225487},
80+
{'rsid': 'rs33978907', 'chromosome': '11', 'position': 5225488},
81+
{'rsid': 'rs34809925', 'chromosome': '11', 'position': 5225592},
82+
{'rsid': 'rs35117167', 'chromosome': '11', 'position': 5225605},
83+
{'rsid': 'rs33971634', 'chromosome': '11', 'position': 5225660},
84+
],
85+
assembly='GRCh38',
86+
)
87+
88+
def test_thalassemia_heterozygous_variants():
89+
"""Test detection of heterozygous thalassemia-associated variants."""
90+
variants = fixture(['TC', 'TC', 'AG', 'GG', 'TT', 'GG'])
91+
92+
# Create mini variant call list for testing
93+
test_vcs = [
94+
VariantCall(rsid='rs33985472', ref='T', alt='C', chromosome='11', position=5225485, gene='HBB'),
95+
VariantCall(rsid='rs63751128', ref='T', alt='C', chromosome='11', position=5225487, gene='HBB'),
96+
VariantCall(rsid='rs33978907', ref='A', alt='G', chromosome='11', position=5225488, gene='HBB'),
97+
]
98+
99+
matches = MatchList(variant_calls=test_vcs).match_rows(variants)
100+
classifier = ThalassemiaClassifier(participant_id='TEST_HET', name='THALASSEMIA', filename='test.txt')
101+
result = classifier(matches)
102+
103+
assert len(result) == 3, f'Expected 3 variant rows, got {len(result)}'
104+
assert all(row['gene'] == 'HBB' for row in result), 'All variants should be HBB'
105+
assert all(row['match_type'] == 'VARIANT_CALL' for row in result), 'All should be variant calls'
106+
107+
# Cleanup output file
108+
os.remove('result_THALASSEMIA_TEST_HET.tsv')
109+
110+
def test_thalassemia_homozygous_variant():
111+
"""Test detection of a homozygous thalassemia-associated variant."""
112+
variants = fixture(['TT', 'TT', 'AA', 'CC', 'TT', 'GG'])
113+
114+
test_vcs = [
115+
VariantCall(rsid='rs34809925', ref='G', alt='C', chromosome='11', position=5225592, gene='HBB'),
116+
]
117+
118+
matches = MatchList(variant_calls=test_vcs).match_rows(variants)
119+
classifier = ThalassemiaClassifier(participant_id='TEST_HOM', name='THALASSEMIA', filename='test.txt')
120+
result = classifier(matches)
121+
122+
assert len(result) == 1, f'Expected 1 variant row, got {len(result)}'
123+
assert result[0]['gene'] == 'HBB', 'Variant should be HBB'
124+
assert result[0]['genotype'] == 'CC', 'Should be homozygous CC'
125+
126+
# Cleanup output file
127+
os.remove('result_THALASSEMIA_TEST_HOM.tsv')
128+
129+
def test_no_variants():
130+
"""Test classifier with no matching variants."""
131+
variants = fixture(['TT', 'TT', 'AA', 'GG', 'TT', 'GG'])
132+
133+
test_vcs = [
134+
VariantCall(rsid='rs33985472', ref='T', alt='C', chromosome='11', position=5225485, gene='HBB'),
135+
]
136+
137+
matches = MatchList(variant_calls=test_vcs).match_rows(variants)
138+
classifier = ThalassemiaClassifier(participant_id='TEST_REF', name='THALASSEMIA', filename='test.txt')
139+
result = classifier(matches)
140+
141+
assert len(result) == 0, f'Expected 0 variant rows, got {len(result)}'
142+
143+
# Cleanup output file
144+
os.remove('result_THALASSEMIA_TEST_REF.tsv')
Lines changed: 144 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,144 @@
1+
import pandas as pd
2+
from bioscript import optional_int, optional_str, write_tsv
3+
from bioscript.classifier import GenotypeClassifier
4+
from bioscript.types import VariantCall
5+
from bioscript import assets_dir
6+
7+
ASSETS_DIR = assets_dir()
8+
CLINVAR_TSV = 'thalassemia_clinvar.tsv'
9+
RESULT_HEADERS = [
10+
'participant_id',
11+
'filename',
12+
'gene',
13+
'rsid',
14+
'chromosome',
15+
'position',
16+
'genotype',
17+
'ref',
18+
'alt',
19+
'variant_type',
20+
'match_type',
21+
]
22+
23+
def generate_variant_calls(df: pd.DataFrame) -> list[VariantCall]:
24+
"""Generate VariantCall objects from ClinVar DataFrame."""
25+
vcs: list[VariantCall] = []
26+
for _, row in df.iterrows():
27+
vcs.append(
28+
VariantCall(
29+
rsid=optional_str(row["rsid"]),
30+
ref=optional_str(row["ref"]),
31+
alt=optional_str(row["alt"]),
32+
chromosome=optional_str(row["chromosome"]),
33+
position=optional_int(row["position"]),
34+
gene=optional_str(row.get("gene"), upper=True),
35+
)
36+
)
37+
return vcs
38+
39+
def get_vcs() -> list[VariantCall]:
40+
"""Load thalassemia-associated variant calls from a ClinVar TSV file."""
41+
df = pd.read_csv(ASSETS_DIR / CLINVAR_TSV, sep=' ')
42+
print(f'Loaded {len(df)} variants from {CLINVAR_TSV}')
43+
return generate_variant_calls(df)
44+
45+
class ThalassemiaClassifier(GenotypeClassifier):
46+
def classify(self, matches):
47+
"""Classify thalassemia-associated variants and write results to TSV files."""
48+
if not matches.all_matches:
49+
print('No variant matches were found.', flush=True)
50+
51+
# Get categorized matches as report rows
52+
ref_rows, var_rows, no_rows = matches.categorize_report_rows(
53+
self.participant_id, self.filename
54+
)
55+
56+
if self.debug:
57+
write_tsv(f'{self.output_basename}_ref.tsv', ref_rows)
58+
write_tsv(f'{self.output_basename}_no.tsv', no_rows)
59+
60+
write_tsv(f'{self.output_basename}.tsv', var_rows, headers=RESULT_HEADERS)
61+
62+
# Return variant rows for testing
63+
return var_rows
64+
65+
__bioscript__ = {
66+
'variant_calls': get_vcs,
67+
'classifier': ThalassemiaClassifier,
68+
'name': 'THALASSEMIA',
69+
}
70+
71+
from bioscript import VariantFixture
72+
from bioscript.types import MatchList
73+
import os
74+
75+
# Create test fixtures for thalassemia-associated HBB variants (subset from thalassemia_clinvar.tsv)
76+
fixture = VariantFixture(
77+
[
78+
{'rsid': 'rs33985472', 'chromosome': '11', 'position': 5225485},
79+
{'rsid': 'rs63751128', 'chromosome': '11', 'position': 5225487},
80+
{'rsid': 'rs33978907', 'chromosome': '11', 'position': 5225488},
81+
{'rsid': 'rs34809925', 'chromosome': '11', 'position': 5225592},
82+
{'rsid': 'rs35117167', 'chromosome': '11', 'position': 5225605},
83+
{'rsid': 'rs33971634', 'chromosome': '11', 'position': 5225660},
84+
],
85+
assembly='GRCh38',
86+
)
87+
88+
def test_thalassemia_heterozygous_variants():
89+
"""Test detection of heterozygous thalassemia-associated variants."""
90+
variants = fixture(['TC', 'TC', 'AG', 'GG', 'TT', 'GG'])
91+
92+
# Create mini variant call list for testing
93+
test_vcs = [
94+
VariantCall(rsid='rs33985472', ref='T', alt='C', chromosome='11', position=5225485, gene='HBB'),
95+
VariantCall(rsid='rs63751128', ref='T', alt='C', chromosome='11', position=5225487, gene='HBB'),
96+
VariantCall(rsid='rs33978907', ref='A', alt='G', chromosome='11', position=5225488, gene='HBB'),
97+
]
98+
99+
matches = MatchList(variant_calls=test_vcs).match_rows(variants)
100+
classifier = ThalassemiaClassifier(participant_id='TEST_HET', name='THALASSEMIA', filename='test.txt')
101+
result = classifier(matches)
102+
103+
assert len(result) == 3, f'Expected 3 variant rows, got {len(result)}'
104+
assert all(row['gene'] == 'HBB' for row in result), 'All variants should be HBB'
105+
assert all(row['match_type'] == 'VARIANT_CALL' for row in result), 'All should be variant calls'
106+
107+
# Cleanup output file
108+
os.remove('result_THALASSEMIA_TEST_HET.tsv')
109+
110+
def test_thalassemia_homozygous_variant():
111+
"""Test detection of a homozygous thalassemia-associated variant."""
112+
variants = fixture(['TT', 'TT', 'AA', 'CC', 'TT', 'GG'])
113+
114+
test_vcs = [
115+
VariantCall(rsid='rs34809925', ref='G', alt='C', chromosome='11', position=5225592, gene='HBB'),
116+
]
117+
118+
matches = MatchList(variant_calls=test_vcs).match_rows(variants)
119+
classifier = ThalassemiaClassifier(participant_id='TEST_HOM', name='THALASSEMIA', filename='test.txt')
120+
result = classifier(matches)
121+
122+
assert len(result) == 1, f'Expected 1 variant row, got {len(result)}'
123+
assert result[0]['gene'] == 'HBB', 'Variant should be HBB'
124+
assert result[0]['genotype'] == 'CC', 'Should be homozygous CC'
125+
126+
# Cleanup output file
127+
os.remove('result_THALASSEMIA_TEST_HOM.tsv')
128+
129+
def test_no_variants():
130+
"""Test classifier with no matching variants."""
131+
variants = fixture(['TT', 'TT', 'AA', 'GG', 'TT', 'GG'])
132+
133+
test_vcs = [
134+
VariantCall(rsid='rs33985472', ref='T', alt='C', chromosome='11', position=5225485, gene='HBB'),
135+
]
136+
137+
matches = MatchList(variant_calls=test_vcs).match_rows(variants)
138+
classifier = ThalassemiaClassifier(participant_id='TEST_REF', name='THALASSEMIA', filename='test.txt')
139+
result = classifier(matches)
140+
141+
assert len(result) == 0, f'Expected 0 variant rows, got {len(result)}'
142+
143+
# Cleanup output file
144+
os.remove('result_THALASSEMIA_TEST_REF.tsv')

0 commit comments

Comments
 (0)