|
| 1 | +import pandas as pd |
| 2 | +from bioscript import optional_int, optional_str, write_tsv |
| 3 | +from bioscript.classifier import GenotypeClassifier |
| 4 | +from bioscript.types import VariantCall |
| 5 | +from bioscript import assets_dir |
| 6 | + |
| 7 | +ASSETS_DIR = assets_dir() |
| 8 | +CLINVAR_TSV = 'thalassemia_clinvar.tsv' |
| 9 | +RESULT_HEADERS = [ |
| 10 | + 'participant_id', |
| 11 | + 'filename', |
| 12 | + 'gene', |
| 13 | + 'rsid', |
| 14 | + 'chromosome', |
| 15 | + 'position', |
| 16 | + 'genotype', |
| 17 | + 'ref', |
| 18 | + 'alt', |
| 19 | + 'variant_type', |
| 20 | + 'match_type', |
| 21 | +] |
| 22 | + |
| 23 | +def generate_variant_calls(df: pd.DataFrame) -> list[VariantCall]: |
| 24 | + """Generate VariantCall objects from ClinVar DataFrame.""" |
| 25 | + vcs: list[VariantCall] = [] |
| 26 | + for _, row in df.iterrows(): |
| 27 | + vcs.append( |
| 28 | + VariantCall( |
| 29 | + rsid=optional_str(row["rsid"]), |
| 30 | + ref=optional_str(row["ref"]), |
| 31 | + alt=optional_str(row["alt"]), |
| 32 | + chromosome=optional_str(row["chromosome"]), |
| 33 | + position=optional_int(row["position"]), |
| 34 | + gene=optional_str(row.get("gene"), upper=True), |
| 35 | + ) |
| 36 | + ) |
| 37 | + return vcs |
| 38 | + |
| 39 | +def get_vcs() -> list[VariantCall]: |
| 40 | + """Load thalassemia-associated variant calls from a ClinVar TSV file.""" |
| 41 | + df = pd.read_csv(ASSETS_DIR / CLINVAR_TSV, sep=' ') |
| 42 | + print(f'Loaded {len(df)} variants from {CLINVAR_TSV}') |
| 43 | + return generate_variant_calls(df) |
| 44 | + |
| 45 | +class ThalassemiaClassifier(GenotypeClassifier): |
| 46 | + def classify(self, matches): |
| 47 | + """Classify thalassemia-associated variants and write results to TSV files.""" |
| 48 | + if not matches.all_matches: |
| 49 | + print('No variant matches were found.', flush=True) |
| 50 | + |
| 51 | + # Get categorized matches as report rows |
| 52 | + ref_rows, var_rows, no_rows = matches.categorize_report_rows( |
| 53 | + self.participant_id, self.filename |
| 54 | + ) |
| 55 | + |
| 56 | + if self.debug: |
| 57 | + write_tsv(f'{self.output_basename}_ref.tsv', ref_rows) |
| 58 | + write_tsv(f'{self.output_basename}_no.tsv', no_rows) |
| 59 | + |
| 60 | + write_tsv(f'{self.output_basename}.tsv', var_rows, headers=RESULT_HEADERS) |
| 61 | + |
| 62 | + # Return variant rows for testing |
| 63 | + return var_rows |
| 64 | + |
| 65 | +__bioscript__ = { |
| 66 | + 'variant_calls': get_vcs, |
| 67 | + 'classifier': ThalassemiaClassifier, |
| 68 | + 'name': 'THALASSEMIA', |
| 69 | +} |
| 70 | + |
| 71 | +from bioscript import VariantFixture |
| 72 | +from bioscript.types import MatchList |
| 73 | +import os |
| 74 | + |
| 75 | +# Create test fixtures for thalassemia-associated HBB variants (subset from thalassemia_clinvar.tsv) |
| 76 | +fixture = VariantFixture( |
| 77 | + [ |
| 78 | + {'rsid': 'rs33985472', 'chromosome': '11', 'position': 5225485}, |
| 79 | + {'rsid': 'rs63751128', 'chromosome': '11', 'position': 5225487}, |
| 80 | + {'rsid': 'rs33978907', 'chromosome': '11', 'position': 5225488}, |
| 81 | + {'rsid': 'rs34809925', 'chromosome': '11', 'position': 5225592}, |
| 82 | + {'rsid': 'rs35117167', 'chromosome': '11', 'position': 5225605}, |
| 83 | + {'rsid': 'rs33971634', 'chromosome': '11', 'position': 5225660}, |
| 84 | + ], |
| 85 | + assembly='GRCh38', |
| 86 | +) |
| 87 | + |
| 88 | +def test_thalassemia_heterozygous_variants(): |
| 89 | + """Test detection of heterozygous thalassemia-associated variants.""" |
| 90 | + variants = fixture(['TC', 'TC', 'AG', 'GG', 'TT', 'GG']) |
| 91 | + |
| 92 | + # Create mini variant call list for testing |
| 93 | + test_vcs = [ |
| 94 | + VariantCall(rsid='rs33985472', ref='T', alt='C', chromosome='11', position=5225485, gene='HBB'), |
| 95 | + VariantCall(rsid='rs63751128', ref='T', alt='C', chromosome='11', position=5225487, gene='HBB'), |
| 96 | + VariantCall(rsid='rs33978907', ref='A', alt='G', chromosome='11', position=5225488, gene='HBB'), |
| 97 | + ] |
| 98 | + |
| 99 | + matches = MatchList(variant_calls=test_vcs).match_rows(variants) |
| 100 | + classifier = ThalassemiaClassifier(participant_id='TEST_HET', name='THALASSEMIA', filename='test.txt') |
| 101 | + result = classifier(matches) |
| 102 | + |
| 103 | + assert len(result) == 3, f'Expected 3 variant rows, got {len(result)}' |
| 104 | + assert all(row['gene'] == 'HBB' for row in result), 'All variants should be HBB' |
| 105 | + assert all(row['match_type'] == 'VARIANT_CALL' for row in result), 'All should be variant calls' |
| 106 | + |
| 107 | + # Cleanup output file |
| 108 | + os.remove('result_THALASSEMIA_TEST_HET.tsv') |
| 109 | + |
| 110 | +def test_thalassemia_homozygous_variant(): |
| 111 | + """Test detection of a homozygous thalassemia-associated variant.""" |
| 112 | + variants = fixture(['TT', 'TT', 'AA', 'CC', 'TT', 'GG']) |
| 113 | + |
| 114 | + test_vcs = [ |
| 115 | + VariantCall(rsid='rs34809925', ref='G', alt='C', chromosome='11', position=5225592, gene='HBB'), |
| 116 | + ] |
| 117 | + |
| 118 | + matches = MatchList(variant_calls=test_vcs).match_rows(variants) |
| 119 | + classifier = ThalassemiaClassifier(participant_id='TEST_HOM', name='THALASSEMIA', filename='test.txt') |
| 120 | + result = classifier(matches) |
| 121 | + |
| 122 | + assert len(result) == 1, f'Expected 1 variant row, got {len(result)}' |
| 123 | + assert result[0]['gene'] == 'HBB', 'Variant should be HBB' |
| 124 | + assert result[0]['genotype'] == 'CC', 'Should be homozygous CC' |
| 125 | + |
| 126 | + # Cleanup output file |
| 127 | + os.remove('result_THALASSEMIA_TEST_HOM.tsv') |
| 128 | + |
| 129 | +def test_no_variants(): |
| 130 | + """Test classifier with no matching variants.""" |
| 131 | + variants = fixture(['TT', 'TT', 'AA', 'GG', 'TT', 'GG']) |
| 132 | + |
| 133 | + test_vcs = [ |
| 134 | + VariantCall(rsid='rs33985472', ref='T', alt='C', chromosome='11', position=5225485, gene='HBB'), |
| 135 | + ] |
| 136 | + |
| 137 | + matches = MatchList(variant_calls=test_vcs).match_rows(variants) |
| 138 | + classifier = ThalassemiaClassifier(participant_id='TEST_REF', name='THALASSEMIA', filename='test.txt') |
| 139 | + result = classifier(matches) |
| 140 | + |
| 141 | + assert len(result) == 0, f'Expected 0 variant rows, got {len(result)}' |
| 142 | + |
| 143 | + # Cleanup output file |
| 144 | + os.remove('result_THALASSEMIA_TEST_REF.tsv') |
0 commit comments