diff --git a/examples/thalassemia/classify_thalassemia.py b/examples/thalassemia/classify_thalassemia.py new file mode 100644 index 0000000..d40a8db --- /dev/null +++ b/examples/thalassemia/classify_thalassemia.py @@ -0,0 +1,144 @@ +import pandas as pd +from bioscript import optional_int, optional_str, write_tsv +from bioscript.classifier import GenotypeClassifier +from bioscript.types import VariantCall +from bioscript import assets_dir + +ASSETS_DIR = assets_dir() +CLINVAR_TSV = 'thalassemia_clinvar.tsv' +RESULT_HEADERS = [ + 'participant_id', + 'filename', + 'gene', + 'rsid', + 'chromosome', + 'position', + 'genotype', + 'ref', + 'alt', + 'variant_type', + 'match_type', +] + +def generate_variant_calls(df: pd.DataFrame) -> list[VariantCall]: + """Generate VariantCall objects from ClinVar DataFrame.""" + vcs: list[VariantCall] = [] + for _, row in df.iterrows(): + vcs.append( + VariantCall( + rsid=optional_str(row["rsid"]), + ref=optional_str(row["ref"]), + alt=optional_str(row["alt"]), + chromosome=optional_str(row["chromosome"]), + position=optional_int(row["position"]), + gene=optional_str(row.get("gene"), upper=True), + ) + ) + return vcs + +def get_vcs() -> list[VariantCall]: + """Load thalassemia-associated variant calls from a ClinVar TSV file.""" + df = pd.read_csv(ASSETS_DIR / CLINVAR_TSV, sep=' ') + print(f'Loaded {len(df)} variants from {CLINVAR_TSV}') + return generate_variant_calls(df) + +class ThalassemiaClassifier(GenotypeClassifier): + def classify(self, matches): + """Classify thalassemia-associated variants and write results to TSV files.""" + if not matches.all_matches: + print('No variant matches were found.', flush=True) + + # Get categorized matches as report rows + ref_rows, var_rows, no_rows = matches.categorize_report_rows( + self.participant_id, self.filename + ) + + if self.debug: + write_tsv(f'{self.output_basename}_ref.tsv', ref_rows) + write_tsv(f'{self.output_basename}_no.tsv', no_rows) + + write_tsv(f'{self.output_basename}.tsv', var_rows, headers=RESULT_HEADERS) + + # Return variant rows for testing + return var_rows + +__bioscript__ = { + 'variant_calls': get_vcs, + 'classifier': ThalassemiaClassifier, + 'name': 'THALASSEMIA', +} + +from bioscript import VariantFixture +from bioscript.types import MatchList +import os + +# Create test fixtures for thalassemia-associated HBB variants (subset from thalassemia_clinvar.tsv) +fixture = VariantFixture( + [ + {'rsid': 'rs33985472', 'chromosome': '11', 'position': 5225485}, + {'rsid': 'rs63751128', 'chromosome': '11', 'position': 5225487}, + {'rsid': 'rs33978907', 'chromosome': '11', 'position': 5225488}, + {'rsid': 'rs34809925', 'chromosome': '11', 'position': 5225592}, + {'rsid': 'rs35117167', 'chromosome': '11', 'position': 5225605}, + {'rsid': 'rs33971634', 'chromosome': '11', 'position': 5225660}, + ], + assembly='GRCh38', +) + +def test_thalassemia_heterozygous_variants(): + """Test detection of heterozygous thalassemia-associated variants.""" + variants = fixture(['TC', 'TC', 'AG', 'GG', 'TT', 'GG']) + + # Create mini variant call list for testing + test_vcs = [ + VariantCall(rsid='rs33985472', ref='T', alt='C', chromosome='11', position=5225485, gene='HBB'), + VariantCall(rsid='rs63751128', ref='T', alt='C', chromosome='11', position=5225487, gene='HBB'), + VariantCall(rsid='rs33978907', ref='A', alt='G', chromosome='11', position=5225488, gene='HBB'), + ] + + matches = MatchList(variant_calls=test_vcs).match_rows(variants) + classifier = ThalassemiaClassifier(participant_id='TEST_HET', name='THALASSEMIA', filename='test.txt') + result = classifier(matches) + + assert len(result) == 3, f'Expected 3 variant rows, got {len(result)}' + assert all(row['gene'] == 'HBB' for row in result), 'All variants should be HBB' + assert all(row['match_type'] == 'VARIANT_CALL' for row in result), 'All should be variant calls' + + # Cleanup output file + os.remove('result_THALASSEMIA_TEST_HET.tsv') + +def test_thalassemia_homozygous_variant(): + """Test detection of a homozygous thalassemia-associated variant.""" + variants = fixture(['TT', 'TT', 'AA', 'CC', 'TT', 'GG']) + + test_vcs = [ + VariantCall(rsid='rs34809925', ref='G', alt='C', chromosome='11', position=5225592, gene='HBB'), + ] + + matches = MatchList(variant_calls=test_vcs).match_rows(variants) + classifier = ThalassemiaClassifier(participant_id='TEST_HOM', name='THALASSEMIA', filename='test.txt') + result = classifier(matches) + + assert len(result) == 1, f'Expected 1 variant row, got {len(result)}' + assert result[0]['gene'] == 'HBB', 'Variant should be HBB' + assert result[0]['genotype'] == 'CC', 'Should be homozygous CC' + + # Cleanup output file + os.remove('result_THALASSEMIA_TEST_HOM.tsv') + +def test_no_variants(): + """Test classifier with no matching variants.""" + variants = fixture(['TT', 'TT', 'AA', 'GG', 'TT', 'GG']) + + test_vcs = [ + VariantCall(rsid='rs33985472', ref='T', alt='C', chromosome='11', position=5225485, gene='HBB'), + ] + + matches = MatchList(variant_calls=test_vcs).match_rows(variants) + classifier = ThalassemiaClassifier(participant_id='TEST_REF', name='THALASSEMIA', filename='test.txt') + result = classifier(matches) + + assert len(result) == 0, f'Expected 0 variant rows, got {len(result)}' + + # Cleanup output file + os.remove('result_THALASSEMIA_TEST_REF.tsv') diff --git a/examples/thalassemia/thalassemia-classifier/assets/classify_thalassemia.py b/examples/thalassemia/thalassemia-classifier/assets/classify_thalassemia.py new file mode 100644 index 0000000..d40a8db --- /dev/null +++ b/examples/thalassemia/thalassemia-classifier/assets/classify_thalassemia.py @@ -0,0 +1,144 @@ +import pandas as pd +from bioscript import optional_int, optional_str, write_tsv +from bioscript.classifier import GenotypeClassifier +from bioscript.types import VariantCall +from bioscript import assets_dir + +ASSETS_DIR = assets_dir() +CLINVAR_TSV = 'thalassemia_clinvar.tsv' +RESULT_HEADERS = [ + 'participant_id', + 'filename', + 'gene', + 'rsid', + 'chromosome', + 'position', + 'genotype', + 'ref', + 'alt', + 'variant_type', + 'match_type', +] + +def generate_variant_calls(df: pd.DataFrame) -> list[VariantCall]: + """Generate VariantCall objects from ClinVar DataFrame.""" + vcs: list[VariantCall] = [] + for _, row in df.iterrows(): + vcs.append( + VariantCall( + rsid=optional_str(row["rsid"]), + ref=optional_str(row["ref"]), + alt=optional_str(row["alt"]), + chromosome=optional_str(row["chromosome"]), + position=optional_int(row["position"]), + gene=optional_str(row.get("gene"), upper=True), + ) + ) + return vcs + +def get_vcs() -> list[VariantCall]: + """Load thalassemia-associated variant calls from a ClinVar TSV file.""" + df = pd.read_csv(ASSETS_DIR / CLINVAR_TSV, sep=' ') + print(f'Loaded {len(df)} variants from {CLINVAR_TSV}') + return generate_variant_calls(df) + +class ThalassemiaClassifier(GenotypeClassifier): + def classify(self, matches): + """Classify thalassemia-associated variants and write results to TSV files.""" + if not matches.all_matches: + print('No variant matches were found.', flush=True) + + # Get categorized matches as report rows + ref_rows, var_rows, no_rows = matches.categorize_report_rows( + self.participant_id, self.filename + ) + + if self.debug: + write_tsv(f'{self.output_basename}_ref.tsv', ref_rows) + write_tsv(f'{self.output_basename}_no.tsv', no_rows) + + write_tsv(f'{self.output_basename}.tsv', var_rows, headers=RESULT_HEADERS) + + # Return variant rows for testing + return var_rows + +__bioscript__ = { + 'variant_calls': get_vcs, + 'classifier': ThalassemiaClassifier, + 'name': 'THALASSEMIA', +} + +from bioscript import VariantFixture +from bioscript.types import MatchList +import os + +# Create test fixtures for thalassemia-associated HBB variants (subset from thalassemia_clinvar.tsv) +fixture = VariantFixture( + [ + {'rsid': 'rs33985472', 'chromosome': '11', 'position': 5225485}, + {'rsid': 'rs63751128', 'chromosome': '11', 'position': 5225487}, + {'rsid': 'rs33978907', 'chromosome': '11', 'position': 5225488}, + {'rsid': 'rs34809925', 'chromosome': '11', 'position': 5225592}, + {'rsid': 'rs35117167', 'chromosome': '11', 'position': 5225605}, + {'rsid': 'rs33971634', 'chromosome': '11', 'position': 5225660}, + ], + assembly='GRCh38', +) + +def test_thalassemia_heterozygous_variants(): + """Test detection of heterozygous thalassemia-associated variants.""" + variants = fixture(['TC', 'TC', 'AG', 'GG', 'TT', 'GG']) + + # Create mini variant call list for testing + test_vcs = [ + VariantCall(rsid='rs33985472', ref='T', alt='C', chromosome='11', position=5225485, gene='HBB'), + VariantCall(rsid='rs63751128', ref='T', alt='C', chromosome='11', position=5225487, gene='HBB'), + VariantCall(rsid='rs33978907', ref='A', alt='G', chromosome='11', position=5225488, gene='HBB'), + ] + + matches = MatchList(variant_calls=test_vcs).match_rows(variants) + classifier = ThalassemiaClassifier(participant_id='TEST_HET', name='THALASSEMIA', filename='test.txt') + result = classifier(matches) + + assert len(result) == 3, f'Expected 3 variant rows, got {len(result)}' + assert all(row['gene'] == 'HBB' for row in result), 'All variants should be HBB' + assert all(row['match_type'] == 'VARIANT_CALL' for row in result), 'All should be variant calls' + + # Cleanup output file + os.remove('result_THALASSEMIA_TEST_HET.tsv') + +def test_thalassemia_homozygous_variant(): + """Test detection of a homozygous thalassemia-associated variant.""" + variants = fixture(['TT', 'TT', 'AA', 'CC', 'TT', 'GG']) + + test_vcs = [ + VariantCall(rsid='rs34809925', ref='G', alt='C', chromosome='11', position=5225592, gene='HBB'), + ] + + matches = MatchList(variant_calls=test_vcs).match_rows(variants) + classifier = ThalassemiaClassifier(participant_id='TEST_HOM', name='THALASSEMIA', filename='test.txt') + result = classifier(matches) + + assert len(result) == 1, f'Expected 1 variant row, got {len(result)}' + assert result[0]['gene'] == 'HBB', 'Variant should be HBB' + assert result[0]['genotype'] == 'CC', 'Should be homozygous CC' + + # Cleanup output file + os.remove('result_THALASSEMIA_TEST_HOM.tsv') + +def test_no_variants(): + """Test classifier with no matching variants.""" + variants = fixture(['TT', 'TT', 'AA', 'GG', 'TT', 'GG']) + + test_vcs = [ + VariantCall(rsid='rs33985472', ref='T', alt='C', chromosome='11', position=5225485, gene='HBB'), + ] + + matches = MatchList(variant_calls=test_vcs).match_rows(variants) + classifier = ThalassemiaClassifier(participant_id='TEST_REF', name='THALASSEMIA', filename='test.txt') + result = classifier(matches) + + assert len(result) == 0, f'Expected 0 variant rows, got {len(result)}' + + # Cleanup output file + os.remove('result_THALASSEMIA_TEST_REF.tsv') diff --git a/examples/thalassemia/thalassemia-classifier/assets/thalassemia_clinvar.tsv b/examples/thalassemia/thalassemia-classifier/assets/thalassemia_clinvar.tsv new file mode 100644 index 0000000..74446a2 --- /dev/null +++ b/examples/thalassemia/thalassemia-classifier/assets/thalassemia_clinvar.tsv @@ -0,0 +1,230 @@ +rsid gene chromosome position ref alt clnrevstat clnsig clnvc +rs33985472 HBB 11 5225485 T C criteria_provided,_multiple_submitters,_no_conflicts Pathogenic single_nucleotide_variant +rs63751128 HBB 11 5225487 T C criteria_provided,_multiple_submitters,_no_conflicts Pathogenic single_nucleotide_variant +rs33978907 HBB 11 5225488 A G criteria_provided,_multiple_submitters,_no_conflicts Pathogenic single_nucleotide_variant +rs34809925 HBB 11 5225592 G C criteria_provided,_multiple_submitters,_no_conflicts Pathogenic/Likely_pathogenic single_nucleotide_variant +rs33954264 HBB 11 5225602 T G criteria_provided,_single_submitter Pathogenic single_nucleotide_variant +rs35117167 HBB 11 5225605 T C criteria_provided,_multiple_submitters,_no_conflicts Pathogenic single_nucleotide_variant +rs33949869 HBB 11 5225606 A G criteria_provided,_single_submitter Pathogenic single_nucleotide_variant +rs33949869 HBB 11 5225606 A T criteria_provided,_multiple_submitters,_no_conflicts Pathogenic single_nucleotide_variant +rs35020585 HBB 11 5225607 C G criteria_provided,_single_submitter Pathogenic single_nucleotide_variant +rs33918338 HBB 11 5225611 T C criteria_provided,_multiple_submitters,_no_conflicts Pathogenic/Likely_pathogenic single_nucleotide_variant +rs33918338 HBB 11 5225611 T G criteria_provided,_single_submitter Pathogenic single_nucleotide_variant +rs33927093 HBB 11 5225620 G A criteria_provided,_single_submitter Pathogenic single_nucleotide_variant +rs281864530 HBB 11 5225649 A T criteria_provided,_single_submitter Pathogenic single_nucleotide_variant +rs33910569 HBB 11 5225659 T G criteria_provided,_single_submitter Pathogenic single_nucleotide_variant +rs33971634 HBB 11 5225660 G A criteria_provided,_multiple_submitters,_no_conflicts Pathogenic single_nucleotide_variant +rs33925391 HBB 11 5225662 A C criteria_provided,_multiple_submitters,_no_conflicts Pathogenic/Likely_pathogenic single_nucleotide_variant +rs33946267 HBB 11 5225678 C A criteria_provided,_multiple_submitters,_no_conflicts Pathogenic single_nucleotide_variant +rs33946267 HBB 11 5225678 C G criteria_provided,_multiple_submitters,_no_conflicts Pathogenic/Likely_pathogenic single_nucleotide_variant +rs33946267 HBB 11 5225678 C T criteria_provided,_multiple_submitters,_no_conflicts Pathogenic/Likely_pathogenic single_nucleotide_variant +rs35485099 HBB 11 5225695 G T criteria_provided,_multiple_submitters,_no_conflicts Pathogenic single_nucleotide_variant +rs34945623 HBB 11 5225696 C G criteria_provided,_single_submitter Pathogenic single_nucleotide_variant +rs36015961 HBB 11 5225698 A G criteria_provided,_multiple_submitters,_no_conflicts Pathogenic/Likely_pathogenic single_nucleotide_variant +rs35256489 HBB 11 5225710 A G criteria_provided,_multiple_submitters,_no_conflicts Pathogenic single_nucleotide_variant +rs33969677 HBB 11 5225714 C G criteria_provided,_single_submitter Pathogenic single_nucleotide_variant +rs33969677 HBB 11 5225714 C T criteria_provided,_multiple_submitters,_no_conflicts Pathogenic single_nucleotide_variant + HBB 11 5225715 G C criteria_provided,_multiple_submitters,_no_conflicts Likely_pathogenic single_nucleotide_variant +rs35519485 HBB 11 5225719 C T criteria_provided,_single_submitter Likely_pathogenic single_nucleotide_variant +rs33941844 HBB 11 5225722 A T criteria_provided,_single_submitter Pathogenic single_nucleotide_variant +rs33952266 HBB 11 5225727 C A criteria_provided,_multiple_submitters,_no_conflicts Likely_pathogenic single_nucleotide_variant +rs33952266 HBB 11 5225727 C G criteria_provided,_single_submitter Pathogenic single_nucleotide_variant +rs33952266 HBB 11 5225727 C T criteria_provided,_multiple_submitters,_no_conflicts Pathogenic/Likely_pathogenic single_nucleotide_variant +rs33914668 HBB 11 5225728 T C criteria_provided,_multiple_submitters,_no_conflicts Pathogenic single_nucleotide_variant +rs33914668 HBB 11 5225728 T G criteria_provided,_multiple_submitters,_no_conflicts Pathogenic single_nucleotide_variant +rs33913413 HBB 11 5225729 G C criteria_provided,_single_submitter Likely_pathogenic single_nucleotide_variant +rs33913413 HBB 11 5225729 G T criteria_provided,_multiple_submitters,_no_conflicts Pathogenic/Likely_pathogenic single_nucleotide_variant +rs35703285 HBB 11 5225740 A C criteria_provided,_multiple_submitters,_no_conflicts Pathogenic/Likely_pathogenic single_nucleotide_variant +rs34690599 HBB 11 5225832 G C criteria_provided,_multiple_submitters,_no_conflicts Pathogenic/Likely_pathogenic single_nucleotide_variant +rs35328027 HBB 11 5225872 A C criteria_provided,_multiple_submitters,_no_conflicts Pathogenic single_nucleotide_variant +rs34451549 HBB 11 5225923 G A criteria_provided,_multiple_submitters,_no_conflicts Pathogenic single_nucleotide_variant +rs35099082 HBB 11 5226572 C G criteria_provided,_multiple_submitters,_no_conflicts Pathogenic single_nucleotide_variant +rs63750283 HBB 11 5226575 A C criteria_provided,_multiple_submitters,_no_conflicts Likely_pathogenic single_nucleotide_variant +rs33945777 HBB 11 5226576 C A criteria_provided,_multiple_submitters,_no_conflicts Pathogenic single_nucleotide_variant +rs33945777 HBB 11 5226576 C G criteria_provided,_single_submitter Pathogenic single_nucleotide_variant +rs33945777 HBB 11 5226576 C T criteria_provided,_multiple_submitters,_no_conflicts Pathogenic single_nucleotide_variant +rs33954595 HBB 11 5226594 C A criteria_provided,_single_submitter Pathogenic single_nucleotide_variant +rs33933298 HBB 11 5226597 C T criteria_provided,_multiple_submitters,_no_conflicts Pathogenic single_nucleotide_variant +rs34515413 HBB 11 5226598 G C criteria_provided,_multiple_submitters,_no_conflicts Pathogenic/Likely_pathogenic single_nucleotide_variant + HBB 11 5226598 G T criteria_provided,_single_submitter Pathogenic single_nucleotide_variant +rs33951978 HBB 11 5226599 T A criteria_provided,_single_submitter Likely_pathogenic single_nucleotide_variant + HBB 11 5226602 A C criteria_provided,_single_submitter Pathogenic single_nucleotide_variant +rs34083951 HBB 11 5226613 G C criteria_provided,_single_submitter Likely_pathogenic single_nucleotide_variant +rs34083951 HBB 11 5226613 G T criteria_provided,_single_submitter Pathogenic single_nucleotide_variant +rs33924775 HBB 11 5226615 G A criteria_provided,_single_submitter Pathogenic single_nucleotide_variant +rs33917785 HBB 11 5226617 A G criteria_provided,_single_submitter Pathogenic single_nucleotide_variant +rs35002698 HBB 11 5226619 C G criteria_provided,_single_submitter Pathogenic single_nucleotide_variant +rs33913712 HBB 11 5226621 C A criteria_provided,_multiple_submitters,_no_conflicts Pathogenic single_nucleotide_variant +rs33940204 HBB 11 5226626 A G criteria_provided,_multiple_submitters,_no_conflicts Pathogenic/Likely_pathogenic single_nucleotide_variant +rs33993568 HBB 11 5226629 G A criteria_provided,_single_submitter Likely_pathogenic single_nucleotide_variant +rs35693898 HBB 11 5226635 A G criteria_provided,_single_submitter Pathogenic single_nucleotide_variant +rs33991993 HBB 11 5226643 C G criteria_provided,_single_submitter Likely_pathogenic single_nucleotide_variant +rs33987903 HBB 11 5226644 T A criteria_provided,_single_submitter Pathogenic single_nucleotide_variant +rs33972593 HBB 11 5226686 A G criteria_provided,_single_submitter Likely_pathogenic single_nucleotide_variant +rs33961459 HBB 11 5226687 G A criteria_provided,_multiple_submitters,_no_conflicts Pathogenic/Likely_pathogenic single_nucleotide_variant +rs36008922 HBB 11 5226690 C T criteria_provided,_multiple_submitters,_no_conflicts Pathogenic single_nucleotide_variant +rs35939489 HBB 11 5226692 T G criteria_provided,_multiple_submitters,_no_conflicts Likely_pathogenic single_nucleotide_variant +rs35353749 HBB 11 5226696 T A criteria_provided,_single_submitter Pathogenic single_nucleotide_variant +rs33922873 HBB 11 5226702 G A criteria_provided,_multiple_submitters,_no_conflicts Pathogenic single_nucleotide_variant +rs33922873 HBB 11 5226702 G T criteria_provided,_single_submitter Pathogenic single_nucleotide_variant +rs33995148 HBB 11 5226708 T A criteria_provided,_multiple_submitters,_no_conflicts Pathogenic single_nucleotide_variant + HBB 11 5226732 C T criteria_provided,_single_submitter Likely_pathogenic single_nucleotide_variant +rs33978338 HBB 11 5226755 A C criteria_provided,_single_submitter Likely_pathogenic single_nucleotide_variant +rs33922842 HBB 11 5226762 C A criteria_provided,_multiple_submitters,_no_conflicts Pathogenic single_nucleotide_variant +rs34378160 HBB 11 5226764 A G criteria_provided,_multiple_submitters,_no_conflicts Pathogenic single_nucleotide_variant +rs11549407 HBB 11 5226774 G A criteria_provided,_multiple_submitters,_no_conflicts Pathogenic single_nucleotide_variant +rs33974936 HBB 11 5226778 C T criteria_provided,_multiple_submitters,_no_conflicts Pathogenic single_nucleotide_variant +rs33991059 HBB 11 5226779 C T criteria_provided,_multiple_submitters,_no_conflicts Pathogenic single_nucleotide_variant +rs33982568 HBB 11 5226784 G T criteria_provided,_multiple_submitters,_no_conflicts Pathogenic single_nucleotide_variant +rs1141387 HBB 11 5226789 C A criteria_provided,_multiple_submitters,_no_conflicts Likely_pathogenic single_nucleotide_variant +rs33948578 HBB 11 5226794 A G criteria_provided,_single_submitter Pathogenic single_nucleotide_variant +rs33948578 HBB 11 5226794 A T criteria_provided,_single_submitter Likely_pathogenic single_nucleotide_variant +rs33943001 HBB 11 5226800 C G criteria_provided,_multiple_submitters,_no_conflicts Pathogenic single_nucleotide_variant +rs33943001 HBB 11 5226800 C T criteria_provided,_multiple_submitters,_no_conflicts Pathogenic single_nucleotide_variant +rs63750513 HBB 11 5226801 T C criteria_provided,_multiple_submitters,_no_conflicts Pathogenic single_nucleotide_variant +rs63750513 HBB 11 5226801 T G criteria_provided,_multiple_submitters,_no_conflicts Pathogenic single_nucleotide_variant +rs34527846 HBB 11 5226802 A C criteria_provided,_multiple_submitters,_no_conflicts Pathogenic/Likely_pathogenic single_nucleotide_variant +rs35456885 HBB 11 5226814 A C criteria_provided,_multiple_submitters,_no_conflicts Pathogenic single_nucleotide_variant +rs35004220 HBB 11 5226820 C T criteria_provided,_multiple_submitters,_no_conflicts Pathogenic/Likely_pathogenic single_nucleotide_variant +rs35724775 HBB 11 5226924 A G criteria_provided,_multiple_submitters,_no_conflicts Pathogenic/Likely_pathogenic single_nucleotide_variant +rs33915217 HBB 11 5226925 C A criteria_provided,_multiple_submitters,_no_conflicts Pathogenic single_nucleotide_variant +rs33915217 HBB 11 5226925 C G criteria_provided,_multiple_submitters,_no_conflicts Pathogenic single_nucleotide_variant +rs33915217 HBB 11 5226925 C T criteria_provided,_multiple_submitters,_no_conflicts Pathogenic single_nucleotide_variant +rs33956879 HBB 11 5226928 A C criteria_provided,_multiple_submitters,_no_conflicts Pathogenic single_nucleotide_variant +rs33956879 HBB 11 5226928 A G criteria_provided,_multiple_submitters,_no_conflicts Pathogenic single_nucleotide_variant +rs33956879 HBB 11 5226928 A T criteria_provided,_multiple_submitters,_no_conflicts Pathogenic single_nucleotide_variant +rs33971440 HBB 11 5226929 C A criteria_provided,_multiple_submitters,_no_conflicts Pathogenic single_nucleotide_variant +rs33971440 HBB 11 5226929 C T criteria_provided,_multiple_submitters,_no_conflicts Pathogenic single_nucleotide_variant + HBB 11 5226930 C A criteria_provided,_single_submitter Likely_pathogenic single_nucleotide_variant +rs33960103 HBB 11 5226930 C G criteria_provided,_multiple_submitters,_no_conflicts Pathogenic single_nucleotide_variant +rs33960103 HBB 11 5226930 C T criteria_provided,_multiple_submitters,_no_conflicts Pathogenic/Likely_pathogenic single_nucleotide_variant +rs35684407 HBB 11 5226931 T C criteria_provided,_single_submitter Pathogenic single_nucleotide_variant +rs35684407 HBB 11 5226931 T G criteria_provided,_multiple_submitters,_no_conflicts Likely_pathogenic single_nucleotide_variant +rs35685286 HBB 11 5226933 C T criteria_provided,_single_submitter Likely_pathogenic single_nucleotide_variant +rs33916412 HBB 11 5226936 A G criteria_provided,_single_submitter Pathogenic single_nucleotide_variant +rs33954632 HBB 11 5226939 G T criteria_provided,_single_submitter Pathogenic single_nucleotide_variant +rs35424040 HBB 11 5226940 C A criteria_provided,_multiple_submitters,_no_conflicts Pathogenic single_nucleotide_variant +rs33950507 HBB 11 5226943 C A criteria_provided,_multiple_submitters,_no_conflicts Pathogenic single_nucleotide_variant +rs33950507 HBB 11 5226943 C T criteria_provided,_multiple_submitters,_no_conflicts Pathogenic single_nucleotide_variant +rs33951465 HBB 11 5226947 A T criteria_provided,_multiple_submitters,_no_conflicts Pathogenic/Likely_pathogenic single_nucleotide_variant +rs33959855 HBB 11 5226955 C A criteria_provided,_multiple_submitters,_no_conflicts Pathogenic single_nucleotide_variant +rs35890959 HBB 11 5226961 C T criteria_provided,_multiple_submitters,_no_conflicts Pathogenic/Likely_pathogenic single_nucleotide_variant +rs33972047 HBB 11 5226963 T C criteria_provided,_multiple_submitters,_no_conflicts Pathogenic single_nucleotide_variant +rs33986703 HBB 11 5226970 T A criteria_provided,_multiple_submitters,_no_conflicts Pathogenic single_nucleotide_variant +rs370075492 HBB 11 5226971 G A criteria_provided,_multiple_submitters,_no_conflicts Pathogenic/Likely_pathogenic single_nucleotide_variant +rs34716011 HBB 11 5226974 C T criteria_provided,_multiple_submitters,_no_conflicts Pathogenic single_nucleotide_variant +rs63750783 HBB 11 5226975 C T criteria_provided,_multiple_submitters,_no_conflicts Pathogenic single_nucleotide_variant +rs33926764 HBB 11 5226997 T C criteria_provided,_single_submitter Pathogenic single_nucleotide_variant +rs34948328 HBB 11 5227000 C A criteria_provided,_single_submitter Pathogenic single_nucleotide_variant +rs334 HBB 11 5227002 T A criteria_provided,_multiple_submitters,_no_conflicts Pathogenic single_nucleotide_variant +rs33930165 HBB 11 5227003 C T criteria_provided,_multiple_submitters,_no_conflicts Pathogenic/Likely_pathogenic single_nucleotide_variant +rs33930702 HBB 11 5227019 C A criteria_provided,_single_submitter Likely_pathogenic single_nucleotide_variant +rs33930702 HBB 11 5227019 C T criteria_provided,_multiple_submitters,_no_conflicts Pathogenic single_nucleotide_variant +rs33941849 HBB 11 5227020 A C criteria_provided,_multiple_submitters,_no_conflicts Pathogenic single_nucleotide_variant +rs33941849 HBB 11 5227020 A G criteria_provided,_multiple_submitters,_no_conflicts Pathogenic single_nucleotide_variant +rs33941849 HBB 11 5227020 A T criteria_provided,_multiple_submitters,_no_conflicts Pathogenic/Likely_pathogenic single_nucleotide_variant +rs34563000 HBB 11 5227021 T C criteria_provided,_multiple_submitters,_no_conflicts Pathogenic single_nucleotide_variant +rs34563000 HBB 11 5227021 T G criteria_provided,_multiple_submitters,_no_conflicts Pathogenic/Likely_pathogenic single_nucleotide_variant +rs34135787 HBB 11 5227039 G C criteria_provided,_multiple_submitters,_no_conflicts Pathogenic single_nucleotide_variant +rs34704828 HBB 11 5227050 C T criteria_provided,_multiple_submitters,_no_conflicts Pathogenic single_nucleotide_variant +rs34305195 HBB 11 5227071 T G criteria_provided,_multiple_submitters,_no_conflicts Pathogenic/Likely_pathogenic single_nucleotide_variant +rs33931746 HBB 11 5227099 T C criteria_provided,_multiple_submitters,_no_conflicts Pathogenic/Likely_pathogenic single_nucleotide_variant +rs33931746 HBB 11 5227099 T G criteria_provided,_multiple_submitters,_no_conflicts Pathogenic single_nucleotide_variant +rs34598529 HBB 11 5227100 T C criteria_provided,_multiple_submitters,_no_conflicts Pathogenic single_nucleotide_variant +rs34598529 HBB 11 5227100 T G criteria_provided,_single_submitter Pathogenic single_nucleotide_variant +rs33980857 HBB 11 5227101 A G criteria_provided,_multiple_submitters,_no_conflicts Pathogenic single_nucleotide_variant +rs33980857 HBB 11 5227101 A T criteria_provided,_multiple_submitters,_no_conflicts Pathogenic single_nucleotide_variant +rs33981098 HBB 11 5227102 T C criteria_provided,_multiple_submitters,_no_conflicts Pathogenic single_nucleotide_variant +rs281864518 HBB 11 5227142 G A criteria_provided,_multiple_submitters,_no_conflicts Pathogenic/Likely_pathogenic single_nucleotide_variant +rs33994806 HBB 11 5227157 G A criteria_provided,_multiple_submitters,_no_conflicts Pathogenic/Likely_pathogenic single_nucleotide_variant +rs33994806 HBB 11 5227157 G C criteria_provided,_multiple_submitters,_no_conflicts Pathogenic/Likely_pathogenic single_nucleotide_variant +rs33994806 HBB 11 5227157 G T criteria_provided,_single_submitter Likely_pathogenic single_nucleotide_variant +rs33941377 HBB 11 5227158 G A criteria_provided,_multiple_submitters,_no_conflicts Pathogenic/Likely_pathogenic single_nucleotide_variant +rs33941377 HBB 11 5227158 G C criteria_provided,_multiple_submitters,_no_conflicts Pathogenic/Likely_pathogenic single_nucleotide_variant +rs33941377 HBB 11 5227158 G T criteria_provided,_multiple_submitters,_no_conflicts Pathogenic single_nucleotide_variant +rs33944208 HBB 11 5227159 G A criteria_provided,_multiple_submitters,_no_conflicts Pathogenic/Likely_pathogenic single_nucleotide_variant +rs33944208 HBB 11 5227159 G T criteria_provided,_multiple_submitters,_no_conflicts Pathogenic/Likely_pathogenic single_nucleotide_variant +rs34999973 HBB 11 5227161 G A criteria_provided,_multiple_submitters,_no_conflicts Pathogenic/Likely_pathogenic single_nucleotide_variant +rs34883338 HBB 11 5227163 G A criteria_provided,_multiple_submitters,_no_conflicts Pathogenic/Likely_pathogenic single_nucleotide_variant +rs63751208 HBB 11 5227172 G A criteria_provided,_multiple_submitters,_no_conflicts Pathogenic single_nucleotide_variant +rs63751208 HBB 11 5227172 G C criteria_provided,_single_submitter Pathogenic single_nucleotide_variant +rs41469945 HBA1 16 173581 T C criteria_provided,_single_submitter Pathogenic single_nucleotide_variant +rs41464951 HBA1 16 173598 T C criteria_provided,_multiple_submitters,_no_conflicts Pathogenic/Likely_pathogenic single_nucleotide_variant +rs34220980 HBA1 16 176717 A G criteria_provided,_multiple_submitters,_no_conflicts Pathogenic single_nucleotide_variant +rs1316527998 HBA1 16 176718 T A criteria_provided,_single_submitter Likely_pathogenic single_nucleotide_variant + HBA1 16 176718 T C criteria_provided,_multiple_submitters,_no_conflicts Pathogenic/Likely_pathogenic single_nucleotide_variant +rs1316527998 HBA1 16 176718 T G criteria_provided,_single_submitter Pathogenic single_nucleotide_variant +rs33964317 HBA1 16 176759 T C criteria_provided,_multiple_submitters,_no_conflicts Pathogenic/Likely_pathogenic single_nucleotide_variant +rs63750090 HBA1 16 176760 G A criteria_provided,_multiple_submitters,_no_conflicts Pathogenic/Likely_pathogenic single_nucleotide_variant +rs1596573335 HBA1 16 176761 G A criteria_provided,_multiple_submitters,_no_conflicts Pathogenic/Likely_pathogenic single_nucleotide_variant + HBA1 16 176811 G A criteria_provided,_single_submitter Likely_pathogenic single_nucleotide_variant +rs1201093320 HBA1 16 176812 G A criteria_provided,_multiple_submitters,_no_conflicts Pathogenic single_nucleotide_variant + HBA1 16 176813 T C criteria_provided,_multiple_submitters,_no_conflicts Likely_pathogenic single_nucleotide_variant +rs1298836193 HBA1 16 176927 A G criteria_provided,_single_submitter Pathogenic single_nucleotide_variant +rs34883113 HBA1 16 176928 G A criteria_provided,_multiple_submitters,_no_conflicts Pathogenic single_nucleotide_variant +rs1455943416 HBA1 16 176932 G A criteria_provided,_multiple_submitters,_no_conflicts Likely_pathogenic single_nucleotide_variant +rs33978134 HBA1 16 176967 C T criteria_provided,_single_submitter Likely_pathogenic single_nucleotide_variant +rs281864895 HBA1 16 177011 G C criteria_provided,_multiple_submitters,_no_conflicts Pathogenic single_nucleotide_variant +rs28928878 HBA1 16 177012 G A criteria_provided,_multiple_submitters,_no_conflicts Pathogenic single_nucleotide_variant + HBA1 16 177017 A T criteria_provided,_single_submitter Likely_pathogenic single_nucleotide_variant +rs28928876 HBA1 16 177095 C T criteria_provided,_multiple_submitters,_no_conflicts Pathogenic single_nucleotide_variant +rs1318210119 HBA1 16 177097 C G criteria_provided,_single_submitter Likely_pathogenic single_nucleotide_variant +rs758093235 HBA1 16 177134 G A criteria_provided,_multiple_submitters,_no_conflicts Likely_pathogenic single_nucleotide_variant +rs758093235 HBA1 16 177134 G T criteria_provided,_single_submitter Pathogenic single_nucleotide_variant +rs756810015 HBA1 16 177308 C A criteria_provided,_single_submitter Likely_pathogenic single_nucleotide_variant + HBA1 16 177311 T C criteria_provided,_single_submitter Likely_pathogenic single_nucleotide_variant +rs63749948 HBA1 16 177314 C A criteria_provided,_multiple_submitters,_no_conflicts Likely_pathogenic single_nucleotide_variant +rs63750751 HBA1 16 177340 C T criteria_provided,_multiple_submitters,_no_conflicts Pathogenic single_nucleotide_variant +rs35993655 HBA1 16 177371 T C criteria_provided,_multiple_submitters,_no_conflicts Likely_pathogenic single_nucleotide_variant +rs121909803 HBA2 16 172913 A G criteria_provided,_multiple_submitters,_no_conflicts Pathogenic single_nucleotide_variant +rs111033603 HBA2 16 172914 T C criteria_provided,_multiple_submitters,_no_conflicts Pathogenic single_nucleotide_variant +rs111033603 HBA2 16 172914 T G criteria_provided,_single_submitter Likely_pathogenic single_nucleotide_variant +rs63750367 HBA2 16 172957 G A criteria_provided,_single_submitter Pathogenic single_nucleotide_variant +rs63751457 HBA2 16 172981 C T criteria_provided,_multiple_submitters,_no_conflicts Likely_pathogenic single_nucleotide_variant +rs281864819 HBA2 16 172982 G T criteria_provided,_single_submitter Likely_pathogenic single_nucleotide_variant +rs281864821 HBA2 16 172985 T G criteria_provided,_single_submitter Likely_pathogenic single_nucleotide_variant +rs281864550 HBA2 16 172987 T G criteria_provided,_multiple_submitters,_no_conflicts Pathogenic/Likely_pathogenic single_nucleotide_variant +rs41341344 HBA2 16 173001 T C criteria_provided,_multiple_submitters,_no_conflicts Pathogenic single_nucleotide_variant +rs63750158 HBA2 16 173008 G A criteria_provided,_multiple_submitters,_no_conflicts Pathogenic single_nucleotide_variant +rs63750158 HBA2 16 173008 G C criteria_provided,_single_submitter Pathogenic single_nucleotide_variant +rs41457746 HBA2 16 173123 A G criteria_provided,_multiple_submitters,_no_conflicts Pathogenic single_nucleotide_variant + HBA2 16 173127 T C criteria_provided,_single_submitter Likely_pathogenic single_nucleotide_variant +rs41515552 HBA2 16 173128 G A criteria_provided,_multiple_submitters,_no_conflicts Pathogenic single_nucleotide_variant +rs63750776 HBA2 16 173135 T C criteria_provided,_single_submitter Likely_pathogenic single_nucleotide_variant +rs281864829 HBA2 16 173159 T G criteria_provided,_multiple_submitters,_no_conflicts Pathogenic single_nucleotide_variant +rs281864834 HBA2 16 173171 G C criteria_provided,_multiple_submitters,_no_conflicts Pathogenic/Likely_pathogenic single_nucleotide_variant +rs281864840 HBA2 16 173192 C T criteria_provided,_multiple_submitters,_no_conflicts Pathogenic/Likely_pathogenic single_nucleotide_variant +rs41328049 HBA2 16 173207 G C criteria_provided,_multiple_submitters,_no_conflicts Likely_pathogenic single_nucleotide_variant +rs281864846 HBA2 16 173208 G A criteria_provided,_multiple_submitters,_no_conflicts Pathogenic single_nucleotide_variant +rs41323248 HBA2 16 173229 T C criteria_provided,_multiple_submitters,_no_conflicts Pathogenic/Likely_pathogenic single_nucleotide_variant + HBA2 16 173271 T C criteria_provided,_single_submitter Likely_pathogenic single_nucleotide_variant +rs281864863 HBA2 16 173271 T G criteria_provided,_single_submitter Likely_pathogenic single_nucleotide_variant + HBA2 16 173304 T C criteria_provided,_single_submitter Pathogenic single_nucleotide_variant +rs281864878 HBA2 16 173312 G A criteria_provided,_single_submitter Pathogenic single_nucleotide_variant +rs281864878 HBA2 16 173312 G T criteria_provided,_single_submitter Likely_pathogenic single_nucleotide_variant +rs780091398 HBA2 16 173330 G A criteria_provided,_single_submitter Likely_pathogenic single_nucleotide_variant +rs587776827 HBA2 16 173471 G A criteria_provided,_single_submitter Likely_pathogenic single_nucleotide_variant +rs1263969213 HBA2 16 173484 T C criteria_provided,_multiple_submitters,_no_conflicts Pathogenic/Likely_pathogenic single_nucleotide_variant +rs41417548 HBA2 16 173485 G A criteria_provided,_multiple_submitters,_no_conflicts Pathogenic/Likely_pathogenic single_nucleotide_variant +rs1262943621 HBA2 16 173529 C T criteria_provided,_multiple_submitters,_no_conflicts Pathogenic/Likely_pathogenic single_nucleotide_variant + HBA2 16 173544 T C criteria_provided,_single_submitter Pathogenic single_nucleotide_variant +rs41397847 HBA2 16 173548 T C criteria_provided,_multiple_submitters,_no_conflicts Pathogenic single_nucleotide_variant +rs41397847 HBA2 16 173548 T G criteria_provided,_multiple_submitters,_no_conflicts Pathogenic/Likely_pathogenic single_nucleotide_variant +rs33933481 HBA2 16 173550 G T criteria_provided,_single_submitter Pathogenic single_nucleotide_variant +rs281864889 HBA2 16 173560 T C criteria_provided,_single_submitter Pathogenic single_nucleotide_variant +rs41469945 HBA1 16 173581 T C criteria_provided,_single_submitter Pathogenic single_nucleotide_variant +rs55870409 HBA2 16 173592 T C criteria_provided,_single_submitter Likely_pathogenic single_nucleotide_variant +rs41464951 HBA2 16 173598 T A criteria_provided,_multiple_submitters,_no_conflicts Pathogenic single_nucleotide_variant +rs41464951 HBA1 16 173598 T C criteria_provided,_multiple_submitters,_no_conflicts Pathogenic/Likely_pathogenic single_nucleotide_variant +rs41464951 HBA2 16 173598 T G criteria_provided,_multiple_submitters,_no_conflicts Pathogenic/Likely_pathogenic single_nucleotide_variant +rs41321345 HBA2 16 173599 A C criteria_provided,_multiple_submitters,_no_conflicts Pathogenic/Likely_pathogenic single_nucleotide_variant + HBA2 16 173599 A T criteria_provided,_single_submitter Pathogenic single_nucleotide_variant +rs41412046 HBA2 16 173600 A T criteria_provided,_multiple_submitters,_no_conflicts Pathogenic single_nucleotide_variant +rs63750067 HBA2 16 173692 A G criteria_provided,_multiple_submitters,_no_conflicts Pathogenic/Likely_pathogenic single_nucleotide_variant +rs63751269 HBA2 16 173694 A G criteria_provided,_multiple_submitters,_no_conflicts Pathogenic single_nucleotide_variant +rs35849348 HBD 11 5232998 C T criteria_provided,_single_submitter Pathogenic single_nucleotide_variant +rs36023765 HBD 11 5233094 T C criteria_provided,_single_submitter Pathogenic single_nucleotide_variant +rs35103459 HBG2 11 5254330 G A criteria_provided,_single_submitter Pathogenic single_nucleotide_variant +rs34474104 HBG2 11 5254417 G A criteria_provided,_multiple_submitters,_no_conflicts Likely_pathogenic single_nucleotide_variant +rs1438114920 HBG2 11 5254643 A T criteria_provided,_single_submitter Likely_pathogenic single_nucleotide_variant +rs1278163109 HBG2 11 5254644 G T criteria_provided,_single_submitter Pathogenic single_nucleotide_variant diff --git a/examples/thalassemia/thalassemia-classifier/pipeline.yaml b/examples/thalassemia/thalassemia-classifier/pipeline.yaml new file mode 100644 index 0000000..5a66a61 --- /dev/null +++ b/examples/thalassemia/thalassemia-classifier/pipeline.yaml @@ -0,0 +1,18 @@ +name: thalassemia-classifier +version: 0.1.1 +inputs: + samplesheet: List[GenotypeRecord] +steps: +- id: thalassemia + uses: ./ + with: + participants: inputs.samplesheet + publish: + classification_result: File(result_THALASSEMIA.tsv) + store: + counts_sql: + kind: sql + destination: SQL() + source: classification_result + table_name: thalassemia_{run_id} + participant_column: participant_id diff --git a/examples/thalassemia/thalassemia-classifier/project.yaml b/examples/thalassemia/thalassemia-classifier/project.yaml new file mode 100644 index 0000000..aa7cb97 --- /dev/null +++ b/examples/thalassemia/thalassemia-classifier/project.yaml @@ -0,0 +1,23 @@ +name: thalassemia-classifier +author: madhava@openmined.org +workflow: workflow.nf +template: dynamic-nextflow +version: 0.1.1 +assets: +- classify_thalassemia.py +- thalassemia_clinvar.tsv +description: Classification of thalassemia-associated variants using ClinVar reference data. +inputs: +- name: participants + type: List[GenotypeRecord] + description: CSV/TSV with participant_id and genotype_file columns + format: csv + mapping: + participant_id: participant_id + genotype_file: genotype_file +outputs: +- name: classification_result + type: File + description: Thalassemia variant classification (aggregated) + format: tsv + path: result_THALASSEMIA.tsv diff --git a/examples/thalassemia/thalassemia-classifier/workflow.nf b/examples/thalassemia/thalassemia-classifier/workflow.nf new file mode 100644 index 0000000..180259b --- /dev/null +++ b/examples/thalassemia/thalassemia-classifier/workflow.nf @@ -0,0 +1,77 @@ +// BioVault workflow export v0.1.1 + +nextflow.enable.dsl=2 + +workflow USER { + take: + context + participants // Channel emitting GenotypeRecord maps + + main: + def assetsDir = context.assets_dir + if (!assetsDir) { + throw new IllegalStateException("Missing assets directory in context") + } + def assetsDirPath = file(assetsDir) + + // Pair the assets directory with each (participant_id, genotype_file) tuple + def participant_work_items = participants.map { record -> + tuple( + assetsDirPath, + record.participant_id, + file(record.genotype_file) + ) + } + + // Process each participant + def per_participant_results = thalassemia_classifier( + participant_work_items + ) + + // Aggregate all results into single file + def aggregated = aggregate_results( + per_participant_results.collect() + ) + + emit: + classification_result = aggregated +} + +process thalassemia_classifier { + container 'ghcr.io/openmined/bioscript:0.1.5' + publishDir params.results_dir, mode: 'copy', overwrite: true, pattern: 'result_THALASSEMIA_*.tsv' + tag { participant_id } + errorStrategy { params.nextflow.error_strategy } + maxRetries { params.nextflow.max_retries } + + input: + tuple path(assets_dir), val(participant_id), path(genotype_file) + + output: + path "result_THALASSEMIA_${participant_id}.tsv" + + script: + def genoFileName = genotype_file.getName() + """ + GENO_FILE=\$(printf '%q' "${genoFileName}") + bioscript classify "${assets_dir}/classify_thalassemia.py" --file \$GENO_FILE --participant_id "${participant_id}" + """ +} + +process aggregate_results { + container 'ghcr.io/openmined/bioscript:0.1.5' + publishDir params.results_dir, mode: 'copy', overwrite: true + + input: + path individual_results + + output: + path "result_THALASSEMIA.tsv" + + script: + def manifestContent = individual_results.collect { it.toString() }.join('\n') + '\n' + """ + cat <<'EOF' > results.list\n${manifestContent}EOF + bioscript combine --list results.list --output result_THALASSEMIA.tsv + """ +} diff --git a/examples/thalassemia/thalassemia_clinvar.tsv b/examples/thalassemia/thalassemia_clinvar.tsv new file mode 100644 index 0000000..74446a2 --- /dev/null +++ b/examples/thalassemia/thalassemia_clinvar.tsv @@ -0,0 +1,230 @@ +rsid gene chromosome position ref alt clnrevstat clnsig clnvc +rs33985472 HBB 11 5225485 T C criteria_provided,_multiple_submitters,_no_conflicts Pathogenic single_nucleotide_variant +rs63751128 HBB 11 5225487 T C criteria_provided,_multiple_submitters,_no_conflicts Pathogenic single_nucleotide_variant +rs33978907 HBB 11 5225488 A G criteria_provided,_multiple_submitters,_no_conflicts Pathogenic single_nucleotide_variant +rs34809925 HBB 11 5225592 G C criteria_provided,_multiple_submitters,_no_conflicts Pathogenic/Likely_pathogenic single_nucleotide_variant +rs33954264 HBB 11 5225602 T G criteria_provided,_single_submitter Pathogenic single_nucleotide_variant +rs35117167 HBB 11 5225605 T C criteria_provided,_multiple_submitters,_no_conflicts Pathogenic single_nucleotide_variant +rs33949869 HBB 11 5225606 A G criteria_provided,_single_submitter Pathogenic single_nucleotide_variant +rs33949869 HBB 11 5225606 A T criteria_provided,_multiple_submitters,_no_conflicts Pathogenic single_nucleotide_variant +rs35020585 HBB 11 5225607 C G criteria_provided,_single_submitter Pathogenic single_nucleotide_variant +rs33918338 HBB 11 5225611 T C criteria_provided,_multiple_submitters,_no_conflicts Pathogenic/Likely_pathogenic single_nucleotide_variant +rs33918338 HBB 11 5225611 T G criteria_provided,_single_submitter Pathogenic single_nucleotide_variant +rs33927093 HBB 11 5225620 G A criteria_provided,_single_submitter Pathogenic single_nucleotide_variant +rs281864530 HBB 11 5225649 A T criteria_provided,_single_submitter Pathogenic single_nucleotide_variant +rs33910569 HBB 11 5225659 T G criteria_provided,_single_submitter Pathogenic single_nucleotide_variant +rs33971634 HBB 11 5225660 G A criteria_provided,_multiple_submitters,_no_conflicts Pathogenic single_nucleotide_variant +rs33925391 HBB 11 5225662 A C criteria_provided,_multiple_submitters,_no_conflicts Pathogenic/Likely_pathogenic single_nucleotide_variant +rs33946267 HBB 11 5225678 C A criteria_provided,_multiple_submitters,_no_conflicts Pathogenic single_nucleotide_variant +rs33946267 HBB 11 5225678 C G criteria_provided,_multiple_submitters,_no_conflicts Pathogenic/Likely_pathogenic single_nucleotide_variant +rs33946267 HBB 11 5225678 C T criteria_provided,_multiple_submitters,_no_conflicts Pathogenic/Likely_pathogenic single_nucleotide_variant +rs35485099 HBB 11 5225695 G T criteria_provided,_multiple_submitters,_no_conflicts Pathogenic single_nucleotide_variant +rs34945623 HBB 11 5225696 C G criteria_provided,_single_submitter Pathogenic single_nucleotide_variant +rs36015961 HBB 11 5225698 A G criteria_provided,_multiple_submitters,_no_conflicts Pathogenic/Likely_pathogenic single_nucleotide_variant +rs35256489 HBB 11 5225710 A G criteria_provided,_multiple_submitters,_no_conflicts Pathogenic single_nucleotide_variant +rs33969677 HBB 11 5225714 C G criteria_provided,_single_submitter Pathogenic single_nucleotide_variant +rs33969677 HBB 11 5225714 C T criteria_provided,_multiple_submitters,_no_conflicts Pathogenic single_nucleotide_variant + HBB 11 5225715 G C criteria_provided,_multiple_submitters,_no_conflicts Likely_pathogenic single_nucleotide_variant +rs35519485 HBB 11 5225719 C T criteria_provided,_single_submitter Likely_pathogenic single_nucleotide_variant +rs33941844 HBB 11 5225722 A T criteria_provided,_single_submitter Pathogenic single_nucleotide_variant +rs33952266 HBB 11 5225727 C A criteria_provided,_multiple_submitters,_no_conflicts Likely_pathogenic single_nucleotide_variant +rs33952266 HBB 11 5225727 C G criteria_provided,_single_submitter Pathogenic single_nucleotide_variant +rs33952266 HBB 11 5225727 C T criteria_provided,_multiple_submitters,_no_conflicts Pathogenic/Likely_pathogenic single_nucleotide_variant +rs33914668 HBB 11 5225728 T C criteria_provided,_multiple_submitters,_no_conflicts Pathogenic single_nucleotide_variant +rs33914668 HBB 11 5225728 T G criteria_provided,_multiple_submitters,_no_conflicts Pathogenic single_nucleotide_variant +rs33913413 HBB 11 5225729 G C criteria_provided,_single_submitter Likely_pathogenic single_nucleotide_variant +rs33913413 HBB 11 5225729 G T criteria_provided,_multiple_submitters,_no_conflicts Pathogenic/Likely_pathogenic single_nucleotide_variant +rs35703285 HBB 11 5225740 A C criteria_provided,_multiple_submitters,_no_conflicts Pathogenic/Likely_pathogenic single_nucleotide_variant +rs34690599 HBB 11 5225832 G C criteria_provided,_multiple_submitters,_no_conflicts Pathogenic/Likely_pathogenic single_nucleotide_variant +rs35328027 HBB 11 5225872 A C criteria_provided,_multiple_submitters,_no_conflicts Pathogenic single_nucleotide_variant +rs34451549 HBB 11 5225923 G A criteria_provided,_multiple_submitters,_no_conflicts Pathogenic single_nucleotide_variant +rs35099082 HBB 11 5226572 C G criteria_provided,_multiple_submitters,_no_conflicts Pathogenic single_nucleotide_variant +rs63750283 HBB 11 5226575 A C criteria_provided,_multiple_submitters,_no_conflicts Likely_pathogenic single_nucleotide_variant +rs33945777 HBB 11 5226576 C A criteria_provided,_multiple_submitters,_no_conflicts Pathogenic single_nucleotide_variant +rs33945777 HBB 11 5226576 C G criteria_provided,_single_submitter Pathogenic single_nucleotide_variant +rs33945777 HBB 11 5226576 C T criteria_provided,_multiple_submitters,_no_conflicts Pathogenic single_nucleotide_variant +rs33954595 HBB 11 5226594 C A criteria_provided,_single_submitter Pathogenic single_nucleotide_variant +rs33933298 HBB 11 5226597 C T criteria_provided,_multiple_submitters,_no_conflicts Pathogenic single_nucleotide_variant +rs34515413 HBB 11 5226598 G C criteria_provided,_multiple_submitters,_no_conflicts Pathogenic/Likely_pathogenic single_nucleotide_variant + HBB 11 5226598 G T criteria_provided,_single_submitter Pathogenic single_nucleotide_variant +rs33951978 HBB 11 5226599 T A criteria_provided,_single_submitter Likely_pathogenic single_nucleotide_variant + HBB 11 5226602 A C criteria_provided,_single_submitter Pathogenic single_nucleotide_variant +rs34083951 HBB 11 5226613 G C criteria_provided,_single_submitter Likely_pathogenic single_nucleotide_variant +rs34083951 HBB 11 5226613 G T criteria_provided,_single_submitter Pathogenic single_nucleotide_variant +rs33924775 HBB 11 5226615 G A criteria_provided,_single_submitter Pathogenic single_nucleotide_variant +rs33917785 HBB 11 5226617 A G criteria_provided,_single_submitter Pathogenic single_nucleotide_variant +rs35002698 HBB 11 5226619 C G criteria_provided,_single_submitter Pathogenic single_nucleotide_variant +rs33913712 HBB 11 5226621 C A criteria_provided,_multiple_submitters,_no_conflicts Pathogenic single_nucleotide_variant +rs33940204 HBB 11 5226626 A G criteria_provided,_multiple_submitters,_no_conflicts Pathogenic/Likely_pathogenic single_nucleotide_variant +rs33993568 HBB 11 5226629 G A criteria_provided,_single_submitter Likely_pathogenic single_nucleotide_variant +rs35693898 HBB 11 5226635 A G criteria_provided,_single_submitter Pathogenic single_nucleotide_variant +rs33991993 HBB 11 5226643 C G criteria_provided,_single_submitter Likely_pathogenic single_nucleotide_variant +rs33987903 HBB 11 5226644 T A criteria_provided,_single_submitter Pathogenic single_nucleotide_variant +rs33972593 HBB 11 5226686 A G criteria_provided,_single_submitter Likely_pathogenic single_nucleotide_variant +rs33961459 HBB 11 5226687 G A criteria_provided,_multiple_submitters,_no_conflicts Pathogenic/Likely_pathogenic single_nucleotide_variant +rs36008922 HBB 11 5226690 C T criteria_provided,_multiple_submitters,_no_conflicts Pathogenic single_nucleotide_variant +rs35939489 HBB 11 5226692 T G criteria_provided,_multiple_submitters,_no_conflicts Likely_pathogenic single_nucleotide_variant +rs35353749 HBB 11 5226696 T A criteria_provided,_single_submitter Pathogenic single_nucleotide_variant +rs33922873 HBB 11 5226702 G A criteria_provided,_multiple_submitters,_no_conflicts Pathogenic single_nucleotide_variant +rs33922873 HBB 11 5226702 G T criteria_provided,_single_submitter Pathogenic single_nucleotide_variant +rs33995148 HBB 11 5226708 T A criteria_provided,_multiple_submitters,_no_conflicts Pathogenic single_nucleotide_variant + HBB 11 5226732 C T criteria_provided,_single_submitter Likely_pathogenic single_nucleotide_variant +rs33978338 HBB 11 5226755 A C criteria_provided,_single_submitter Likely_pathogenic single_nucleotide_variant +rs33922842 HBB 11 5226762 C A criteria_provided,_multiple_submitters,_no_conflicts Pathogenic single_nucleotide_variant +rs34378160 HBB 11 5226764 A G criteria_provided,_multiple_submitters,_no_conflicts Pathogenic single_nucleotide_variant +rs11549407 HBB 11 5226774 G A criteria_provided,_multiple_submitters,_no_conflicts Pathogenic single_nucleotide_variant +rs33974936 HBB 11 5226778 C T criteria_provided,_multiple_submitters,_no_conflicts Pathogenic single_nucleotide_variant +rs33991059 HBB 11 5226779 C T criteria_provided,_multiple_submitters,_no_conflicts Pathogenic single_nucleotide_variant +rs33982568 HBB 11 5226784 G T criteria_provided,_multiple_submitters,_no_conflicts Pathogenic single_nucleotide_variant +rs1141387 HBB 11 5226789 C A criteria_provided,_multiple_submitters,_no_conflicts Likely_pathogenic single_nucleotide_variant +rs33948578 HBB 11 5226794 A G criteria_provided,_single_submitter Pathogenic single_nucleotide_variant +rs33948578 HBB 11 5226794 A T criteria_provided,_single_submitter Likely_pathogenic single_nucleotide_variant +rs33943001 HBB 11 5226800 C G criteria_provided,_multiple_submitters,_no_conflicts Pathogenic single_nucleotide_variant +rs33943001 HBB 11 5226800 C T criteria_provided,_multiple_submitters,_no_conflicts Pathogenic single_nucleotide_variant +rs63750513 HBB 11 5226801 T C criteria_provided,_multiple_submitters,_no_conflicts Pathogenic single_nucleotide_variant +rs63750513 HBB 11 5226801 T G criteria_provided,_multiple_submitters,_no_conflicts Pathogenic single_nucleotide_variant +rs34527846 HBB 11 5226802 A C criteria_provided,_multiple_submitters,_no_conflicts Pathogenic/Likely_pathogenic single_nucleotide_variant +rs35456885 HBB 11 5226814 A C criteria_provided,_multiple_submitters,_no_conflicts Pathogenic single_nucleotide_variant +rs35004220 HBB 11 5226820 C T criteria_provided,_multiple_submitters,_no_conflicts Pathogenic/Likely_pathogenic single_nucleotide_variant +rs35724775 HBB 11 5226924 A G criteria_provided,_multiple_submitters,_no_conflicts Pathogenic/Likely_pathogenic single_nucleotide_variant +rs33915217 HBB 11 5226925 C A criteria_provided,_multiple_submitters,_no_conflicts Pathogenic single_nucleotide_variant +rs33915217 HBB 11 5226925 C G criteria_provided,_multiple_submitters,_no_conflicts Pathogenic single_nucleotide_variant +rs33915217 HBB 11 5226925 C T criteria_provided,_multiple_submitters,_no_conflicts Pathogenic single_nucleotide_variant +rs33956879 HBB 11 5226928 A C criteria_provided,_multiple_submitters,_no_conflicts Pathogenic single_nucleotide_variant +rs33956879 HBB 11 5226928 A G criteria_provided,_multiple_submitters,_no_conflicts Pathogenic single_nucleotide_variant +rs33956879 HBB 11 5226928 A T criteria_provided,_multiple_submitters,_no_conflicts Pathogenic single_nucleotide_variant +rs33971440 HBB 11 5226929 C A criteria_provided,_multiple_submitters,_no_conflicts Pathogenic single_nucleotide_variant +rs33971440 HBB 11 5226929 C T criteria_provided,_multiple_submitters,_no_conflicts Pathogenic single_nucleotide_variant + HBB 11 5226930 C A criteria_provided,_single_submitter Likely_pathogenic single_nucleotide_variant +rs33960103 HBB 11 5226930 C G criteria_provided,_multiple_submitters,_no_conflicts Pathogenic single_nucleotide_variant +rs33960103 HBB 11 5226930 C T criteria_provided,_multiple_submitters,_no_conflicts Pathogenic/Likely_pathogenic single_nucleotide_variant +rs35684407 HBB 11 5226931 T C criteria_provided,_single_submitter Pathogenic single_nucleotide_variant +rs35684407 HBB 11 5226931 T G criteria_provided,_multiple_submitters,_no_conflicts Likely_pathogenic single_nucleotide_variant +rs35685286 HBB 11 5226933 C T criteria_provided,_single_submitter Likely_pathogenic single_nucleotide_variant +rs33916412 HBB 11 5226936 A G criteria_provided,_single_submitter Pathogenic single_nucleotide_variant +rs33954632 HBB 11 5226939 G T criteria_provided,_single_submitter Pathogenic single_nucleotide_variant +rs35424040 HBB 11 5226940 C A criteria_provided,_multiple_submitters,_no_conflicts Pathogenic single_nucleotide_variant +rs33950507 HBB 11 5226943 C A criteria_provided,_multiple_submitters,_no_conflicts Pathogenic single_nucleotide_variant +rs33950507 HBB 11 5226943 C T criteria_provided,_multiple_submitters,_no_conflicts Pathogenic single_nucleotide_variant +rs33951465 HBB 11 5226947 A T criteria_provided,_multiple_submitters,_no_conflicts Pathogenic/Likely_pathogenic single_nucleotide_variant +rs33959855 HBB 11 5226955 C A criteria_provided,_multiple_submitters,_no_conflicts Pathogenic single_nucleotide_variant +rs35890959 HBB 11 5226961 C T criteria_provided,_multiple_submitters,_no_conflicts Pathogenic/Likely_pathogenic single_nucleotide_variant +rs33972047 HBB 11 5226963 T C criteria_provided,_multiple_submitters,_no_conflicts Pathogenic single_nucleotide_variant +rs33986703 HBB 11 5226970 T A criteria_provided,_multiple_submitters,_no_conflicts Pathogenic single_nucleotide_variant +rs370075492 HBB 11 5226971 G A criteria_provided,_multiple_submitters,_no_conflicts Pathogenic/Likely_pathogenic single_nucleotide_variant +rs34716011 HBB 11 5226974 C T criteria_provided,_multiple_submitters,_no_conflicts Pathogenic single_nucleotide_variant +rs63750783 HBB 11 5226975 C T criteria_provided,_multiple_submitters,_no_conflicts Pathogenic single_nucleotide_variant +rs33926764 HBB 11 5226997 T C criteria_provided,_single_submitter Pathogenic single_nucleotide_variant +rs34948328 HBB 11 5227000 C A criteria_provided,_single_submitter Pathogenic single_nucleotide_variant +rs334 HBB 11 5227002 T A criteria_provided,_multiple_submitters,_no_conflicts Pathogenic single_nucleotide_variant +rs33930165 HBB 11 5227003 C T criteria_provided,_multiple_submitters,_no_conflicts Pathogenic/Likely_pathogenic single_nucleotide_variant +rs33930702 HBB 11 5227019 C A criteria_provided,_single_submitter Likely_pathogenic single_nucleotide_variant +rs33930702 HBB 11 5227019 C T criteria_provided,_multiple_submitters,_no_conflicts Pathogenic single_nucleotide_variant +rs33941849 HBB 11 5227020 A C criteria_provided,_multiple_submitters,_no_conflicts Pathogenic single_nucleotide_variant +rs33941849 HBB 11 5227020 A G criteria_provided,_multiple_submitters,_no_conflicts Pathogenic single_nucleotide_variant +rs33941849 HBB 11 5227020 A T criteria_provided,_multiple_submitters,_no_conflicts Pathogenic/Likely_pathogenic single_nucleotide_variant +rs34563000 HBB 11 5227021 T C criteria_provided,_multiple_submitters,_no_conflicts Pathogenic single_nucleotide_variant +rs34563000 HBB 11 5227021 T G criteria_provided,_multiple_submitters,_no_conflicts Pathogenic/Likely_pathogenic single_nucleotide_variant +rs34135787 HBB 11 5227039 G C criteria_provided,_multiple_submitters,_no_conflicts Pathogenic single_nucleotide_variant +rs34704828 HBB 11 5227050 C T criteria_provided,_multiple_submitters,_no_conflicts Pathogenic single_nucleotide_variant +rs34305195 HBB 11 5227071 T G criteria_provided,_multiple_submitters,_no_conflicts Pathogenic/Likely_pathogenic single_nucleotide_variant +rs33931746 HBB 11 5227099 T C criteria_provided,_multiple_submitters,_no_conflicts Pathogenic/Likely_pathogenic single_nucleotide_variant +rs33931746 HBB 11 5227099 T G criteria_provided,_multiple_submitters,_no_conflicts Pathogenic single_nucleotide_variant +rs34598529 HBB 11 5227100 T C criteria_provided,_multiple_submitters,_no_conflicts Pathogenic single_nucleotide_variant +rs34598529 HBB 11 5227100 T G criteria_provided,_single_submitter Pathogenic single_nucleotide_variant +rs33980857 HBB 11 5227101 A G criteria_provided,_multiple_submitters,_no_conflicts Pathogenic single_nucleotide_variant +rs33980857 HBB 11 5227101 A T criteria_provided,_multiple_submitters,_no_conflicts Pathogenic single_nucleotide_variant +rs33981098 HBB 11 5227102 T C criteria_provided,_multiple_submitters,_no_conflicts Pathogenic single_nucleotide_variant +rs281864518 HBB 11 5227142 G A criteria_provided,_multiple_submitters,_no_conflicts Pathogenic/Likely_pathogenic single_nucleotide_variant +rs33994806 HBB 11 5227157 G A criteria_provided,_multiple_submitters,_no_conflicts Pathogenic/Likely_pathogenic single_nucleotide_variant +rs33994806 HBB 11 5227157 G C criteria_provided,_multiple_submitters,_no_conflicts Pathogenic/Likely_pathogenic single_nucleotide_variant +rs33994806 HBB 11 5227157 G T criteria_provided,_single_submitter Likely_pathogenic single_nucleotide_variant +rs33941377 HBB 11 5227158 G A criteria_provided,_multiple_submitters,_no_conflicts Pathogenic/Likely_pathogenic single_nucleotide_variant +rs33941377 HBB 11 5227158 G C criteria_provided,_multiple_submitters,_no_conflicts Pathogenic/Likely_pathogenic single_nucleotide_variant +rs33941377 HBB 11 5227158 G T criteria_provided,_multiple_submitters,_no_conflicts Pathogenic single_nucleotide_variant +rs33944208 HBB 11 5227159 G A criteria_provided,_multiple_submitters,_no_conflicts Pathogenic/Likely_pathogenic single_nucleotide_variant +rs33944208 HBB 11 5227159 G T criteria_provided,_multiple_submitters,_no_conflicts Pathogenic/Likely_pathogenic single_nucleotide_variant +rs34999973 HBB 11 5227161 G A criteria_provided,_multiple_submitters,_no_conflicts Pathogenic/Likely_pathogenic single_nucleotide_variant +rs34883338 HBB 11 5227163 G A criteria_provided,_multiple_submitters,_no_conflicts Pathogenic/Likely_pathogenic single_nucleotide_variant +rs63751208 HBB 11 5227172 G A criteria_provided,_multiple_submitters,_no_conflicts Pathogenic single_nucleotide_variant +rs63751208 HBB 11 5227172 G C criteria_provided,_single_submitter Pathogenic single_nucleotide_variant +rs41469945 HBA1 16 173581 T C criteria_provided,_single_submitter Pathogenic single_nucleotide_variant +rs41464951 HBA1 16 173598 T C criteria_provided,_multiple_submitters,_no_conflicts Pathogenic/Likely_pathogenic single_nucleotide_variant +rs34220980 HBA1 16 176717 A G criteria_provided,_multiple_submitters,_no_conflicts Pathogenic single_nucleotide_variant +rs1316527998 HBA1 16 176718 T A criteria_provided,_single_submitter Likely_pathogenic single_nucleotide_variant + HBA1 16 176718 T C criteria_provided,_multiple_submitters,_no_conflicts Pathogenic/Likely_pathogenic single_nucleotide_variant +rs1316527998 HBA1 16 176718 T G criteria_provided,_single_submitter Pathogenic single_nucleotide_variant +rs33964317 HBA1 16 176759 T C criteria_provided,_multiple_submitters,_no_conflicts Pathogenic/Likely_pathogenic single_nucleotide_variant +rs63750090 HBA1 16 176760 G A criteria_provided,_multiple_submitters,_no_conflicts Pathogenic/Likely_pathogenic single_nucleotide_variant +rs1596573335 HBA1 16 176761 G A criteria_provided,_multiple_submitters,_no_conflicts Pathogenic/Likely_pathogenic single_nucleotide_variant + HBA1 16 176811 G A criteria_provided,_single_submitter Likely_pathogenic single_nucleotide_variant +rs1201093320 HBA1 16 176812 G A criteria_provided,_multiple_submitters,_no_conflicts Pathogenic single_nucleotide_variant + HBA1 16 176813 T C criteria_provided,_multiple_submitters,_no_conflicts Likely_pathogenic single_nucleotide_variant +rs1298836193 HBA1 16 176927 A G criteria_provided,_single_submitter Pathogenic single_nucleotide_variant +rs34883113 HBA1 16 176928 G A criteria_provided,_multiple_submitters,_no_conflicts Pathogenic single_nucleotide_variant +rs1455943416 HBA1 16 176932 G A criteria_provided,_multiple_submitters,_no_conflicts Likely_pathogenic single_nucleotide_variant +rs33978134 HBA1 16 176967 C T criteria_provided,_single_submitter Likely_pathogenic single_nucleotide_variant +rs281864895 HBA1 16 177011 G C criteria_provided,_multiple_submitters,_no_conflicts Pathogenic single_nucleotide_variant +rs28928878 HBA1 16 177012 G A criteria_provided,_multiple_submitters,_no_conflicts Pathogenic single_nucleotide_variant + HBA1 16 177017 A T criteria_provided,_single_submitter Likely_pathogenic single_nucleotide_variant +rs28928876 HBA1 16 177095 C T criteria_provided,_multiple_submitters,_no_conflicts Pathogenic single_nucleotide_variant +rs1318210119 HBA1 16 177097 C G criteria_provided,_single_submitter Likely_pathogenic single_nucleotide_variant +rs758093235 HBA1 16 177134 G A criteria_provided,_multiple_submitters,_no_conflicts Likely_pathogenic single_nucleotide_variant +rs758093235 HBA1 16 177134 G T criteria_provided,_single_submitter Pathogenic single_nucleotide_variant +rs756810015 HBA1 16 177308 C A criteria_provided,_single_submitter Likely_pathogenic single_nucleotide_variant + HBA1 16 177311 T C criteria_provided,_single_submitter Likely_pathogenic single_nucleotide_variant +rs63749948 HBA1 16 177314 C A criteria_provided,_multiple_submitters,_no_conflicts Likely_pathogenic single_nucleotide_variant +rs63750751 HBA1 16 177340 C T criteria_provided,_multiple_submitters,_no_conflicts Pathogenic single_nucleotide_variant +rs35993655 HBA1 16 177371 T C criteria_provided,_multiple_submitters,_no_conflicts Likely_pathogenic single_nucleotide_variant +rs121909803 HBA2 16 172913 A G criteria_provided,_multiple_submitters,_no_conflicts Pathogenic single_nucleotide_variant +rs111033603 HBA2 16 172914 T C criteria_provided,_multiple_submitters,_no_conflicts Pathogenic single_nucleotide_variant +rs111033603 HBA2 16 172914 T G criteria_provided,_single_submitter Likely_pathogenic single_nucleotide_variant +rs63750367 HBA2 16 172957 G A criteria_provided,_single_submitter Pathogenic single_nucleotide_variant +rs63751457 HBA2 16 172981 C T criteria_provided,_multiple_submitters,_no_conflicts Likely_pathogenic single_nucleotide_variant +rs281864819 HBA2 16 172982 G T criteria_provided,_single_submitter Likely_pathogenic single_nucleotide_variant +rs281864821 HBA2 16 172985 T G criteria_provided,_single_submitter Likely_pathogenic single_nucleotide_variant +rs281864550 HBA2 16 172987 T G criteria_provided,_multiple_submitters,_no_conflicts Pathogenic/Likely_pathogenic single_nucleotide_variant +rs41341344 HBA2 16 173001 T C criteria_provided,_multiple_submitters,_no_conflicts Pathogenic single_nucleotide_variant +rs63750158 HBA2 16 173008 G A criteria_provided,_multiple_submitters,_no_conflicts Pathogenic single_nucleotide_variant +rs63750158 HBA2 16 173008 G C criteria_provided,_single_submitter Pathogenic single_nucleotide_variant +rs41457746 HBA2 16 173123 A G criteria_provided,_multiple_submitters,_no_conflicts Pathogenic single_nucleotide_variant + HBA2 16 173127 T C criteria_provided,_single_submitter Likely_pathogenic single_nucleotide_variant +rs41515552 HBA2 16 173128 G A criteria_provided,_multiple_submitters,_no_conflicts Pathogenic single_nucleotide_variant +rs63750776 HBA2 16 173135 T C criteria_provided,_single_submitter Likely_pathogenic single_nucleotide_variant +rs281864829 HBA2 16 173159 T G criteria_provided,_multiple_submitters,_no_conflicts Pathogenic single_nucleotide_variant +rs281864834 HBA2 16 173171 G C criteria_provided,_multiple_submitters,_no_conflicts Pathogenic/Likely_pathogenic single_nucleotide_variant +rs281864840 HBA2 16 173192 C T criteria_provided,_multiple_submitters,_no_conflicts Pathogenic/Likely_pathogenic single_nucleotide_variant +rs41328049 HBA2 16 173207 G C criteria_provided,_multiple_submitters,_no_conflicts Likely_pathogenic single_nucleotide_variant +rs281864846 HBA2 16 173208 G A criteria_provided,_multiple_submitters,_no_conflicts Pathogenic single_nucleotide_variant +rs41323248 HBA2 16 173229 T C criteria_provided,_multiple_submitters,_no_conflicts Pathogenic/Likely_pathogenic single_nucleotide_variant + HBA2 16 173271 T C criteria_provided,_single_submitter Likely_pathogenic single_nucleotide_variant +rs281864863 HBA2 16 173271 T G criteria_provided,_single_submitter Likely_pathogenic single_nucleotide_variant + HBA2 16 173304 T C criteria_provided,_single_submitter Pathogenic single_nucleotide_variant +rs281864878 HBA2 16 173312 G A criteria_provided,_single_submitter Pathogenic single_nucleotide_variant +rs281864878 HBA2 16 173312 G T criteria_provided,_single_submitter Likely_pathogenic single_nucleotide_variant +rs780091398 HBA2 16 173330 G A criteria_provided,_single_submitter Likely_pathogenic single_nucleotide_variant +rs587776827 HBA2 16 173471 G A criteria_provided,_single_submitter Likely_pathogenic single_nucleotide_variant +rs1263969213 HBA2 16 173484 T C criteria_provided,_multiple_submitters,_no_conflicts Pathogenic/Likely_pathogenic single_nucleotide_variant +rs41417548 HBA2 16 173485 G A criteria_provided,_multiple_submitters,_no_conflicts Pathogenic/Likely_pathogenic single_nucleotide_variant +rs1262943621 HBA2 16 173529 C T criteria_provided,_multiple_submitters,_no_conflicts Pathogenic/Likely_pathogenic single_nucleotide_variant + HBA2 16 173544 T C criteria_provided,_single_submitter Pathogenic single_nucleotide_variant +rs41397847 HBA2 16 173548 T C criteria_provided,_multiple_submitters,_no_conflicts Pathogenic single_nucleotide_variant +rs41397847 HBA2 16 173548 T G criteria_provided,_multiple_submitters,_no_conflicts Pathogenic/Likely_pathogenic single_nucleotide_variant +rs33933481 HBA2 16 173550 G T criteria_provided,_single_submitter Pathogenic single_nucleotide_variant +rs281864889 HBA2 16 173560 T C criteria_provided,_single_submitter Pathogenic single_nucleotide_variant +rs41469945 HBA1 16 173581 T C criteria_provided,_single_submitter Pathogenic single_nucleotide_variant +rs55870409 HBA2 16 173592 T C criteria_provided,_single_submitter Likely_pathogenic single_nucleotide_variant +rs41464951 HBA2 16 173598 T A criteria_provided,_multiple_submitters,_no_conflicts Pathogenic single_nucleotide_variant +rs41464951 HBA1 16 173598 T C criteria_provided,_multiple_submitters,_no_conflicts Pathogenic/Likely_pathogenic single_nucleotide_variant +rs41464951 HBA2 16 173598 T G criteria_provided,_multiple_submitters,_no_conflicts Pathogenic/Likely_pathogenic single_nucleotide_variant +rs41321345 HBA2 16 173599 A C criteria_provided,_multiple_submitters,_no_conflicts Pathogenic/Likely_pathogenic single_nucleotide_variant + HBA2 16 173599 A T criteria_provided,_single_submitter Pathogenic single_nucleotide_variant +rs41412046 HBA2 16 173600 A T criteria_provided,_multiple_submitters,_no_conflicts Pathogenic single_nucleotide_variant +rs63750067 HBA2 16 173692 A G criteria_provided,_multiple_submitters,_no_conflicts Pathogenic/Likely_pathogenic single_nucleotide_variant +rs63751269 HBA2 16 173694 A G criteria_provided,_multiple_submitters,_no_conflicts Pathogenic single_nucleotide_variant +rs35849348 HBD 11 5232998 C T criteria_provided,_single_submitter Pathogenic single_nucleotide_variant +rs36023765 HBD 11 5233094 T C criteria_provided,_single_submitter Pathogenic single_nucleotide_variant +rs35103459 HBG2 11 5254330 G A criteria_provided,_single_submitter Pathogenic single_nucleotide_variant +rs34474104 HBG2 11 5254417 G A criteria_provided,_multiple_submitters,_no_conflicts Likely_pathogenic single_nucleotide_variant +rs1438114920 HBG2 11 5254643 A T criteria_provided,_single_submitter Likely_pathogenic single_nucleotide_variant +rs1278163109 HBG2 11 5254644 G T criteria_provided,_single_submitter Pathogenic single_nucleotide_variant diff --git a/examples/thalassemia/thalassemia_dev.ipynb b/examples/thalassemia/thalassemia_dev.ipynb new file mode 100644 index 0000000..af9cd84 --- /dev/null +++ b/examples/thalassemia/thalassemia_dev.ipynb @@ -0,0 +1,482 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Thalassemia (HBB)\n", + "\n", + "This notebook shows how to develop a classifier with embedded tests in Jupyter using thalassemia-associated ClinVar variants.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "# !uv pip install -e ../../python" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "from bioscript import optional_int, optional_str, write_tsv\n", + "from bioscript.classifier import GenotypeClassifier\n", + "from bioscript.types import VariantCall\n", + "from bioscript import assets_dir" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "ASSETS_DIR = assets_dir()\n", + "CLINVAR_TSV = 'thalassemia_clinvar.tsv'\n", + "RESULT_HEADERS = [\n", + " 'participant_id',\n", + " 'filename',\n", + " 'gene',\n", + " 'rsid',\n", + " 'chromosome',\n", + " 'position',\n", + " 'genotype',\n", + " 'ref',\n", + " 'alt',\n", + " 'variant_type',\n", + " 'match_type',\n", + "]\n" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "def generate_variant_calls(df: pd.DataFrame) -> list[VariantCall]:\n", + " \"\"\"Generate VariantCall objects from ClinVar DataFrame.\"\"\"\n", + " vcs: list[VariantCall] = []\n", + " for _, row in df.iterrows():\n", + " vcs.append(\n", + " VariantCall(\n", + " rsid=optional_str(row[\"rsid\"]),\n", + " ref=optional_str(row[\"ref\"]),\n", + " alt=optional_str(row[\"alt\"]),\n", + " chromosome=optional_str(row[\"chromosome\"]),\n", + " position=optional_int(row[\"position\"]),\n", + " gene=optional_str(row.get(\"gene\"), upper=True),\n", + " )\n", + " )\n", + " return vcs" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "def get_vcs() -> list[VariantCall]:\n", + " \"\"\"Load thalassemia-associated variant calls from a ClinVar TSV file.\"\"\"\n", + " df = pd.read_csv(ASSETS_DIR / CLINVAR_TSV, sep='\t')\n", + " print(f'Loaded {len(df)} variants from {CLINVAR_TSV}')\n", + " return generate_variant_calls(df)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "class ThalassemiaClassifier(GenotypeClassifier):\n", + " def classify(self, matches):\n", + " \"\"\"Classify thalassemia-associated variants and write results to TSV files.\"\"\"\n", + " if not matches.all_matches:\n", + " print('No variant matches were found.', flush=True)\n", + "\n", + " # Get categorized matches as report rows\n", + " ref_rows, var_rows, no_rows = matches.categorize_report_rows(\n", + " self.participant_id, self.filename\n", + " )\n", + "\n", + " if self.debug:\n", + " write_tsv(f'{self.output_basename}_ref.tsv', ref_rows)\n", + " write_tsv(f'{self.output_basename}_no.tsv', no_rows)\n", + "\n", + " write_tsv(f'{self.output_basename}.tsv', var_rows, headers=RESULT_HEADERS)\n", + "\n", + " # Return variant rows for testing\n", + " return var_rows\n" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "__bioscript__ = {\n", + " 'variant_calls': get_vcs,\n", + " 'classifier': ThalassemiaClassifier,\n", + " 'name': 'THALASSEMIA',\n", + "}\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Tests\n", + "\n", + "Write tests using the test_* function convention:" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "from bioscript import VariantFixture\n", + "from bioscript.types import MatchList\n", + "import os\n", + "\n", + "# Create test fixtures for thalassemia-associated HBB variants (subset from thalassemia_clinvar.tsv)\n", + "fixture = VariantFixture(\n", + " [\n", + " {'rsid': 'rs33985472', 'chromosome': '11', 'position': 5225485},\n", + " {'rsid': 'rs63751128', 'chromosome': '11', 'position': 5225487},\n", + " {'rsid': 'rs33978907', 'chromosome': '11', 'position': 5225488},\n", + " {'rsid': 'rs34809925', 'chromosome': '11', 'position': 5225592},\n", + " {'rsid': 'rs35117167', 'chromosome': '11', 'position': 5225605},\n", + " {'rsid': 'rs33971634', 'chromosome': '11', 'position': 5225660},\n", + " ],\n", + " assembly='GRCh38',\n", + ")\n" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "def test_thalassemia_heterozygous_variants():\n", + " \"\"\"Test detection of heterozygous thalassemia-associated variants.\"\"\"\n", + " variants = fixture(['TC', 'TC', 'AG', 'GG', 'TT', 'GG'])\n", + "\n", + " # Create mini variant call list for testing\n", + " test_vcs = [\n", + " VariantCall(rsid='rs33985472', ref='T', alt='C', chromosome='11', position=5225485, gene='HBB'),\n", + " VariantCall(rsid='rs63751128', ref='T', alt='C', chromosome='11', position=5225487, gene='HBB'),\n", + " VariantCall(rsid='rs33978907', ref='A', alt='G', chromosome='11', position=5225488, gene='HBB'),\n", + " ]\n", + "\n", + " matches = MatchList(variant_calls=test_vcs).match_rows(variants)\n", + " classifier = ThalassemiaClassifier(participant_id='TEST_HET', name='THALASSEMIA', filename='test.txt')\n", + " result = classifier(matches)\n", + "\n", + " assert len(result) == 3, f'Expected 3 variant rows, got {len(result)}'\n", + " assert all(row['gene'] == 'HBB' for row in result), 'All variants should be HBB'\n", + " assert all(row['match_type'] == 'VARIANT_CALL' for row in result), 'All should be variant calls'\n", + "\n", + " # Cleanup output file\n", + " os.remove('result_THALASSEMIA_TEST_HET.tsv')\n" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "def test_thalassemia_homozygous_variant():\n", + " \"\"\"Test detection of a homozygous thalassemia-associated variant.\"\"\"\n", + " variants = fixture(['TT', 'TT', 'AA', 'CC', 'TT', 'GG'])\n", + "\n", + " test_vcs = [\n", + " VariantCall(rsid='rs34809925', ref='G', alt='C', chromosome='11', position=5225592, gene='HBB'),\n", + " ]\n", + "\n", + " matches = MatchList(variant_calls=test_vcs).match_rows(variants)\n", + " classifier = ThalassemiaClassifier(participant_id='TEST_HOM', name='THALASSEMIA', filename='test.txt')\n", + " result = classifier(matches)\n", + "\n", + " assert len(result) == 1, f'Expected 1 variant row, got {len(result)}'\n", + " assert result[0]['gene'] == 'HBB', 'Variant should be HBB'\n", + " assert result[0]['genotype'] == 'CC', 'Should be homozygous CC'\n", + "\n", + " # Cleanup output file\n", + " os.remove('result_THALASSEMIA_TEST_HOM.tsv')\n" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "def test_no_variants():\n", + " \"\"\"Test classifier with no matching variants.\"\"\"\n", + " variants = fixture(['TT', 'TT', 'AA', 'GG', 'TT', 'GG'])\n", + "\n", + " test_vcs = [\n", + " VariantCall(rsid='rs33985472', ref='T', alt='C', chromosome='11', position=5225485, gene='HBB'),\n", + " ]\n", + "\n", + " matches = MatchList(variant_calls=test_vcs).match_rows(variants)\n", + " classifier = ThalassemiaClassifier(participant_id='TEST_REF', name='THALASSEMIA', filename='test.txt')\n", + " result = classifier(matches)\n", + "\n", + " assert len(result) == 0, f'Expected 0 variant rows, got {len(result)}'\n", + "\n", + " # Cleanup output file\n", + " os.remove('result_THALASSEMIA_TEST_REF.tsv')\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Run Tests in Jupyter\n", + "\n", + "You can run tests directly in the notebook:" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "✓ All tests passed!\n" + ] + } + ], + "source": [ + "# Run tests\n", + "test_thalassemia_heterozygous_variants()\n", + "test_thalassemia_homozygous_variant()\n", + "test_no_variants()\n", + "print('✓ All tests passed!')\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Export to Python Module\n", + "\n", + "Export this notebook to a Python file:\n", + "\n", + "```bash\n", + "bioscript export thalassemia_dev.ipynb -o classify_thalassemia.py\n", + "```\n", + "\n", + "Or in Python:\n", + "\n", + "```python\n", + "from bioscript import export_from_notebook\n", + "export_from_notebook('thalassemia_dev.ipynb', 'classify_thalassemia.py')\n", + "```\n" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "PosixPath('classify_thalassemia.py')" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from bioscript import export_from_notebook\n", + "export_from_notebook('thalassemia_dev.ipynb', 'classify_thalassemia.py')\n" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "============================================================\n", + "Testing: classify_thalassemia.py\n", + "============================================================\n", + "Running tests with pytest: classify_thalassemia.py\n", + "\u001b[1m============================= test session starts ==============================\u001b[0m\n", + "platform darwin -- Python 3.13.5, pytest-9.0.2, pluggy-1.6.0 -- /Users/madhavajay/dev/bioscript/workspace2/.venv/bin/python3\n", + "cachedir: .pytest_cache\n", + "rootdir: /Users/madhavajay/dev/bioscript/workspace2/examples/thalassemia\n", + "plugins: anyio-4.12.0\n", + "collected 3 items \u001b[0m\n", + "\n", + "classify_thalassemia.py::test_thalassemia_heterozygous_variants \u001b[32mPASSED\u001b[0m\u001b[32m [ 33%]\u001b[0m\n", + "classify_thalassemia.py::test_thalassemia_homozygous_variant \u001b[32mPASSED\u001b[0m\u001b[32m [ 66%]\u001b[0m\n", + "classify_thalassemia.py::test_no_variants \u001b[32mPASSED\u001b[0m\u001b[32m [100%]\u001b[0m\n", + "\n", + "\u001b[32m============================== \u001b[32m\u001b[1m3 passed\u001b[0m\u001b[32m in 0.03s\u001b[0m\u001b[32m ===============================\u001b[0m\n" + ] + } + ], + "source": [ + "!bioscript test classify_thalassemia.py\n" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "BioVaultProject(name='thalassemia-classifier', author='madhava@openmined.org', workflow='workflow.nf', description='Classification of thalassemia-associated variants using ClinVar reference data.', template=, version='0.1.1', assets=['classify_thalassemia.py', 'thalassemia_clinvar.tsv'], parameters=[], inputs=[Input(name='participants', type='List[GenotypeRecord]', description='CSV/TSV with participant_id and genotype_file columns', format='csv', path=None, mapping={'participant_id': 'participant_id', 'genotype_file': 'genotype_file'}, cli_flag=None)], outputs=[Output(name='classification_result', type='File', description='Thalassemia variant classification (aggregated)', format='tsv', path='result_THALASSEMIA.tsv', cli_flag=None)], processes=[ProcessDefinition(name='thalassemia_classifier', script='classify_thalassemia.py', container='ghcr.io/openmined/bioscript:0.1.5', kind='bioscript')], docker_image='ghcr.io/openmined/bioscript:0.1.5', docker_platform='linux/amd64')" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from bioscript import export_bioscript_workflow\n", + "\n", + "project = export_bioscript_workflow(\n", + " script_path='./classify_thalassemia.py',\n", + " workflow_name='thalassemia-classifier',\n", + " author='madhava@openmined.org',\n", + " target_dir='./',\n", + " assets={\n", + " 'thalassemia_clinvar.tsv',\n", + " },\n", + " inputs=[\n", + " {\n", + " 'name': 'participants',\n", + " 'type': 'List[GenotypeRecord]',\n", + " 'description': 'CSV/TSV with participant_id and genotype_file columns',\n", + " 'format': 'csv',\n", + " 'mapping': {\n", + " 'participant_id': 'participant_id',\n", + " 'genotype_file': 'genotype_file',\n", + " },\n", + " }\n", + " ],\n", + " outputs=[\n", + " {\n", + " 'name': 'classification_result',\n", + " 'type': 'File',\n", + " 'description': 'Thalassemia variant classification (aggregated)',\n", + " 'format': 'tsv',\n", + " 'path': 'result_THALASSEMIA.tsv',\n", + " },\n", + " ],\n", + " version='0.1.1',\n", + " description='Classification of thalassemia-associated variants using ClinVar reference data.',\n", + ")\n", + "project\n" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "BioVaultPipeline(name='thalassemia-classifier', inputs={'samplesheet': 'List[GenotypeRecord]'}, steps=[PipelineStep(step_id='thalassemia', uses='./', with_args={'participants': 'inputs.samplesheet'}, publish={'classification_result': 'File(result_THALASSEMIA.tsv)'}, store={'counts_sql': SQLStore(source='classification_result', table_name='thalassemia_{run_id}', destination='SQL()', participant_column='participant_id', key_column='participant_id')})], version='0.1.1')" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from bioscript import export_bioscript_pipeline, PipelineStep, SQLStore\n", + "\n", + "pipeline = export_bioscript_pipeline(\n", + " pipeline_name='thalassemia-classifier',\n", + " target_dir='./thalassemia-classifier',\n", + " inputs={\n", + " 'samplesheet': 'List[GenotypeRecord]',\n", + " },\n", + " steps=[\n", + " PipelineStep(\n", + " step_id='thalassemia',\n", + " uses='./',\n", + " with_args={\n", + " 'participants': 'inputs.samplesheet',\n", + " },\n", + " publish={\n", + " 'classification_result': 'File(result_THALASSEMIA.tsv)',\n", + " },\n", + " store={\n", + " 'counts_sql': SQLStore(\n", + " source='classification_result',\n", + " table_name='thalassemia_{run_id}',\n", + " destination='SQL()',\n", + " key_column='participant_id',\n", + " ),\n", + " },\n", + " ),\n", + " ],\n", + " version='0.1.1',\n", + ")\n", + "pipeline\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.13.5" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +}