Skip to content

Commit 269479e

Browse files
authored
Merge pull request #443 from apriltuesday/issue-435_2
Issue 435: Filter gene-related disorder submission from curation and evidence generation
2 parents 86cc604 + facb45d commit 269479e

File tree

12 files changed

+63
-32
lines changed

12 files changed

+63
-32
lines changed

bin/trait_mapping/parse_traits.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,7 @@
55

66
if __name__ == '__main__':
77
parser = argparse.ArgumentParser(description="Parse traits from ClinVar XML")
8-
parser.add_argument("-i", dest="input_filepath", required=True,
9-
help="ClinVar XML dump file. One record per line.")
8+
parser.add_argument("-i", dest="input_filepath", required=True, help="ClinVar XML dump file.")
109
parser.add_argument("-o", dest="output_traits_filepath", required=True,
1110
help="path to output file for all traits for downstream processing")
1211
parser.add_argument("-u", dest="output_for_platform", required=False,

cmat/clinvar_xml_io/clinvar_dataset.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,8 @@
44
from datetime import date
55

66
from cmat.clinvar_xml_io.clinvar_reference_record import ClinVarReferenceRecord
7-
from cmat.clinvar_xml_io.xml_parsing import iterate_rcv_from_xml, parse_header_attributes
7+
from cmat.clinvar_xml_io.clinvar_set import ClinVarSet
8+
from cmat.clinvar_xml_io.xml_parsing import iterate_rcv_from_xml, parse_header_attributes, iterate_cvs_from_xml
89

910
logger = logging.getLogger(__name__)
1011
logger.setLevel(logging.INFO)
@@ -22,6 +23,10 @@ def __iter__(self):
2223
for rcv in iterate_rcv_from_xml(self.clinvar_xml):
2324
yield ClinVarReferenceRecord(rcv, self.xsd_version)
2425

26+
def iter_cvs(self):
27+
for cvs in iterate_cvs_from_xml(self.clinvar_xml):
28+
yield ClinVarSet(cvs, self.xsd_version)
29+
2530
def get_xsd_version(self):
2631
# For format, see https://github.com/ncbi/clinvar/blob/master/FTPSiteXsdChanges.md
2732
if 'xsi:noNamespaceSchemaLocation' in self.header_attr:

cmat/clinvar_xml_io/clinvar_reference_record.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@
22
from functools import cached_property
33

44
from cmat.clinvar_xml_io.clinical_classification import ClinicalClassification
5-
65
from cmat.clinvar_xml_io.clinvar_record import ClinVarRecord
76
from cmat.clinvar_xml_io.xml_parsing import find_mandatory_unique_element, find_elements
87

cmat/clinvar_xml_io/clinvar_set.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
from cmat.clinvar_xml_io import ClinVarRecord
1+
from cmat.clinvar_xml_io.clinvar_reference_record import ClinVarReferenceRecord
22
from cmat.clinvar_xml_io.clinvar_submitted_record import ClinVarSubmittedRecord
33
from cmat.clinvar_xml_io.xml_parsing import find_mandatory_unique_element, find_elements
44

@@ -12,7 +12,7 @@ def __init__(self, cvs_xml, xsd_version):
1212
self.cvs_xml = cvs_xml
1313

1414
rcv_elem = find_mandatory_unique_element(self.cvs_xml, 'ReferenceClinVarAssertion')
15-
self.rcv = ClinVarRecord(rcv_elem, xsd_version)
15+
self.rcv = ClinVarReferenceRecord(rcv_elem, xsd_version)
1616

1717
scv_elems = find_elements(self.cvs_xml, 'ClinVarAssertion', allow_zero=False, allow_multiple=True)
1818
self.scvs = [ClinVarSubmittedRecord(elem, xsd_version, self.rcv) for elem in scv_elems]

cmat/clinvar_xml_io/clinvar_submitted_record.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
import logging
22
from functools import cached_property
33

4-
from cmat.clinvar_xml_io import ClinVarRecord
4+
from cmat.clinvar_xml_io.clinvar_record import ClinVarRecord
55
from cmat.clinvar_xml_io.xml_parsing import find_mandatory_unique_element
66

77
logger = logging.getLogger(__name__)

cmat/clinvar_xml_io/filtering.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
# Filtering functions that can be used in multiple pipelines.
2+
3+
# Identified as problematic submissions, e.g. too many unmappable trait names.
4+
submission_names_to_exclude = ['SUB14299258']
5+
6+
7+
def filter_by_submission_name(clinvar_set):
8+
"""Return False (i.e. filter out) if every submitted record in the set has submission_name in the exclusion list."""
9+
for submitted_record in clinvar_set.scvs:
10+
if submitted_record.submission_name not in submission_names_to_exclude:
11+
return True
12+
return False

cmat/output_generation/clinvar_to_evidence_strings.py

Lines changed: 17 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010

1111
from cmat.clinvar_xml_io import ClinVarDataset
1212
from cmat.clinvar_xml_io.clinical_classification import MultipleClinicalClassificationsError
13+
from cmat.clinvar_xml_io.filtering import filter_by_submission_name
1314
from cmat.output_generation import consequence_type as CT
1415
from cmat.output_generation.report import Report
1516

@@ -64,7 +65,8 @@ def clinvar_to_evidence_strings(string_to_efo_mappings, variant_to_gene_mappings
6465

6566
logger.info('Processing ClinVar records')
6667
i = -1
67-
for clinvar_record in ClinVarDataset(clinvar_xml):
68+
dataset = ClinVarDataset(clinvar_xml)
69+
for clinvar_set in dataset.iter_cvs():
6870
# If start & end provided, only process records in the range [start, end)
6971
i += 1
7072
if start and i < start:
@@ -78,7 +80,13 @@ def clinvar_to_evidence_strings(string_to_efo_mappings, variant_to_gene_mappings
7880

7981
# Catch any exceptions for this record so we can continue processing.
8082
try:
81-
# Failure mode 0 (skip). Contains multiple clinical classification annotations.
83+
# Failure mode 1 (fatal). Record is only supported by submissions deemed to be unusable.
84+
if not filter_by_submission_name(clinvar_set):
85+
report.clinvar_fatal_excluded_submission += 1
86+
continue
87+
clinvar_record = clinvar_set.rcv
88+
89+
# Failure mode 2 (skip). Contains multiple clinical classification annotations.
8290
# This is new as of V2 of the ClinVar XSD and should definitely be supported at some point,
8391
# but as it can cause parsing complications we catch these cases first.
8492
# See GH issue for context: https://github.com/EBIvariation/CMAT/issues/396
@@ -87,18 +95,18 @@ def clinvar_to_evidence_strings(string_to_efo_mappings, variant_to_gene_mappings
8795
report.clinvar_skip_multiple_clinical_classifications += 1
8896
continue
8997

90-
# Failure mode 1 (fatal). A ClinVar record contains no valid traits (traits which have at least one valid,
98+
# Failure mode 3 (fatal). A ClinVar record contains no valid traits (traits which have at least one valid,
9199
# potentially mappable name).
92100
if not clinvar_record.traits_with_valid_names:
93101
report.clinvar_fatal_no_valid_traits += 1
94102
continue
95-
# Failure mode 2 (fatal). A ClinVar record contains no valid clinical significance terms, likely due to
103+
# Failure mode 4 (fatal). A ClinVar record contains no valid clinical significance terms, likely due to
96104
# submissions being flagged.
97105
if not clinvar_record.valid_clinical_significances:
98106
report.clinvar_fatal_no_clinical_significance += 1
99107
continue
100108

101-
# Failure mode 3 (skip). A ClinVar record contains an unsupported variation type.
109+
# Failure mode 5 (skip). A ClinVar record contains an unsupported variation type.
102110
if clinvar_record.measure is None:
103111
report.clinvar_skip_unsupported_variation += 1
104112
continue
@@ -110,7 +118,7 @@ def clinvar_to_evidence_strings(string_to_efo_mappings, variant_to_gene_mappings
110118
grouped_diseases = group_diseases_by_efo_mapping(clinvar_record.traits_with_valid_names,
111119
string_to_efo_mappings)
112120

113-
# Failure mode 4 (skip). No functional consequences are available.
121+
# Failure mode 6 (skip). No functional consequences are available.
114122
if not consequence_types:
115123
report.clinvar_skip_no_functional_consequences += 1
116124
continue
@@ -121,7 +129,7 @@ def clinvar_to_evidence_strings(string_to_efo_mappings, variant_to_gene_mappings
121129
if is_structural_variant(clinvar_record.measure):
122130
report.structural_variants += len(consequence_types)
123131

124-
# Failure mode 5 (skip). A ClinVar record has at least one trait with at least one valid name, but no
132+
# Failure mode 7 (skip). A ClinVar record has at least one trait with at least one valid name, but no
125133
# suitable EFO mappings were found in the database. This will still generate an evidence string, but is
126134
# tracked as a failure so we can continue to measure mapping coverage.
127135
if not contains_mapping(grouped_diseases):
@@ -175,8 +183,8 @@ def clinvar_to_evidence_strings(string_to_efo_mappings, variant_to_gene_mappings
175183
except Exception as e:
176184
# We catch exceptions but record when one is thrown, so that the pipeline will crash after processing all
177185
# records and printing the report.
178-
logger.error(f'Problem generating evidence for {clinvar_record.accession}')
179-
logger.error(f'Error: {e}')
186+
logger.error(f'Problem generating evidence for {clinvar_set.rcv.accession}')
187+
logger.error(f'Error: {repr(e)}')
180188
exception_raised = True
181189
continue
182190

cmat/output_generation/report.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ def __init__(self, trait_mappings=None, consequence_mappings=None):
2727
self.clinvar_total = 0
2828
self.clinvar_fatal_no_valid_traits = 0
2929
self.clinvar_fatal_no_clinical_significance = 0
30+
self.clinvar_fatal_excluded_submission = 0
3031
self.clinvar_skip_unsupported_variation = 0
3132
self.clinvar_skip_no_functional_consequences = 0
3233
self.clinvar_skip_missing_efo_mapping = 0
@@ -88,7 +89,8 @@ def load_from_file(self, filename):
8889

8990
def compute_record_tallies(self):
9091
"""Compute tallies of records fatal/skipped/done based on the more granular counts."""
91-
self.clinvar_fatal = self.clinvar_fatal_no_valid_traits + self.clinvar_fatal_no_clinical_significance
92+
self.clinvar_fatal = (self.clinvar_fatal_no_valid_traits + self.clinvar_fatal_no_clinical_significance +
93+
self.clinvar_fatal_excluded_submission)
9294
self.clinvar_skipped = (self.clinvar_skip_unsupported_variation + self.clinvar_skip_no_functional_consequences +
9395
self.clinvar_skip_missing_efo_mapping + self.clinvar_skip_invalid_evidence_string +
9496
self.clinvar_skip_multiple_clinical_classifications)
@@ -115,6 +117,7 @@ def print_report(self):
115117
Fatal: Cannot produce evidence\t{self.clinvar_fatal}
116118
No traits with valid names\t{self.clinvar_fatal_no_valid_traits}
117119
No clinical significance\t{self.clinvar_fatal_no_clinical_significance}
120+
Excluded submissions\t{self.clinvar_fatal_excluded_submission}
118121
Skipped: Can be rescued by future improvements\t{self.clinvar_skipped}
119122
Unsupported variation type\t{self.clinvar_skip_unsupported_variation}
120123
No functional consequences\t{self.clinvar_skip_no_functional_consequences}

cmat/trait_mapping/trait_names_parsing.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
from collections import Counter
22

3-
from cmat import clinvar_xml_io
3+
from cmat.clinvar_xml_io import ClinVarDataset
4+
from cmat.clinvar_xml_io.filtering import filter_by_submission_name
45
from cmat.trait_mapping.trait import Trait
56

67

@@ -27,7 +28,11 @@ def parse_trait_names(filepath: str) -> list:
2728
# Their curation is of highest importance regardless of how many records they are actually associated with.
2829
nt_expansion_traits = set()
2930

30-
for clinvar_record in clinvar_xml_io.ClinVarDataset(filepath):
31+
dataset = ClinVarDataset(filepath)
32+
for clinvar_set in dataset.iter_cvs():
33+
if not filter_by_submission_name(clinvar_set):
34+
continue
35+
clinvar_record = clinvar_set.rcv
3136
trait_names_and_ids = set((trait.preferred_or_other_valid_name.lower(), trait.identifier)
3237
for trait in clinvar_record.traits_with_valid_names)
3338
for trait_tuple in trait_names_and_ids:

tests/pipelines/resources/expected/automated_trait_mappings.tsv

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -128,7 +128,7 @@ chédiak-higashi syndrome http://www.orpha.net/ORDO/Orphanet_167 chédiak-higash
128128
cobalamin c disease http://purl.obolibrary.org/obo/MONDO_0010184 methylmalonic aciduria and homocystinuria type cblC
129129
cobalamin c disease http://www.orpha.net/ORDO/Orphanet_26 Methylmalonic acidemia with homocystinuria
130130
cobalamin c disease http://www.orpha.net/ORDO/Orphanet_79282 Methylmalonic acidemia with homocystinuria, type cblC
131-
coffin-siris syndrome 1 http://purl.obolibrary.org/obo/MONDO_0015452 Coffin-Siris syndrome
131+
coffin-siris syndrome 1 http://purl.obolibrary.org/obo/MONDO_0007617 coffin-siris syndrome 1
132132
cog1 congenital disorder of glycosylation http://purl.obolibrary.org/obo/MONDO_0012637 COG1-congenital disorder of glycosylation
133133
cog7 congenital disorder of glycosylation http://purl.obolibrary.org/obo/MONDO_0012118 COG7-congenital disorder of glycosylation
134134
cohen syndrome http://purl.obolibrary.org/obo/MONDO_0008999 cohen syndrome
@@ -278,7 +278,7 @@ hepatoencephalopathy due to combined oxidative phosphorylation defect type 1 htt
278278
hereditary breast ovarian cancer syndrome http://purl.obolibrary.org/obo/MONDO_0003582 hereditary breast ovarian cancer syndrome
279279
hereditary cancer-predisposing syndrome http://purl.obolibrary.org/obo/MONDO_0015356 hereditary neoplastic syndrome
280280
hereditary diffuse gastric adenocarcinoma http://purl.obolibrary.org/obo/MONDO_0007648 hereditary diffuse gastric adenocarcinoma
281-
hereditary diffuse leukoencephalopathy with spheroids http://www.orpha.net/ORDO/Orphanet_313808 Hereditary diffuse leukoencephalopathy with axonal spheroids and pigmented glia
281+
hereditary diffuse leukoencephalopathy with spheroids http://www.orpha.net/ORDO/Orphanet_313808 Adult-onset leukoencephalopathy with axonal spheroids and pigmented glia
282282
hereditary hemorrhagic telangiectasia http://purl.obolibrary.org/obo/MONDO_0019180 hereditary hemorrhagic telangiectasia
283283
hereditary insensitivity to pain with anhidrosis http://purl.obolibrary.org/obo/MONDO_0009746 hereditary sensory and autonomic neuropathy type 4
284284
hereditary nonpolyposis colorectal neoplasms http://www.ebi.ac.uk/efo/EFO_0009911 hereditary nonpolyposis colorectal carcinoma
@@ -338,7 +338,7 @@ inflammatory skin and bowel disease, neonatal, 1 http://purl.obolibrary.org/obo/
338338
intellectual developmental disorder, autosomal dominant 64 http://purl.obolibrary.org/obo/MONDO_0030934 intellectual developmental disorder, autosomal dominant 64
339339
intellectual disability http://purl.obolibrary.org/obo/HP_0001249 intellectual disability
340340
intellectual disability, autosomal dominant 1 http://purl.obolibrary.org/obo/MONDO_0016459 2q23.1 microdeletion syndrome
341-
intellectual disability, autosomal dominant 20 http://purl.obolibrary.org/obo/MONDO_0016456 5q14.3 microdeletion syndrome
341+
intellectual disability, autosomal dominant 20 http://purl.obolibrary.org/obo/MONDO_0013266 intellectual disability, autosomal dominant 20
342342
intellectual disability, autosomal dominant 5 http://purl.obolibrary.org/obo/MONDO_0012960 intellectual disability, autosomal dominant 5
343343
intellectual disability, autosomal dominant 6 http://purl.obolibrary.org/obo/MONDO_0100172 intellectual disability, autosomal dominant
344344
intellectual disability, autosomal dominant 9 http://purl.obolibrary.org/obo/MONDO_0013656 intellectual disability, autosomal dominant 9
@@ -508,7 +508,7 @@ retinitis pigmentosa-deafness syndrome http://purl.obolibrary.org/obo/MONDO_0019
508508
retinoblastoma http://purl.obolibrary.org/obo/MONDO_0008380 retinoblastoma
509509
rett syndrome http://purl.obolibrary.org/obo/MONDO_0010726 rett syndrome
510510
rett syndrome, congenital variant http://purl.obolibrary.org/obo/MONDO_0010726 Rett syndrome
511-
rhabdoid tumor predisposition syndrome 2 http://purl.obolibrary.org/obo/MONDO_0016473 familial rhabdoid tumor
511+
rhabdoid tumor predisposition syndrome 2 http://purl.obolibrary.org/obo/MONDO_0013224 rhabdoid tumor predisposition syndrome 2
512512
rod-cone dystrophy http://www.orpha.net/ORDO/Orphanet_1872 Cone rod dystrophy
513513
rubinstein-taybi syndrome http://purl.obolibrary.org/obo/MONDO_0019188 rubinstein-taybi syndrome
514514
ryr1-related disorders http://www.ebi.ac.uk/efo/EFO_0009143 ryr1-related disorders

0 commit comments

Comments
 (0)