Skip to content

Commit 0febe78

Browse files
authored
Merge pull request #435 from VariantEffect/jstone-dev/clingen-linkage
ClinGen & ClinVar script enhancements
2 parents a16e7b3 + 80a9246 commit 0febe78

File tree

6 files changed

+119
-44
lines changed

6 files changed

+119
-44
lines changed

src/mavedb/lib/clingen/content_constructors.py

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
from datetime import datetime
2+
from typing import Optional
23
from uuid import uuid4
34
from urllib.parse import quote_plus
45

@@ -15,7 +16,7 @@ def construct_ldh_submission_event(sbj: LdhContentSubject) -> LdhEvent:
1516
"type": LDH_SUBMISSION_TYPE,
1617
"name": LDH_ENTITY_NAME,
1718
"uuid": str(uuid4()),
18-
"sbj": {"id": sbj["Variant"]["hgvs"], "type": "Variant", "format": "hgvs", "add": True},
19+
"sbj": {"id": sbj["Variant"]["hgvs"], "type": "Variant", "format": "hgvs", "add": True, "iri": None},
1920
"triggered": {
2021
"by": {
2122
"host": MAVEDB_BASE_GIT,
@@ -31,26 +32,29 @@ def construct_ldh_submission_subject(hgvs: str) -> LdhContentSubject:
3132
return {"Variant": {"hgvs": hgvs}}
3233

3334

34-
def construct_ldh_submission_entity(variant: Variant, mapped_variant: MappedVariant) -> LdhContentLinkedData:
35-
return {
35+
def construct_ldh_submission_entity(variant: Variant, mapped_variant: Optional[MappedVariant]) -> LdhContentLinkedData:
36+
entity: LdhContentLinkedData = {
3637
# TODO#372: We try to make all possible fields that are non-nullable represented that way.
3738
"MaveDBMapping": [
3839
{
3940
"entContent": {
4041
"mavedb_id": variant.urn, # type: ignore
41-
"pre_mapped": mapped_variant.pre_mapped, # type: ignore
42-
"post_mapped": mapped_variant.post_mapped, # type: ignore
43-
"mapping_api_version": mapped_variant.mapping_api_version, # type: ignore
4442
"score": variant.data["score_data"]["score"], # type: ignore
43+
"score_set_description": variant.score_set.short_description, # type: ignore
4544
},
4645
"entId": variant.urn, # type: ignore
4746
"entIri": f"{MAVEDB_FRONTEND_URL}/score-sets/{quote_plus(variant.score_set.urn)}?variant={quote_plus(variant.urn)}", # type: ignore
4847
}
4948
]
5049
}
50+
if mapped_variant is not None:
51+
entity["MaveDBMapping"][0]["entContent"]["pre_mapped"] = mapped_variant.pre_mapped
52+
entity["MaveDBMapping"][0]["entContent"]["post_mapped"] = mapped_variant.post_mapped
53+
entity["MaveDBMapping"][0]["entContent"]["mapping_api_version"] = mapped_variant.mapping_api_version
54+
return entity
5155

5256

53-
def construct_ldh_submission(variant_content: list[tuple[str, Variant, MappedVariant]]) -> list[LdhSubmission]:
57+
def construct_ldh_submission(variant_content: list[tuple[str, Variant, Optional[MappedVariant]]]) -> list[LdhSubmission]:
5458
content_submission: list[LdhSubmission] = []
5559
for hgvs, variant, mapped_variant in variant_content:
5660
subject = construct_ldh_submission_subject(hgvs)

src/mavedb/lib/types/clingen.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
from typing import TypedDict, Literal
1+
from typing import Optional, TypedDict, Literal
22
from typing_extensions import NotRequired
33

44

@@ -14,6 +14,7 @@ class EventSbj(TypedDict):
1414
type: str
1515
format: Literal["hgvs", "alleleRegistryID", "clinvarID", "geneSymbol"]
1616
add: bool
17+
iri: Optional[str]
1718

1819

1920
# Who/what triggered the event
@@ -52,9 +53,9 @@ class LdhContentSubject(TypedDict):
5253
# The entities we are submitting
5354
class LdhMapping(TypedDict):
5455
mavedb_id: str
55-
pre_mapped: str
56-
post_mapped: str
57-
mapping_api_version: str
56+
pre_mapped: Optional[str]
57+
post_mapped: Optional[str]
58+
mapping_api_version: Optional[str]
5859
score: float
5960

6061

src/mavedb/scripts/clingen_ldh_submission.py

Lines changed: 52 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,9 @@
11
import click
22
import logging
3-
from typing import Sequence
3+
import re
4+
from typing import Optional, Sequence
45

5-
from sqlalchemy import select
6+
from sqlalchemy import and_, select
67
from sqlalchemy.orm import Session
78

89
from mavedb.models.score_set import ScoreSet
@@ -16,8 +17,10 @@
1617

1718
logger = logging.getLogger(__name__)
1819

20+
intronic_variant_with_reference_regex = re.compile(r":c\..*[+-]")
21+
variant_with_reference_regex = re.compile(r":")
1922

20-
def submit_urns_to_clingen(db: Session, urns: Sequence[str], debug: bool) -> list[str]:
23+
def submit_urns_to_clingen(db: Session, urns: Sequence[str], unlinked_only: bool, prefer_unmapped_hgvs: bool, debug: bool) -> list[str]:
2124
ldh_service = ClinGenLdhService(url=LDH_SUBMISSION_ENDPOINT)
2225
ldh_service.authenticate()
2326

@@ -37,13 +40,12 @@ def submit_urns_to_clingen(db: Session, urns: Sequence[str], debug: bool) -> lis
3740
continue
3841

3942
logger.info(f"Submitting mapped variants to LDH service for score set with URN: {urn}")
43+
mapped_variant_join_clause = and_(MappedVariant.variant_id == Variant.id, MappedVariant.post_mapped.is_not(None), MappedVariant.current.is_(True))
4044
variant_objects = db.execute(
4145
select(Variant, MappedVariant)
42-
.join(MappedVariant)
46+
.join(MappedVariant, mapped_variant_join_clause, isouter=True)
4347
.join(ScoreSet)
4448
.where(ScoreSet.urn == urn)
45-
.where(MappedVariant.post_mapped.is_not(None))
46-
.where(MappedVariant.current.is_(True))
4749
).all()
4850

4951
if not variant_objects:
@@ -52,12 +54,48 @@ def submit_urns_to_clingen(db: Session, urns: Sequence[str], debug: bool) -> lis
5254

5355
logger.debug(f"Preparing {len(variant_objects)} mapped variants for submission")
5456

55-
variant_content: list[tuple[str, Variant, MappedVariant]] = []
57+
variant_content: list[tuple[str, Variant, Optional[MappedVariant]]] = []
5658
for variant, mapped_variant in variant_objects:
57-
variation = hgvs_from_mapped_variant(mapped_variant)
59+
if mapped_variant is None:
60+
if variant.hgvs_nt is not None and intronic_variant_with_reference_regex.search(variant.hgvs_nt):
61+
# Use the hgvs_nt string for unmapped intronic variants. This is because our mapper does not yet
62+
# support mapping intronic variants.
63+
variation = [variant.hgvs_nt]
64+
if variation:
65+
logger.info(f"Using hgvs_nt for unmapped intronic variant {variant.urn}: {variation}")
66+
elif variant.hgvs_nt is not None and variant_with_reference_regex.search(variant.hgvs_nt):
67+
# Use the hgvs_nt string for other unmapped NT variants in accession-based score sets.
68+
variation = [variant.hgvs_nt]
69+
if variation:
70+
logger.info(f"Using hgvs_nt for unmapped non-intronic variant {variant.urn}: {variation}")
71+
elif variant.hgvs_pro is not None and variant_with_reference_regex.search(variant.hgvs_pro):
72+
# Use the hgvs_pro string for unmapped PRO variants in accession-based score sets.
73+
variation = [variant.hgvs_pro]
74+
if variation:
75+
logger.info(f"Using hgvs_pro for unmapped non-intronic variant {variant.urn}: {variation}")
76+
else:
77+
logger.warning(f"No variation found for unmapped variant {variant.urn} (nt: {variant.hgvs_nt}, aa: {variant.hgvs_pro}, splice: {variant.hgvs_splice}).")
78+
continue
79+
else:
80+
if unlinked_only and mapped_variant.clingen_allele_id:
81+
continue
82+
# If the script was run with the --prefer-unmapped-hgvs flag, use the hgvs_nt string rather than the
83+
# mapped variant, as long as the variant is accession-based.
84+
if prefer_unmapped_hgvs and variant.hgvs_nt is not None and variant_with_reference_regex.search(variant.hgvs_nt):
85+
variation = [variant.hgvs_nt]
86+
if variation:
87+
logger.info(f"Using hgvs_nt for mapped variant {variant.urn}: {variation}")
88+
elif prefer_unmapped_hgvs and variant.hgvs_pro is not None and variant_with_reference_regex.search(variant.hgvs_pro):
89+
variation = [variant.hgvs_pro]
90+
if variation:
91+
logger.info(f"Using hgvs_pro for mapped variant {variant.urn}: {variation}") # continue # TEMPORARY. Only submit unmapped variants.
92+
else:
93+
variation = hgvs_from_mapped_variant(mapped_variant)
94+
if variation:
95+
logger.info(f"Using mapped variant for {variant.urn}: {variation}")
5896

5997
if not variation:
60-
logger.warning(f"No variation found for variant {variant.urn}.")
98+
logger.warning(f"No variation found for mapped variant {variant.urn} (nt: {variant.hgvs_nt}, aa: {variant.hgvs_pro}, splice: {variant.hgvs_splice}).")
6199
continue
62100

63101
for allele in variation:
@@ -90,11 +128,13 @@ def submit_urns_to_clingen(db: Session, urns: Sequence[str], debug: bool) -> lis
90128
@click.command()
91129
@with_database_session
92130
@click.argument("urns", nargs=-1)
93-
@click.option("--all", help="Submit mapped variants for every score set in MaveDB.", is_flag=True)
131+
@click.option("--all", help="Submit variants for every score set in MaveDB.", is_flag=True)
132+
@click.option("--unlinked", default=False, help="Only submit variants that have not already been linked to ClinGen alleles.", is_flag=True)
133+
@click.option("--prefer-unmapped-hgvs", default=False, help="If the unmapped HGVS string is accession-based, use it in the submission instead of the mapped variant.", is_flag=True)
94134
@click.option("--suppress-output", help="Suppress final print output to the console.", is_flag=True)
95135
@click.option("--debug", help="Enable debug mode. This will send only one request at most to ClinGen", is_flag=True)
96136
def submit_clingen_urns_command(
97-
db: Session, urns: Sequence[str], all: bool, suppress_output: bool, debug: bool
137+
db: Session, urns: Sequence[str], all: bool, unlinked: bool, prefer_unmapped_hgvs: bool, suppress_output: bool, debug: bool
98138
) -> None:
99139
"""
100140
Submit data to ClinGen for mapped variant allele ID generation for the given URNs.
@@ -111,7 +151,7 @@ def submit_clingen_urns_command(
111151
logger.error("No URNs provided. Please provide at least one URN.")
112152
return
113153

114-
submitted_variant_urns = submit_urns_to_clingen(db, urns, debug)
154+
submitted_variant_urns = submit_urns_to_clingen(db, urns, unlinked, prefer_unmapped_hgvs, debug)
115155

116156
if not suppress_output:
117157
print(", ".join(submitted_variant_urns))

src/mavedb/scripts/link_clingen_variants.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
import logging
33
from typing import Sequence
44

5-
from sqlalchemy import select
5+
from sqlalchemy import and_, select
66
from sqlalchemy.orm import Session
77

88
from mavedb.lib.clingen.linked_data_hub import get_clingen_variation, clingen_allele_id_from_ldh_variation
@@ -18,7 +18,7 @@
1818
@with_database_session
1919
@click.argument("urns", nargs=-1)
2020
@click.option("--score-sets/--variants", default=False)
21-
@click.option("--unlinked", default=False)
21+
@click.option("--unlinked", default=False, is_flag=True)
2222
def link_clingen_variants(db: Session, urns: Sequence[str], score_sets: bool, unlinked: bool) -> None:
2323
"""
2424
Submit data to ClinGen for mapped variant allele ID generation for the given URNs.
@@ -51,7 +51,7 @@ def link_clingen_variants(db: Session, urns: Sequence[str], score_sets: bool, un
5151
failed_urns.append(urn)
5252
continue
5353

54-
mapped_variant = db.scalar(select(MappedVariant).join(Variant).where(Variant.urn == urn))
54+
mapped_variant = db.scalar(select(MappedVariant).join(Variant).where(and_(Variant.urn == urn, MappedVariant.current.is_(True))))
5555

5656
if not mapped_variant:
5757
logger.warning(f"No mapped variant found for URN {urn}.")

src/mavedb/scripts/refresh_clinvar_variant_data.py

Lines changed: 28 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
11
import click
2+
from mavedb.models.score_set import ScoreSet
3+
from mavedb.models.variant import Variant
24
import requests
35
import csv
46
import time
@@ -7,10 +9,10 @@
79
import random
810
import io
911

10-
from typing import Dict, Any, Optional
12+
from typing import Dict, Any, Optional, Sequence
1113
from datetime import date
1214

13-
from sqlalchemy import select, distinct, func
15+
from sqlalchemy import and_, select, distinct
1416
from sqlalchemy.orm import Session
1517

1618
from mavedb.models.mapped_variant import MappedVariant
@@ -62,20 +64,33 @@ def query_clingen_allele_api(allele_id: str) -> Dict[str, Any]:
6264
return response.json()
6365

6466

65-
def refresh_clinvar_variants(db: Session, month: Optional[str], year: Optional[str]) -> None:
67+
def refresh_clinvar_variants(db: Session, month: Optional[str], year: Optional[str], urns: Sequence[str]) -> None:
6668
tsv_content = fetch_clinvar_variant_summary_tsv(month, year)
6769
tsv_data = parse_tsv(tsv_content)
6870
version = f"{month}_{year}" if month and year else f"{date.today().month}_{date.today().year}"
6971
logger.info(f"Fetched TSV variant data for ClinVar for {version}.")
7072

71-
total_variants_with_clingen_ids = db.scalar(func.count(distinct(MappedVariant.clingen_allele_id)))
72-
clingen_ids = db.scalars(
73-
select(distinct(MappedVariant.clingen_allele_id)).where(MappedVariant.clingen_allele_id.is_not(None))
74-
).all()
73+
if urns:
74+
clingen_ids = db.scalars(
75+
select(distinct(MappedVariant.clingen_allele_id))
76+
.join(Variant)
77+
.join(ScoreSet)
78+
.where(MappedVariant.current.is_(True), MappedVariant.post_mapped.is_not(None))
79+
.where(and_(
80+
MappedVariant.clingen_allele_id.is_not(None),
81+
MappedVariant.current.is_(True),
82+
ScoreSet.urn.in_(urns)
83+
))
84+
).all()
85+
else:
86+
clingen_ids = db.scalars(
87+
select(distinct(MappedVariant.clingen_allele_id)).where(MappedVariant.clingen_allele_id.is_not(None))
88+
).all()
89+
total_variants_with_clingen_ids = len(clingen_ids)
7590

7691
logger.info(f"Fetching ClinGen data for {total_variants_with_clingen_ids} variants.")
7792
for index, clingen_id in enumerate(clingen_ids):
78-
if total_variants_with_clingen_ids > 0 and index % (total_variants_with_clingen_ids // 100) == 0:
93+
if total_variants_with_clingen_ids > 0 and index % (max(total_variants_with_clingen_ids // 100, 1)) == 0:
7994
logger.info(f"Progress: {index / total_variants_with_clingen_ids:.0%}")
8095

8196
# Guaranteed based on our query filters.
@@ -116,6 +131,8 @@ def refresh_clinvar_variants(db: Session, month: Optional[str], year: Optional[s
116131
select(MappedVariant).where(MappedVariant.clingen_allele_id == clingen_id)
117132
).all()
118133
for mapped_variant in variants_with_clingen_allele_id:
134+
if clinvar_variant.id in [c.id for c in mapped_variant.clinical_controls]:
135+
continue
119136
mapped_variant.clinical_controls.append(clinvar_variant)
120137
db.add(mapped_variant)
121138

@@ -127,10 +144,11 @@ def refresh_clinvar_variants(db: Session, month: Optional[str], year: Optional[s
127144

128145
@click.command()
129146
@with_database_session
147+
@click.argument("urns", nargs=-1)
130148
@click.option("--month", default=None, help="Populate mapped variants for every score set in MaveDB.")
131149
@click.option("--year", default=None, help="Populate mapped variants for every score set in MaveDB.")
132-
def refresh_clinvar_variants_command(db: Session, month: Optional[str], year: Optional[str]) -> None:
133-
refresh_clinvar_variants(db, month, year)
150+
def refresh_clinvar_variants_command(db: Session, month: Optional[str], year: Optional[str], urns: Sequence[str]) -> None:
151+
refresh_clinvar_variants(db, month, year, urns)
134152

135153

136154
if __name__ == "__main__":

tests/lib/clingen/test_content_constructors.py

Lines changed: 20 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
)
1212
from mavedb.lib.clingen.constants import LDH_ENTITY_NAME, LDH_SUBMISSION_TYPE
1313
from mavedb import __version__
14+
import pytest
1415

1516
from tests.helpers.constants import (
1617
TEST_HGVS_IDENTIFIER,
@@ -44,6 +45,7 @@ def test_construct_ldh_submission_event():
4445
"type": "Variant",
4546
"format": "hgvs",
4647
"add": True,
48+
"iri": None
4749
}
4850
assert result["triggered"]["by"] == {
4951
"host": MAVEDB_BASE_GIT,
@@ -52,30 +54,40 @@ def test_construct_ldh_submission_event():
5254
}
5355

5456

55-
def test_construct_ldh_submission_entity(mock_variant, mock_mapped_variant):
56-
result = construct_ldh_submission_entity(mock_variant, mock_mapped_variant)
57+
@pytest.mark.parametrize("has_mapped_variant", [(True), (False)])
58+
def test_construct_ldh_submission_entity(mock_variant, mock_mapped_variant, has_mapped_variant: bool):
59+
mapped_variant = mock_mapped_variant if has_mapped_variant else None
60+
result = construct_ldh_submission_entity(mock_variant, mapped_variant)
5761

5862
assert "MaveDBMapping" in result
5963
assert len(result["MaveDBMapping"]) == 1
6064
mapping = result["MaveDBMapping"][0]
6165

6266
assert mapping["entContent"]["mavedb_id"] == VALID_VARIANT_URN
63-
assert mapping["entContent"]["pre_mapped"] == TEST_VALID_PRE_MAPPED_VRS_ALLELE_VRS2_X
64-
assert mapping["entContent"]["post_mapped"] == TEST_VALID_POST_MAPPED_VRS_ALLELE_VRS2_X
65-
assert mapping["entContent"]["mapping_api_version"] == "pytest.mapping.1.0"
6667
assert mapping["entContent"]["score"] == 1.0
6768

69+
if has_mapped_variant:
70+
assert mapping["entContent"]["pre_mapped"] == TEST_VALID_PRE_MAPPED_VRS_ALLELE_VRS2_X
71+
assert mapping["entContent"]["post_mapped"] == TEST_VALID_POST_MAPPED_VRS_ALLELE_VRS2_X
72+
assert mapping["entContent"]["mapping_api_version"] == "pytest.mapping.1.0"
73+
else:
74+
assert "pre_mapped" not in mapping["entContent"]
75+
assert "post_mapped" not in mapping["entContent"]
76+
assert "mapping_api_version" not in mapping["entContent"]
77+
6878
assert mapping["entId"] == VALID_VARIANT_URN
6979
assert (
7080
mapping["entIri"]
7181
== f"{MAVEDB_FRONTEND_URL}/score-sets/{quote_plus(VALID_SCORE_SET_URN)}?variant={quote_plus(VALID_VARIANT_URN)}"
7282
)
7383

7484

75-
def test_construct_ldh_submission(mock_variant, mock_mapped_variant):
85+
@pytest.mark.parametrize("has_mapped_variant", [(True), (False)])
86+
def test_construct_ldh_submission(mock_variant, mock_mapped_variant, has_mapped_variant: bool):
87+
mapped_variant = mock_mapped_variant if has_mapped_variant else None
7688
variant_content = [
77-
(TEST_HGVS_IDENTIFIER, mock_variant, mock_mapped_variant),
78-
(TEST_HGVS_IDENTIFIER, mock_variant, mock_mapped_variant),
89+
(TEST_HGVS_IDENTIFIER, mock_variant, mapped_variant),
90+
(TEST_HGVS_IDENTIFIER, mock_variant, mapped_variant),
7991
]
8092

8193
uuid_1 = UUID("12345678-1234-5678-1234-567812345678")

0 commit comments

Comments
 (0)