Skip to content

Commit e407383

Browse files
committed
Script for populating mapped gene targets via ClinGen
For historical data, we do not always have the genes that we mapped a target to. This script can be used to infer the mapped gene from the HGVS string of a mapped variant.
1 parent 8076cf5 commit e407383

File tree

1 file changed

+126
-0
lines changed

1 file changed

+126
-0
lines changed
Lines changed: 126 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,126 @@
1+
import json
2+
import logging
3+
import requests
4+
from typing import Sequence, Optional
5+
6+
import click
7+
from sqlalchemy import select
8+
from sqlalchemy.orm import Session
9+
10+
from mavedb.models.score_set import ScoreSet
11+
from mavedb.models.mapped_variant import MappedVariant
12+
from mavedb.models.target_gene import TargetGene
13+
from mavedb.models.variant import Variant
14+
15+
from mavedb.scripts.environment import script_environment, with_database_session
16+
17+
logger = logging.getLogger(__name__)
18+
logger.setLevel(logging.DEBUG)
19+
20+
CLINGEN_API_URL = "https://reg.test.genome.network/allele"
21+
22+
23+
def get_gene_symbol_from_clingen(hgvs_string: str) -> Optional[str]:
24+
response = requests.get(f"{CLINGEN_API_URL}?hgvs={hgvs_string}")
25+
if response.status_code != 200:
26+
logger.error(f"Failed to query ClinGen API for {hgvs_string}: {response.status_code}")
27+
return None
28+
29+
data = response.json()
30+
if "aminoAcidAlleles" in data:
31+
return data["aminoAcidAlleles"][0].get("geneSymbol")
32+
elif "transcriptAlleles" in data:
33+
return data["transcriptAlleles"][0].get("geneSymbol")
34+
35+
return None
36+
37+
38+
@script_environment.command()
39+
@with_database_session
40+
@click.argument("urns", nargs=-1)
41+
@click.option("--all", help="Generate gene mappings for every score set in MaveDB.", is_flag=True)
42+
def generate_gene_mappings(db: Session, urns: Sequence[Optional[str]], all: bool):
43+
score_set_ids: Sequence[Optional[int]]
44+
if all:
45+
score_set_ids = db.scalars(select(ScoreSet.id)).all()
46+
logger.info(
47+
f"Command invoked with --all. Routine will generate gene mappings for {len(score_set_ids)} score sets."
48+
)
49+
else:
50+
score_set_ids = db.scalars(select(ScoreSet.id).where(ScoreSet.urn.in_(urns))).all()
51+
logger.info(f"Generating gene mappings for the provided score sets ({len(score_set_ids)}).")
52+
53+
for ss_id in score_set_ids:
54+
if not ss_id:
55+
continue
56+
57+
score_set = db.scalar(select(ScoreSet).where(ScoreSet.id == ss_id))
58+
if not score_set:
59+
logger.warning(f"Could not fetch score set with id={ss_id}.")
60+
continue
61+
62+
try:
63+
mapped_variant = db.scalars(
64+
select(MappedVariant)
65+
.join(Variant)
66+
.where(
67+
Variant.score_set_id == ss_id,
68+
MappedVariant.current.is_(True),
69+
MappedVariant.post_mapped.isnot(None),
70+
)
71+
.limit(1)
72+
).one_or_none()
73+
74+
if not mapped_variant:
75+
logger.info(f"No current mapped variant found for score set {score_set.urn}.")
76+
continue
77+
78+
# From line 69, this object must not be None.
79+
hgvs_string = mapped_variant.post_mapped.get("expressions", {})[0].get("value") # type: ignore
80+
if not hgvs_string:
81+
logger.warning(f"No HGVS string found in post_mapped for variant {mapped_variant.id}.")
82+
continue
83+
84+
gene_symbol = get_gene_symbol_from_clingen(hgvs_string)
85+
if not gene_symbol:
86+
logger.warning(f"No gene symbol found for HGVS string {hgvs_string}.")
87+
continue
88+
89+
# This script has been designed to work prior to the introduction of multi-target mapping.
90+
# This .one() call reflects those assumptions.
91+
target_gene = db.scalars(select(TargetGene).where(TargetGene.score_set_id == ss_id)).one()
92+
93+
if target_gene.post_mapped_metadata is None:
94+
logger.warning(
95+
f"Target gene for score set {score_set.urn} has no post_mapped_metadata despite containing current mapped variants."
96+
)
97+
continue
98+
99+
# Cannot update JSONB fields directly. They can be converted to dictionaries over Mypy's objections.
100+
post_mapped_metadata = json.loads(json.dumps(dict(target_gene.post_mapped_metadata.copy()))) # type: ignore
101+
if "genomic" in post_mapped_metadata:
102+
key = "genomic"
103+
elif "protein" in post_mapped_metadata:
104+
key = "protein"
105+
else:
106+
logger.warning(f"Unknown post_mapped type for variant {mapped_variant.id}.")
107+
108+
if "sequence_genes" not in post_mapped_metadata[key]:
109+
post_mapped_metadata[key]["sequence_genes"] = []
110+
post_mapped_metadata[key]["sequence_genes"].append(gene_symbol)
111+
112+
target_gene.post_mapped_metadata = post_mapped_metadata
113+
114+
db.add(target_gene)
115+
db.commit()
116+
logger.info(f"Gene symbol {gene_symbol} added to target gene for score set {score_set.urn}.")
117+
118+
except Exception as e:
119+
logger.error(f"Failed to generate gene mappings for score set {score_set.urn}: {str(e)}")
120+
db.rollback()
121+
122+
logger.info("Done generating gene mappings.")
123+
124+
125+
if __name__ == "__main__":
126+
generate_gene_mappings()

0 commit comments

Comments
 (0)