Skip to content

Commit aeeddd8

Browse files
authored
Merge pull request #553 from VariantEffect/store-all-hgvs
Populate and surface post-mapped HGVS expressions and VEP functional consequence, and surface gnomAD AF
2 parents 6ab205d + 21a771f commit aeeddd8

File tree

12 files changed

+704
-71
lines changed

12 files changed

+704
-71
lines changed
Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
"""Add functional consequence
2+
3+
Revision ID: 1e08b947679d
4+
Revises: 019eb75ad9ae
5+
Create Date: 2025-09-17 11:15:52.091271
6+
7+
"""
8+
9+
from alembic import op
10+
import sqlalchemy as sa
11+
12+
# revision identifiers, used by Alembic.
13+
revision = "1e08b947679d"
14+
down_revision = "019eb75ad9ae"
15+
branch_labels = None
16+
depends_on = None
17+
18+
19+
def upgrade():
20+
# ### commands auto generated by Alembic - please adjust! ###
21+
op.add_column("mapped_variants", sa.Column("vep_functional_consequence", sa.String(), nullable=True))
22+
op.add_column("mapped_variants", sa.Column("vep_access_date", sa.Date(), nullable=True))
23+
# ### end Alembic commands ###
24+
25+
26+
def downgrade():
27+
# ### commands auto generated by Alembic - please adjust! ###
28+
op.drop_column("mapped_variants", "vep_access_date")
29+
op.drop_column("mapped_variants", "vep_functional_consequence")
30+
# ### end Alembic commands ###
Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
"""Add mapped hgvs
2+
3+
Revision ID: b22b450d409c
4+
Revises: 1e08b947679d
5+
Create Date: 2025-10-09 09:53:47.903249
6+
7+
"""
8+
9+
from alembic import op
10+
import sqlalchemy as sa
11+
12+
# revision identifiers, used by Alembic.
13+
revision = "b22b450d409c"
14+
down_revision = "1e08b947679d"
15+
branch_labels = None
16+
depends_on = None
17+
18+
19+
def upgrade():
20+
# ### commands auto generated by Alembic - please adjust! ###
21+
op.add_column("mapped_variants", sa.Column("hgvs_assay_level", sa.String(), nullable=True))
22+
op.add_column("mapped_variants", sa.Column("hgvs_g", sa.String(), nullable=True))
23+
op.add_column("mapped_variants", sa.Column("hgvs_c", sa.String(), nullable=True))
24+
op.add_column("mapped_variants", sa.Column("hgvs_p", sa.String(), nullable=True))
25+
# ### end Alembic commands ###
26+
27+
28+
def downgrade():
29+
# ### commands auto generated by Alembic - please adjust! ###
30+
op.drop_column("mapped_variants", "hgvs_p")
31+
op.drop_column("mapped_variants", "hgvs_c")
32+
op.drop_column("mapped_variants", "hgvs_g")
33+
op.drop_column("mapped_variants", "hgvs_assay_level")
34+
# ### end Alembic commands ###

src/mavedb/lib/clingen/allele_registry.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
logger = logging.getLogger(__name__)
55
logger.setLevel(logging.DEBUG)
66

7-
CLINGEN_API_URL = "https://reg.test.genome.network/allele"
7+
CLINGEN_API_URL = "https://reg.genome.network/allele"
88

99

1010
def get_canonical_pa_ids(clingen_allele_id: str) -> list[str]:

src/mavedb/lib/score_sets.py

Lines changed: 141 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@
2525
from mavedb.lib.mave.utils import is_csv_null
2626
from mavedb.lib.validation.constants.general import null_values_list
2727
from mavedb.lib.validation.utilities import is_null as validate_is_null
28-
from mavedb.lib.variants import get_digest_from_post_mapped, get_hgvs_from_post_mapped, is_hgvs_g, is_hgvs_p
28+
from mavedb.lib.variants import get_digest_from_post_mapped
2929
from mavedb.models.contributor import Contributor
3030
from mavedb.models.controlled_keyword import ControlledKeyword
3131
from mavedb.models.doi_identifier import DoiIdentifier
@@ -35,6 +35,7 @@
3535
from mavedb.models.experiment_controlled_keyword import ExperimentControlledKeywordAssociation
3636
from mavedb.models.experiment_publication_identifier import ExperimentPublicationIdentifierAssociation
3737
from mavedb.models.experiment_set import ExperimentSet
38+
from mavedb.models.gnomad_variant import GnomADVariant
3839
from mavedb.models.mapped_variant import MappedVariant
3940
from mavedb.models.publication_identifier import PublicationIdentifier
4041
from mavedb.models.refseq_identifier import RefseqIdentifier
@@ -501,7 +502,7 @@ def find_publish_or_private_superseded_score_set_tail(
501502
def get_score_set_variants_as_csv(
502503
db: Session,
503504
score_set: ScoreSet,
504-
namespaces: List[Literal["scores", "counts"]],
505+
namespaces: List[Literal["scores", "counts", "vep", "gnomad"]],
505506
namespaced: Optional[bool] = None,
506507
start: Optional[int] = None,
507508
limit: Optional[int] = None,
@@ -518,8 +519,8 @@ def get_score_set_variants_as_csv(
518519
The database session to use.
519520
score_set : ScoreSet
520521
The score set to get the variants from.
521-
namespaces : List[Literal["scores", "counts"]]
522-
The namespaces for data. Now there are only scores and counts. There will be ClinVar and gnomAD.
522+
namespaces : List[Literal["scores", "counts", "vep", "gnomad"]]
523+
The namespaces for data. Now there are only scores, counts, VEP, and gnomAD. ClinVar will be added in the future.
523524
namespaced: Optional[bool] = None
524525
Whether namespace the columns or not.
525526
start : int, optional
@@ -531,8 +532,8 @@ def get_score_set_variants_as_csv(
531532
include_custom_columns : bool, optional
532533
Whether to include custom columns defined in the score set. Defaults to True.
533534
include_post_mapped_hgvs : bool, optional
534-
Whether to include post-mapped HGVS notations in the output. Defaults to False. If True, the output will include
535-
columns for both post-mapped HGVS genomic (g.) and protein (p.) notations.
535+
Whether to include post-mapped HGVS notations and VEP functional consequence in the output. Defaults to False. If True, the output will include
536+
columns for post-mapped HGVS genomic (g.) and protein (p.) notations, and VEP functional consequence.
536537
537538
Returns
538539
_______
@@ -547,9 +548,12 @@ def get_score_set_variants_as_csv(
547548
if include_post_mapped_hgvs:
548549
namespaced_score_set_columns["mavedb"].append("post_mapped_hgvs_g")
549550
namespaced_score_set_columns["mavedb"].append("post_mapped_hgvs_p")
551+
namespaced_score_set_columns["mavedb"].append("post_mapped_hgvs_c")
552+
namespaced_score_set_columns["mavedb"].append("post_mapped_hgvs_at_assay_level")
550553
namespaced_score_set_columns["mavedb"].append("post_mapped_vrs_digest")
551554
for namespace in namespaces:
552555
namespaced_score_set_columns[namespace] = []
556+
553557
if include_custom_columns:
554558
if "scores" in namespaced_score_set_columns:
555559
namespaced_score_set_columns["scores"] = [
@@ -561,10 +565,51 @@ def get_score_set_variants_as_csv(
561565
]
562566
elif "scores" in namespaced_score_set_columns:
563567
namespaced_score_set_columns["scores"].append(REQUIRED_SCORE_COLUMN)
568+
if "vep" in namespaced_score_set_columns:
569+
namespaced_score_set_columns["vep"].append("vep_functional_consequence")
570+
if "gnomad" in namespaced_score_set_columns:
571+
namespaced_score_set_columns["gnomad"].append("gnomad_af")
564572
variants: Sequence[Variant] = []
565573
mappings: Optional[list[Optional[MappedVariant]]] = None
574+
gnomad_data: Optional[list[Optional[GnomADVariant]]] = None
566575

567-
if include_post_mapped_hgvs:
576+
if "gnomad" in namespaces and include_post_mapped_hgvs:
577+
variants_mappings_and_gnomad_query = (
578+
select(Variant, MappedVariant, GnomADVariant)
579+
.join(
580+
MappedVariant,
581+
and_(Variant.id == MappedVariant.variant_id, MappedVariant.current.is_(True)),
582+
isouter=True,
583+
)
584+
.join(MappedVariant.gnomad_variants.of_type(GnomADVariant), isouter=True)
585+
.where(
586+
and_(
587+
Variant.score_set_id == score_set.id,
588+
or_(
589+
and_(
590+
GnomADVariant.db_name == "gnomAD",
591+
GnomADVariant.db_version == "v4.1",
592+
),
593+
GnomADVariant.id.is_(None),
594+
),
595+
)
596+
)
597+
.order_by(cast(func.split_part(Variant.urn, "#", 2), Integer))
598+
)
599+
if start:
600+
variants_mappings_and_gnomad_query = variants_mappings_and_gnomad_query.offset(start)
601+
if limit:
602+
variants_mappings_and_gnomad_query = variants_mappings_and_gnomad_query.limit(limit)
603+
variants_mappings_and_gnomad = db.execute(variants_mappings_and_gnomad_query).all()
604+
605+
variants = []
606+
mappings = []
607+
gnomad_data = []
608+
for variant, mapping, gnomad in variants_mappings_and_gnomad:
609+
variants.append(variant)
610+
mappings.append(mapping)
611+
gnomad_data.append(gnomad)
612+
elif include_post_mapped_hgvs:
568613
variants_and_mappings_query = (
569614
select(Variant, MappedVariant)
570615
.join(
@@ -586,6 +631,40 @@ def get_score_set_variants_as_csv(
586631
for variant, mapping in variants_and_mappings:
587632
variants.append(variant)
588633
mappings.append(mapping)
634+
elif "gnomad" in namespaces:
635+
variants_and_gnomad_query = (
636+
select(Variant, GnomADVariant)
637+
.join(
638+
MappedVariant,
639+
and_(Variant.id == MappedVariant.variant_id, MappedVariant.current.is_(True)),
640+
isouter=True,
641+
)
642+
.join(MappedVariant.gnomad_variants.of_type(GnomADVariant), isouter=True)
643+
.where(
644+
and_(
645+
Variant.score_set_id == score_set.id,
646+
or_(
647+
and_(
648+
GnomADVariant.db_name == "gnomAD",
649+
GnomADVariant.db_version == "v4.1",
650+
),
651+
GnomADVariant.id.is_(None),
652+
),
653+
)
654+
)
655+
.order_by(cast(func.split_part(Variant.urn, "#", 2), Integer))
656+
)
657+
if start:
658+
variants_and_gnomad_query = variants_and_gnomad_query.offset(start)
659+
if limit:
660+
variants_and_gnomad_query = variants_and_gnomad_query.limit(limit)
661+
variants_and_gnomad = db.execute(variants_and_gnomad_query).all()
662+
663+
variants = []
664+
gnomad_data = []
665+
for variant, gnomad in variants_and_gnomad:
666+
variants.append(variant)
667+
gnomad_data.append(gnomad)
589668
else:
590669
variants_query = (
591670
select(Variant)
@@ -598,7 +677,11 @@ def get_score_set_variants_as_csv(
598677
variants_query = variants_query.limit(limit)
599678
variants = db.scalars(variants_query).all()
600679
rows_data = variants_to_csv_rows(
601-
variants, columns=namespaced_score_set_columns, namespaced=namespaced, mappings=mappings
680+
variants,
681+
columns=namespaced_score_set_columns,
682+
namespaced=namespaced,
683+
mappings=mappings,
684+
gnomad_data=gnomad_data,
602685
) # type: ignore
603686
rows_columns = [
604687
(
@@ -654,6 +737,7 @@ def variant_to_csv_row(
654737
variant: Variant,
655738
columns: dict[str, list[str]],
656739
mapping: Optional[MappedVariant] = None,
740+
gnomad_data: Optional[GnomADVariant] = None,
657741
namespaced: Optional[bool] = None,
658742
na_rep="NA",
659743
) -> dict[str, Any]:
@@ -668,6 +752,10 @@ def variant_to_csv_row(
668752
Columns to serialize.
669753
namespaced: Optional[bool] = None
670754
Namespace the columns or not.
755+
mapping : variant.models.MappedVariant, optional
756+
Mapped variant corresponding to the variant.
757+
gnomad_data : variant.models.GnomADVariant, optional
758+
gnomAD variant data corresponding to the variant.
671759
na_rep : str
672760
String to represent null values.
673761
@@ -693,24 +781,29 @@ def variant_to_csv_row(
693781
row[column_key] = value
694782
for column_key in columns.get("mavedb", []):
695783
if column_key == "post_mapped_hgvs_g":
696-
hgvs_str = get_hgvs_from_post_mapped(mapping.post_mapped) if mapping and mapping.post_mapped else None
697-
if hgvs_str is not None and is_hgvs_g(hgvs_str):
698-
value = hgvs_str
699-
else:
700-
value = ""
784+
value = str(mapping.hgvs_g) if mapping and mapping.hgvs_g else na_rep
701785
elif column_key == "post_mapped_hgvs_p":
702-
hgvs_str = get_hgvs_from_post_mapped(mapping.post_mapped) if mapping and mapping.post_mapped else None
703-
if hgvs_str is not None and is_hgvs_p(hgvs_str):
704-
value = hgvs_str
705-
else:
706-
value = ""
786+
value = str(mapping.hgvs_p) if mapping and mapping.hgvs_p else na_rep
787+
elif column_key == "post_mapped_hgvs_c":
788+
value = str(mapping.hgvs_c) if mapping and mapping.hgvs_c else na_rep
789+
elif column_key == "post_mapped_hgvs_at_assay_level":
790+
value = str(mapping.hgvs_assay_level) if mapping and mapping.hgvs_assay_level else na_rep
707791
elif column_key == "post_mapped_vrs_digest":
708792
digest = get_digest_from_post_mapped(mapping.post_mapped) if mapping and mapping.post_mapped else None
709-
value = digest if digest is not None else ""
793+
value = digest if digest is not None else na_rep
710794
if is_null(value):
711795
value = na_rep
712796
key = f"mavedb.{column_key}" if namespaced else column_key
713797
row[key] = value
798+
for column_key in columns.get("vep", []):
799+
if column_key == "vep_functional_consequence":
800+
vep_functional_consequence = mapping.vep_functional_consequence if mapping else None
801+
if vep_functional_consequence is not None:
802+
value = vep_functional_consequence
803+
else:
804+
value = na_rep
805+
key = f"vep.{column_key}" if namespaced else column_key
806+
row[key] = value
714807
for column_key in columns.get("scores", []):
715808
parent = variant.data.get("score_data") if variant.data else None
716809
value = str(parent.get(column_key)) if parent else na_rep
@@ -721,13 +814,23 @@ def variant_to_csv_row(
721814
value = str(parent.get(column_key)) if parent else na_rep
722815
key = f"counts.{column_key}" if namespaced else column_key
723816
row[key] = value
817+
for column_key in columns.get("gnomad", []):
818+
if column_key == "gnomad_af":
819+
gnomad_af = gnomad_data.allele_frequency if gnomad_data else None
820+
if gnomad_af is not None:
821+
value = str(gnomad_af)
822+
else:
823+
value = na_rep
824+
key = f"gnomad.{column_key}" if namespaced else column_key
825+
row[key] = value
724826
return row
725827

726828

727829
def variants_to_csv_rows(
728830
variants: Sequence[Variant],
729831
columns: dict[str, list[str]],
730832
mappings: Optional[Sequence[Optional[MappedVariant]]] = None,
833+
gnomad_data: Optional[Sequence[Optional[GnomADVariant]]] = None,
731834
namespaced: Optional[bool] = None,
732835
na_rep="NA",
733836
) -> Iterable[dict[str, Any]]:
@@ -742,18 +845,36 @@ def variants_to_csv_rows(
742845
Columns to serialize.
743846
namespaced: Optional[bool] = None
744847
Namespace the columns or not.
848+
mappings : list[Optional[variant.models.MappedVariant]], optional
849+
List of mapped variants corresponding to the variants.
850+
gnomad_data : list[Optional[variant.models.GnomADVariant]], optional
851+
List of gnomAD variant data corresponding to the variants.
745852
na_rep : str
746853
String to represent null values.
747854
748855
Returns
749856
-------
750857
list[dict[str, Any]]
751858
"""
752-
if mappings is not None:
859+
if mappings is not None and gnomad_data is not None:
860+
return map(
861+
lambda zipped: variant_to_csv_row(
862+
zipped[0], columns, mapping=zipped[1], gnomad_data=zipped[2], namespaced=namespaced, na_rep=na_rep
863+
),
864+
zip(variants, mappings, gnomad_data),
865+
)
866+
elif mappings is not None:
753867
return map(
754868
lambda pair: variant_to_csv_row(pair[0], columns, mapping=pair[1], namespaced=namespaced, na_rep=na_rep),
755869
zip(variants, mappings),
756870
)
871+
elif gnomad_data is not None:
872+
return map(
873+
lambda pair: variant_to_csv_row(
874+
pair[0], columns, gnomad_data=pair[1], namespaced=namespaced, na_rep=na_rep
875+
),
876+
zip(variants, gnomad_data),
877+
)
757878
return map(lambda v: variant_to_csv_row(v, columns, namespaced=namespaced, na_rep=na_rep), variants)
758879

759880

src/mavedb/models/mapped_variant.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,15 @@ class MappedVariant(Base):
3434

3535
clingen_allele_id = Column(String, index=True, nullable=True)
3636

37+
vep_functional_consequence = Column(String, nullable=True)
38+
vep_access_date = Column(Date, nullable=True)
39+
40+
# mapped hgvs
41+
hgvs_assay_level = Column(String, nullable=True)
42+
hgvs_g = Column(String, nullable=True)
43+
hgvs_c = Column(String, nullable=True)
44+
hgvs_p = Column(String, nullable=True)
45+
3746
clinical_controls: Mapped[list["ClinicalControl"]] = relationship(
3847
"ClinicalControl",
3948
secondary=mapped_variants_clinical_controls_association_table,

0 commit comments

Comments
 (0)