Skip to content

Commit d78189e

Browse files
authored
Merge pull request #577 from VariantEffect/feature/bencap/550/vrs-digest-in-post-mapped-csv
feat: output VRS digest with post mapped HGVS strings
2 parents 9acd4a4 + 7eb4d02 commit d78189e

File tree

4 files changed

+75
-11
lines changed

4 files changed

+75
-11
lines changed

src/mavedb/lib/score_sets.py

Lines changed: 13 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,16 @@
1-
from collections import Counter
21
import csv
32
import io
43
import logging
5-
from operator import attrgetter
64
import re
7-
from typing import Any, BinaryIO, Iterable, List, Optional, TYPE_CHECKING, Sequence, Literal
5+
from collections import Counter
6+
from operator import attrgetter
7+
from typing import TYPE_CHECKING, Any, BinaryIO, Iterable, List, Literal, Optional, Sequence
88

9-
from mavedb.models.mapped_variant import MappedVariant
109
import numpy as np
1110
import pandas as pd
1211
from pandas.testing import assert_index_equal
1312
from sqlalchemy import Integer, and_, cast, func, or_, select
14-
from sqlalchemy.orm import Session, aliased, contains_eager, joinedload, Query, selectinload
13+
from sqlalchemy.orm import Query, Session, aliased, contains_eager, joinedload, selectinload
1514

1615
from mavedb.lib.exceptions import ValidationError
1716
from mavedb.lib.logging.context import logging_context, save_to_logging_context
@@ -26,7 +25,7 @@
2625
from mavedb.lib.mave.utils import is_csv_null
2726
from mavedb.lib.validation.constants.general import null_values_list
2827
from mavedb.lib.validation.utilities import is_null as validate_is_null
29-
from mavedb.lib.variants import get_hgvs_from_post_mapped, is_hgvs_g, is_hgvs_p
28+
from mavedb.lib.variants import get_digest_from_post_mapped, get_hgvs_from_post_mapped, is_hgvs_g, is_hgvs_p
3029
from mavedb.models.contributor import Contributor
3130
from mavedb.models.controlled_keyword import ControlledKeyword
3231
from mavedb.models.doi_identifier import DoiIdentifier
@@ -36,6 +35,7 @@
3635
from mavedb.models.experiment_controlled_keyword import ExperimentControlledKeywordAssociation
3736
from mavedb.models.experiment_publication_identifier import ExperimentPublicationIdentifierAssociation
3837
from mavedb.models.experiment_set import ExperimentSet
38+
from mavedb.models.mapped_variant import MappedVariant
3939
from mavedb.models.publication_identifier import PublicationIdentifier
4040
from mavedb.models.refseq_identifier import RefseqIdentifier
4141
from mavedb.models.refseq_offset import RefseqOffset
@@ -547,6 +547,7 @@ def get_score_set_variants_as_csv(
547547
if include_post_mapped_hgvs:
548548
namespaced_score_set_columns["mavedb"].append("post_mapped_hgvs_g")
549549
namespaced_score_set_columns["mavedb"].append("post_mapped_hgvs_p")
550+
namespaced_score_set_columns["mavedb"].append("post_mapped_vrs_digest")
550551
for namespace in namespaces:
551552
namespaced_score_set_columns[namespace] = []
552553
if include_custom_columns:
@@ -596,7 +597,9 @@ def get_score_set_variants_as_csv(
596597
if limit:
597598
variants_query = variants_query.limit(limit)
598599
variants = db.scalars(variants_query).all()
599-
rows_data = variants_to_csv_rows(variants, columns=namespaced_score_set_columns, namespaced=namespaced, mappings=mappings) # type: ignore
600+
rows_data = variants_to_csv_rows(
601+
variants, columns=namespaced_score_set_columns, namespaced=namespaced, mappings=mappings
602+
) # type: ignore
600603
rows_columns = [
601604
(
602605
f"{namespace}.{col}"
@@ -701,6 +704,9 @@ def variant_to_csv_row(
701704
value = hgvs_str
702705
else:
703706
value = ""
707+
elif column_key == "post_mapped_vrs_digest":
708+
digest = get_digest_from_post_mapped(mapping.post_mapped) if mapping and mapping.post_mapped else None
709+
value = digest if digest is not None else ""
704710
if is_null(value):
705711
value = na_rep
706712
key = f"mavedb.{column_key}" if namespaced else column_key

src/mavedb/lib/variants.py

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
import re
22
from typing import Any, Optional
33

4-
54
HGVS_G_REGEX = re.compile(r"(^|:)g\.")
65
HGVS_P_REGEX = re.compile(r"(^|:)p\.")
76

@@ -48,6 +47,23 @@ def get_hgvs_from_post_mapped(post_mapped_vrs: Optional[Any]) -> Optional[str]:
4847
return variations_hgvs[0]
4948

5049

50+
def get_digest_from_post_mapped(post_mapped_vrs: Optional[Any]) -> Optional[str]:
51+
"""
52+
Extract the digest value from a post-mapped VRS object.
53+
54+
Args:
55+
post_mapped_vrs: A post-mapped VRS (Variation Representation Specification) object
56+
that may contain a digest field. Can be None.
57+
58+
Returns:
59+
The digest string if present in the post_mapped_vrs object, otherwise None.
60+
"""
61+
if not post_mapped_vrs:
62+
return None
63+
64+
return post_mapped_vrs.get("digest") # type: ignore
65+
66+
5167
# TODO (https://github.com/VariantEffect/mavedb-api/issues/440) Temporarily, we are using these functions to distinguish
5268
# genomic and protein HGVS strings produced by the mapper. Using hgvs.parser.Parser is too slow, and we won't need to do
5369
# this once the mapper extracts separate g., c., and p. post-mapped HGVS strings.

tests/lib/test_variants.py

Lines changed: 43 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,22 @@
11
import pytest
22

3-
from mavedb.lib.variants import hgvs_from_vrs_allele, get_hgvs_from_post_mapped, is_hgvs_g, is_hgvs_p
4-
3+
from mavedb.lib.variants import (
4+
get_digest_from_post_mapped,
5+
get_hgvs_from_post_mapped,
6+
hgvs_from_vrs_allele,
7+
is_hgvs_g,
8+
is_hgvs_p,
9+
)
510
from tests.helpers.constants import (
611
TEST_HGVS_IDENTIFIER,
712
TEST_VALID_POST_MAPPED_VRS_ALLELE_VRS1_X,
813
TEST_VALID_POST_MAPPED_VRS_ALLELE_VRS2_X,
9-
TEST_VALID_POST_MAPPED_VRS_HAPLOTYPE,
1014
TEST_VALID_POST_MAPPED_VRS_CIS_PHASED_BLOCK,
15+
TEST_VALID_POST_MAPPED_VRS_HAPLOTYPE,
1116
)
1217

18+
### Tests for hgvs_from_vrs_allele function ###
19+
1320

1421
def test_hgvs_from_vrs_allele_vrs_1():
1522
with pytest.raises(ValueError):
@@ -26,6 +33,9 @@ def test_hgvs_from_vrs_allele_invalid():
2633
hgvs_from_vrs_allele({"invalid_key": "invalid_value"})
2734

2835

36+
### Tests for get_hgvs_from_post_mapped function ###
37+
38+
2939
def test_get_hgvs_from_post_mapped_haplotype():
3040
with pytest.raises(ValueError):
3141
get_hgvs_from_post_mapped(TEST_VALID_POST_MAPPED_VRS_HAPLOTYPE)
@@ -61,6 +71,36 @@ def test_get_hgvs_from_post_mapped_invalid_structure():
6171
get_hgvs_from_post_mapped({"invalid_key": "InvalidType"})
6272

6373

74+
### Tests for get_digest_from_post_mapped function ###
75+
76+
77+
def test_get_digest_from_post_mapped_with_digest():
78+
post_mapped_vrs = {"digest": "test_digest_value", "type": "Allele"}
79+
result = get_digest_from_post_mapped(post_mapped_vrs)
80+
assert result == "test_digest_value"
81+
82+
83+
def test_get_digest_from_post_mapped_without_digest():
84+
post_mapped_vrs = {"type": "Allele", "other_field": "value"}
85+
86+
result = get_digest_from_post_mapped(post_mapped_vrs)
87+
88+
assert result is None
89+
90+
91+
def test_get_digest_from_post_mapped_none_input():
92+
result = get_digest_from_post_mapped(None)
93+
assert result is None
94+
95+
96+
def test_get_digest_from_post_mapped_empty_dict():
97+
result = get_digest_from_post_mapped({})
98+
assert result is None
99+
100+
101+
### Tests for is_hgvs_g and is_hgvs_p functions ###
102+
103+
64104
@pytest.mark.parametrize(
65105
"hgvs,expected",
66106
[

tests/routers/test_score_set.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2706,6 +2706,7 @@ def test_download_variants_data_file(
27062706
"hgvs_pro",
27072707
"mavedb.post_mapped_hgvs_g",
27082708
"mavedb.post_mapped_hgvs_p",
2709+
"mavedb.post_mapped_vrs_digest",
27092710
"scores.score",
27102711
]
27112712
)
@@ -2871,6 +2872,7 @@ def test_download_scores_counts_and_post_mapped_variants_file(
28712872
"hgvs_pro",
28722873
"mavedb.post_mapped_hgvs_g",
28732874
"mavedb.post_mapped_hgvs_p",
2875+
"mavedb.post_mapped_vrs_digest",
28742876
"scores.score",
28752877
"scores.s_0",
28762878
"scores.s_1",

0 commit comments

Comments
 (0)