Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 13 additions & 7 deletions src/mavedb/lib/score_sets.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,16 @@
from collections import Counter
import csv
import io
import logging
from operator import attrgetter
import re
from typing import Any, BinaryIO, Iterable, List, Optional, TYPE_CHECKING, Sequence, Literal
from collections import Counter
from operator import attrgetter
from typing import TYPE_CHECKING, Any, BinaryIO, Iterable, List, Literal, Optional, Sequence

from mavedb.models.mapped_variant import MappedVariant
import numpy as np
import pandas as pd
from pandas.testing import assert_index_equal
from sqlalchemy import Integer, and_, cast, func, or_, select
from sqlalchemy.orm import Session, aliased, contains_eager, joinedload, Query, selectinload
from sqlalchemy.orm import Query, Session, aliased, contains_eager, joinedload, selectinload

from mavedb.lib.exceptions import ValidationError
from mavedb.lib.logging.context import logging_context, save_to_logging_context
Expand All @@ -26,7 +25,7 @@
from mavedb.lib.mave.utils import is_csv_null
from mavedb.lib.validation.constants.general import null_values_list
from mavedb.lib.validation.utilities import is_null as validate_is_null
from mavedb.lib.variants import get_hgvs_from_post_mapped, is_hgvs_g, is_hgvs_p
from mavedb.lib.variants import get_digest_from_post_mapped, get_hgvs_from_post_mapped, is_hgvs_g, is_hgvs_p
from mavedb.models.contributor import Contributor
from mavedb.models.controlled_keyword import ControlledKeyword
from mavedb.models.doi_identifier import DoiIdentifier
Expand All @@ -36,6 +35,7 @@
from mavedb.models.experiment_controlled_keyword import ExperimentControlledKeywordAssociation
from mavedb.models.experiment_publication_identifier import ExperimentPublicationIdentifierAssociation
from mavedb.models.experiment_set import ExperimentSet
from mavedb.models.mapped_variant import MappedVariant
from mavedb.models.publication_identifier import PublicationIdentifier
from mavedb.models.refseq_identifier import RefseqIdentifier
from mavedb.models.refseq_offset import RefseqOffset
Expand Down Expand Up @@ -547,6 +547,7 @@ def get_score_set_variants_as_csv(
if include_post_mapped_hgvs:
namespaced_score_set_columns["mavedb"].append("post_mapped_hgvs_g")
namespaced_score_set_columns["mavedb"].append("post_mapped_hgvs_p")
namespaced_score_set_columns["mavedb"].append("post_mapped_vrs_digest")
for namespace in namespaces:
namespaced_score_set_columns[namespace] = []
if include_custom_columns:
Expand Down Expand Up @@ -596,7 +597,9 @@ def get_score_set_variants_as_csv(
if limit:
variants_query = variants_query.limit(limit)
variants = db.scalars(variants_query).all()
rows_data = variants_to_csv_rows(variants, columns=namespaced_score_set_columns, namespaced=namespaced, mappings=mappings) # type: ignore
rows_data = variants_to_csv_rows(
variants, columns=namespaced_score_set_columns, namespaced=namespaced, mappings=mappings
) # type: ignore
rows_columns = [
(
f"{namespace}.{col}"
Expand Down Expand Up @@ -701,6 +704,9 @@ def variant_to_csv_row(
value = hgvs_str
else:
value = ""
elif column_key == "post_mapped_vrs_digest":
digest = get_digest_from_post_mapped(mapping.post_mapped) if mapping and mapping.post_mapped else None
value = digest if digest is not None else ""
if is_null(value):
value = na_rep
key = f"mavedb.{column_key}" if namespaced else column_key
Expand Down
18 changes: 17 additions & 1 deletion src/mavedb/lib/variants.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import re
from typing import Any, Optional


HGVS_G_REGEX = re.compile(r"(^|:)g\.")
HGVS_P_REGEX = re.compile(r"(^|:)p\.")

Expand Down Expand Up @@ -48,6 +47,23 @@ def get_hgvs_from_post_mapped(post_mapped_vrs: Optional[Any]) -> Optional[str]:
return variations_hgvs[0]


def get_digest_from_post_mapped(post_mapped_vrs: Optional[Any]) -> Optional[str]:
"""
Extract the digest value from a post-mapped VRS object.

Args:
post_mapped_vrs: A post-mapped VRS (Variation Representation Specification) object
that may contain a digest field. Can be None.

Returns:
The digest string if present in the post_mapped_vrs object, otherwise None.
"""
if not post_mapped_vrs:
return None

return post_mapped_vrs.get("digest") # type: ignore


# TODO (https://github.com/VariantEffect/mavedb-api/issues/440) Temporarily, we are using these functions to distinguish
# genomic and protein HGVS strings produced by the mapper. Using hgvs.parser.Parser is too slow, and we won't need to do
# this once the mapper extracts separate g., c., and p. post-mapped HGVS strings.
Expand Down
46 changes: 43 additions & 3 deletions tests/lib/test_variants.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,22 @@
import pytest

from mavedb.lib.variants import hgvs_from_vrs_allele, get_hgvs_from_post_mapped, is_hgvs_g, is_hgvs_p

from mavedb.lib.variants import (
get_digest_from_post_mapped,
get_hgvs_from_post_mapped,
hgvs_from_vrs_allele,
is_hgvs_g,
is_hgvs_p,
)
from tests.helpers.constants import (
TEST_HGVS_IDENTIFIER,
TEST_VALID_POST_MAPPED_VRS_ALLELE_VRS1_X,
TEST_VALID_POST_MAPPED_VRS_ALLELE_VRS2_X,
TEST_VALID_POST_MAPPED_VRS_HAPLOTYPE,
TEST_VALID_POST_MAPPED_VRS_CIS_PHASED_BLOCK,
TEST_VALID_POST_MAPPED_VRS_HAPLOTYPE,
)

### Tests for hgvs_from_vrs_allele function ###


def test_hgvs_from_vrs_allele_vrs_1():
with pytest.raises(ValueError):
Expand All @@ -26,6 +33,9 @@ def test_hgvs_from_vrs_allele_invalid():
hgvs_from_vrs_allele({"invalid_key": "invalid_value"})


### Tests for get_hgvs_from_post_mapped function ###


def test_get_hgvs_from_post_mapped_haplotype():
with pytest.raises(ValueError):
get_hgvs_from_post_mapped(TEST_VALID_POST_MAPPED_VRS_HAPLOTYPE)
Expand Down Expand Up @@ -61,6 +71,36 @@ def test_get_hgvs_from_post_mapped_invalid_structure():
get_hgvs_from_post_mapped({"invalid_key": "InvalidType"})


### Tests for get_digest_from_post_mapped function ###


def test_get_digest_from_post_mapped_with_digest():
post_mapped_vrs = {"digest": "test_digest_value", "type": "Allele"}
result = get_digest_from_post_mapped(post_mapped_vrs)
assert result == "test_digest_value"


def test_get_digest_from_post_mapped_without_digest():
post_mapped_vrs = {"type": "Allele", "other_field": "value"}

result = get_digest_from_post_mapped(post_mapped_vrs)

assert result is None


def test_get_digest_from_post_mapped_none_input():
result = get_digest_from_post_mapped(None)
assert result is None


def test_get_digest_from_post_mapped_empty_dict():
result = get_digest_from_post_mapped({})
assert result is None


### Tests for is_hgvs_g and is_hgvs_p functions ###


@pytest.mark.parametrize(
"hgvs,expected",
[
Expand Down
17 changes: 5 additions & 12 deletions tests/routers/test_score_set.py
Original file line number Diff line number Diff line change
Expand Up @@ -2738,6 +2738,7 @@ def test_download_variants_data_file(
"hgvs_pro",
"mavedb.post_mapped_hgvs_g",
"mavedb.post_mapped_hgvs_p",
"mavedb.post_mapped_vrs_digest",
"scores.score",
]
)
Expand Down Expand Up @@ -2862,16 +2863,7 @@ def test_download_scores_and_counts_file(session, data_provider, client, setup_r
download_scores_and_counts_csv = download_scores_and_counts_csv_response.text
reader = csv.DictReader(StringIO(download_scores_and_counts_csv))
assert sorted(reader.fieldnames) == sorted(
[
"accession",
"hgvs_nt",
"hgvs_pro",
"scores.score",
"scores.s_0",
"scores.s_1",
"counts.c_0",
"counts.c_1"
]
["accession", "hgvs_nt", "hgvs_pro", "scores.score", "scores.s_0", "scores.s_1", "counts.c_0", "counts.c_1"]
)


Expand All @@ -2885,7 +2877,7 @@ def test_download_scores_and_counts_file(session, data_provider, client, setup_r
ids=["without_post_mapped_vrs", "with_post_mapped_hgvs_g", "with_post_mapped_hgvs_p"],
)
def test_download_scores_counts_and_post_mapped_variants_file(
session, data_provider, client, setup_router_db, data_files, mapped_variant, has_hgvs_g, has_hgvs_p
session, data_provider, client, setup_router_db, data_files, mapped_variant, has_hgvs_g, has_hgvs_p
):
experiment = create_experiment(client)
score_set = create_seq_score_set(client, experiment["urn"])
Expand All @@ -2912,11 +2904,12 @@ def test_download_scores_counts_and_post_mapped_variants_file(
"hgvs_pro",
"mavedb.post_mapped_hgvs_g",
"mavedb.post_mapped_hgvs_p",
"mavedb.post_mapped_vrs_digest",
"scores.score",
"scores.s_0",
"scores.s_1",
"counts.c_0",
"counts.c_1"
"counts.c_1",
]
)

Expand Down