Skip to content

Commit 1f54107

Browse files
authored
Merge pull request #466 from VariantEffect/release-2025.2.1
Release 2025.2.1
2 parents 0c1b391 + df3a82d commit 1f54107

File tree

9 files changed

+400
-27
lines changed

9 files changed

+400
-27
lines changed

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
44

55
[tool.poetry]
66
name = "mavedb"
7-
version = "2025.2.0"
7+
version = "2025.2.1"
88
description = "API for MaveDB, the database of Multiplexed Assays of Variant Effect."
99
license = "AGPL-3.0-only"
1010
readme = "README.md"

src/mavedb/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,6 @@
66
logger = module_logging.getLogger(__name__)
77

88
__project__ = "mavedb-api"
9-
__version__ = "2025.2.0"
9+
__version__ = "2025.2.1"
1010

1111
logger.info(f"MaveDB {__version__}")

src/mavedb/lib/score_sets.py

Lines changed: 165 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -5,10 +5,11 @@
55
from operator import attrgetter
66
from typing import Any, BinaryIO, Iterable, Optional, TYPE_CHECKING, Sequence, Literal
77

8+
from mavedb.models.mapped_variant import MappedVariant
89
import numpy as np
910
import pandas as pd
1011
from pandas.testing import assert_index_equal
11-
from sqlalchemy import Integer, cast, func, or_, select
12+
from sqlalchemy import Integer, and_, cast, func, or_, select
1213
from sqlalchemy.orm import Session, aliased, contains_eager, joinedload, selectinload
1314

1415
from mavedb.lib.exceptions import ValidationError
@@ -17,6 +18,7 @@
1718
HGVS_NT_COLUMN,
1819
HGVS_PRO_COLUMN,
1920
HGVS_SPLICE_COLUMN,
21+
REQUIRED_SCORE_COLUMN,
2022
VARIANT_COUNT_DATA,
2123
VARIANT_SCORE_DATA,
2224
)
@@ -57,6 +59,9 @@
5759

5860
logger = logging.getLogger(__name__)
5961

62+
HGVS_G_REGEX = re.compile(r"(^|:)g\.")
63+
HGVS_P_REGEX = re.compile(r"(^|:)p\.")
64+
6065

6166
class HGVSColumns:
6267
NUCLEOTIDE: str = "hgvs_nt" # dataset.constants.hgvs_nt_column
@@ -402,26 +407,90 @@ def get_score_set_variants_as_csv(
402407
start: Optional[int] = None,
403408
limit: Optional[int] = None,
404409
drop_na_columns: Optional[bool] = None,
410+
include_custom_columns: bool = True,
411+
include_post_mapped_hgvs: bool = False,
405412
) -> str:
413+
"""
414+
Get the variant data from a score set as a CSV string.
415+
416+
Parameters
417+
__________
418+
db : Session
419+
The database session to use.
420+
score_set : ScoreSet
421+
The score set to get the variants from.
422+
data_type : {'scores', 'counts'}
423+
The type of data to get. Either 'scores' or 'counts'.
424+
start : int, optional
425+
The index to start from. If None, starts from the beginning.
426+
limit : int, optional
427+
The maximum number of variants to return. If None, returns all variants.
428+
drop_na_columns : bool, optional
429+
Whether to drop columns that contain only NA values. Defaults to False.
430+
include_custom_columns : bool, optional
431+
Whether to include custom columns defined in the score set. Defaults to True.
432+
include_post_mapped_hgvs : bool, optional
433+
Whether to include post-mapped HGVS notations in the output. Defaults to False. If True, the output will include
434+
columns for both post-mapped HGVS genomic (g.) and protein (p.) notations.
435+
436+
Returns
437+
_______
438+
str
439+
The CSV string containing the variant data.
440+
"""
406441
assert type(score_set.dataset_columns) is dict
407-
dataset_cols = "score_columns" if data_type == "scores" else "count_columns"
442+
custom_columns_set = "score_columns" if data_type == "scores" else "count_columns"
408443
type_column = "score_data" if data_type == "scores" else "count_data"
409444

410-
count_columns = [str(x) for x in list(score_set.dataset_columns.get(dataset_cols, []))]
411-
columns = ["accession", "hgvs_nt", "hgvs_splice", "hgvs_pro"] + count_columns
412-
413-
variants_query = (
414-
select(Variant)
415-
.where(Variant.score_set_id == score_set.id)
416-
.order_by(cast(func.split_part(Variant.urn, "#", 2), Integer))
417-
)
418-
if start:
419-
variants_query = variants_query.offset(start)
420-
if limit:
421-
variants_query = variants_query.limit(limit)
422-
variants = db.scalars(variants_query).all()
445+
columns = ["accession", "hgvs_nt", "hgvs_splice", "hgvs_pro"]
446+
if include_post_mapped_hgvs:
447+
columns.append("post_mapped_hgvs_g")
448+
columns.append("post_mapped_hgvs_p")
449+
450+
if include_custom_columns:
451+
custom_columns = [str(x) for x in list(score_set.dataset_columns.get(custom_columns_set, []))]
452+
columns += custom_columns
453+
elif data_type == "scores":
454+
columns.append(REQUIRED_SCORE_COLUMN)
455+
456+
variants: Sequence[Variant] = []
457+
mappings: Optional[list[Optional[MappedVariant]]] = None
458+
459+
if include_post_mapped_hgvs:
460+
variants_and_mappings_query = (
461+
select(Variant, MappedVariant)
462+
.join(
463+
MappedVariant,
464+
and_(Variant.id == MappedVariant.variant_id, MappedVariant.current.is_(True)),
465+
isouter=True,
466+
)
467+
.where(Variant.score_set_id == score_set.id)
468+
.order_by(cast(func.split_part(Variant.urn, "#", 2), Integer))
469+
)
470+
if start:
471+
variants_and_mappings_query = variants_and_mappings_query.offset(start)
472+
if limit:
473+
variants_and_mappings_query = variants_and_mappings_query.limit(limit)
474+
variants_and_mappings = db.execute(variants_and_mappings_query).all()
475+
476+
variants = []
477+
mappings = []
478+
for variant, mapping in variants_and_mappings:
479+
variants.append(variant)
480+
mappings.append(mapping)
481+
else:
482+
variants_query = (
483+
select(Variant)
484+
.where(Variant.score_set_id == score_set.id)
485+
.order_by(cast(func.split_part(Variant.urn, "#", 2), Integer))
486+
)
487+
if start:
488+
variants_query = variants_query.offset(start)
489+
if limit:
490+
variants_query = variants_query.limit(limit)
491+
variants = db.scalars(variants_query).all()
423492

424-
rows_data = variants_to_csv_rows(variants, columns=columns, dtype=type_column) # type: ignore
493+
rows_data = variants_to_csv_rows(variants, columns=columns, dtype=type_column, mappings=mappings) # type: ignore
425494
if drop_na_columns:
426495
rows_data, columns = drop_na_columns_from_csv_file_rows(rows_data, columns)
427496

@@ -462,7 +531,63 @@ def is_null(value):
462531
return null_values_re.fullmatch(value) or not value
463532

464533

465-
def variant_to_csv_row(variant: Variant, columns: list[str], dtype: str, na_rep="NA") -> dict[str, Any]:
534+
def hgvs_from_vrs_allele(allele: dict) -> str:
535+
"""
536+
Extract the HGVS notation from the VRS allele.
537+
"""
538+
try:
539+
# VRS 2.X
540+
return allele["expressions"][0]["value"]
541+
except KeyError:
542+
raise ValueError("VRS 1.X format not supported.")
543+
# VRS 1.X. We don't want to allow this.
544+
# return allele["variation"]["expressions"][0]["value"]
545+
546+
547+
def get_hgvs_from_mapped_variant(post_mapped_vrs: Any) -> Optional[str]:
548+
if post_mapped_vrs["type"] == "Haplotype": # type: ignore
549+
variations_hgvs = [hgvs_from_vrs_allele(allele) for allele in post_mapped_vrs["members"]]
550+
elif post_mapped_vrs["type"] == "CisPhasedBlock": # type: ignore
551+
variations_hgvs = [hgvs_from_vrs_allele(allele) for allele in post_mapped_vrs["members"]]
552+
elif post_mapped_vrs["type"] == "Allele": # type: ignore
553+
variations_hgvs = [hgvs_from_vrs_allele(post_mapped_vrs)]
554+
else:
555+
return None
556+
557+
if len(variations_hgvs) == 0:
558+
return None
559+
# raise ValueError(f"No variations found in variant {variant_urn}.")
560+
if len(variations_hgvs) > 1:
561+
return None
562+
# raise ValueError(f"Multiple variations found in variant {variant_urn}.")
563+
564+
return variations_hgvs[0]
565+
566+
567+
# TODO (https://github.com/VariantEffect/mavedb-api/issues/440) Temporarily, we are using these functions to distinguish
568+
# genomic and protein HGVS strings produced by the mapper. Using hgvs.parser.Parser is too slow, and we won't need to do
569+
# this once the mapper extracts separate g., c., and p. post-mapped HGVS strings.
570+
def is_hgvs_g(hgvs: str) -> bool:
571+
"""
572+
Check if the given HGVS string is a genomic HGVS (g.) string.
573+
"""
574+
return bool(HGVS_G_REGEX.search(hgvs))
575+
576+
577+
def is_hgvs_p(hgvs: str) -> bool:
578+
"""
579+
Check if the given HGVS string is a protein HGVS (p.) string.
580+
"""
581+
return bool(HGVS_P_REGEX.search(hgvs))
582+
583+
584+
def variant_to_csv_row(
585+
variant: Variant,
586+
columns: list[str],
587+
dtype: str,
588+
mapping: Optional[MappedVariant] = None,
589+
na_rep="NA",
590+
) -> dict[str, Any]:
466591
"""
467592
Format a variant into a containing the keys specified in `columns`.
468593
@@ -491,6 +616,18 @@ def variant_to_csv_row(variant: Variant, columns: list[str], dtype: str, na_rep=
491616
value = str(variant.hgvs_splice)
492617
elif column_key == "accession":
493618
value = str(variant.urn)
619+
elif column_key == "post_mapped_hgvs_g":
620+
hgvs_str = get_hgvs_from_mapped_variant(mapping.post_mapped) if mapping and mapping.post_mapped else None
621+
if hgvs_str is not None and is_hgvs_g(hgvs_str):
622+
value = hgvs_str
623+
else:
624+
value = ""
625+
elif column_key == "post_mapped_hgvs_p":
626+
hgvs_str = get_hgvs_from_mapped_variant(mapping.post_mapped) if mapping and mapping.post_mapped else None
627+
if hgvs_str is not None and is_hgvs_p(hgvs_str):
628+
value = hgvs_str
629+
else:
630+
value = ""
494631
else:
495632
parent = variant.data.get(dtype) if variant.data else None
496633
value = str(parent.get(column_key)) if parent else na_rep
@@ -502,7 +639,11 @@ def variant_to_csv_row(variant: Variant, columns: list[str], dtype: str, na_rep=
502639

503640

504641
def variants_to_csv_rows(
505-
variants: Sequence[Variant], columns: list[str], dtype: str, na_rep="NA"
642+
variants: Sequence[Variant],
643+
columns: list[str],
644+
dtype: str,
645+
mappings: Optional[Sequence[Optional[MappedVariant]]] = None,
646+
na_rep="NA",
506647
) -> Iterable[dict[str, Any]]:
507648
"""
508649
Format each variant into a dictionary row containing the keys specified in `columns`.
@@ -522,7 +663,12 @@ def variants_to_csv_rows(
522663
-------
523664
list[dict[str, Any]]
524665
"""
525-
return map(lambda v: variant_to_csv_row(v, columns, dtype, na_rep), variants)
666+
if mappings is not None:
667+
return map(
668+
lambda pair: variant_to_csv_row(pair[0], columns, dtype, mapping=pair[1], na_rep=na_rep),
669+
zip(variants, mappings),
670+
)
671+
return map(lambda v: variant_to_csv_row(v, columns, dtype, na_rep=na_rep), variants)
526672

527673

528674
def find_meta_analyses_for_score_sets(db: Session, urns: list[str]) -> list[ScoreSet]:

src/mavedb/routers/score_sets.py

Lines changed: 89 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -225,6 +225,95 @@ async def show_score_set(
225225
return score_set.ScoreSet.from_orm(item).copy(update={"experiment": enriched_experiment})
226226

227227

228+
@router.get(
229+
"/score-sets/{urn}/variants/data",
230+
status_code=200,
231+
responses={
232+
200: {
233+
"content": {"text/csv": {}},
234+
"description": """Variant data in CSV format, with four fixed columns (accession, hgvs_nt, hgvs_pro,"""
235+
""" and hgvs_splice), plus score columns defined by the score set.""",
236+
}
237+
},
238+
)
239+
def get_score_set_variants_csv(
240+
*,
241+
urn: str,
242+
start: int = Query(default=None, description="Start index for pagination"),
243+
limit: int = Query(default=None, description="Maximum number of variants to return"),
244+
drop_na_columns: Optional[bool] = None,
245+
db: Session = Depends(deps.get_db),
246+
user_data: Optional[UserData] = Depends(get_current_user),
247+
) -> Any:
248+
"""
249+
Return tabular variant data from a score set, identified by URN, in CSV format.
250+
251+
This differs from get_score_set_scores_csv() in that it returns only the HGVS columns, score column, and mapped HGVS
252+
string.
253+
254+
TODO (https://github.com/VariantEffect/mavedb-api/issues/446) We may want to turn this into a general-purpose CSV
255+
export endpoint, with options governing which columns to include.
256+
257+
Parameters
258+
__________
259+
260+
Parameters
261+
__________
262+
urn : str
263+
The URN of the score set to fetch variants from.
264+
start : Optional[int]
265+
The index to start from. If None, starts from the beginning.
266+
limit : Optional[int]
267+
The maximum number of variants to return. If None, returns all variants.
268+
drop_na_columns : bool, optional
269+
Whether to drop columns that contain only NA values. Defaults to False.
270+
db : Session
271+
The database session to use.
272+
user_data : Optional[UserData]
273+
The user data of the current user. If None, no user-specific permissions are checked.
274+
275+
Returns
276+
_______
277+
str
278+
The CSV string containing the variant data.
279+
"""
280+
save_to_logging_context(
281+
{
282+
"requested_resource": urn,
283+
"resource_property": "scores",
284+
"start": start,
285+
"limit": limit,
286+
"drop_na_columns": drop_na_columns,
287+
}
288+
)
289+
290+
if start and start < 0:
291+
logger.info(msg="Could not fetch scores with negative start index.", extra=logging_context())
292+
raise HTTPException(status_code=400, detail="Start index must be non-negative")
293+
if limit is not None and limit <= 0:
294+
logger.info(msg="Could not fetch scores with non-positive limit.", extra=logging_context())
295+
raise HTTPException(status_code=400, detail="Limit must be positive")
296+
297+
score_set = db.query(ScoreSet).filter(ScoreSet.urn == urn).first()
298+
if not score_set:
299+
logger.info(msg="Could not fetch the requested scores; No such score set exists.", extra=logging_context())
300+
raise HTTPException(status_code=404, detail=f"score set with URN '{urn}' not found")
301+
302+
assert_permission(user_data, score_set, Action.READ)
303+
304+
csv_str = get_score_set_variants_as_csv(
305+
db,
306+
score_set,
307+
"scores",
308+
start,
309+
limit,
310+
drop_na_columns,
311+
include_custom_columns=False,
312+
include_post_mapped_hgvs=True,
313+
)
314+
return StreamingResponse(iter([csv_str]), media_type="text/csv")
315+
316+
228317
@router.get(
229318
"/score-sets/{urn}/scores",
230319
status_code=200,

src/mavedb/routers/variants.py

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,10 @@ def lookup_variants(
3434
db: Session = Depends(deps.get_db),
3535
user_data: UserData = Depends(get_current_user),
3636
):
37+
save_to_logging_context({"requested_resource": "clingen-allele-id-lookups"})
38+
save_to_logging_context({"clingen_allele_ids_to_lookup": request.clingen_allele_ids})
39+
logger.debug(msg="Looking up variants by Clingen Allele IDs", extra=logging_context())
40+
3741
variants = db.execute(
3842
select(Variant, MappedVariant.clingen_allele_id)
3943
.join(MappedVariant)
@@ -42,12 +46,25 @@ def lookup_variants(
4246
).all()
4347

4448
variants_by_allele_id: dict[str, list[Variant]] = {allele_id: [] for allele_id in request.clingen_allele_ids}
49+
save_to_logging_context({"num_variants_matching_clingen_allele_ids": len(variants)})
50+
logger.debug(msg="Found variants with matching ClinGen Allele IDs", extra=logging_context())
4551

52+
num_variants_matching_clingen_allele_ids_and_permitted = 0
4653
for variant, allele_id in variants:
4754
if has_permission(user_data, variant.score_set, Action.READ).permitted:
4855
variants_by_allele_id[allele_id].append(variant)
56+
num_variants_matching_clingen_allele_ids_and_permitted += 1
57+
58+
save_to_logging_context(
59+
{"clingen_allele_ids_with_permitted_variants": num_variants_matching_clingen_allele_ids_and_permitted}
60+
)
61+
62+
if not any(matched_variants for matched_variants in variants_by_allele_id.values()):
63+
logger.info(msg="No variants found for the provided Clingen Allele IDs.", extra=logging_context())
64+
raise HTTPException(status_code=404, detail="No variants found for the provided Clingen Allele IDs.")
4965

50-
return [variants_by_allele_id[allele_id] for allele_id in request.clingen_allele_ids]
66+
# These dict methods will preserve key ordering.
67+
return list(variants_by_allele_id.values())
5168

5269

5370
@router.get(

0 commit comments

Comments
 (0)