Skip to content

Commit 2ae432c

Browse files
authored
Merge pull request #541 from VariantEffect/enhancement/estelle/446/namespacedVariantCSVExport
Modify the function of get_score_set_variants_csv to allow downloading multiple data types together.
2 parents 5724fe3 + 1f3af16 commit 2ae432c

File tree

4 files changed

+236
-65
lines changed

4 files changed

+236
-65
lines changed

src/mavedb/lib/score_sets.py

Lines changed: 71 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
import logging
55
from operator import attrgetter
66
import re
7-
from typing import Any, BinaryIO, Iterable, Optional, TYPE_CHECKING, Sequence, Literal
7+
from typing import Any, BinaryIO, Iterable, List, Optional, TYPE_CHECKING, Sequence, Literal
88

99
from mavedb.models.mapped_variant import MappedVariant
1010
import numpy as np
@@ -501,12 +501,13 @@ def find_publish_or_private_superseded_score_set_tail(
501501
def get_score_set_variants_as_csv(
502502
db: Session,
503503
score_set: ScoreSet,
504-
data_type: Literal["scores", "counts"],
504+
namespaces: List[Literal["scores", "counts"]],
505+
namespaced: Optional[bool] = None,
505506
start: Optional[int] = None,
506507
limit: Optional[int] = None,
507508
drop_na_columns: Optional[bool] = None,
508-
include_custom_columns: bool = True,
509-
include_post_mapped_hgvs: bool = False,
509+
include_custom_columns: Optional[bool] = True,
510+
include_post_mapped_hgvs: Optional[bool] = False,
510511
) -> str:
511512
"""
512513
Get the variant data from a score set as a CSV string.
@@ -517,8 +518,10 @@ def get_score_set_variants_as_csv(
517518
The database session to use.
518519
score_set : ScoreSet
519520
The score set to get the variants from.
520-
data_type : {'scores', 'counts'}
521-
The type of data to get. Either 'scores' or 'counts'.
521+
namespaces : List[Literal["scores", "counts"]]
522+
The namespaces for data. Now there are only scores and counts. There will be ClinVar and gnomAD.
523+
namespaced: Optional[bool] = None
524+
Whether namespace the columns or not.
522525
start : int, optional
523526
The index to start from. If None, starts from the beginning.
524527
limit : int, optional
@@ -537,20 +540,26 @@ def get_score_set_variants_as_csv(
537540
The CSV string containing the variant data.
538541
"""
539542
assert type(score_set.dataset_columns) is dict
540-
custom_columns_set = "score_columns" if data_type == "scores" else "count_columns"
541-
type_column = "score_data" if data_type == "scores" else "count_data"
542-
543-
columns = ["accession", "hgvs_nt", "hgvs_splice", "hgvs_pro"]
543+
namespaced_score_set_columns: dict[str, list[str]] = {
544+
"core": ["accession", "hgvs_nt", "hgvs_splice", "hgvs_pro"],
545+
"mavedb": [],
546+
}
544547
if include_post_mapped_hgvs:
545-
columns.append("post_mapped_hgvs_g")
546-
columns.append("post_mapped_hgvs_p")
547-
548+
namespaced_score_set_columns["mavedb"].append("post_mapped_hgvs_g")
549+
namespaced_score_set_columns["mavedb"].append("post_mapped_hgvs_p")
550+
for namespace in namespaces:
551+
namespaced_score_set_columns[namespace] = []
548552
if include_custom_columns:
549-
custom_columns = [str(x) for x in list(score_set.dataset_columns.get(custom_columns_set, []))]
550-
columns += custom_columns
551-
elif data_type == "scores":
552-
columns.append(REQUIRED_SCORE_COLUMN)
553-
553+
if "scores" in namespaced_score_set_columns:
554+
namespaced_score_set_columns["scores"] = [
555+
col for col in [str(x) for x in list(score_set.dataset_columns.get("score_columns", []))]
556+
]
557+
if "counts" in namespaced_score_set_columns:
558+
namespaced_score_set_columns["counts"] = [
559+
col for col in [str(x) for x in list(score_set.dataset_columns.get("count_columns", []))]
560+
]
561+
elif "scores" in namespaced_score_set_columns:
562+
namespaced_score_set_columns["scores"].append(REQUIRED_SCORE_COLUMN)
554563
variants: Sequence[Variant] = []
555564
mappings: Optional[list[Optional[MappedVariant]]] = None
556565

@@ -587,13 +596,22 @@ def get_score_set_variants_as_csv(
587596
if limit:
588597
variants_query = variants_query.limit(limit)
589598
variants = db.scalars(variants_query).all()
599+
rows_data = variants_to_csv_rows(variants, columns=namespaced_score_set_columns, namespaced=namespaced, mappings=mappings) # type: ignore
600+
rows_columns = [
601+
(
602+
f"{namespace}.{col}"
603+
if (namespaced and namespace not in ["core", "mavedb"])
604+
else (f"mavedb.{col}" if namespaced and namespace == "mavedb" else col)
605+
)
606+
for namespace, cols in namespaced_score_set_columns.items()
607+
for col in cols
608+
]
590609

591-
rows_data = variants_to_csv_rows(variants, columns=columns, dtype=type_column, mappings=mappings) # type: ignore
592610
if drop_na_columns:
593-
rows_data, columns = drop_na_columns_from_csv_file_rows(rows_data, columns)
611+
rows_data, rows_columns = drop_na_columns_from_csv_file_rows(rows_data, rows_columns)
594612

595613
stream = io.StringIO()
596-
writer = csv.DictWriter(stream, fieldnames=columns, quoting=csv.QUOTE_MINIMAL)
614+
writer = csv.DictWriter(stream, fieldnames=rows_columns, quoting=csv.QUOTE_MINIMAL)
597615
writer.writeheader()
598616
writer.writerows(rows_data)
599617
return stream.getvalue()
@@ -631,9 +649,9 @@ def is_null(value):
631649

632650
def variant_to_csv_row(
633651
variant: Variant,
634-
columns: list[str],
635-
dtype: str,
652+
columns: dict[str, list[str]],
636653
mapping: Optional[MappedVariant] = None,
654+
namespaced: Optional[bool] = None,
637655
na_rep="NA",
638656
) -> dict[str, Any]:
639657
"""
@@ -645,17 +663,18 @@ def variant_to_csv_row(
645663
List of variants.
646664
columns : list[str]
647665
Columns to serialize.
648-
dtype : str, {'scores', 'counts'}
649-
The type of data requested. Either the 'score_data' or 'count_data'.
666+
namespaced: Optional[bool] = None
667+
Namespace the columns or not.
650668
na_rep : str
651669
String to represent null values.
652670
653671
Returns
654672
-------
655673
dict[str, Any]
656674
"""
657-
row = {}
658-
for column_key in columns:
675+
row: dict[str, Any] = {}
676+
# Handle each column key explicitly as part of its namespace.
677+
for column_key in columns.get("core", []):
659678
if column_key == "hgvs_nt":
660679
value = str(variant.hgvs_nt)
661680
elif column_key == "hgvs_pro":
@@ -664,7 +683,13 @@ def variant_to_csv_row(
664683
value = str(variant.hgvs_splice)
665684
elif column_key == "accession":
666685
value = str(variant.urn)
667-
elif column_key == "post_mapped_hgvs_g":
686+
if is_null(value):
687+
value = na_rep
688+
689+
# export columns in the `core` namespace without a namespace
690+
row[column_key] = value
691+
for column_key in columns.get("mavedb", []):
692+
if column_key == "post_mapped_hgvs_g":
668693
hgvs_str = get_hgvs_from_post_mapped(mapping.post_mapped) if mapping and mapping.post_mapped else None
669694
if hgvs_str is not None and is_hgvs_g(hgvs_str):
670695
value = hgvs_str
@@ -676,21 +701,28 @@ def variant_to_csv_row(
676701
value = hgvs_str
677702
else:
678703
value = ""
679-
else:
680-
parent = variant.data.get(dtype) if variant.data else None
681-
value = str(parent.get(column_key)) if parent else na_rep
682704
if is_null(value):
683705
value = na_rep
684-
row[column_key] = value
685-
706+
key = f"mavedb.{column_key}" if namespaced else column_key
707+
row[key] = value
708+
for column_key in columns.get("scores", []):
709+
parent = variant.data.get("score_data") if variant.data else None
710+
value = str(parent.get(column_key)) if parent else na_rep
711+
key = f"scores.{column_key}" if namespaced else column_key
712+
row[key] = value
713+
for column_key in columns.get("counts", []):
714+
parent = variant.data.get("count_data") if variant.data else None
715+
value = str(parent.get(column_key)) if parent else na_rep
716+
key = f"counts.{column_key}" if namespaced else column_key
717+
row[key] = value
686718
return row
687719

688720

689721
def variants_to_csv_rows(
690722
variants: Sequence[Variant],
691-
columns: list[str],
692-
dtype: str,
723+
columns: dict[str, list[str]],
693724
mappings: Optional[Sequence[Optional[MappedVariant]]] = None,
725+
namespaced: Optional[bool] = None,
694726
na_rep="NA",
695727
) -> Iterable[dict[str, Any]]:
696728
"""
@@ -702,8 +734,8 @@ def variants_to_csv_rows(
702734
List of variants.
703735
columns : list[str]
704736
Columns to serialize.
705-
dtype : str, {'scores', 'counts'}
706-
The type of data requested. Either the 'score_data' or 'count_data'.
737+
namespaced: Optional[bool] = None
738+
Namespace the columns or not.
707739
na_rep : str
708740
String to represent null values.
709741
@@ -713,10 +745,10 @@ def variants_to_csv_rows(
713745
"""
714746
if mappings is not None:
715747
return map(
716-
lambda pair: variant_to_csv_row(pair[0], columns, dtype, mapping=pair[1], na_rep=na_rep),
748+
lambda pair: variant_to_csv_row(pair[0], columns, mapping=pair[1], namespaced=namespaced, na_rep=na_rep),
717749
zip(variants, mappings),
718750
)
719-
return map(lambda v: variant_to_csv_row(v, columns, dtype, na_rep=na_rep), variants)
751+
return map(lambda v: variant_to_csv_row(v, columns, namespaced=namespaced, na_rep=na_rep), variants)
720752

721753

722754
def find_meta_analyses_for_score_sets(db: Session, urns: list[str]) -> list[ScoreSet]:

src/mavedb/routers/score_sets.py

Lines changed: 31 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
import json
22
import logging
33
from datetime import date
4-
from typing import Any, List, Optional, Sequence, TypedDict, Union
4+
from typing import Any, List, Literal, Optional, Sequence, TypedDict, Union
55

66
import pandas as pd
77
from arq import ArqRedis
@@ -110,20 +110,28 @@ async def enqueue_variant_creation(
110110
"hgvs_splice",
111111
"hgvs_pro",
112112
] + item.dataset_columns.get("score_columns", [])
113+
# score_columns = {
114+
# "core": ["hgvs_nt", "hgvs_splice", "hgvs_pro"],
115+
# "counts": item.dataset_columns["score_columns"],
116+
# }
113117
existing_scores_df = pd.DataFrame(
114-
variants_to_csv_rows(item.variants, columns=score_columns, dtype="score_data")
118+
variants_to_csv_rows(item.variants, columns=score_columns, namespaced=False)
115119
).replace("NA", pd.NA)
116120

117121
# create CSV from existing variants on the score set if no new dataframe provided
118122
existing_counts_df = None
119123
if new_counts_df is None and item.dataset_columns.get("count_columns"):
124+
# count_columns = {
125+
# "core": ["hgvs_nt", "hgvs_splice", "hgvs_pro"],
126+
# "counts": item.dataset_columns["count_columns"],
127+
# }
120128
count_columns = [
121-
"hgvs_nt",
122-
"hgvs_splice",
123-
"hgvs_pro",
124-
] + item.dataset_columns["count_columns"]
129+
"hgvs_nt",
130+
"hgvs_splice",
131+
"hgvs_pro",
132+
] + item.dataset_columns["count_columns"]
125133
existing_counts_df = pd.DataFrame(
126-
variants_to_csv_rows(item.variants, columns=count_columns, dtype="count_data")
134+
variants_to_csv_rows(item.variants, columns=count_columns, namespaced=False)
127135
).replace("NA", pd.NA)
128136

129137
# Await the insertion of this job into the worker queue, not the job itself.
@@ -638,7 +646,13 @@ def get_score_set_variants_csv(
638646
urn: str,
639647
start: int = Query(default=None, description="Start index for pagination"),
640648
limit: int = Query(default=None, description="Maximum number of variants to return"),
649+
namespaces: List[Literal["scores", "counts"]] = Query(
650+
default=["scores"],
651+
description="One or more data types to include: scores, counts, clinVar, gnomAD"
652+
),
641653
drop_na_columns: Optional[bool] = None,
654+
include_custom_columns: Optional[bool] = None,
655+
include_post_mapped_hgvs: Optional[bool] = None,
642656
db: Session = Depends(deps.get_db),
643657
user_data: Optional[UserData] = Depends(get_current_user),
644658
) -> Any:
@@ -648,12 +662,9 @@ def get_score_set_variants_csv(
648662
This differs from get_score_set_scores_csv() in that it returns only the HGVS columns, score column, and mapped HGVS
649663
string.
650664
651-
TODO (https://github.com/VariantEffect/mavedb-api/issues/446) We may want to turn this into a general-purpose CSV
665+
TODO (https://github.com/VariantEffect/mavedb-api/issues/446) We may add another function for ClinVar and gnomAD.
652666
export endpoint, with options governing which columns to include.
653667
654-
Parameters
655-
__________
656-
657668
Parameters
658669
__________
659670
urn : str
@@ -662,6 +673,9 @@ def get_score_set_variants_csv(
662673
The index to start from. If None, starts from the beginning.
663674
limit : Optional[int]
664675
The maximum number of variants to return. If None, returns all variants.
676+
namespaces: List[Literal["scores", "counts"]]
677+
The namespaces of all columns except for accession, hgvs_nt, hgvs_pro, and hgvs_splice.
678+
We may add ClinVar and gnomAD in the future.
665679
drop_na_columns : bool, optional
666680
Whether to drop columns that contain only NA values. Defaults to False.
667681
db : Session
@@ -701,12 +715,13 @@ def get_score_set_variants_csv(
701715
csv_str = get_score_set_variants_as_csv(
702716
db,
703717
score_set,
704-
"scores",
718+
namespaces,
719+
True,
705720
start,
706721
limit,
707722
drop_na_columns,
708-
include_custom_columns=False,
709-
include_post_mapped_hgvs=True,
723+
include_custom_columns,
724+
include_post_mapped_hgvs,
710725
)
711726
return StreamingResponse(iter([csv_str]), media_type="text/csv")
712727

@@ -762,7 +777,7 @@ def get_score_set_scores_csv(
762777

763778
assert_permission(user_data, score_set, Action.READ)
764779

765-
csv_str = get_score_set_variants_as_csv(db, score_set, "scores", start, limit, drop_na_columns)
780+
csv_str = get_score_set_variants_as_csv(db, score_set, ["scores"], False, start, limit, drop_na_columns)
766781
return StreamingResponse(iter([csv_str]), media_type="text/csv")
767782

768783

@@ -817,7 +832,7 @@ async def get_score_set_counts_csv(
817832

818833
assert_permission(user_data, score_set, Action.READ)
819834

820-
csv_str = get_score_set_variants_as_csv(db, score_set, "counts", start, limit, drop_na_columns)
835+
csv_str = get_score_set_variants_as_csv(db, score_set, ["counts"], False, start, limit, drop_na_columns)
821836
return StreamingResponse(iter([csv_str]), media_type="text/csv")
822837

823838

src/mavedb/scripts/export_public_data.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -147,12 +147,12 @@ def export_public_data(db: Session):
147147
logger.info(f"{i + 1}/{num_score_sets} Exporting variants for score set {score_set.urn}")
148148
csv_filename_base = score_set.urn.replace(":", "-")
149149

150-
csv_str = get_score_set_variants_as_csv(db, score_set, "scores")
150+
csv_str = get_score_set_variants_as_csv(db, score_set, ["scores"])
151151
zipfile.writestr(f"csv/{csv_filename_base}.scores.csv", csv_str)
152152

153153
count_columns = score_set.dataset_columns["count_columns"] if score_set.dataset_columns else None
154154
if count_columns and len(count_columns) > 0:
155-
csv_str = get_score_set_variants_as_csv(db, score_set, "counts")
155+
csv_str = get_score_set_variants_as_csv(db, score_set, ["counts"])
156156
zipfile.writestr(f"csv/{csv_filename_base}.counts.csv", csv_str)
157157

158158

0 commit comments

Comments
 (0)