Skip to content

Commit dbd2fce

Browse files
committed
Add gnomad af to csv wip
1 parent 553bb3b commit dbd2fce

File tree

2 files changed

+70
-7
lines changed

2 files changed

+70
-7
lines changed

src/mavedb/lib/score_sets.py

Lines changed: 68 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@
3535
from mavedb.models.experiment_controlled_keyword import ExperimentControlledKeywordAssociation
3636
from mavedb.models.experiment_publication_identifier import ExperimentPublicationIdentifierAssociation
3737
from mavedb.models.experiment_set import ExperimentSet
38+
from mavedb.models.gnomad_variant import GnomADVariant
3839
from mavedb.models.publication_identifier import PublicationIdentifier
3940
from mavedb.models.refseq_identifier import RefseqIdentifier
4041
from mavedb.models.refseq_offset import RefseqOffset
@@ -407,6 +408,8 @@ def get_score_set_variants_as_csv(
407408
drop_na_columns: Optional[bool] = None,
408409
include_custom_columns: bool = True,
409410
include_post_mapped_hgvs: bool = False,
411+
include_gnomad: bool = False,
412+
gnomad_version: Optional[Literal["v4.1"]] = None,
410413
) -> str:
411414
"""
412415
Get the variant data from a score set as a CSV string.
@@ -430,6 +433,8 @@ def get_score_set_variants_as_csv(
430433
include_post_mapped_hgvs : bool, optional
431434
Whether to include post-mapped HGVS notations in the output. Defaults to False. If True, the output will include
432435
columns for both post-mapped HGVS genomic (g.) and protein (p.) notations.
436+
include_gnomad: bool, optional
437+
Whether to include gnomAD allele frequency data in the output. Defaults to False.
433438
434439
Returns
435440
_______
@@ -444,6 +449,8 @@ def get_score_set_variants_as_csv(
444449
if include_post_mapped_hgvs:
445450
columns.append("post_mapped_hgvs_g")
446451
columns.append("post_mapped_hgvs_p")
452+
if include_gnomad:
453+
columns.append("gnomad_af")
447454

448455
if include_custom_columns:
449456
custom_columns = [str(x) for x in list(score_set.dataset_columns.get(custom_columns_set, []))]
@@ -453,8 +460,45 @@ def get_score_set_variants_as_csv(
453460

454461
variants: Sequence[Variant] = []
455462
mappings: Optional[list[Optional[MappedVariant]]] = None
463+
gnomad_data: Optional[list[Optional[GnomADVariant]]] = None
456464

457-
if include_post_mapped_hgvs:
465+
if include_gnomad:
466+
variants_mappings_and_gnomad_query = (
467+
select(Variant, MappedVariant, GnomADVariant)
468+
.join(
469+
MappedVariant,
470+
and_(Variant.id == MappedVariant.variant_id, MappedVariant.current.is_(True)),
471+
isouter=True,
472+
)
473+
.join(MappedVariant.gnomad_variants.of_type(GnomADVariant), isouter=True)
474+
.where(
475+
and_(
476+
Variant.score_set_id == score_set.id,
477+
or_(
478+
and_(
479+
GnomADVariant.db_name == "gnomAD",
480+
GnomADVariant.db_version == (gnomad_version if gnomad_version else "v4.1"),
481+
),
482+
GnomADVariant.id.is_(None),
483+
),
484+
)
485+
)
486+
.order_by(cast(func.split_part(Variant.urn, "#", 2), Integer))
487+
)
488+
if start:
489+
variants_mappings_and_gnomad_query = variants_mappings_and_gnomad_query.offset(start)
490+
if limit:
491+
variants_mappings_and_gnomad_query = variants_mappings_and_gnomad_query.limit(limit)
492+
variants_mappings_and_gnomad = db.execute(variants_mappings_and_gnomad_query).all()
493+
494+
variants = []
495+
mappings = []
496+
gnomad_data = []
497+
for variant, mapping, gnomad in variants_mappings_and_gnomad:
498+
variants.append(variant)
499+
mappings.append(mapping)
500+
gnomad_data.append(gnomad)
501+
elif include_post_mapped_hgvs:
458502
variants_and_mappings_query = (
459503
select(Variant, MappedVariant)
460504
.join(
@@ -488,7 +532,9 @@ def get_score_set_variants_as_csv(
488532
variants_query = variants_query.limit(limit)
489533
variants = db.scalars(variants_query).all()
490534

491-
rows_data = variants_to_csv_rows(variants, columns=columns, dtype=type_column, mappings=mappings) # type: ignore
535+
rows_data = variants_to_csv_rows(
536+
variants, columns=columns, dtype=type_column, mappings=mappings, gnomad_data=gnomad_data
537+
) # type: ignore
492538
if drop_na_columns:
493539
rows_data, columns = drop_na_columns_from_csv_file_rows(rows_data, columns)
494540

@@ -534,6 +580,7 @@ def variant_to_csv_row(
534580
columns: list[str],
535581
dtype: str,
536582
mapping: Optional[MappedVariant] = None,
583+
gnomad_data: Optional[GnomADVariant] = None,
537584
na_rep="NA",
538585
) -> dict[str, Any]:
539586
"""
@@ -547,6 +594,10 @@ def variant_to_csv_row(
547594
Columns to serialize.
548595
dtype : str, {'scores', 'counts'}
549596
The type of data requested. Either the 'score_data' or 'count_data'.
597+
mapping : variant.models.MappedVariant, optional
598+
Mapped variant corresponding to the variant.
599+
gnomad_data : variant.models.GnomADVariant, optional
600+
gnomAD variant data corresponding to the variant.
550601
na_rep : str
551602
String to represent null values.
552603
@@ -576,6 +627,8 @@ def variant_to_csv_row(
576627
value = hgvs_str
577628
else:
578629
value = ""
630+
elif column_key == "gnomad_af":
631+
value = str(gnomad_data.allele_frequency) if gnomad_data else ""
579632
else:
580633
parent = variant.data.get(dtype) if variant.data else None
581634
value = str(parent.get(column_key)) if parent else na_rep
@@ -591,6 +644,7 @@ def variants_to_csv_rows(
591644
columns: list[str],
592645
dtype: str,
593646
mappings: Optional[Sequence[Optional[MappedVariant]]] = None,
647+
gnomad_data: Optional[Sequence[Optional[GnomADVariant]]] = None,
594648
na_rep="NA",
595649
) -> Iterable[dict[str, Any]]:
596650
"""
@@ -604,14 +658,25 @@ def variants_to_csv_rows(
604658
Columns to serialize.
605659
dtype : str, {'scores', 'counts'}
606660
The type of data requested. Either the 'score_data' or 'count_data'.
661+
mappings : list[Optional[variant.models.MappedVariant]], optional
662+
List of mapped variants corresponding to the variants.
663+
gnomad_data : list[Optional[variant.models.GnomADVariant]], optional
664+
List of gnomAD variant data corresponding to the variants.
607665
na_rep : str
608666
String to represent null values.
609667
610668
Returns
611669
-------
612670
list[dict[str, Any]]
613671
"""
614-
if mappings is not None:
672+
if mappings is not None and gnomad_data is not None:
673+
return map(
674+
lambda zipped: variant_to_csv_row(
675+
zipped[0], columns, dtype, mapping=zipped[1], gnomad_data=zipped[2], na_rep=na_rep
676+
),
677+
zip(variants, mappings, gnomad_data),
678+
)
679+
elif mappings is not None:
615680
return map(
616681
lambda pair: variant_to_csv_row(pair[0], columns, dtype, mapping=pair[1], na_rep=na_rep),
617682
zip(variants, mappings),

src/mavedb/routers/score_sets.py

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -318,6 +318,7 @@ def get_score_set_variants_csv(
318318
drop_na_columns,
319319
include_custom_columns=False,
320320
include_post_mapped_hgvs=True,
321+
include_gnomad=True,
321322
)
322323
return StreamingResponse(iter([csv_str]), media_type="text/csv")
323324

@@ -1465,10 +1466,7 @@ async def get_clinical_controls_for_score_set(
14651466
select(ClinicalControl)
14661467
.join(ClinicalControl.mapped_variants)
14671468
.join(MappedVariant.variant)
1468-
.options(
1469-
contains_eager(ClinicalControl.mapped_variants)
1470-
.contains_eager(MappedVariant.variant)
1471-
)
1469+
.options(contains_eager(ClinicalControl.mapped_variants).contains_eager(MappedVariant.variant))
14721470
.filter(MappedVariant.current.is_(True))
14731471
.filter(Variant.score_set_id == item.id)
14741472
)

0 commit comments

Comments
 (0)