2525from mavedb .lib .mave .utils import is_csv_null
2626from mavedb .lib .validation .constants .general import null_values_list
2727from mavedb .lib .validation .utilities import is_null as validate_is_null
28- from mavedb .lib .variants import get_digest_from_post_mapped , get_hgvs_from_post_mapped , is_hgvs_g , is_hgvs_p
28+ from mavedb .lib .variants import get_digest_from_post_mapped
2929from mavedb .models .contributor import Contributor
3030from mavedb .models .controlled_keyword import ControlledKeyword
3131from mavedb .models .doi_identifier import DoiIdentifier
3535from mavedb .models .experiment_controlled_keyword import ExperimentControlledKeywordAssociation
3636from mavedb .models .experiment_publication_identifier import ExperimentPublicationIdentifierAssociation
3737from mavedb .models .experiment_set import ExperimentSet
38+ from mavedb .models .gnomad_variant import GnomADVariant
3839from mavedb .models .mapped_variant import MappedVariant
3940from mavedb .models .publication_identifier import PublicationIdentifier
4041from mavedb .models .refseq_identifier import RefseqIdentifier
@@ -501,7 +502,7 @@ def find_publish_or_private_superseded_score_set_tail(
501502def get_score_set_variants_as_csv (
502503 db : Session ,
503504 score_set : ScoreSet ,
504- namespaces : List [Literal ["scores" , "counts" ]],
505+ namespaces : List [Literal ["scores" , "counts" , "vep" , "gnomad" ]],
505506 namespaced : Optional [bool ] = None ,
506507 start : Optional [int ] = None ,
507508 limit : Optional [int ] = None ,
@@ -518,8 +519,8 @@ def get_score_set_variants_as_csv(
518519 The database session to use.
519520 score_set : ScoreSet
520521 The score set to get the variants from.
521- namespaces : List[Literal["scores", "counts"]]
522- The namespaces for data. Now there are only scores and counts. There will be ClinVar and gnomAD .
522+ namespaces : List[Literal["scores", "counts", "vep", "gnomad" ]]
523+ The namespaces for data. Now there are only scores, counts, VEP, and gnomAD. ClinVar will be added in the future .
523524 namespaced: Optional[bool] = None
524525 Whether namespace the columns or not.
525526 start : int, optional
@@ -531,8 +532,8 @@ def get_score_set_variants_as_csv(
531532 include_custom_columns : bool, optional
532533 Whether to include custom columns defined in the score set. Defaults to True.
533534 include_post_mapped_hgvs : bool, optional
534- Whether to include post-mapped HGVS notations in the output. Defaults to False. If True, the output will include
535- columns for both post-mapped HGVS genomic (g.) and protein (p.) notations.
535+ Whether to include post-mapped HGVS notations and VEP functional consequence in the output. Defaults to False. If True, the output will include
536+ columns for post-mapped HGVS genomic (g.) and protein (p.) notations, and VEP functional consequence .
536537
537538 Returns
538539 _______
@@ -547,9 +548,12 @@ def get_score_set_variants_as_csv(
547548 if include_post_mapped_hgvs :
548549 namespaced_score_set_columns ["mavedb" ].append ("post_mapped_hgvs_g" )
549550 namespaced_score_set_columns ["mavedb" ].append ("post_mapped_hgvs_p" )
551+ namespaced_score_set_columns ["mavedb" ].append ("post_mapped_hgvs_c" )
552+ namespaced_score_set_columns ["mavedb" ].append ("post_mapped_hgvs_at_assay_level" )
550553 namespaced_score_set_columns ["mavedb" ].append ("post_mapped_vrs_digest" )
551554 for namespace in namespaces :
552555 namespaced_score_set_columns [namespace ] = []
556+
553557 if include_custom_columns :
554558 if "scores" in namespaced_score_set_columns :
555559 namespaced_score_set_columns ["scores" ] = [
@@ -561,10 +565,51 @@ def get_score_set_variants_as_csv(
561565 ]
562566 elif "scores" in namespaced_score_set_columns :
563567 namespaced_score_set_columns ["scores" ].append (REQUIRED_SCORE_COLUMN )
568+ if "vep" in namespaced_score_set_columns :
569+ namespaced_score_set_columns ["vep" ].append ("vep_functional_consequence" )
570+ if "gnomad" in namespaced_score_set_columns :
571+ namespaced_score_set_columns ["gnomad" ].append ("gnomad_af" )
564572 variants : Sequence [Variant ] = []
565573 mappings : Optional [list [Optional [MappedVariant ]]] = None
574+ gnomad_data : Optional [list [Optional [GnomADVariant ]]] = None
566575
567- if include_post_mapped_hgvs :
576+ if "gnomad" in namespaces and include_post_mapped_hgvs :
577+ variants_mappings_and_gnomad_query = (
578+ select (Variant , MappedVariant , GnomADVariant )
579+ .join (
580+ MappedVariant ,
581+ and_ (Variant .id == MappedVariant .variant_id , MappedVariant .current .is_ (True )),
582+ isouter = True ,
583+ )
584+ .join (MappedVariant .gnomad_variants .of_type (GnomADVariant ), isouter = True )
585+ .where (
586+ and_ (
587+ Variant .score_set_id == score_set .id ,
588+ or_ (
589+ and_ (
590+ GnomADVariant .db_name == "gnomAD" ,
591+ GnomADVariant .db_version == "v4.1" ,
592+ ),
593+ GnomADVariant .id .is_ (None ),
594+ ),
595+ )
596+ )
597+ .order_by (cast (func .split_part (Variant .urn , "#" , 2 ), Integer ))
598+ )
599+ if start :
600+ variants_mappings_and_gnomad_query = variants_mappings_and_gnomad_query .offset (start )
601+ if limit :
602+ variants_mappings_and_gnomad_query = variants_mappings_and_gnomad_query .limit (limit )
603+ variants_mappings_and_gnomad = db .execute (variants_mappings_and_gnomad_query ).all ()
604+
605+ variants = []
606+ mappings = []
607+ gnomad_data = []
608+ for variant , mapping , gnomad in variants_mappings_and_gnomad :
609+ variants .append (variant )
610+ mappings .append (mapping )
611+ gnomad_data .append (gnomad )
612+ elif include_post_mapped_hgvs :
568613 variants_and_mappings_query = (
569614 select (Variant , MappedVariant )
570615 .join (
@@ -586,6 +631,40 @@ def get_score_set_variants_as_csv(
586631 for variant , mapping in variants_and_mappings :
587632 variants .append (variant )
588633 mappings .append (mapping )
634+ elif "gnomad" in namespaces :
635+ variants_and_gnomad_query = (
636+ select (Variant , GnomADVariant )
637+ .join (
638+ MappedVariant ,
639+ and_ (Variant .id == MappedVariant .variant_id , MappedVariant .current .is_ (True )),
640+ isouter = True ,
641+ )
642+ .join (MappedVariant .gnomad_variants .of_type (GnomADVariant ), isouter = True )
643+ .where (
644+ and_ (
645+ Variant .score_set_id == score_set .id ,
646+ or_ (
647+ and_ (
648+ GnomADVariant .db_name == "gnomAD" ,
649+ GnomADVariant .db_version == "v4.1" ,
650+ ),
651+ GnomADVariant .id .is_ (None ),
652+ ),
653+ )
654+ )
655+ .order_by (cast (func .split_part (Variant .urn , "#" , 2 ), Integer ))
656+ )
657+ if start :
658+ variants_and_gnomad_query = variants_and_gnomad_query .offset (start )
659+ if limit :
660+ variants_and_gnomad_query = variants_and_gnomad_query .limit (limit )
661+ variants_and_gnomad = db .execute (variants_and_gnomad_query ).all ()
662+
663+ variants = []
664+ gnomad_data = []
665+ for variant , gnomad in variants_and_gnomad :
666+ variants .append (variant )
667+ gnomad_data .append (gnomad )
589668 else :
590669 variants_query = (
591670 select (Variant )
@@ -598,7 +677,11 @@ def get_score_set_variants_as_csv(
598677 variants_query = variants_query .limit (limit )
599678 variants = db .scalars (variants_query ).all ()
600679 rows_data = variants_to_csv_rows (
601- variants , columns = namespaced_score_set_columns , namespaced = namespaced , mappings = mappings
680+ variants ,
681+ columns = namespaced_score_set_columns ,
682+ namespaced = namespaced ,
683+ mappings = mappings ,
684+ gnomad_data = gnomad_data ,
602685 ) # type: ignore
603686 rows_columns = [
604687 (
@@ -654,6 +737,7 @@ def variant_to_csv_row(
654737 variant : Variant ,
655738 columns : dict [str , list [str ]],
656739 mapping : Optional [MappedVariant ] = None ,
740+ gnomad_data : Optional [GnomADVariant ] = None ,
657741 namespaced : Optional [bool ] = None ,
658742 na_rep = "NA" ,
659743) -> dict [str , Any ]:
@@ -668,6 +752,10 @@ def variant_to_csv_row(
668752 Columns to serialize.
669753 namespaced: Optional[bool] = None
670754 Namespace the columns or not.
755+ mapping : variant.models.MappedVariant, optional
756+ Mapped variant corresponding to the variant.
757+ gnomad_data : variant.models.GnomADVariant, optional
758+ gnomAD variant data corresponding to the variant.
671759 na_rep : str
672760 String to represent null values.
673761
@@ -693,24 +781,29 @@ def variant_to_csv_row(
693781 row [column_key ] = value
694782 for column_key in columns .get ("mavedb" , []):
695783 if column_key == "post_mapped_hgvs_g" :
696- hgvs_str = get_hgvs_from_post_mapped (mapping .post_mapped ) if mapping and mapping .post_mapped else None
697- if hgvs_str is not None and is_hgvs_g (hgvs_str ):
698- value = hgvs_str
699- else :
700- value = ""
784+ value = str (mapping .hgvs_g ) if mapping and mapping .hgvs_g else na_rep
701785 elif column_key == "post_mapped_hgvs_p" :
702- hgvs_str = get_hgvs_from_post_mapped (mapping .post_mapped ) if mapping and mapping .post_mapped else None
703- if hgvs_str is not None and is_hgvs_p ( hgvs_str ) :
704- value = hgvs_str
705- else :
706- value = ""
786+ value = str (mapping .hgvs_p ) if mapping and mapping .hgvs_p else na_rep
787+ elif column_key == "post_mapped_hgvs_c" :
788+ value = str ( mapping . hgvs_c ) if mapping and mapping . hgvs_c else na_rep
789+ elif column_key == "post_mapped_hgvs_at_assay_level" :
790+ value = str ( mapping . hgvs_assay_level ) if mapping and mapping . hgvs_assay_level else na_rep
707791 elif column_key == "post_mapped_vrs_digest" :
708792 digest = get_digest_from_post_mapped (mapping .post_mapped ) if mapping and mapping .post_mapped else None
709- value = digest if digest is not None else ""
793+ value = digest if digest is not None else na_rep
710794 if is_null (value ):
711795 value = na_rep
712796 key = f"mavedb.{ column_key } " if namespaced else column_key
713797 row [key ] = value
798+ for column_key in columns .get ("vep" , []):
799+ if column_key == "vep_functional_consequence" :
800+ vep_functional_consequence = mapping .vep_functional_consequence if mapping else None
801+ if vep_functional_consequence is not None :
802+ value = vep_functional_consequence
803+ else :
804+ value = na_rep
805+ key = f"vep.{ column_key } " if namespaced else column_key
806+ row [key ] = value
714807 for column_key in columns .get ("scores" , []):
715808 parent = variant .data .get ("score_data" ) if variant .data else None
716809 value = str (parent .get (column_key )) if parent else na_rep
@@ -721,13 +814,23 @@ def variant_to_csv_row(
721814 value = str (parent .get (column_key )) if parent else na_rep
722815 key = f"counts.{ column_key } " if namespaced else column_key
723816 row [key ] = value
817+ for column_key in columns .get ("gnomad" , []):
818+ if column_key == "gnomad_af" :
819+ gnomad_af = gnomad_data .allele_frequency if gnomad_data else None
820+ if gnomad_af is not None :
821+ value = str (gnomad_af )
822+ else :
823+ value = na_rep
824+ key = f"gnomad.{ column_key } " if namespaced else column_key
825+ row [key ] = value
724826 return row
725827
726828
727829def variants_to_csv_rows (
728830 variants : Sequence [Variant ],
729831 columns : dict [str , list [str ]],
730832 mappings : Optional [Sequence [Optional [MappedVariant ]]] = None ,
833+ gnomad_data : Optional [Sequence [Optional [GnomADVariant ]]] = None ,
731834 namespaced : Optional [bool ] = None ,
732835 na_rep = "NA" ,
733836) -> Iterable [dict [str , Any ]]:
@@ -742,18 +845,36 @@ def variants_to_csv_rows(
742845 Columns to serialize.
743846 namespaced: Optional[bool] = None
744847 Namespace the columns or not.
848+ mappings : list[Optional[variant.models.MappedVariant]], optional
849+ List of mapped variants corresponding to the variants.
850+ gnomad_data : list[Optional[variant.models.GnomADVariant]], optional
851+ List of gnomAD variant data corresponding to the variants.
745852 na_rep : str
746853 String to represent null values.
747854
748855 Returns
749856 -------
750857 list[dict[str, Any]]
751858 """
752- if mappings is not None :
859+ if mappings is not None and gnomad_data is not None :
860+ return map (
861+ lambda zipped : variant_to_csv_row (
862+ zipped [0 ], columns , mapping = zipped [1 ], gnomad_data = zipped [2 ], namespaced = namespaced , na_rep = na_rep
863+ ),
864+ zip (variants , mappings , gnomad_data ),
865+ )
866+ elif mappings is not None :
753867 return map (
754868 lambda pair : variant_to_csv_row (pair [0 ], columns , mapping = pair [1 ], namespaced = namespaced , na_rep = na_rep ),
755869 zip (variants , mappings ),
756870 )
871+ elif gnomad_data is not None :
872+ return map (
873+ lambda pair : variant_to_csv_row (
874+ pair [0 ], columns , gnomad_data = pair [1 ], namespaced = namespaced , na_rep = na_rep
875+ ),
876+ zip (variants , gnomad_data ),
877+ )
757878 return map (lambda v : variant_to_csv_row (v , columns , namespaced = namespaced , na_rep = na_rep ), variants )
758879
759880
0 commit comments