@@ -254,14 +254,16 @@ def _get_mane_p(mane_data: Dict, mane_c_pos_range: Tuple[int, int]) -> Dict:
254254 :return: MANE transcripts accessions and position change on
255255 p. coordinate
256256 """
257+ start = mane_c_pos_range [0 ] / 3
258+ end = mane_c_pos_range [1 ] / 3
259+ start = math .floor (start ) if start == end else math .ceil (start )
260+ end = math .floor (end )
261+
257262 return dict (
258263 gene = mane_data ["symbol" ],
259264 refseq = mane_data ["RefSeq_prot" ],
260265 ensembl = mane_data ["Ensembl_prot" ],
261- pos = (
262- math .ceil (mane_c_pos_range [0 ] / 3 ),
263- math .floor (mane_c_pos_range [1 ] / 3 ),
264- ),
266+ pos = (start , end ),
265267 strand = mane_data ["chr_strand" ],
266268 status = TranscriptPriority (
267269 "_" .join (mane_data ["MANE_status" ].split ()).lower ()
@@ -507,27 +509,35 @@ def _get_prioritized_transcripts_from_gene(self, df: pl.DataFrame) -> List:
507509
508510 async def get_longest_compatible_transcript (
509511 self ,
510- gene : str ,
511512 start_pos : int ,
512513 end_pos : int ,
513514 start_annotation_layer : AnnotationLayer ,
515+ gene : Optional [str ] = None ,
514516 ref : Optional [str ] = None ,
515517 residue_mode : ResidueMode = ResidueMode .RESIDUE ,
516518 mane_transcripts : Optional [Set ] = None ,
517519 alt_ac : Optional [str ] = None ,
520+ end_annotation_layer : Optional [
521+ Union [AnnotationLayer .PROTEIN , AnnotationLayer .CDNA ]
522+ ] = None ,
518523 ) -> Optional [Dict ]:
519524 """Get longest compatible transcript from a gene.
520525 Try GRCh38 first, then GRCh37.
521526 Transcript is compatible if it passes validation checks.
522527
523- :param gene: Gene symbol
524528 :param start_pos: Start position change
525529 :param end_pos: End position change
526- :param start_annotation_layer: Starting annotation layer.
530+ :param start_annotation_layer: Starting annotation layer
531+ :param gene: HGNC gene symbol
527532 :param ref: Reference at position given during input
528533 :param residue_mode: Residue mode for `start_pos` and `end_pos`
529534 :param mane_transcripts: Attempted mane transcripts that were not compatible
530535 :param alt_ac: Genomic accession
536+ :param end_annotation_layer: The end annotation layer. If not provided, will be
537+ set to the following
538+ `AnnotationLayer.PROTEIN` if
539+ `start_annotation_layer == AnnotationLayer.PROTEIN`
540+ `AnnotationLayer.CDNA` otherwise
531541 :return: Data for longest compatible transcript
532542 """
533543 inter_residue_pos , _ = get_inter_residue_pos (
@@ -548,13 +558,14 @@ async def get_longest_compatible_transcript(
548558
549559 # Data Frame that contains transcripts associated to a gene
550560 if is_p_or_c_start_anno :
551- df = await self .uta_db .get_transcripts_from_gene (
552- gene , c_start_pos , c_end_pos , use_tx_pos = True , alt_ac = alt_ac
561+ df = await self .uta_db .get_transcripts (
562+ c_start_pos , c_end_pos , gene = gene , use_tx_pos = True , alt_ac = alt_ac
553563 )
554564 else :
555- df = await self .uta_db .get_transcripts_from_gene (
556- gene , start_pos , end_pos , use_tx_pos = False , alt_ac = alt_ac
565+ df = await self .uta_db .get_transcripts (
566+ start_pos , end_pos , gene = gene , use_tx_pos = False , alt_ac = alt_ac
557567 )
568+
558569 if df .is_empty ():
559570 logger .warning (f"Unable to get transcripts from gene { gene } " )
560571 return None
@@ -651,7 +662,13 @@ async def get_longest_compatible_transcript(
651662 if not valid_references :
652663 continue
653664
654- if start_annotation_layer == AnnotationLayer .PROTEIN :
665+ if not end_annotation_layer :
666+ if start_annotation_layer == AnnotationLayer .PROTEIN :
667+ end_annotation_layer = AnnotationLayer .PROTEIN
668+ else :
669+ end_annotation_layer = AnnotationLayer .CDNA
670+
671+ if end_annotation_layer == AnnotationLayer .PROTEIN :
655672 pos = (
656673 math .ceil (lcr_c_data ["pos" ][0 ] / 3 ),
657674 math .floor (lcr_c_data ["pos" ][1 ] / 3 ),
@@ -699,7 +716,7 @@ async def get_mane_transcript(
699716 :param start_annotation_layer: Starting annotation layer.
700717 :param end_pos: End position change. If `None` assumes both `start_pos` and
701718 `end_pos` have same values.
702- :param gene: Gene symbol
719+ :param gene: HGNC gene symbol
703720 :param ref: Reference at position given during input
704721 :param try_longest_compatible: `True` if should try longest compatible remaining
705722 if mane transcript was not compatible. `False` otherwise.
@@ -793,21 +810,21 @@ async def get_mane_transcript(
793810 if try_longest_compatible :
794811 if start_annotation_layer == AnnotationLayer .PROTEIN :
795812 return await self .get_longest_compatible_transcript (
796- g ["gene" ],
797813 start_pos ,
798814 end_pos ,
799815 AnnotationLayer .PROTEIN ,
800- ref ,
816+ ref = ref ,
817+ gene = g ["gene" ],
801818 residue_mode = residue_mode ,
802819 mane_transcripts = mane_transcripts ,
803820 )
804821 else :
805822 return await self .get_longest_compatible_transcript (
806- g ["gene" ],
807823 c_pos [0 ],
808824 c_pos [1 ],
809825 AnnotationLayer .CDNA ,
810- ref ,
826+ ref = ref ,
827+ gene = g ["gene" ],
811828 residue_mode = residue_mode ,
812829 mane_transcripts = mane_transcripts ,
813830 )
@@ -1005,3 +1022,94 @@ async def g_to_mane_c(
10051022 ensembl_c_ac = current_mane_data ["Ensembl_nuc" ],
10061023 alt_ac = grch38 ["ac" ] if grch38 else None ,
10071024 )
1025+
1026+ async def grch38_to_mane_p (
1027+ self ,
1028+ alt_ac : str ,
1029+ start_pos : int ,
1030+ end_pos : int ,
1031+ gene : Optional [str ] = None ,
1032+ residue_mode : ResidueMode = ResidueMode .RESIDUE ,
1033+ try_longest_compatible : bool = False ,
1034+ ) -> Optional [Dict ]:
1035+ """Given GRCh38 genomic representation, return protein representation.
1036+ Will try MANE Select and then MANE Plus Clinical. If neither is found and
1037+ `try_longest_compatible` is set to `true`, will also try to find the longest
1038+ compatible remaining representation.
1039+
1040+ :param alt_ac: Genomic RefSeq accession on GRCh38
1041+ :param start_pos: Start position
1042+ :param end_pos: End position
1043+ :param gene: HGNC gene symbol
1044+ :param residue_mode: Starting residue mode for `start_pos` and `end_pos`. Will
1045+ always return coordinates as inter-residue.
1046+ :param try_longest_compatible: `True` if should try longest compatible remaining
1047+ if mane transcript(s) not compatible. `False` otherwise.
1048+ :return: If successful, return MANE data or longest compatible remaining (if
1049+ `try_longest_compatible` set to `True`) protein representation. Will return
1050+ inter-residue coordinates.
1051+ """
1052+ # Step 1: Get MANE data to map to
1053+ if gene :
1054+ mane_data = self .mane_transcript_mappings .get_gene_mane_data (gene )
1055+ else :
1056+ mane_data = self .mane_transcript_mappings .get_mane_data_from_chr_pos (
1057+ alt_ac , start_pos , end_pos
1058+ )
1059+
1060+ if not mane_data and not try_longest_compatible :
1061+ return None
1062+
1063+ # Step 2: Get inter-residue position
1064+ inter_residue_pos , _ = get_inter_residue_pos (
1065+ start_pos , residue_mode , end_pos = end_pos
1066+ )
1067+ if not inter_residue_pos :
1068+ return None
1069+ start_pos , end_pos = inter_residue_pos
1070+ residue_mode = ResidueMode .INTER_RESIDUE
1071+
1072+ # Step 3: Try getting MANE protein representation
1073+ mane_transcripts = set () # Used if getting longest compatible remaining
1074+ for current_mane_data in mane_data :
1075+ mane_c_ac = current_mane_data ["RefSeq_nuc" ]
1076+ mane_transcripts |= set ((mane_c_ac , current_mane_data ["Ensembl_nuc" ]))
1077+
1078+ # GRCh38 -> MANE C
1079+ mane_tx_genomic_data = await self .uta_db .get_mane_c_genomic_data (
1080+ mane_c_ac , None , start_pos , end_pos
1081+ )
1082+ if not mane_tx_genomic_data :
1083+ continue
1084+
1085+ # Get MANE C positions
1086+ coding_start_site = mane_tx_genomic_data ["coding_start_site" ]
1087+ mane_c_pos_change = self .get_mane_c_pos_change (
1088+ mane_tx_genomic_data , coding_start_site
1089+ )
1090+
1091+ # Validate MANE C positions
1092+ if not self ._validate_index (
1093+ mane_c_ac , mane_c_pos_change , coding_start_site
1094+ ):
1095+ logger .warning (
1096+ f"{ mane_c_pos_change } are not valid positions on { mane_c_ac } with "
1097+ f"coding start site { coding_start_site } "
1098+ )
1099+ continue
1100+
1101+ # MANE C -> MANE P
1102+ return self ._get_mane_p (current_mane_data , mane_c_pos_change )
1103+
1104+ if try_longest_compatible :
1105+ return await self .get_longest_compatible_transcript (
1106+ start_pos ,
1107+ end_pos ,
1108+ AnnotationLayer .GENOMIC ,
1109+ residue_mode = residue_mode ,
1110+ alt_ac = alt_ac ,
1111+ end_annotation_layer = AnnotationLayer .PROTEIN ,
1112+ mane_transcripts = mane_transcripts ,
1113+ )
1114+ else :
1115+ return None
0 commit comments