Skip to content

Commit af67d8e

Browse files
authored
feat!: create method for GRCh38 to MANE protein representation (#211)
* `MANETranscript` updates: * Add method `grch38_to_mane_p` * Given GRCh38 representation, will return MANE protein representation. Also allows for longest compatible remaining representation to be returned if no compatible MANE representation found. * Updates `get_longest_compatible_transcript` * `gene` is now optional * Adds `end_annotation_layer` as a parameter. This will allow you to either return protein or cdna representation when set. * `MANETranscriptMappings` updates: * `get_gene_mane_data` always returns list. If no MANE data found, will return empty list (previously returned None) * `UTADatabase` updates: * `get_transcripts_from_gene` renamed to `get_transcripts` * `gene` is now an optional parameter
1 parent a74262a commit af67d8e

File tree

6 files changed

+313
-52
lines changed

6 files changed

+313
-52
lines changed

cool_seq_tool/mappers/mane_transcript.py

Lines changed: 125 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -254,14 +254,16 @@ def _get_mane_p(mane_data: Dict, mane_c_pos_range: Tuple[int, int]) -> Dict:
254254
:return: MANE transcripts accessions and position change on
255255
p. coordinate
256256
"""
257+
start = mane_c_pos_range[0] / 3
258+
end = mane_c_pos_range[1] / 3
259+
start = math.floor(start) if start == end else math.ceil(start)
260+
end = math.floor(end)
261+
257262
return dict(
258263
gene=mane_data["symbol"],
259264
refseq=mane_data["RefSeq_prot"],
260265
ensembl=mane_data["Ensembl_prot"],
261-
pos=(
262-
math.ceil(mane_c_pos_range[0] / 3),
263-
math.floor(mane_c_pos_range[1] / 3),
264-
),
266+
pos=(start, end),
265267
strand=mane_data["chr_strand"],
266268
status=TranscriptPriority(
267269
"_".join(mane_data["MANE_status"].split()).lower()
@@ -507,27 +509,35 @@ def _get_prioritized_transcripts_from_gene(self, df: pl.DataFrame) -> List:
507509

508510
async def get_longest_compatible_transcript(
509511
self,
510-
gene: str,
511512
start_pos: int,
512513
end_pos: int,
513514
start_annotation_layer: AnnotationLayer,
515+
gene: Optional[str] = None,
514516
ref: Optional[str] = None,
515517
residue_mode: ResidueMode = ResidueMode.RESIDUE,
516518
mane_transcripts: Optional[Set] = None,
517519
alt_ac: Optional[str] = None,
520+
end_annotation_layer: Optional[
521+
Union[AnnotationLayer.PROTEIN, AnnotationLayer.CDNA]
522+
] = None,
518523
) -> Optional[Dict]:
519524
"""Get longest compatible transcript from a gene.
520525
Try GRCh38 first, then GRCh37.
521526
Transcript is compatible if it passes validation checks.
522527
523-
:param gene: Gene symbol
524528
:param start_pos: Start position change
525529
:param end_pos: End position change
526-
:param start_annotation_layer: Starting annotation layer.
530+
:param start_annotation_layer: Starting annotation layer
531+
:param gene: HGNC gene symbol
527532
:param ref: Reference at position given during input
528533
:param residue_mode: Residue mode for `start_pos` and `end_pos`
529534
:param mane_transcripts: Attempted mane transcripts that were not compatible
530535
:param alt_ac: Genomic accession
536+
:param end_annotation_layer: The end annotation layer. If not provided, will be
537+
set to the following
538+
`AnnotationLayer.PROTEIN` if
539+
`start_annotation_layer == AnnotationLayer.PROTEIN`
540+
`AnnotationLayer.CDNA` otherwise
531541
:return: Data for longest compatible transcript
532542
"""
533543
inter_residue_pos, _ = get_inter_residue_pos(
@@ -548,13 +558,14 @@ async def get_longest_compatible_transcript(
548558

549559
# Data Frame that contains transcripts associated to a gene
550560
if is_p_or_c_start_anno:
551-
df = await self.uta_db.get_transcripts_from_gene(
552-
gene, c_start_pos, c_end_pos, use_tx_pos=True, alt_ac=alt_ac
561+
df = await self.uta_db.get_transcripts(
562+
c_start_pos, c_end_pos, gene=gene, use_tx_pos=True, alt_ac=alt_ac
553563
)
554564
else:
555-
df = await self.uta_db.get_transcripts_from_gene(
556-
gene, start_pos, end_pos, use_tx_pos=False, alt_ac=alt_ac
565+
df = await self.uta_db.get_transcripts(
566+
start_pos, end_pos, gene=gene, use_tx_pos=False, alt_ac=alt_ac
557567
)
568+
558569
if df.is_empty():
559570
logger.warning(f"Unable to get transcripts from gene {gene}")
560571
return None
@@ -651,7 +662,13 @@ async def get_longest_compatible_transcript(
651662
if not valid_references:
652663
continue
653664

654-
if start_annotation_layer == AnnotationLayer.PROTEIN:
665+
if not end_annotation_layer:
666+
if start_annotation_layer == AnnotationLayer.PROTEIN:
667+
end_annotation_layer = AnnotationLayer.PROTEIN
668+
else:
669+
end_annotation_layer = AnnotationLayer.CDNA
670+
671+
if end_annotation_layer == AnnotationLayer.PROTEIN:
655672
pos = (
656673
math.ceil(lcr_c_data["pos"][0] / 3),
657674
math.floor(lcr_c_data["pos"][1] / 3),
@@ -699,7 +716,7 @@ async def get_mane_transcript(
699716
:param start_annotation_layer: Starting annotation layer.
700717
:param end_pos: End position change. If `None` assumes both `start_pos` and
701718
`end_pos` have same values.
702-
:param gene: Gene symbol
719+
:param gene: HGNC gene symbol
703720
:param ref: Reference at position given during input
704721
:param try_longest_compatible: `True` if should try longest compatible remaining
705722
if mane transcript was not compatible. `False` otherwise.
@@ -793,21 +810,21 @@ async def get_mane_transcript(
793810
if try_longest_compatible:
794811
if start_annotation_layer == AnnotationLayer.PROTEIN:
795812
return await self.get_longest_compatible_transcript(
796-
g["gene"],
797813
start_pos,
798814
end_pos,
799815
AnnotationLayer.PROTEIN,
800-
ref,
816+
ref=ref,
817+
gene=g["gene"],
801818
residue_mode=residue_mode,
802819
mane_transcripts=mane_transcripts,
803820
)
804821
else:
805822
return await self.get_longest_compatible_transcript(
806-
g["gene"],
807823
c_pos[0],
808824
c_pos[1],
809825
AnnotationLayer.CDNA,
810-
ref,
826+
ref=ref,
827+
gene=g["gene"],
811828
residue_mode=residue_mode,
812829
mane_transcripts=mane_transcripts,
813830
)
@@ -1005,3 +1022,94 @@ async def g_to_mane_c(
10051022
ensembl_c_ac=current_mane_data["Ensembl_nuc"],
10061023
alt_ac=grch38["ac"] if grch38 else None,
10071024
)
1025+
1026+
async def grch38_to_mane_p(
1027+
self,
1028+
alt_ac: str,
1029+
start_pos: int,
1030+
end_pos: int,
1031+
gene: Optional[str] = None,
1032+
residue_mode: ResidueMode = ResidueMode.RESIDUE,
1033+
try_longest_compatible: bool = False,
1034+
) -> Optional[Dict]:
1035+
"""Given GRCh38 genomic representation, return protein representation.
1036+
Will try MANE Select and then MANE Plus Clinical. If neither is found and
1037+
`try_longest_compatible` is set to `true`, will also try to find the longest
1038+
compatible remaining representation.
1039+
1040+
:param alt_ac: Genomic RefSeq accession on GRCh38
1041+
:param start_pos: Start position
1042+
:param end_pos: End position
1043+
:param gene: HGNC gene symbol
1044+
:param residue_mode: Starting residue mode for `start_pos` and `end_pos`. Will
1045+
always return coordinates as inter-residue.
1046+
:param try_longest_compatible: `True` if should try longest compatible remaining
1047+
if mane transcript(s) not compatible. `False` otherwise.
1048+
:return: If successful, return MANE data or longest compatible remaining (if
1049+
`try_longest_compatible` set to `True`) protein representation. Will return
1050+
inter-residue coordinates.
1051+
"""
1052+
# Step 1: Get MANE data to map to
1053+
if gene:
1054+
mane_data = self.mane_transcript_mappings.get_gene_mane_data(gene)
1055+
else:
1056+
mane_data = self.mane_transcript_mappings.get_mane_data_from_chr_pos(
1057+
alt_ac, start_pos, end_pos
1058+
)
1059+
1060+
if not mane_data and not try_longest_compatible:
1061+
return None
1062+
1063+
# Step 2: Get inter-residue position
1064+
inter_residue_pos, _ = get_inter_residue_pos(
1065+
start_pos, residue_mode, end_pos=end_pos
1066+
)
1067+
if not inter_residue_pos:
1068+
return None
1069+
start_pos, end_pos = inter_residue_pos
1070+
residue_mode = ResidueMode.INTER_RESIDUE
1071+
1072+
# Step 3: Try getting MANE protein representation
1073+
mane_transcripts = set() # Used if getting longest compatible remaining
1074+
for current_mane_data in mane_data:
1075+
mane_c_ac = current_mane_data["RefSeq_nuc"]
1076+
mane_transcripts |= set((mane_c_ac, current_mane_data["Ensembl_nuc"]))
1077+
1078+
# GRCh38 -> MANE C
1079+
mane_tx_genomic_data = await self.uta_db.get_mane_c_genomic_data(
1080+
mane_c_ac, None, start_pos, end_pos
1081+
)
1082+
if not mane_tx_genomic_data:
1083+
continue
1084+
1085+
# Get MANE C positions
1086+
coding_start_site = mane_tx_genomic_data["coding_start_site"]
1087+
mane_c_pos_change = self.get_mane_c_pos_change(
1088+
mane_tx_genomic_data, coding_start_site
1089+
)
1090+
1091+
# Validate MANE C positions
1092+
if not self._validate_index(
1093+
mane_c_ac, mane_c_pos_change, coding_start_site
1094+
):
1095+
logger.warning(
1096+
f"{mane_c_pos_change} are not valid positions on {mane_c_ac} with "
1097+
f"coding start site {coding_start_site}"
1098+
)
1099+
continue
1100+
1101+
# MANE C -> MANE P
1102+
return self._get_mane_p(current_mane_data, mane_c_pos_change)
1103+
1104+
if try_longest_compatible:
1105+
return await self.get_longest_compatible_transcript(
1106+
start_pos,
1107+
end_pos,
1108+
AnnotationLayer.GENOMIC,
1109+
residue_mode=residue_mode,
1110+
alt_ac=alt_ac,
1111+
end_annotation_layer=AnnotationLayer.PROTEIN,
1112+
mane_transcripts=mane_transcripts,
1113+
)
1114+
else:
1115+
return None

cool_seq_tool/sources/mane_transcript_mappings.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
"""The module for loading MANE Transcript mappings to genes."""
22
import logging
33
from pathlib import Path
4-
from typing import Dict, List, Optional
4+
from typing import Dict, List
55

66
import polars as pl
77

@@ -26,7 +26,7 @@ def _load_mane_transcript_data(self) -> pl.DataFrame:
2626
"""
2727
return pl.read_csv(self.mane_data_path, separator="\t")
2828

29-
def get_gene_mane_data(self, gene_symbol: str) -> Optional[List[Dict]]:
29+
def get_gene_mane_data(self, gene_symbol: str) -> List[Dict]:
3030
"""Return MANE Transcript data for a gene.
3131
:param str gene_symbol: HGNC Gene Symbol
3232
:return: List of MANE Transcript data (Transcript accessions,
@@ -39,7 +39,7 @@ def get_gene_mane_data(self, gene_symbol: str) -> Optional[List[Dict]]:
3939
logger.warning(
4040
f"Unable to get MANE Transcript data for gene: " f"{gene_symbol}"
4141
)
42-
return None
42+
return []
4343

4444
data = data.sort(by="MANE_status", descending=True)
4545
return data.to_dicts()

cool_seq_tool/sources/uta_database.py

Lines changed: 23 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -863,30 +863,36 @@ async def get_gene_from_ac(
863863

864864
return [r[0] for r in results]
865865

866-
async def get_transcripts_from_gene(
866+
async def get_transcripts(
867867
self,
868-
gene: str,
869868
start_pos: Optional[int] = None,
870869
end_pos: Optional[int] = None,
870+
gene: Optional[str] = None,
871871
use_tx_pos: bool = True,
872872
alt_ac: Optional[str] = None,
873873
) -> pl.DataFrame:
874-
"""Get transcripts associated to a gene.
874+
"""Get transcripts for a given `gene` or `alt_ac` related to optional positions.
875875
876-
:param gene: HGNC gene symbol
877-
:param start_pos: Start position change.
878-
If not provided and `end_pos` not provided, all transcripts associated with the gene and/or accession will be returned.
876+
:param start_pos: Start position change
877+
If not provided and `end_pos` not provided, all transcripts associated with
878+
the gene and/or accession will be returned
879879
:param end_pos: End position change
880-
If not provided and `start_pos` not provided, all transcripts associated with the gene and/or accession will be returned.
880+
If not provided and `start_pos` not provided, all transcripts associated
881+
with the gene and/or accession will be returned
882+
:param gene: HGNC gene symbol
881883
:param use_tx_pos: `True` if querying on transcript position. This means
882-
`start_pos` and `end_pos` are c. coordinate positions. `False` if querying on
883-
genomic position. This means `start_pos` and `end_pos` are g. coordinate
884+
`start_pos` and `end_pos` are c. coordinate positions. `False` if querying
885+
on genomic position. This means `start_pos` and `end_pos` are g. coordinate
884886
positions
885887
:param alt_ac: Genomic accession. If not provided, must provide `gene`
886-
:return: Data Frame containing transcripts associated with a gene. Transcripts
887-
are ordered by most recent NC accession, then by descending transcript
888-
length.
888+
:return: Data Frame containing transcripts associated with a gene.
889+
Transcripts are ordered by most recent NC accession, then by
890+
descending transcript length
889891
"""
892+
schema = ["pro_ac", "tx_ac", "alt_ac", "cds_start_i"]
893+
if not gene and not alt_ac:
894+
return pl.DataFrame([], schema=schema)
895+
890896
pos_cond = ""
891897
if start_pos is not None and end_pos is not None:
892898
if use_tx_pos:
@@ -915,24 +921,24 @@ async def get_transcripts_from_gene(
915921
else:
916922
alt_ac_cond = "AND ALIGN.alt_ac LIKE 'NC_00%'"
917923

924+
gene_cond = f"AND T.hgnc = '{gene}'" if gene else ""
925+
918926
query = f"""
919927
SELECT AA.pro_ac, AA.tx_ac, ALIGN.alt_ac, T.cds_start_i
920928
FROM {self.schema}.associated_accessions as AA
921929
JOIN {self.schema}.transcript as T ON T.ac = AA.tx_ac
922930
JOIN {self.schema}.tx_exon_aln_v as ALIGN ON T.ac = ALIGN.tx_ac
923-
WHERE T.hgnc = '{gene}'
931+
WHERE ALIGN.alt_aln_method = 'splign'
932+
{gene_cond}
924933
{alt_ac_cond}
925-
AND ALIGN.alt_aln_method = 'splign'
926934
{pos_cond}
927935
{order_by_cond}
928936
"""
929937
results = await self.execute_query(query)
930938
results = [
931939
(r["pro_ac"], r["tx_ac"], r["alt_ac"], r["cds_start_i"]) for r in results
932940
]
933-
return pl.DataFrame(
934-
results, schema=["pro_ac", "tx_ac", "alt_ac", "cds_start_i"]
935-
).unique()
941+
return pl.DataFrame(results, schema=schema).unique()
936942

937943
async def get_chr_assembly(self, ac: str) -> Optional[Tuple[str, str]]:
938944
"""Get chromosome and assembly for NC accession if not in GRCh38.

0 commit comments

Comments
 (0)