Skip to content

Commit 3a7c914

Browse files
committed
Add support for providing assembly
1 parent 9bc2c31 commit 3a7c914

File tree

2 files changed

+56
-100
lines changed

2 files changed

+56
-100
lines changed

src/cool_seq_tool/mappers/exon_genomic_coords.py

Lines changed: 36 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -413,6 +413,7 @@ async def genomic_to_tx_segment(
413413
transcript: str | None = None,
414414
gene: str | None = None,
415415
coordinate_type: CoordinateType = CoordinateType.INTER_RESIDUE,
416+
assembly: Assembly = Assembly.GRCH38,
416417
) -> GenomicTxSegService:
417418
"""Get transcript segment data for genomic data, lifted over to GRCh38.
418419
@@ -452,6 +453,8 @@ async def genomic_to_tx_segment(
452453
value is provided.
453454
:param coordinate_type: Coordinate type for ``seg_start_genomic`` and
454455
``seg_end_genomic``. Expects inter-residue coordinates by default
456+
:param assembly: The assembly that the supplied coordinate comes from. Set to
457+
GRCh38 be default
455458
:return: Genomic data (inter-residue coordinates)
456459
"""
457460
errors = []
@@ -477,6 +480,7 @@ async def genomic_to_tx_segment(
477480
gene=gene,
478481
is_seg_start=True,
479482
coordinate_type=coordinate_type,
483+
assembly=assembly,
480484
)
481485
if start_tx_seg_data.errors:
482486
return _return_service_errors(start_tx_seg_data.errors)
@@ -497,6 +501,7 @@ async def genomic_to_tx_segment(
497501
gene=gene,
498502
is_seg_start=False,
499503
coordinate_type=coordinate_type,
504+
assembly=assembly,
500505
)
501506
if end_tx_seg_data.errors:
502507
return _return_service_errors(end_tx_seg_data.errors)
@@ -727,6 +732,7 @@ async def _genomic_to_tx_segment(
727732
gene: str | None = None,
728733
is_seg_start: bool = True,
729734
coordinate_type: CoordinateType = CoordinateType.INTER_RESIDUE,
735+
assembly: Assembly = Assembly.GRCH38,
730736
) -> GenomicTxSeg:
731737
"""Given genomic data, generate a boundary for a transcript segment.
732738
@@ -749,6 +755,8 @@ async def _genomic_to_tx_segment(
749755
``False`` if ``genomic_pos`` is where the transcript segment ends.
750756
:param coordinate_type: Coordinate type for ``seg_start_genomic`` and
751757
``seg_end_genomic``. Expects inter-residue coordinates by default
758+
:param assembly: The assembly that the supplied coordinate comes from. Set to
759+
GRCh38 be default
752760
:return: Data for a transcript segment boundary (inter-residue coordinates)
753761
"""
754762
params = {key: None for key in GenomicTxSeg.model_fields}
@@ -771,6 +779,9 @@ async def _genomic_to_tx_segment(
771779
genomic_ac_validation = await self.uta_db.validate_genomic_ac(genomic_ac)
772780
if not genomic_ac_validation:
773781
return GenomicTxSeg(errors=[f"{genomic_ac} does not exist in UTA"])
782+
if assembly == Assembly.GRCH37:
783+
grch38_ac = await self.uta_db.get_newest_assembly_ac(genomic_ac)
784+
genomic_ac = grch38_ac[0]
774785
else:
775786
genomic_acs, err_msg = self.seqrepo_access.chromosome_to_acs(chromosome)
776787

@@ -780,13 +791,15 @@ async def _genomic_to_tx_segment(
780791
)
781792
genomic_ac = genomic_acs[0]
782793

783-
# Always liftover to GRCh38
784-
genomic_ac, genomic_pos, err_msg = await self._get_grch38_ac_pos(
785-
genomic_ac,
786-
genomic_pos,
787-
)
788-
if err_msg:
789-
return GenomicTxSeg(errors=[err_msg])
794+
# Liftover to GRCh38 if the provided assembly is GRCh37
795+
if assembly == Assembly.GRCH37:
796+
genomic_pos = await self._get_grch38_pos(genomic_pos, genomic_ac)
797+
if not genomic_pos:
798+
return GenomicTxSeg(
799+
errors=[
800+
f"Lifting over {genomic_pos} on {genomic_ac} from {Assembly.GRCH37.value} to {Assembly.GRCH38.value} was unsuccessful."
801+
]
802+
)
790803

791804
# Select a transcript if not provided
792805
if not transcript:
@@ -896,59 +909,23 @@ async def _genomic_to_tx_segment(
896909
tx_ac=transcript,
897910
)
898911

899-
async def _get_grch38_ac_pos(
900-
self,
901-
genomic_ac: str,
902-
genomic_pos: int,
903-
grch38_ac: str | None = None,
904-
) -> tuple[str | None, int | None, str | None]:
905-
"""Get GRCh38 genomic representation for accession and position
906-
907-
:param genomic_ac: RefSeq genomic accession (GRCh37 or GRCh38 assembly)
908-
:param genomic_pos: Genomic position on ``genomic_ac``
909-
:param grch38_ac: A valid GRCh38 genomic accession for ``genomic_ac``. If not
910-
provided, will attempt to retrieve associated GRCh38 accession from UTA.
911-
:return: Tuple containing GRCh38 accession, GRCh38 position, and error message
912-
if unable to get GRCh38 representation
913-
"""
914-
# Validate accession exists
915-
if not grch38_ac:
916-
grch38_ac = await self.uta_db.get_newest_assembly_ac(genomic_ac)
917-
if not grch38_ac:
918-
return None, None, f"Unrecognized genomic accession: {genomic_ac}."
919-
920-
grch38_ac = grch38_ac[0]
921-
922-
if grch38_ac != genomic_ac:
923-
# Ensure genomic_ac is GRCh37
924-
chromosome, _ = self.seqrepo_access.translate_identifier(
925-
genomic_ac, Assembly.GRCH37.value
926-
)
927-
if not chromosome:
928-
_logger.warning(
929-
"SeqRepo could not find associated %s assembly for genomic accession %s.",
930-
Assembly.GRCH37.value,
931-
genomic_ac,
932-
)
933-
return (
934-
None,
935-
None,
936-
f"`genomic_ac` must use {Assembly.GRCH37.value} or {Assembly.GRCH38.value} assembly.",
937-
)
938-
chromosome = chromosome[-1].split(":")[-1]
939-
liftover_data = self.liftover.get_liftover(
940-
chromosome, genomic_pos, Assembly.GRCH38
941-
)
942-
if liftover_data is None:
943-
return (
944-
None,
945-
None,
946-
f"Lifting over {genomic_pos} on {genomic_ac} from {Assembly.GRCH37.value} to {Assembly.GRCH38.value} was unsuccessful.",
947-
)
948-
genomic_pos = liftover_data[1]
949-
genomic_ac = grch38_ac
912+
async def _get_grch38_pos(self, genomic_pos: int, genomic_ac: str) -> int | None:
913+
"""Liftover a GRCh37 coordinate to GRCh38
950914
951-
return genomic_ac, genomic_pos, None
915+
:param genomic_pos: A genomic coordinate in GRCh37
916+
:param genomic_ac: The genomic accession in GRCh38
917+
:return The genomic coordinate in GRCh38
918+
"""
919+
chromosome, _ = self.seqrepo_access.translate_identifier(
920+
genomic_ac, target_namespaces=Assembly.GRCH38.value
921+
)
922+
chromosome = chromosome[-1].split(":")[-1]
923+
liftover_data = self.liftover.get_liftover(
924+
chromosome, genomic_pos, Assembly.GRCH38
925+
)
926+
if liftover_data is None:
927+
return None
928+
return liftover_data[1]
952929

953930
async def _validate_gene_coordinates(
954931
self,

tests/mappers/test_exon_genomic_coords.py

Lines changed: 20 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
_ExonCoord,
1111
)
1212
from cool_seq_tool.schemas import (
13+
Assembly,
1314
CoordinateType,
1415
Strand,
1516
)
@@ -704,48 +705,15 @@ def genomic_tx_seg_checks(actual, expected=None, is_valid=True):
704705

705706

706707
@pytest.mark.asyncio()
707-
async def test_get_grch38_ac_pos(test_egc_mapper):
708-
"""Test that _get_grch38_ac_pos works correctly"""
709-
grch38_ac = "NC_000001.11"
710-
grch38_pos = 154192135
711-
expected = grch38_ac, grch38_pos, None
712-
713-
# GRCh37 provided
714-
grch38_data = await test_egc_mapper._get_grch38_ac_pos("NC_000001.10", 154164611)
715-
assert grch38_data == expected
716-
717-
# GRCh38 provided, no grch38_ac
718-
grch38_data = await test_egc_mapper._get_grch38_ac_pos(grch38_ac, grch38_pos)
719-
assert grch38_data == expected
720-
721-
# GRCh38 and grch38_ac provided
722-
grch38_data = await test_egc_mapper._get_grch38_ac_pos(
723-
grch38_ac, grch38_pos, grch38_ac=grch38_ac
724-
)
725-
assert grch38_data == expected
726-
727-
# Unrecognized accession
728-
invalid_ac = "NC_0000026.10"
729-
grch38_data = await test_egc_mapper._get_grch38_ac_pos(invalid_ac, 154164611)
730-
assert grch38_data == (None, None, f"Unrecognized genomic accession: {invalid_ac}.")
731-
732-
# GRCh36 used
733-
grch38_data = await test_egc_mapper._get_grch38_ac_pos("NC_000001.9", 154164611)
734-
assert grch38_data == (
735-
None,
736-
None,
737-
"`genomic_ac` must use GRCh37 or GRCh38 assembly.",
738-
)
708+
async def test_get_grch38_pos(test_egc_mapper):
709+
"""Test that get_grch38_pos works correctly"""
710+
genomic_pos = await test_egc_mapper._get_grch38_pos(9609996, "NC_000011.10")
711+
assert genomic_pos == 9588449
739712

740-
# Unsuccessful liftover
741-
grch38_data = await test_egc_mapper._get_grch38_ac_pos(
742-
"NC_000001.10", 9999999999999999999
743-
)
744-
assert grch38_data == (
745-
None,
746-
None,
747-
"Lifting over 9999999999999999999 on NC_000001.10 from GRCh37 to GRCh38 was unsuccessful.",
713+
genomic_pos = await test_egc_mapper._get_grch38_pos(
714+
9609996999999999, "NC_000011.10"
748715
)
716+
assert genomic_pos is None
749717

750718

751719
@pytest.mark.asyncio()
@@ -1268,6 +1236,7 @@ async def test_braf(test_egc_mapper, mane_braf):
12681236
"seg_start_genomic": 140501359, # GRCh38 coords: 140801559
12691237
"seg_end_genomic": 140453136, # GRCh38 coords: 140753336
12701238
"gene": "BRAF",
1239+
"assembly": Assembly.GRCH37.value,
12711240
}
12721241
# MANE
12731242
g_to_t_resp = await test_egc_mapper.genomic_to_tx_segment(**inputs)
@@ -1291,6 +1260,7 @@ async def test_wee1(test_egc_mapper, wee1_exon2_exon11, mane_wee1_exon2_exon11):
12911260
"seg_start_genomic": 9597639,
12921261
"seg_end_genomic": 9609996,
12931262
"transcript": "NM_003390.3",
1263+
"assembly": Assembly.GRCH37.value,
12941264
}
12951265
g_to_t_resp = await test_egc_mapper.genomic_to_tx_segment(**inputs)
12961266
genomic_tx_seg_service_checks(g_to_t_resp, wee1_exon2_exon11)
@@ -1307,6 +1277,7 @@ async def test_wee1(test_egc_mapper, wee1_exon2_exon11, mane_wee1_exon2_exon11):
13071277
"seg_end_genomic": 9609996,
13081278
"transcript": "NM_003390.3",
13091279
"gene": "WEE1",
1280+
"assembly": Assembly.GRCH37.value,
13101281
}
13111282
g_to_t_resp = await test_egc_mapper.genomic_to_tx_segment(**inputs)
13121283
genomic_tx_seg_service_checks(g_to_t_resp, wee1_exon2_exon11)
@@ -1322,6 +1293,7 @@ async def test_wee1(test_egc_mapper, wee1_exon2_exon11, mane_wee1_exon2_exon11):
13221293
"seg_start_genomic": 9597639, # GRCh38 coords: 9576092
13231294
"seg_end_genomic": 9609996, # GRCh38 coords: 9588449
13241295
"gene": "WEE1",
1296+
"assembly": Assembly.GRCH37.value,
13251297
}
13261298
g_to_t_resp = await test_egc_mapper.genomic_to_tx_segment(**inputs)
13271299
genomic_tx_seg_service_checks(g_to_t_resp, mane_wee1_exon2_exon11)
@@ -1433,11 +1405,17 @@ async def test_valid_inputs(test_egc_mapper, eln_grch38_intronic):
14331405
"gene": "WEE1",
14341406
"genomic_ac": "NC_000011.9",
14351407
"seg_end_genomic": 9609996,
1408+
"assembly": Assembly.GRCH37.value,
14361409
}
14371410
resp = await test_egc_mapper.genomic_to_tx_segment(**inputs)
14381411
assert all((resp.gene, resp.genomic_ac, resp.tx_ac, resp.seg_end))
14391412

1440-
inputs = {"gene": "WEE1", "chromosome": "11", "seg_end_genomic": 9588449}
1413+
inputs = {
1414+
"gene": "WEE1",
1415+
"chromosome": "11",
1416+
"seg_end_genomic": 9609996,
1417+
"assembly": Assembly.GRCH37.value,
1418+
}
14411419
resp = await test_egc_mapper.genomic_to_tx_segment(**inputs)
14421420
assert all((resp.gene, resp.genomic_ac, resp.tx_ac, resp.seg_end))
14431421

@@ -1471,6 +1449,7 @@ async def test_valid_inputs(test_egc_mapper, eln_grch38_intronic):
14711449
seg_start_genomic=73442503,
14721450
seg_end_genomic=73457929, # not on an exon
14731451
gene="ELN",
1452+
assembly=Assembly.GRCH37.value,
14741453
)
14751454
genomic_tx_seg_service_checks(resp, eln_grch38_intronic)
14761455

0 commit comments

Comments
 (0)