Skip to content

Commit 2f36fdb

Browse files
committed
Add work to support residue coords + refactor offset calculation
1 parent c73e4de commit 2f36fdb

File tree

2 files changed

+196
-55
lines changed

2 files changed

+196
-55
lines changed

src/cool_seq_tool/mappers/exon_genomic_coords.py

Lines changed: 66 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
from cool_seq_tool.schemas import (
1111
Assembly,
1212
BaseModelForbidExtra,
13+
CoordinateType,
1314
ServiceMeta,
1415
Strand,
1516
)
@@ -412,6 +413,7 @@ async def genomic_to_tx_segment(
412413
transcript: str | None = None,
413414
get_nearest_transcript_junction: bool = False,
414415
gene: str | None = None,
416+
coordinate_type: CoordinateType = CoordinateType.INTER_RESIDUE,
415417
) -> GenomicTxSegService:
416418
"""Get transcript segment data for genomic data, lifted over to GRCh38.
417419
@@ -485,6 +487,7 @@ async def genomic_to_tx_segment(
485487
gene=gene,
486488
get_nearest_transcript_junction=get_nearest_transcript_junction,
487489
is_seg_start=True,
490+
coordinate_type=coordinate_type,
488491
)
489492
if start_tx_seg_data.errors:
490493
return _return_service_errors(start_tx_seg_data.errors)
@@ -505,6 +508,7 @@ async def genomic_to_tx_segment(
505508
gene=gene,
506509
get_nearest_transcript_junction=get_nearest_transcript_junction,
507510
is_seg_start=False,
511+
coordinate_type=coordinate_type,
508512
)
509513
if end_tx_seg_data.errors:
510514
return _return_service_errors(end_tx_seg_data.errors)
@@ -735,6 +739,7 @@ async def _genomic_to_tx_segment(
735739
gene: str | None = None,
736740
get_nearest_transcript_junction: bool = False,
737741
is_seg_start: bool = True,
742+
coordinate_type: CoordinateType = CoordinateType.INTER_RESIDUE,
738743
) -> GenomicTxSeg:
739744
"""Given genomic data, generate a boundary for a transcript segment.
740745
@@ -763,6 +768,8 @@ async def _genomic_to_tx_segment(
763768
preceding the breakpoint for the 3' end.
764769
:param is_seg_start: ``True`` if ``genomic_pos`` is where the transcript segment starts.
765770
``False`` if ``genomic_pos`` is where the transcript segment ends.
771+
:param coordinate_type: Coordinate type for ``seg_start_genomic`` and
772+
``seg_end_genomic``
766773
:return: Data for a transcript segment boundary (inter-residue coordinates)
767774
"""
768775
params = {key: None for key in GenomicTxSeg.model_fields}
@@ -835,6 +842,13 @@ async def _genomic_to_tx_segment(
835842

836843
strand = Strand(tx_exons[0].alt_strand)
837844
params["strand"] = strand
845+
use_alt_start_i = self._use_alt_start_i(
846+
is_seg_start=is_seg_start, strand=strand
847+
)
848+
if use_alt_start_i and coordinate_type == CoordinateType.RESIDUE:
849+
genomic_pos = (
850+
genomic_pos - 1
851+
) # Convert residue coordinate to inter-residue
838852

839853
# Check if breakpoint occurs on an exon.
840854
# If not, determine the adjacent exon given the selected transcript
@@ -847,15 +861,11 @@ async def _genomic_to_tx_segment(
847861
)
848862

849863
offset = self._get_exon_offset(
850-
start_i=tx_exons[exon_num].alt_start_i,
851-
end_i=tx_exons[exon_num].alt_end_i,
864+
genomic_pos=genomic_pos,
865+
exon_boundary=tx_exons[exon_num].alt_start_i
866+
if use_alt_start_i
867+
else tx_exons[exon_num].alt_end_i,
852868
strand=strand,
853-
use_start_i=strand == Strand.POSITIVE
854-
if is_seg_start
855-
else strand != Strand.POSITIVE,
856-
is_in_exon=False,
857-
start=genomic_pos if is_seg_start else None,
858-
end=genomic_pos if not is_seg_start else None,
859869
)
860870

861871
genomic_location, err_msg = self._get_vrs_seq_loc(
@@ -931,7 +941,12 @@ async def _genomic_to_tx_segment(
931941
)
932942

933943
return await self._get_tx_seg_genomic_metadata(
934-
genomic_ac, genomic_pos, is_seg_start, gene, tx_ac=transcript
944+
genomic_ac,
945+
genomic_pos,
946+
is_seg_start,
947+
gene,
948+
tx_ac=transcript,
949+
coordinate_type=coordinate_type,
935950
)
936951

937952
async def _get_grch38_ac_pos(
@@ -1049,6 +1064,7 @@ async def _get_tx_seg_genomic_metadata(
10491064
is_seg_start: bool,
10501065
gene: str,
10511066
tx_ac: str | None,
1067+
coordinate_type: CoordinateType = CoordinateType.INTER_RESIDUE,
10521068
) -> GenomicTxSeg:
10531069
"""Get transcript segment data and associated genomic metadata.
10541070
@@ -1110,15 +1126,18 @@ async def _get_tx_seg_genomic_metadata(
11101126
)
11111127

11121128
tx_exon_aln_data = tx_exon_aln_data[0]
1113-
1129+
strand = Strand(tx_exon_aln_data.alt_strand)
1130+
use_alt_start_i = self._use_alt_start_i(
1131+
is_seg_start=is_seg_start, strand=strand
1132+
)
1133+
if use_alt_start_i and coordinate_type == CoordinateType.RESIDUE:
1134+
genomic_pos = genomic_pos - 1 # Convert residue coordinate to inter-residue
11141135
offset = self._get_exon_offset(
1115-
start_i=tx_exon_aln_data.alt_start_i,
1116-
end_i=tx_exon_aln_data.alt_end_i,
1117-
strand=Strand(tx_exon_aln_data.alt_strand),
1118-
use_start_i=False, # This doesn't impact anything since we're on the exon
1119-
is_in_exon=True,
1120-
start=genomic_pos if is_seg_start else None,
1121-
end=genomic_pos if not is_seg_start else None,
1136+
genomic_pos=genomic_pos,
1137+
exon_boundary=tx_exon_aln_data.alt_start_i
1138+
if use_alt_start_i
1139+
else tx_exon_aln_data.alt_end_i,
1140+
strand=strand,
11221141
)
11231142

11241143
genomic_location, err_msg = self._get_vrs_seq_loc(
@@ -1150,6 +1169,24 @@ def _is_exonic_breakpoint(pos: int, tx_genomic_coords: list[_ExonCoord]) -> bool
11501169
exon.alt_start_i <= pos <= exon.alt_end_i for exon in tx_genomic_coords
11511170
)
11521171

1172+
@staticmethod
1173+
def _use_alt_start_i(is_seg_start: bool, strand: Strand) -> bool:
1174+
"""Determine whether to use alt_start_i or alt_end_i from UTA when computing
1175+
exon offset
1176+
1177+
:param is_seg_start: ``True`` if ``genomic_pos`` is where the transcript segment starts.
1178+
``False`` if ``genomic_pos`` is where the transcript segment ends.
1179+
:param strand: The transcribed strand
1180+
:return ``True`` if alt_start_i should be used, ``False`` if alt_end_i should
1181+
be used
1182+
"""
1183+
return bool(
1184+
is_seg_start
1185+
and strand == Strand.POSITIVE
1186+
or not is_seg_start
1187+
and strand == Strand.NEGATIVE
1188+
)
1189+
11531190
@staticmethod
11541191
def _get_adjacent_exon(
11551192
tx_exons_genomic_coords: list[_ExonCoord],
@@ -1205,38 +1242,21 @@ def _get_adjacent_exon(
12051242

12061243
@staticmethod
12071244
def _get_exon_offset(
1208-
start_i: int,
1209-
end_i: int,
1245+
genomic_pos: int,
1246+
exon_boundary: int,
12101247
strand: Strand,
1211-
use_start_i: bool = True,
1212-
is_in_exon: bool = True,
1213-
start: int | None = None,
1214-
end: int | None = None,
12151248
) -> int:
12161249
"""Compute offset from exon start or end index
12171250
1218-
:param start_i: Exon start index (inter-residue)
1219-
:param end_i: Exon end index (inter-residue)
1220-
:param strand: Strand
1221-
:param use_start_i: Whether or not ``start_i`` should be used to compute the
1222-
offset, defaults to ``True``. This is only used when ``is_in_exon`` is
1223-
``False``.
1224-
:param is_in_exon: Whether or not the position occurs in an exon, defaults to
1225-
``True``
1226-
:param start: Provided start position, defaults to ``None``. Must provide
1227-
``start`` or ``end``, not both.
1228-
:param end: Provided end position, defaults to ``None``. Must provide ``start``
1229-
or ``end``, not both
1251+
:param genomic_pos: The supplied genomic position. This can represent, for
1252+
example, a fusion junction breakpoint
1253+
:param exon_boundary: The genomic position for the exon boundary that the offset
1254+
is being computed against
1255+
:paran strand: The transcribed strand
12301256
:return: Offset from exon start or end index
12311257
"""
1232-
if is_in_exon:
1233-
if start is not None:
1234-
offset = start - start_i if strand == Strand.POSITIVE else end_i - start
1235-
else:
1236-
offset = end - end_i if strand == Strand.POSITIVE else start_i - end
1237-
else:
1238-
if strand == Strand.POSITIVE:
1239-
offset = start - start_i if use_start_i else end - end_i
1240-
else:
1241-
offset = start_i - end if use_start_i else end_i - start
1242-
return offset
1258+
return (
1259+
genomic_pos - exon_boundary
1260+
if strand == Strand.POSITIVE
1261+
else (genomic_pos - exon_boundary) * -1
1262+
)

tests/mappers/test_exon_genomic_coords.py

Lines changed: 130 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
_ExonCoord,
1111
)
1212
from cool_seq_tool.schemas import (
13+
CoordinateType,
1314
Strand,
1415
)
1516

@@ -880,6 +881,21 @@ def test_is_exonic_breakpoint(test_egc_mapper, nm_001105539_exons_genomic_coords
880881
assert resp is True # Breakpoint does occur on an exon
881882

882883

884+
def test_use_alt_start_i(test_egc_mapper):
885+
"""Test when to use alt_start_i or alt_end_i from UTA"""
886+
resp = test_egc_mapper._use_alt_start_i(is_seg_start=True, strand=Strand.POSITIVE)
887+
assert resp
888+
889+
resp = test_egc_mapper._use_alt_start_i(is_seg_start=False, strand=Strand.NEGATIVE)
890+
assert resp
891+
892+
resp = test_egc_mapper._use_alt_start_i(is_seg_start=True, strand=Strand.NEGATIVE)
893+
assert not resp
894+
895+
resp = test_egc_mapper._use_alt_start_i(is_seg_start=False, strand=Strand.POSITIVE)
896+
assert not resp
897+
898+
883899
@pytest.mark.asyncio()
884900
async def test_genomic_to_transcript_fusion_context(
885901
test_egc_mapper,
@@ -900,15 +916,6 @@ async def test_genomic_to_transcript_fusion_context(
900916
resp = await test_egc_mapper.genomic_to_tx_segment(**inputs)
901917
genomic_tx_seg_service_checks(resp, zbtb10_exon3_end)
902918

903-
inputs = {
904-
"chromosome": "chr8",
905-
"seg_end_genomic": 80514010,
906-
"gene": "ZBTB10",
907-
"get_nearest_transcript_junction": True,
908-
}
909-
resp = await test_egc_mapper.genomic_to_tx_segment(**inputs)
910-
genomic_tx_seg_service_checks(resp, zbtb10_exon3_end)
911-
912919
inputs = {
913920
"chromosome": "8",
914921
"seg_start_genomic": 80518580,
@@ -981,6 +988,67 @@ async def test_genomic_to_transcript_fusion_context(
981988
resp = await test_egc_mapper.genomic_to_tx_segment(**inputs)
982989
genomic_tx_seg_service_checks(resp, gusbp3_exon5_start)
983990

991+
# Test with residue coordinates
992+
inputs = {
993+
"chromosome": "8",
994+
"seg_end_genomic": 80514010,
995+
"gene": "ZBTB10",
996+
"get_nearest_transcript_junction": True,
997+
"coordinate_type": CoordinateType.RESIDUE,
998+
}
999+
resp = await test_egc_mapper.genomic_to_tx_segment(**inputs)
1000+
genomic_tx_seg_service_checks(resp, zbtb10_exon3_end)
1001+
1002+
inputs = {
1003+
"chromosome": "8",
1004+
"seg_start_genomic": 80518581,
1005+
"gene": "ZBTB10",
1006+
"get_nearest_transcript_junction": True,
1007+
"coordinate_type": CoordinateType.RESIDUE,
1008+
}
1009+
resp = await test_egc_mapper.genomic_to_tx_segment(**inputs)
1010+
genomic_tx_seg_service_checks(resp, zbtb10_exon5_start)
1011+
1012+
inputs = {
1013+
"chromosome": "1",
1014+
"seg_end_genomic": 154171411,
1015+
"gene": "TPM3",
1016+
"get_nearest_transcript_junction": True,
1017+
"coordinate_type": CoordinateType.RESIDUE,
1018+
}
1019+
resp = await test_egc_mapper.genomic_to_tx_segment(**inputs)
1020+
genomic_tx_seg_service_checks(resp, tpm3_exon6_end)
1021+
1022+
inputs = {
1023+
"chromosome": "1",
1024+
"seg_start_genomic": 154173080,
1025+
"gene": "TPM3",
1026+
"get_nearest_transcript_junction": True,
1027+
"coordinate_type": CoordinateType.RESIDUE,
1028+
}
1029+
resp = await test_egc_mapper.genomic_to_tx_segment(**inputs)
1030+
genomic_tx_seg_service_checks(resp, tpm3_exon5_start)
1031+
1032+
inputs = {
1033+
"chromosome": "5",
1034+
"seg_end_genomic": 69680765,
1035+
"gene": "GUSBP3",
1036+
"get_nearest_transcript_junction": True,
1037+
"coordinate_type": CoordinateType.RESIDUE,
1038+
}
1039+
resp = await test_egc_mapper.genomic_to_tx_segment(**inputs)
1040+
genomic_tx_seg_service_checks(resp, gusbp3_exon2_end)
1041+
1042+
inputs = {
1043+
"chromosome": "5",
1044+
"seg_start_genomic": 69645878,
1045+
"gene": "GUSBP3",
1046+
"get_nearest_transcript_junction": True,
1047+
"coordinate_type": CoordinateType.RESIDUE,
1048+
}
1049+
resp = await test_egc_mapper.genomic_to_tx_segment(**inputs)
1050+
genomic_tx_seg_service_checks(resp, gusbp3_exon5_start)
1051+
9841052

9851053
@pytest.mark.asyncio()
9861054
async def test_get_alt_ac_start_and_end(
@@ -1070,6 +1138,59 @@ async def test_genomic_to_transcript(test_egc_mapper, tpm3_exon1, tpm3_exon8):
10701138
)
10711139
genomic_tx_seg_checks(resp, tpm3_exon8)
10721140

1141+
# Test with residue coordinates
1142+
resp = await test_egc_mapper._genomic_to_tx_segment(
1143+
154192135,
1144+
genomic_ac="NC_000001.11",
1145+
transcript="NM_152263.3",
1146+
gene="TPM3",
1147+
coordinate_type=CoordinateType.RESIDUE,
1148+
)
1149+
genomic_tx_seg_checks(resp, tpm3_exon1)
1150+
1151+
resp = await test_egc_mapper._genomic_to_tx_segment(
1152+
154192135,
1153+
chromosome="1",
1154+
transcript="NM_152263.3",
1155+
coordinate_type=CoordinateType.RESIDUE,
1156+
)
1157+
genomic_tx_seg_checks(resp, tpm3_exon1)
1158+
1159+
resp = await test_egc_mapper._genomic_to_tx_segment(
1160+
154192135,
1161+
chromosome="1",
1162+
transcript="NM_152263.3",
1163+
coordinate_type=CoordinateType.RESIDUE,
1164+
)
1165+
genomic_tx_seg_checks(resp, tpm3_exon1)
1166+
1167+
resp = await test_egc_mapper._genomic_to_tx_segment(
1168+
154170400,
1169+
genomic_ac="NC_000001.11",
1170+
transcript="NM_152263.3",
1171+
is_seg_start=False,
1172+
coordinate_type=CoordinateType.RESIDUE,
1173+
)
1174+
genomic_tx_seg_checks(resp, tpm3_exon8)
1175+
1176+
resp = await test_egc_mapper._genomic_to_tx_segment(
1177+
154170400,
1178+
chromosome="1",
1179+
transcript="NM_152263.3",
1180+
is_seg_start=False,
1181+
coordinate_type=CoordinateType.RESIDUE,
1182+
)
1183+
genomic_tx_seg_checks(resp, tpm3_exon8)
1184+
1185+
resp = await test_egc_mapper._genomic_to_tx_segment(
1186+
154170400,
1187+
chromosome="1",
1188+
transcript="NM_152263.3",
1189+
is_seg_start=False,
1190+
coordinate_type=CoordinateType.RESIDUE,
1191+
)
1192+
genomic_tx_seg_checks(resp, tpm3_exon8)
1193+
10731194

10741195
@pytest.mark.asyncio()
10751196
async def test_tpm3(

0 commit comments

Comments
 (0)