diff --git a/src/cool_seq_tool/mappers/exon_genomic_coords.py b/src/cool_seq_tool/mappers/exon_genomic_coords.py index 189ec38..1509cc3 100644 --- a/src/cool_seq_tool/mappers/exon_genomic_coords.py +++ b/src/cool_seq_tool/mappers/exon_genomic_coords.py @@ -865,6 +865,22 @@ async def _genomic_to_tx_segment( if use_alt_start_i and coordinate_type == CoordinateType.RESIDUE: genomic_pos = genomic_pos - 1 # Convert residue coordinate to inter-residue + # Validate that the breakpoint occurs within 150 bp of the first and last exon for the selected transcript. + # A breakpoint beyond this range is likely erroneous. + coordinate_check = await self._validate_genomic_breakpoint( + pos=genomic_pos, genomic_ac=genomic_ac, tx_ac=transcript + ) + if not coordinate_check: + msg = ( + f"{genomic_pos} on {genomic_ac} occurs more than 150 bp outside the " + f"exon boundaries of the {transcript} transcript, indicating this may not " + f"be a chimeric transcript junction and is unlikely to represent a " + f"contiguous coding sequence. Confirm that the genomic position " + f"{genomic_pos} is being used to represent transcript junction and not " + f"DNA breakpoint." + ) + _logger.warning(msg) + # Check if breakpoint occurs on an exon. # If not, determine the adjacent exon given the selected transcript if not self._is_exonic_breakpoint(genomic_pos, tx_exons): @@ -932,6 +948,37 @@ async def _get_grch38_pos( ) return liftover_data[1] if liftover_data else None + async def _validate_genomic_breakpoint( + self, + pos: int, + genomic_ac: str, + tx_ac: str, + ) -> bool: + """Validate that a genomic coordinate falls within the first and last exon + for a transcript on a given accession + + :param pos: Genomic position on ``genomic_ac`` + :param genomic_ac: RefSeq genomic accession, e.g. ``"NC_000007.14"`` + :param transcript: A transcript accession + :return: ``True`` if the coordinate falls within 150bp of the first and last exon + for the transcript, ``False`` if not. Breakpoints past this threshold + are likely erroneous. + """ + query = f""" + WITH tx_boundaries AS ( + SELECT + MIN(alt_start_i) AS min_start, + MAX(alt_end_i) AS max_end + FROM {self.uta_db.schema}.tx_exon_aln_v + WHERE tx_ac = '{tx_ac}' + AND alt_ac = '{genomic_ac}' + ) + SELECT * FROM tx_boundaries + WHERE {pos} between (tx_boundaries.min_start - 150) and (tx_boundaries.max_end + 150) + """ # noqa: S608 + results = await self.uta_db.execute_query(query) + return bool(results) + async def _get_tx_ac_gene( self, tx_ac: str, diff --git a/tests/mappers/test_exon_genomic_coords.py b/tests/mappers/test_exon_genomic_coords.py index 114f3e1..8a3ce10 100644 --- a/tests/mappers/test_exon_genomic_coords.py +++ b/tests/mappers/test_exon_genomic_coords.py @@ -1,5 +1,6 @@ """Module for testing that Cool Seq Tool works correctly.""" +import logging from datetime import datetime import pytest @@ -1463,7 +1464,7 @@ async def test_valid_inputs(test_egc_mapper, eln_grch38_intronic): @pytest.mark.asyncio -async def test_invalid(test_egc_mapper): +async def test_invalid(test_egc_mapper, caplog): """Test that invalid queries return `None`.""" resp = await test_egc_mapper.genomic_to_tx_segment( transcript="NM_152263 3", @@ -1514,6 +1515,37 @@ async def test_invalid(test_egc_mapper): genomic_tx_seg_service_checks(resp, is_valid=False) assert resp.errors == ["Must provide either `gene` or `transcript`"] + # Check 150 bp warning statement + with caplog.at_level(logging.WARNING): + await test_egc_mapper.genomic_to_tx_segment( + genomic_ac="NC_000001.11", + seg_start_genomic=9999999999998, + seg_end_genomic=9999999999999, + transcript="NM_152263.3", + ) + assert any( + ( + "9999999999998 on NC_000001.11 occurs more than 150 bp outside the exon " + "boundaries of the NM_152263.3 transcript, indicating this may not be " + "a chimeric transcript junction and is unlikely to represent a " + "contiguous coding sequence. Confirm that the genomic position 9999999999998 is " + "being used to represent transcript junction and not DNA breakpoint." + ) + in record.message + for record in caplog.records + ) + assert any( + ( + "9999999999999 on NC_000001.11 occurs more than 150 bp outside the exon " + "boundaries of the NM_152263.3 transcript, indicating this may not be " + "a chimeric transcript junction and is unlikely to represent a " + "contiguous coding sequence. Confirm that the genomic position 9999999999999 is " + "being used to represent transcript junction and not DNA breakpoint." + ) + in record.message + for record in caplog.records + ) + # Exon 22 does not exist resp = await test_egc_mapper.tx_segment_to_genomic( exon_start=None,