@@ -865,6 +865,22 @@ async def _genomic_to_tx_segment(
865865 if use_alt_start_i and coordinate_type == CoordinateType .RESIDUE :
866866 genomic_pos = genomic_pos - 1 # Convert residue coordinate to inter-residue
867867
868+ # Validate that the breakpoint occurs within 150 bp of the first and last exon for the selected transcript.
869+ # A breakpoint beyond this range is likely erroneous.
870+ coordinate_check = await self ._validate_genomic_breakpoint (
871+ pos = genomic_pos , genomic_ac = genomic_ac , tx_ac = transcript
872+ )
873+ if not coordinate_check :
874+ msg = (
875+ f"{ genomic_pos } on { genomic_ac } occurs more than 150 bp outside the "
876+ f"exon boundaries of the { transcript } transcript, indicating this may not "
877+ f"be a chimeric transcript junction and is unlikely to represent a "
878+ f"contiguous coding sequence. Confirm that the genomic position "
879+ f"{ genomic_pos } is being used to represent transcript junction and not "
880+ f"DNA breakpoint."
881+ )
882+ _logger .warning (msg )
883+
868884 # Check if breakpoint occurs on an exon.
869885 # If not, determine the adjacent exon given the selected transcript
870886 if not self ._is_exonic_breakpoint (genomic_pos , tx_exons ):
@@ -932,6 +948,37 @@ async def _get_grch38_pos(
932948 )
933949 return liftover_data [1 ] if liftover_data else None
934950
951+ async def _validate_genomic_breakpoint (
952+ self ,
953+ pos : int ,
954+ genomic_ac : str ,
955+ tx_ac : str ,
956+ ) -> bool :
957+ """Validate that a genomic coordinate falls within the first and last exon
958+ for a transcript on a given accession
959+
960+ :param pos: Genomic position on ``genomic_ac``
961+ :param genomic_ac: RefSeq genomic accession, e.g. ``"NC_000007.14"``
962+ :param transcript: A transcript accession
963+ :return: ``True`` if the coordinate falls within 150bp of the first and last exon
964+ for the transcript, ``False`` if not. Breakpoints past this threshold
965+ are likely erroneous.
966+ """
967+ query = f"""
968+ WITH tx_boundaries AS (
969+ SELECT
970+ MIN(alt_start_i) AS min_start,
971+ MAX(alt_end_i) AS max_end
972+ FROM { self .uta_db .schema } .tx_exon_aln_v
973+ WHERE tx_ac = '{ tx_ac } '
974+ AND alt_ac = '{ genomic_ac } '
975+ )
976+ SELECT * FROM tx_boundaries
977+ WHERE { pos } between (tx_boundaries.min_start - 150) and (tx_boundaries.max_end + 150)
978+ """ # noqa: S608
979+ results = await self .uta_db .execute_query (query )
980+ return bool (results )
981+
935982 async def _get_tx_ac_gene (
936983 self ,
937984 tx_ac : str ,
0 commit comments