diff --git a/src/cool_seq_tool/mappers/exon_genomic_coords.py b/src/cool_seq_tool/mappers/exon_genomic_coords.py index c8d398d..40756fb 100644 --- a/src/cool_seq_tool/mappers/exon_genomic_coords.py +++ b/src/cool_seq_tool/mappers/exon_genomic_coords.py @@ -413,6 +413,7 @@ async def genomic_to_tx_segment( transcript: str | None = None, gene: str | None = None, coordinate_type: CoordinateType = CoordinateType.INTER_RESIDUE, + starting_assembly: Assembly = Assembly.GRCH38, ) -> GenomicTxSegService: """Get transcript segment data for genomic data, lifted over to GRCh38. @@ -441,7 +442,9 @@ async def genomic_to_tx_segment( used. :param genomic_ac: Genomic accession (i.e. ``NC_000001.11``). If not provided, must provide ``chromosome. If ``chromosome`` is also provided, - ``genomic_ac`` will be used. + ``genomic_ac`` will be used. If the genomic accession is from GRCh37, it + will be lifted over to GRCh38 and the original accession version will be + ignored :param seg_start_genomic: Genomic position where the transcript segment starts :param seg_end_genomic: Genomic position where the transcript segment ends :param transcript: The transcript to use. If this is not given, we will try the @@ -452,6 +455,8 @@ async def genomic_to_tx_segment( value is provided. :param coordinate_type: Coordinate type for ``seg_start_genomic`` and ``seg_end_genomic``. Expects inter-residue coordinates by default + :param starting_assembly: The assembly that the supplied coordinate comes from. Set to + GRCh38 by default. Will attempt to liftover if starting assembly is GRCh37 :return: Genomic data (inter-residue coordinates) """ errors = [] @@ -477,6 +482,7 @@ async def genomic_to_tx_segment( gene=gene, is_seg_start=True, coordinate_type=coordinate_type, + starting_assembly=starting_assembly, ) if start_tx_seg_data.errors: return _return_service_errors(start_tx_seg_data.errors) @@ -497,6 +503,7 @@ async def genomic_to_tx_segment( gene=gene, is_seg_start=False, coordinate_type=coordinate_type, + starting_assembly=starting_assembly, ) if end_tx_seg_data.errors: return _return_service_errors(end_tx_seg_data.errors) @@ -727,6 +734,7 @@ async def _genomic_to_tx_segment( gene: str | None = None, is_seg_start: bool = True, coordinate_type: CoordinateType = CoordinateType.INTER_RESIDUE, + starting_assembly: Assembly = Assembly.GRCH38, ) -> GenomicTxSeg: """Given genomic data, generate a boundary for a transcript segment. @@ -743,7 +751,8 @@ async def _genomic_to_tx_segment( If ``genomic_ac`` is also provided, ``genomic_ac`` will be used. :param genomic_ac: Genomic accession (i.e. ``NC_000001.11``). If not provided, must provide ``chromosome. If ``chromosome`` is also provided, ``genomic_ac`` - will be used. + will be used. If the genomic accession is from GRCh37, it will be lifted + over to GRCh38 and the original accession version will be ignored :param transcript: The transcript to use. If this is not given, we will try the following transcripts: MANE Select, MANE Clinical Plus, Longest Remaining Compatible Transcript @@ -752,6 +761,8 @@ async def _genomic_to_tx_segment( ``False`` if ``genomic_pos`` is where the transcript segment ends. :param coordinate_type: Coordinate type for ``seg_start_genomic`` and ``seg_end_genomic``. Expects inter-residue coordinates by default + :param starting_assembly: The assembly that the supplied coordinate comes from. Set to + GRCh38 by default. Will attempt to liftover if starting assembly is GRCh37 :return: Data for a transcript segment boundary (inter-residue coordinates) """ params = {key: None for key in GenomicTxSeg.model_fields} @@ -770,8 +781,10 @@ async def _genomic_to_tx_segment( ) if genomic_ac: - genomic_ac_validation = await self.uta_db.validate_genomic_ac(genomic_ac) - if not genomic_ac_validation: + grch38_ac = await self.uta_db.get_newest_assembly_ac(genomic_ac) + if grch38_ac: + genomic_ac = grch38_ac[0] + else: return GenomicTxSeg( errors=[f"Genomic accession does not exist in UTA: {genomic_ac}"] ) @@ -784,13 +797,17 @@ async def _genomic_to_tx_segment( ) genomic_ac = genomic_acs[0] - # Always liftover to GRCh38 - genomic_ac, genomic_pos, err_msg = await self._get_grch38_ac_pos( - genomic_ac, - genomic_pos, - ) - if err_msg: - return GenomicTxSeg(errors=[err_msg]) + # Liftover to GRCh38 if the provided assembly is GRCh37 + if starting_assembly == Assembly.GRCH37: + genomic_pos = await self._get_grch38_pos( + genomic_ac, genomic_pos, chromosome=chromosome if chromosome else None + ) + if not genomic_pos: + return GenomicTxSeg( + errors=[ + f"Lifting over {genomic_pos} on {genomic_ac} from {Assembly.GRCH37.value} to {Assembly.GRCH38.value} was unsuccessful." + ] + ) # Select a transcript if not provided if not transcript: @@ -903,59 +920,28 @@ async def _genomic_to_tx_segment( ), ) - async def _get_grch38_ac_pos( + async def _get_grch38_pos( self, genomic_ac: str, genomic_pos: int, - grch38_ac: str | None = None, - ) -> tuple[str | None, int | None, str | None]: + chromosome: str | None = None, + ) -> int | None: """Get GRCh38 genomic representation for accession and position - :param genomic_ac: RefSeq genomic accession (GRCh37 or GRCh38 assembly) - :param genomic_pos: Genomic position on ``genomic_ac`` - :param grch38_ac: A valid GRCh38 genomic accession for ``genomic_ac``. If not - provided, will attempt to retrieve associated GRCh38 accession from UTA. - :return: Tuple containing GRCh38 accession, GRCh38 position, and error message - if unable to get GRCh38 representation + :param genomic_pos: A genomic coordinate in GRCh37 + :param genomic_ac: The genomic accession in GRCh38 + :param chromosome: The chromosome that genomic_pos occurs on + :return The genomic coordinate in GRCh38 """ - # Validate accession exists - if not grch38_ac: - grch38_ac = await self.uta_db.get_newest_assembly_ac(genomic_ac) - if not grch38_ac: - return None, None, f"Unrecognized genomic accession: {genomic_ac}." - - grch38_ac = grch38_ac[0] - - if grch38_ac != genomic_ac: - # Ensure genomic_ac is GRCh37 + if not chromosome: chromosome, _ = self.seqrepo_access.translate_identifier( - genomic_ac, Assembly.GRCH37.value + genomic_ac, target_namespaces=Assembly.GRCH38.value ) - if not chromosome: - _logger.warning( - "SeqRepo could not find associated %s assembly for genomic accession %s.", - Assembly.GRCH37.value, - genomic_ac, - ) - return ( - None, - None, - f"`genomic_ac` must use {Assembly.GRCH37.value} or {Assembly.GRCH38.value} assembly.", - ) chromosome = chromosome[-1].split(":")[-1] - liftover_data = self.liftover.get_liftover( - chromosome, genomic_pos, Assembly.GRCH38 - ) - if liftover_data is None: - return ( - None, - None, - f"Lifting over {genomic_pos} on {genomic_ac} from {Assembly.GRCH37.value} to {Assembly.GRCH38.value} was unsuccessful.", - ) - genomic_pos = liftover_data[1] - genomic_ac = grch38_ac - - return genomic_ac, genomic_pos, None + liftover_data = self.liftover.get_liftover( + chromosome, genomic_pos, Assembly.GRCH38 + ) + return liftover_data[1] if liftover_data else None async def _validate_gene_coordinates( self, diff --git a/tests/mappers/test_exon_genomic_coords.py b/tests/mappers/test_exon_genomic_coords.py index 5790ec2..57f277c 100644 --- a/tests/mappers/test_exon_genomic_coords.py +++ b/tests/mappers/test_exon_genomic_coords.py @@ -10,6 +10,7 @@ _ExonCoord, ) from cool_seq_tool.schemas import ( + Assembly, CoordinateType, Strand, ) @@ -703,51 +704,6 @@ def genomic_tx_seg_checks(actual, expected=None, is_valid=True): assert len(actual.errors) > 0 -@pytest.mark.asyncio() -async def test_get_grch38_ac_pos(test_egc_mapper): - """Test that _get_grch38_ac_pos works correctly""" - grch38_ac = "NC_000001.11" - grch38_pos = 154192135 - expected = grch38_ac, grch38_pos, None - - # GRCh37 provided - grch38_data = await test_egc_mapper._get_grch38_ac_pos("NC_000001.10", 154164611) - assert grch38_data == expected - - # GRCh38 provided, no grch38_ac - grch38_data = await test_egc_mapper._get_grch38_ac_pos(grch38_ac, grch38_pos) - assert grch38_data == expected - - # GRCh38 and grch38_ac provided - grch38_data = await test_egc_mapper._get_grch38_ac_pos( - grch38_ac, grch38_pos, grch38_ac=grch38_ac - ) - assert grch38_data == expected - - # Unrecognized accession - invalid_ac = "NC_0000026.10" - grch38_data = await test_egc_mapper._get_grch38_ac_pos(invalid_ac, 154164611) - assert grch38_data == (None, None, f"Unrecognized genomic accession: {invalid_ac}.") - - # GRCh36 used - grch38_data = await test_egc_mapper._get_grch38_ac_pos("NC_000001.9", 154164611) - assert grch38_data == ( - None, - None, - "`genomic_ac` must use GRCh37 or GRCh38 assembly.", - ) - - # Unsuccessful liftover - grch38_data = await test_egc_mapper._get_grch38_ac_pos( - "NC_000001.10", 9999999999999999999 - ) - assert grch38_data == ( - None, - None, - "Lifting over 9999999999999999999 on NC_000001.10 from GRCh37 to GRCh38 was unsuccessful.", - ) - - @pytest.mark.asyncio() async def test_get_all_exon_coords( test_egc_mapper, nm_152263_exons, nm_152263_exons_genomic_coords @@ -893,6 +849,21 @@ def test_is_exonic_breakpoint(test_egc_mapper, nm_001105539_exons_genomic_coords assert resp is True # Breakpoint does occur on an exon +def test_use_alt_start_i(test_egc_mapper): + """Test when to use alt_start_i or alt_end_i from UTA""" + resp = test_egc_mapper._use_alt_start_i(is_seg_start=True, strand=Strand.POSITIVE) + assert resp + + resp = test_egc_mapper._use_alt_start_i(is_seg_start=False, strand=Strand.NEGATIVE) + assert resp + + resp = test_egc_mapper._use_alt_start_i(is_seg_start=True, strand=Strand.NEGATIVE) + assert not resp + + resp = test_egc_mapper._use_alt_start_i(is_seg_start=False, strand=Strand.POSITIVE) + assert not resp + + @pytest.mark.asyncio() async def test_genomic_to_transcript_fusion_context( test_egc_mapper, @@ -1250,14 +1221,25 @@ async def test_braf(test_egc_mapper, mane_braf): """ inputs = { "genomic_ac": "NC_000007.13", - "seg_start_genomic": 140501359, # GRCh38 coords: 140801559 - "seg_end_genomic": 140453136, # GRCh38 coords: 140753336 + "seg_start_genomic": 140501359, + "seg_end_genomic": 140453136, "gene": "BRAF", + "starting_assembly": Assembly.GRCH37.value, } # MANE g_to_t_resp = await test_egc_mapper.genomic_to_tx_segment(**inputs) genomic_tx_seg_service_checks(g_to_t_resp, mane_braf) + inputs = { + "genomic_ac": "NC_000007.14", + "seg_start_genomic": 140801559, + "seg_end_genomic": 140753336, + "gene": "BRAF", + "starting_assembly": Assembly.GRCH38.value, + } + g_to_t_resp = await test_egc_mapper.genomic_to_tx_segment(**inputs) + genomic_tx_seg_service_checks(g_to_t_resp, mane_braf) + expected = mane_braf.model_copy(deep=True) expected.seg_start.genomic_location.end = 140801559 t_to_g_resp = await test_egc_mapper.tx_segment_to_genomic( @@ -1276,6 +1258,7 @@ async def test_wee1(test_egc_mapper, wee1_exon2_exon11, mane_wee1_exon2_exon11): "seg_start_genomic": 9597639, "seg_end_genomic": 9609996, "transcript": "NM_003390.3", + "starting_assembly": Assembly.GRCH37.value, } g_to_t_resp = await test_egc_mapper.genomic_to_tx_segment(**inputs) genomic_tx_seg_service_checks(g_to_t_resp, wee1_exon2_exon11) @@ -1292,6 +1275,7 @@ async def test_wee1(test_egc_mapper, wee1_exon2_exon11, mane_wee1_exon2_exon11): "seg_end_genomic": 9609996, "transcript": "NM_003390.3", "gene": "WEE1", + "starting_assembly": Assembly.GRCH37.value, } g_to_t_resp = await test_egc_mapper.genomic_to_tx_segment(**inputs) genomic_tx_seg_service_checks(g_to_t_resp, wee1_exon2_exon11) @@ -1304,9 +1288,20 @@ async def test_wee1(test_egc_mapper, wee1_exon2_exon11, mane_wee1_exon2_exon11): # MANE since no transcript provided inputs = { "genomic_ac": "NC_000011.9", - "seg_start_genomic": 9597639, # GRCh38 coords: 9576092 - "seg_end_genomic": 9609996, # GRCh38 coords: 9588449 + "seg_start_genomic": 9597639, + "seg_end_genomic": 9609996, + "gene": "WEE1", + "starting_assembly": Assembly.GRCH37.value, + } + g_to_t_resp = await test_egc_mapper.genomic_to_tx_segment(**inputs) + genomic_tx_seg_service_checks(g_to_t_resp, mane_wee1_exon2_exon11) + + inputs = { + "genomic_ac": "NC_000011.10", + "seg_start_genomic": 9576092, + "seg_end_genomic": 9588449, "gene": "WEE1", + "starting_assembly": Assembly.GRCH38.value, } g_to_t_resp = await test_egc_mapper.genomic_to_tx_segment(**inputs) genomic_tx_seg_service_checks(g_to_t_resp, mane_wee1_exon2_exon11) @@ -1418,13 +1413,19 @@ async def test_valid_inputs(test_egc_mapper, eln_grch38_intronic): "gene": "WEE1", "genomic_ac": "NC_000011.9", "seg_end_genomic": 9609996, + "starting_assembly": Assembly.GRCH37.value, } resp = await test_egc_mapper.genomic_to_tx_segment(**inputs) assert all((resp.gene, resp.genomic_ac, resp.tx_ac, resp.seg_end)) - # inputs = {"gene": "WEE1", "chromosome": "11", "seg_end_genomic": 9588449} - # resp = await test_egc_mapper.genomic_to_tx_segment(**inputs) - # assert all((resp.gene, resp.genomic_ac, resp.tx_ac, resp.seg_end)) + inputs = { + "gene": "WEE1", + "chromosome": "11", + "seg_end_genomic": 9609996, + "starting_assembly": Assembly.GRCH37.value, + } + resp = await test_egc_mapper.genomic_to_tx_segment(**inputs) + assert all((resp.gene, resp.genomic_ac, resp.tx_ac, resp.seg_end)) inputs = {"transcript": "NM_003390.3", "exon_start": 2} resp = await test_egc_mapper.tx_segment_to_genomic(**inputs) @@ -1456,6 +1457,7 @@ async def test_valid_inputs(test_egc_mapper, eln_grch38_intronic): seg_start_genomic=73442503, seg_end_genomic=73457929, # not on an exon gene="ELN", + starting_assembly=Assembly.GRCH37.value, ) genomic_tx_seg_service_checks(resp, eln_grch38_intronic) @@ -1497,13 +1499,13 @@ async def test_invalid(test_egc_mapper): # Invalid accession resp = await test_egc_mapper.genomic_to_tx_segment( - genomic_ac="NC_000001.200", + genomic_ac="NC_000035.200", seg_start_genomic=154191901, seg_end_genomic=154192135, transcript="NM_152263.3", ) genomic_tx_seg_service_checks(resp, is_valid=False) - assert resp.errors == ["Genomic accession does not exist in UTA: NC_000001.200"] + assert resp.errors == ["Genomic accession does not exist in UTA: NC_000035.200"] # Invalid coordinates resp = await test_egc_mapper.genomic_to_tx_segment(