diff --git a/docs/source/reference/index.rst b/docs/source/reference/index.rst index 5c410926..c0e655b5 100644 --- a/docs/source/reference/index.rst +++ b/docs/source/reference/index.rst @@ -55,6 +55,7 @@ Data Mappers cool_seq_tool.mappers.alignment cool_seq_tool.mappers.exon_genomic_coords + cool_seq_tool.mappers.feature_overlap cool_seq_tool.mappers.liftover cool_seq_tool.mappers.mane_transcript diff --git a/pyproject.toml b/pyproject.toml index 23c7f940..746b7867 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -33,8 +33,9 @@ dependencies = [ "biocommons.seqrepo", "pydantic >=2.0,<3.0", "ga4gh.vrs >=2.1.3,<3.0", - "wags-tails ~= 0.3.2", + "wags-tails ~= 0.4.0", "bioutils", + "polars" ] dynamic = ["version"] diff --git a/src/cool_seq_tool/mappers/__init__.py b/src/cool_seq_tool/mappers/__init__.py index 1ae9e4c7..03b1afeb 100644 --- a/src/cool_seq_tool/mappers/__init__.py +++ b/src/cool_seq_tool/mappers/__init__.py @@ -4,6 +4,13 @@ from .liftover import LiftOver from .mane_transcript import ManeTranscript from .exon_genomic_coords import ExonGenomicCoordsMapper +from .feature_overlap import FeatureOverlap -__all__ = ["AlignmentMapper", "ExonGenomicCoordsMapper", "LiftOver", "ManeTranscript"] +__all__ = [ + "AlignmentMapper", + "ExonGenomicCoordsMapper", + "FeatureOverlap", + "LiftOver", + "ManeTranscript", +] diff --git a/src/cool_seq_tool/mappers/feature_overlap.py b/src/cool_seq_tool/mappers/feature_overlap.py new file mode 100644 index 00000000..409a4e7b --- /dev/null +++ b/src/cool_seq_tool/mappers/feature_overlap.py @@ -0,0 +1,251 @@ +"""Module for getting feature (gene/exon) overlap""" + +import re +from pathlib import Path + +import polars as pl +from ga4gh.core import ga4gh_identify +from ga4gh.vrs.models import SequenceLocation, SequenceReference + +from cool_seq_tool.handlers import SeqRepoAccess +from cool_seq_tool.resources.data_files import DataFile, get_data_file +from cool_seq_tool.schemas import Assembly, CdsOverlap, CoordinateType + +# Pattern for chromosome +CHR_PATTERN = r"X|Y|([1-9]|1[0-9]|2[0-2])" + + +class FeatureOverlapError(Exception): + """Custom exception for the Feature Overlap class""" + + +class FeatureOverlap: + """The class for getting feature overlap""" + + def __init__( + self, + seqrepo_access: SeqRepoAccess, + mane_refseq_genomic_path: Path | None = None, + from_local: bool = False, + ) -> None: + """Initialize the FeatureOverlap class. Will load RefSeq data and store as df. + + :param seqrepo_access: Client for accessing SeqRepo data + :param mane_refseq_genomic_path: Path to MANE RefSeq Genomic GFF data + :param from_local: if ``True``, don't check for or acquire latest version -- + just provide most recent locally available file, if possible, and raise + error otherwise + """ + if not mane_refseq_genomic_path: + mane_refseq_genomic_path = get_data_file( + DataFile.MANE_REFSEQ_GENOMIC, from_local + ) + self.seqrepo_access = seqrepo_access + self.mane_refseq_genomic_path = mane_refseq_genomic_path + self.df = self._load_mane_refseq_gff_data() + + def _load_mane_refseq_gff_data(self) -> pl.DataFrame: + """Load MANE RefSeq GFF data file into DataFrame. + + :return: DataFrame containing MANE RefSeq Genomic GFF data for CDS. Columns + include `type`, `chromosome` (chromosome without 'chr' prefix), `cds_start`, + `cds_stop`, `info_name` (name of record), and `gene`. `cds_start` and + `cds_stop` use inter-residue coordinates. + """ + df = pl.read_csv( + self.mane_refseq_genomic_path, + separator="\t", + has_header=False, + skip_rows=9, + columns=[0, 2, 3, 4, 8], + ) + df.columns = ["chromosome", "type", "cds_start", "cds_stop", "info"] + + # Restrict to only feature of interest: CDS (which has gene info) + df = df.filter(pl.col("type") == "CDS") + + # Get name from the info field + # Get gene from the info field + # Get chromosome names without prefix and without suffix for alternate transcripts + # Convert start and stop to ints + # Convert to inter-residue coordinates + # Only return certain columns + return df.with_columns( + (pl.col("info").str.extract(r"Name=([^;]+)", 1).alias("info_name")), + (pl.col("info").str.extract(r"gene=([^;]+)", 1).alias("gene")), + (pl.col("chromosome").str.extract(r"^chr?([^_]+)", 1).alias("chromosome")), + (pl.col("cds_start").cast(pl.Int64) - 1).alias("cds_start"), + (pl.col("cds_stop").cast(pl.Int64).alias("cds_stop")), + ).select( + [ + pl.col("type"), + pl.col("chromosome"), + pl.col("cds_start"), + pl.col("cds_stop"), + pl.col("info_name"), + pl.col("gene"), + ] + ) + + def _get_chr_from_alt_ac(self, identifier: str) -> str: + """Get chromosome given genomic identifier + + :param identifier: Genomic identifier on GRCh38 assembly + :raises FeatureOverlapError: If unable to find associated GRCh38 chromosome + :return: Chromosome. 1..22, X, Y. No 'chr' prefix. + """ + aliases, error_msg = self.seqrepo_access.translate_identifier( + identifier, Assembly.GRCH38.value + ) + + if error_msg: + raise FeatureOverlapError(str(error_msg)) + + if not aliases: + error_msg = ( + f"Unable to find {Assembly.GRCH38.value} aliases for: {identifier}" + ) + raise FeatureOverlapError(error_msg) + + assembly_chr_pattern = ( + rf"^{Assembly.GRCH38.value}:(?P{CHR_PATTERN})$" + ) + for a in aliases: + chr_match = re.match(assembly_chr_pattern, a) + if chr_match: + break + + if not chr_match: + error_msg = ( + f"Unable to find {Assembly.GRCH38.value} chromosome for: {identifier}" + ) + raise FeatureOverlapError(error_msg) + + chr_groupdict = chr_match.groupdict() + return chr_groupdict["chromosome"] + + def get_grch38_mane_gene_cds_overlap( + self, + start: int, + end: int, + chromosome: str | None = None, + identifier: str | None = None, + coordinate_type: CoordinateType = CoordinateType.RESIDUE, + ) -> dict[str, list[CdsOverlap]] | None: + """Given GRCh38 genomic data, find the overlapping MANE features (gene and cds). + The genomic data is specified as a sequence location by `chromosome`, `start`, + `end`. All CDS regions with which the input sequence location has nonzero base + pair overlap will be returned. + + :param start: GRCh38 start position + :param end: GRCh38 end position + :param chromosome: Chromosome. 1..22, X, or Y. If not provided, must provide + `identifier`. If both `chromosome` and `identifier` are provided, + `chromosome` will be used. + :param identifier: Genomic identifier on GRCh38 assembly. If not provided, must + provide `chromosome`. If both `chromosome` and `identifier` are provided, + `chromosome` will be used. + :param coordinate_type: Coordinate type for ``start`` and ``end`` + :raise FeatureOverlapError: If missing required fields or unable to find + associated ga4gh identifier + :return: MANE feature (gene/cds) overlap data represented as a dict. The + dictionary will be keyed by genes which overlap the input sequence location. + Each gene contains a list of the overlapping CDS regions with the beginning + and end of the input sequence location's overlap with each + """ + ga4gh_seq_id = None + if chromosome: + if not re.match(f"^{CHR_PATTERN}$", chromosome): + error_msg = "`chromosome` must be 1, ..., 22, X, or Y" + raise FeatureOverlapError(error_msg) + else: + if identifier: + chromosome = self._get_chr_from_alt_ac(identifier) + if identifier.startswith("ga4gh:SQ."): + ga4gh_seq_id = identifier + else: + error_msg = "Must provide either `chromosome` or `identifier`" + raise FeatureOverlapError(error_msg) + + # Convert residue to inter-residue + if coordinate_type == CoordinateType.RESIDUE: + start -= 1 + + # Get feature dataframe (df uses inter-residue) + feature_df = self.df.filter( + (pl.col("chromosome") == chromosome) + & (pl.col("cds_start") <= end) + & (pl.col("cds_stop") >= start) + ) + + if feature_df.is_empty(): + return None + + # Add overlap columns + feature_df = feature_df.with_columns( + [ + pl.when(pl.col("cds_start") < start) + .then(start) + .otherwise(pl.col("cds_start")) + .alias("overlap_start"), + pl.when(pl.col("cds_stop") > end) + .then(end) + .otherwise(pl.col("cds_stop")) + .alias("overlap_stop"), + ] + ) + + # Get ga4gh identifier for chromosome + if not ga4gh_seq_id: + grch38_chr = f"{Assembly.GRCH38.value}:{chromosome}" + ga4gh_aliases, error_msg = self.seqrepo_access.translate_identifier( + grch38_chr, "ga4gh" + ) + + # Errors should never happen but catching just in case + if error_msg: + raise FeatureOverlapError(str(error_msg)) + + if not ga4gh_aliases: + error_msg = f"Unable to find ga4gh identifier for: {grch38_chr}" + raise FeatureOverlapError(error_msg) + + ga4gh_seq_id = ga4gh_aliases[0] + + def _get_seq_loc(start_pos: int, stop_pos: int, refget_ac: str) -> dict: + """Get VRS Sequence Location represented as a dict + + :param start_pos: Start position + :param stop_pos: Stop position + :param refget_ac: Refget Accession (SQ.) + :return: VRS Sequence Location represented as dictionary with the ga4gh ID + included + """ + _sl = SequenceLocation( + sequenceReference=SequenceReference( + refgetAccession=refget_ac, + ), + start=start_pos, + end=stop_pos, + ) + ga4gh_identify(_sl) + return _sl.model_dump(exclude_none=True) + + resp = {} + refget_ac = ga4gh_seq_id.split("ga4gh:")[-1] + for gene, group in feature_df.group_by("gene"): + gene = gene[0] + _gene_overlap_data = [ + CdsOverlap( + cds=_get_seq_loc( + cds_row["cds_start"], cds_row["cds_stop"], refget_ac + ), + overlap=_get_seq_loc( + cds_row["overlap_start"], cds_row["overlap_stop"], refget_ac + ), + ).model_dump(by_alias=True, exclude_none=True) + for cds_row in group.iter_rows(named=True) + ] + resp[gene] = _gene_overlap_data + + return resp diff --git a/src/cool_seq_tool/resources/data_files.py b/src/cool_seq_tool/resources/data_files.py index f686611c..647d01b9 100644 --- a/src/cool_seq_tool/resources/data_files.py +++ b/src/cool_seq_tool/resources/data_files.py @@ -6,7 +6,11 @@ from os import environ from pathlib import Path -from wags_tails import NcbiLrgRefSeqGeneData, NcbiManeSummaryData +from wags_tails import ( + NcbiLrgRefSeqGeneData, + NcbiManeRefSeqGenomicData, + NcbiManeSummaryData, +) _logger = logging.getLogger(__name__) @@ -16,6 +20,7 @@ class DataFile(str, Enum): TRANSCRIPT_MAPPINGS = "transcript_mappings" MANE_SUMMARY = "mane_summary" + MANE_REFSEQ_GENOMIC = "mane_refseq_genomic" LRG_REFSEQGENE = "lrg_refseqgene" def lower(self) -> str: @@ -37,6 +42,12 @@ def lower(self) -> str: from_local=from_local )[0], ), + DataFile.MANE_REFSEQ_GENOMIC: ( + "MANE_REFSEQ_GENOMIC_PATH", + lambda from_local: NcbiManeRefSeqGenomicData(silent=True).get_latest( + from_local=from_local + )[0], + ), DataFile.LRG_REFSEQGENE: ( "LRG_REFSEQGENE_PATH", lambda from_local: NcbiLrgRefSeqGeneData(silent=True).get_latest( @@ -53,6 +64,7 @@ def get_data_file(resource: DataFile, from_local: bool = False) -> Path: * ``Resource.TRANSCRIPT_MAPPINGS`` -> ``TRANSCRIPT_MAPPINGS_PATH`` * ``Resource.MANE_SUMMARY`` -> ``MANE_SUMMARY_PATH`` + * ``Resource.MANE_REFSEQ_GENOMIC`` -> ``MANE_REFSEQ_GENOMIC_PATH`` * ``Resource.LRG_REFSEQGENE`` -> ``LRG_REFSEQGENE_PATH`` Otherwise, this function falls back on default expected locations: diff --git a/src/cool_seq_tool/schemas.py b/src/cool_seq_tool/schemas.py index 38e6d31a..8036f6d9 100644 --- a/src/cool_seq_tool/schemas.py +++ b/src/cool_seq_tool/schemas.py @@ -4,6 +4,7 @@ from enum import Enum, IntEnum from typing import Literal +from ga4gh.vrs.models import SequenceLocation from pydantic import ( BaseModel, ConfigDict, @@ -167,3 +168,37 @@ class ServiceMeta(BaseModelForbidExtra): } } ) + + +class CdsOverlap(BaseModelForbidExtra): + """Create model for representing CDS start/stop and Overlap start/stop""" + + cds: SequenceLocation + overlap: SequenceLocation + + model_config = ConfigDict( + json_schema_extra={ + "example": { + "cds": { + "id": "ga4gh:SL.fYRYzNIAoe6UQF9MT1XaYsFscoU68ZJv", + "type": "SequenceLocation", + "sequenceReference": { + "refgetAccession": "SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul", + "type": "SequenceReference", + }, + "start": 140726493, + "end": 140726516, + }, + "overlap": { + "id": "ga4gh:SL.fYRYzNIAoe6UQF9MT1XaYsFscoU68ZJv", + "type": "SequenceLocation", + "sequenceReference": { + "refgetAccession": "SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul", + "type": "SequenceReference", + }, + "start": 140726493, + "end": 140726516, + }, + } + } + ) diff --git a/tests/handlers/test_feature_overlap.py b/tests/handlers/test_feature_overlap.py new file mode 100644 index 00000000..b8365bfe --- /dev/null +++ b/tests/handlers/test_feature_overlap.py @@ -0,0 +1,492 @@ +"""Module for testing Feature Overlap class""" + +import polars as pl +import pytest + +from cool_seq_tool.mappers.feature_overlap import ( + FeatureOverlap, + FeatureOverlapError, +) +from cool_seq_tool.schemas import CoordinateType + + +@pytest.fixture(scope="module") +def test_feature_overlap(test_seqrepo_access): + """Build Feature Overlap test fixture""" + return FeatureOverlap(test_seqrepo_access) + + +def test_df(test_feature_overlap): + """Test that the dataframe contains correct data""" + # We only store CDS data + assert list(test_feature_overlap.df["type"].unique()) == ["CDS"] + + assert set(test_feature_overlap.df.columns) == { + "type", + "chromosome", + "cds_start", + "cds_stop", + "info_name", + "gene", + } + + assert test_feature_overlap.df["cds_start"].dtype == pl.Int64 + assert test_feature_overlap.df["cds_stop"].dtype == pl.Int64 + + assert set(test_feature_overlap.df["chromosome"].unique()) == { + "1", + "2", + "3", + "4", + "5", + "6", + "7", + "8", + "9", + "10", + "11", + "12", + "13", + "14", + "15", + "16", + "17", + "18", + "19", + "20", + "21", + "22", + "X", + "Y", + } + + +def test_get_chr_from_alt_ac(test_feature_overlap): + """Test that _get_chr_from_alt_ac works correctly""" + resp = test_feature_overlap._get_chr_from_alt_ac("NC_000001.11") + assert resp == "1" + + resp = test_feature_overlap._get_chr_from_alt_ac("NC_000023.11") + assert resp == "X" + + # identifier is invalid (no version) + with pytest.raises(FeatureOverlapError) as e: + test_feature_overlap._get_chr_from_alt_ac("NC_000001") + assert str(e.value) == "SeqRepo unable to get translated identifiers for NC_000001" + + # identifier is grch37 + with pytest.raises(FeatureOverlapError) as e: + test_feature_overlap._get_chr_from_alt_ac("NC_000001.10") + assert str(e.value) == "Unable to find GRCh38 aliases for: NC_000001.10" + + +def test_get_grch38_cds_overlap(test_feature_overlap): + """Test that get_grch38_mane_gene_cds_overlap works correctly""" + # Variant fully contains exon (negative strand) + resp = test_feature_overlap.get_grch38_mane_gene_cds_overlap( + 140726490, 140726520, identifier="ga4gh:SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul" + ) + assert resp == { + "BRAF": [ + { + "cds": { + "id": "ga4gh:SL.fYRYzNIAoe6UQF9MT1XaYsFscoU68ZJv", + "digest": "fYRYzNIAoe6UQF9MT1XaYsFscoU68ZJv", + "type": "SequenceLocation", + "sequenceReference": { + "refgetAccession": "SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul", + "type": "SequenceReference", + }, + "start": 140726493, + "end": 140726516, + }, + "overlap": { + "id": "ga4gh:SL.fYRYzNIAoe6UQF9MT1XaYsFscoU68ZJv", + "digest": "fYRYzNIAoe6UQF9MT1XaYsFscoU68ZJv", + "type": "SequenceLocation", + "sequenceReference": { + "refgetAccession": "SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul", + "type": "SequenceReference", + }, + "start": 140726493, + "end": 140726516, + }, + } + ] + } + + expected = { + "BRAF": [ + { + "cds": { + "id": "ga4gh:SL.fYRYzNIAoe6UQF9MT1XaYsFscoU68ZJv", + "digest": "fYRYzNIAoe6UQF9MT1XaYsFscoU68ZJv", + "type": "SequenceLocation", + "sequenceReference": { + "refgetAccession": "SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul", + "type": "SequenceReference", + }, + "start": 140726493, + "end": 140726516, + }, + "overlap": { + "id": "ga4gh:SL.rMJlP7STVHBdvcCMgHkA4XJXXIdXnsix", + "digest": "rMJlP7STVHBdvcCMgHkA4XJXXIdXnsix", + "type": "SequenceLocation", + "sequenceReference": { + "refgetAccession": "SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul", + "type": "SequenceReference", + }, + "start": 140726500, + "end": 140726501, + }, + } + ] + } + + # Using residue (start == stop) + resp = test_feature_overlap.get_grch38_mane_gene_cds_overlap( + 140726501, + 140726501, + identifier="ga4gh:SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul", + coordinate_type=CoordinateType.RESIDUE, + ) + assert resp == expected + + # Using inter-residue + resp = test_feature_overlap.get_grch38_mane_gene_cds_overlap( + 140726500, + 140726501, + identifier="ga4gh:SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul", + coordinate_type=CoordinateType.INTER_RESIDUE, + ) + assert resp == expected + + # Variant is fully contained within exon (positive strand) + resp = test_feature_overlap.get_grch38_mane_gene_cds_overlap( + 55019308, 55019341, chromosome="7" + ) + assert resp == { + "EGFR": [ + { + "cds": { + "id": "ga4gh:SL.vjxcgicBFEkN8b8AXhagvUDC7FZgZgCp", + "digest": "vjxcgicBFEkN8b8AXhagvUDC7FZgZgCp", + "type": "SequenceLocation", + "sequenceReference": { + "refgetAccession": "SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul", + "type": "SequenceReference", + }, + "start": 55019277, + "end": 55019365, + }, + "overlap": { + "id": "ga4gh:SL.a_MHSA9TJ5zMkxd52eBuRUNb5ZIXHH7T", + "digest": "a_MHSA9TJ5zMkxd52eBuRUNb5ZIXHH7T", + "type": "SequenceLocation", + "sequenceReference": { + "refgetAccession": "SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul", + "type": "SequenceReference", + }, + "start": 55019307, + "end": 55019341, + }, + } + ] + } + + # Variant partially overlaps with exon, from the exon's start side (negative strand) + resp = test_feature_overlap.get_grch38_mane_gene_cds_overlap( + 140726503, 140726520, chromosome="7" + ) + assert resp == { + "BRAF": [ + { + "cds": { + "id": "ga4gh:SL.fYRYzNIAoe6UQF9MT1XaYsFscoU68ZJv", + "digest": "fYRYzNIAoe6UQF9MT1XaYsFscoU68ZJv", + "type": "SequenceLocation", + "sequenceReference": { + "refgetAccession": "SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul", + "type": "SequenceReference", + }, + "start": 140726493, + "end": 140726516, + }, + "overlap": { + "id": "ga4gh:SL.MdSOBEGp0l8wT3y1taeRvVIEi_XDBIGK", + "digest": "MdSOBEGp0l8wT3y1taeRvVIEi_XDBIGK", + "type": "SequenceLocation", + "sequenceReference": { + "refgetAccession": "SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul", + "type": "SequenceReference", + }, + "start": 140726502, + "end": 140726516, + }, + } + ] + } + + # Variant partially overlaps with exon, from the exon's stop side (negative strand) + resp = test_feature_overlap.get_grch38_mane_gene_cds_overlap( + 140726490, 140726505, identifier="NC_000007.14" + ) + assert resp == { + "BRAF": [ + { + "cds": { + "id": "ga4gh:SL.fYRYzNIAoe6UQF9MT1XaYsFscoU68ZJv", + "digest": "fYRYzNIAoe6UQF9MT1XaYsFscoU68ZJv", + "type": "SequenceLocation", + "sequenceReference": { + "refgetAccession": "SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul", + "type": "SequenceReference", + }, + "start": 140726493, + "end": 140726516, + }, + "overlap": { + "id": "ga4gh:SL.Rjvup1y8hPgveXiYnj7dipqYkt3BaFZE", + "digest": "Rjvup1y8hPgveXiYnj7dipqYkt3BaFZE", + "type": "SequenceLocation", + "sequenceReference": { + "refgetAccession": "SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul", + "type": "SequenceReference", + }, + "start": 140726493, + "end": 140726505, + }, + } + ] + } + + # Variant overlaps with multiple exons (positive strand) + resp = test_feature_overlap.get_grch38_mane_gene_cds_overlap( + 21522390, 21523491, chromosome="Y" + ) + assert resp == { + "RBMY1B": [ + { + "cds": { + "id": "ga4gh:SL.3fbgdG4Z2a1fqkqkY8M2bQMfBhJsVr_i", + "digest": "3fbgdG4Z2a1fqkqkY8M2bQMfBhJsVr_i", + "type": "SequenceLocation", + "sequenceReference": { + "refgetAccession": "SQ.8_liLu1aycC0tPQPFmUaGXJLDs5SbPZ5", + "type": "SequenceReference", + }, + "start": 21522382, + "end": 21522493, + }, + "overlap": { + "id": "ga4gh:SL.XSqmOKSXECFtfjhcvVDmga72xQ0jVGO7", + "digest": "XSqmOKSXECFtfjhcvVDmga72xQ0jVGO7", + "type": "SequenceLocation", + "sequenceReference": { + "refgetAccession": "SQ.8_liLu1aycC0tPQPFmUaGXJLDs5SbPZ5", + "type": "SequenceReference", + }, + "start": 21522389, + "end": 21522493, + }, + }, + { + "cds": { + "id": "ga4gh:SL.wi_fCVQHmZCOUf--3UPKm6johAu3zQYJ", + "digest": "wi_fCVQHmZCOUf--3UPKm6johAu3zQYJ", + "type": "SequenceLocation", + "sequenceReference": { + "refgetAccession": "SQ.8_liLu1aycC0tPQPFmUaGXJLDs5SbPZ5", + "type": "SequenceReference", + }, + "start": 21522934, + "end": 21523045, + }, + "overlap": { + "id": "ga4gh:SL.wi_fCVQHmZCOUf--3UPKm6johAu3zQYJ", + "digest": "wi_fCVQHmZCOUf--3UPKm6johAu3zQYJ", + "type": "SequenceLocation", + "sequenceReference": { + "refgetAccession": "SQ.8_liLu1aycC0tPQPFmUaGXJLDs5SbPZ5", + "type": "SequenceReference", + }, + "start": 21522934, + "end": 21523045, + }, + }, + { + "cds": { + "id": "ga4gh:SL.cnULzfdHZPiY6rSQP2Prfzocf1_YOBGA", + "digest": "cnULzfdHZPiY6rSQP2Prfzocf1_YOBGA", + "type": "SequenceLocation", + "sequenceReference": { + "refgetAccession": "SQ.8_liLu1aycC0tPQPFmUaGXJLDs5SbPZ5", + "type": "SequenceReference", + }, + "start": 21523479, + "end": 21523590, + }, + "overlap": { + "id": "ga4gh:SL.kHLoQs5cjOqjztK9yZyyVR9ScKCM1S8f", + "digest": "kHLoQs5cjOqjztK9yZyyVR9ScKCM1S8f", + "type": "SequenceLocation", + "sequenceReference": { + "refgetAccession": "SQ.8_liLu1aycC0tPQPFmUaGXJLDs5SbPZ5", + "type": "SequenceReference", + }, + "start": 21523479, + "end": 21523491, + }, + }, + ] + } + + # Variant overlaps with multiple exons (negative strand) + resp = test_feature_overlap.get_grch38_mane_gene_cds_overlap( + 154779177, 154781317, chromosome="X" + ) + assert resp == { + "MPP1": [ + { + "cds": { + "id": "ga4gh:SL.dnHzxh-VwjVdanLcvKkI1otKhZeY223-", + "digest": "dnHzxh-VwjVdanLcvKkI1otKhZeY223-", + "type": "SequenceLocation", + "sequenceReference": { + "refgetAccession": "SQ.w0WZEvgJF0zf_P4yyTzjjv9oW1z61HHP", + "type": "SequenceReference", + }, + "start": 154781238, + "end": 154781313, + }, + "overlap": { + "id": "ga4gh:SL.dnHzxh-VwjVdanLcvKkI1otKhZeY223-", + "digest": "dnHzxh-VwjVdanLcvKkI1otKhZeY223-", + "type": "SequenceLocation", + "sequenceReference": { + "refgetAccession": "SQ.w0WZEvgJF0zf_P4yyTzjjv9oW1z61HHP", + "type": "SequenceReference", + }, + "start": 154781238, + "end": 154781313, + }, + }, + { + "cds": { + "id": "ga4gh:SL.Z4jQtiT0-FplZWGVA1wNhdCaCMKGQ17D", + "digest": "Z4jQtiT0-FplZWGVA1wNhdCaCMKGQ17D", + "type": "SequenceLocation", + "sequenceReference": { + "refgetAccession": "SQ.w0WZEvgJF0zf_P4yyTzjjv9oW1z61HHP", + "type": "SequenceReference", + }, + "start": 154779176, + "end": 154779353, + }, + "overlap": { + "id": "ga4gh:SL.Z4jQtiT0-FplZWGVA1wNhdCaCMKGQ17D", + "digest": "Z4jQtiT0-FplZWGVA1wNhdCaCMKGQ17D", + "type": "SequenceLocation", + "sequenceReference": { + "refgetAccession": "SQ.w0WZEvgJF0zf_P4yyTzjjv9oW1z61HHP", + "type": "SequenceReference", + }, + "start": 154779176, + "end": 154779353, + }, + }, + ] + } + + # Variant overlap with cds in multiple genes and alt chromosome accession + # chr19_KI270930v1_alt with exact start/stop CDS + resp = test_feature_overlap.get_grch38_mane_gene_cds_overlap( + 135329, 135381, chromosome="19" + ) + expected = { + "KIR2DL5B": [ + { + "cds": { + "id": "ga4gh:SL.tR0TL0hHD3udyK9at0snGQ3zNSmhCz6K", + "digest": "tR0TL0hHD3udyK9at0snGQ3zNSmhCz6K", + "type": "SequenceLocation", + "sequenceReference": { + "refgetAccession": "SQ.IIB53T8CNeJJdUqzn9V_JnRtQadwWCbl", + "type": "SequenceReference", + }, + "start": 135328, + "end": 135381, + }, + "overlap": { + "id": "ga4gh:SL.tR0TL0hHD3udyK9at0snGQ3zNSmhCz6K", + "digest": "tR0TL0hHD3udyK9at0snGQ3zNSmhCz6K", + "type": "SequenceLocation", + "sequenceReference": { + "refgetAccession": "SQ.IIB53T8CNeJJdUqzn9V_JnRtQadwWCbl", + "type": "SequenceReference", + }, + "start": 135328, + "end": 135381, + }, + } + ], + "FCGBP": [ + { + "cds": { + "id": "ga4gh:SL.3G3gZfvJ56-y-TRWNSAUHUmyPi_8X3qK", + "digest": "3G3gZfvJ56-y-TRWNSAUHUmyPi_8X3qK", + "type": "SequenceLocation", + "sequenceReference": { + "refgetAccession": "SQ.IIB53T8CNeJJdUqzn9V_JnRtQadwWCbl", + "type": "SequenceReference", + }, + "start": 135263, + "end": 135807, + }, + "overlap": { + "id": "ga4gh:SL.tR0TL0hHD3udyK9at0snGQ3zNSmhCz6K", + "digest": "tR0TL0hHD3udyK9at0snGQ3zNSmhCz6K", + "type": "SequenceLocation", + "sequenceReference": { + "refgetAccession": "SQ.IIB53T8CNeJJdUqzn9V_JnRtQadwWCbl", + "type": "SequenceReference", + }, + "start": 135328, + "end": 135381, + }, + } + ], + } + assert resp == expected + + # Using inter-residue (start != stop) + resp = test_feature_overlap.get_grch38_mane_gene_cds_overlap( + 135328, 135381, chromosome="19", coordinate_type=CoordinateType.INTER_RESIDUE + ) + assert resp == expected + + # No overlap found + resp = test_feature_overlap.get_grch38_mane_gene_cds_overlap(1, 2, chromosome="19") + assert resp is None + + # Testing invalid + + # chromosome does not match regex pattern + with pytest.raises(FeatureOverlapError) as e: + test_feature_overlap.get_grch38_mane_gene_cds_overlap( + 154779177, 154781317, chromosome="chrX" + ) + assert str(e.value) == "`chromosome` must be 1, ..., 22, X, or Y" + + # identifier is GRCh37 + with pytest.raises(FeatureOverlapError) as e: + test_feature_overlap.get_grch38_mane_gene_cds_overlap( + 154779177, 154781317, identifier="NC_000023.10" + ) + assert str(e.value) == "Unable to find GRCh38 aliases for: NC_000023.10" + + # no identifier or chromosome provided + with pytest.raises(FeatureOverlapError) as e: + test_feature_overlap.get_grch38_mane_gene_cds_overlap(154779177, 154781317) + assert str(e.value) == "Must provide either `chromosome` or `identifier`"