VariantEffect
diff --git a/‎Dockerfile‎
Lines changed: 6 additions & 0 deletions b/‎Dockerfile‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎pyproject.toml‎
Lines changed: 1 addition & 0 deletions b/‎pyproject.toml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/dcd_mapping/align.py‎
Lines changed: 133 additions & 33 deletions b/‎src/dcd_mapping/align.py‎
Lines changed: 133 additions & 33 deletions
diff --git a/‎src/dcd_mapping/annotate.py‎
Lines changed: 24 additions & 2 deletions b/‎src/dcd_mapping/annotate.py‎
Lines changed: 24 additions & 2 deletions
diff --git a/‎src/dcd_mapping/lookup.py‎
Lines changed: 36 additions & 0 deletions b/‎src/dcd_mapping/lookup.py‎
Lines changed: 36 additions & 0 deletions
diff --git a/‎src/dcd_mapping/main.py‎
Lines changed: 12 additions & 2 deletions b/‎src/dcd_mapping/main.py‎
Lines changed: 12 additions & 2 deletions
@@ -37,6 +37,12 @@ RUN curl -L https://github.com/samtools/htslib/releases/download/${htsversion}/h
     curl -L https://github.com/samtools/bcftools/releases/download/${htsversion}/bcftools-${htsversion}.tar.bz2 | tar xj && \
     (cd bcftools-${htsversion} && ./configure --enable-libgsl --enable-perl-filters --with-htslib=system && make install)
 
+# Fetch and index GRCh37 and GRCh38 assemblies for cdot
+RUN wget -O - https://ftp.ncbi.nlm.nih.gov/genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_assembly_versions/GCF_000001405.25_GRCh37.p13/GCF_000001405.25_GRCh37.p13_genomic.fna.gz | gzip -d | bgzip >  GCF_000001405.25_GRCh37.p13_genomic.fna.gz
+RUN wget -O - https://ftp.ncbi.nlm.nih.gov/genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_assembly_versions/GCF_000001405.39_GRCh38.p13/GCF_000001405.39_GRCh38.p13_genomic.fna.gz | gzip -d | bgzip > GCF_000001405.39_GRCh38.p13_genomic.fna.gz
+RUN samtools faidx GCF_000001405.25_GRCh37.p13_genomic.fna.gz
+RUN samtools faidx GCF_000001405.39_GRCh38.p13_genomic.fna.gz
+
 RUN mkdir /usr/src/app
 WORKDIR /usr/src/app
 COPY . .
 
@@ -35,6 +35,7 @@ dependencies = [
     "requests",
     "biopython",
     "tqdm",
+    "cdot",
     "click",
     "cool-seq-tool==0.4.0.dev3",
     "ga4gh.vrs==2.0.0-a6",
 
@@ -14,9 +14,7 @@
 from cool_seq_tool.schemas import Strand
 
 from dcd_mapping.lookup import get_chromosome_identifier, get_gene_location
-from dcd_mapping.mavedb_data import (
-    LOCAL_STORE_PATH,
-)
+from dcd_mapping.mavedb_data import LOCAL_STORE_PATH, ScoresetNotSupportedError
 from dcd_mapping.resource_utils import (
     ResourceAcquisitionError,
     http_download,
@@ -180,36 +178,35 @@ def _get_blat_output(metadata: ScoresetMetadata, silent: bool) -> Any:  # noqa:
     :return: dict where keys are target gene identifiers and values are BLAT query result objects
     :raise AlignmentError: if BLAT subprocess returns error code
     """
-    return parse_blat(f"{metadata.urn}_blat.psl", "blat-psl")
-    # with tempfile.NamedTemporaryFile() as tmp_file:
-    #     query_file = _build_query_file(metadata, Path(tmp_file.name))
-    #     target_sequence_type = _get_target_sequence_type(metadata)
-    #     if target_sequence_type == TargetSequenceType.PROTEIN:
-    #         target_args = "-q=prot -t=dnax"
-    #     elif target_sequence_type == TargetSequenceType.DNA:
-    #         target_args = ""
-    #     else:
-    #         # TODO implement support for mixed types, not hard to do - just split blat into two files and run command with each set of arguments.
-    #         msg = "Mapping for score sets with a mix of nucleotide and protein target sequences is not currently supported."
-    #         raise NotImplementedError(msg)
-    #     process_result = _run_blat(target_args, query_file, "/dev/stdout", silent)
-    #     out_file = _write_blat_output_tempfile(process_result)
-
-    #     try:
-    #         output = parse_blat(out_file, "blat-psl")
-
-    #     # TODO reevaluate this code block - are there cases in mavedb where target sequence type is incorrectly supplied?
-    #     except ValueError:
-    #         target_args = "-q=dnax -t=dnax"
-    #         process_result = _run_blat(target_args, query_file, "/dev/stdout", silent)
-    #         out_file = _write_blat_output_tempfile(process_result)
-    #         try:
-    #             output = parse_blat(out_file, "blat-psl")
-    #         except ValueError as e:
-    #             msg = f"Unable to run successful BLAT on {metadata.urn}"
-    #             raise AlignmentError(msg) from e
-
-    # return output
+    with tempfile.NamedTemporaryFile() as tmp_file:
+        query_file = _build_query_file(metadata, Path(tmp_file.name))
+        target_sequence_type = _get_target_sequence_type(metadata)
+        if target_sequence_type == TargetSequenceType.PROTEIN:
+            target_args = "-q=prot -t=dnax"
+        elif target_sequence_type == TargetSequenceType.DNA:
+            target_args = ""
+        else:
+            # TODO consider implementing support for mixed types, not hard to do - just split blat into two files and run command with each set of arguments.
+            msg = "Mapping for score sets with a mix of nucleotide and protein target sequences is not currently supported."
+            raise NotImplementedError(msg)
+        process_result = _run_blat(target_args, query_file, "/dev/stdout", silent)
+        out_file = _write_blat_output_tempfile(process_result)
+
+        try:
+            output = parse_blat(out_file, "blat-psl")
+
+        # TODO reevaluate this code block - are there cases in mavedb where target sequence type is incorrectly supplied?
+        except ValueError:
+            target_args = "-q=dnax -t=dnax"
+            process_result = _run_blat(target_args, query_file, "/dev/stdout", silent)
+            out_file = _write_blat_output_tempfile(process_result)
+            try:
+                output = parse_blat(out_file, "blat-psl")
+            except ValueError as e:
+                msg = f"Unable to run successful BLAT on {metadata.urn}"
+                raise AlignmentError(msg) from e
+
+    return output
 
 
 def _get_best_hit(output: QueryResult, chromosome: str | None) -> Hit:
@@ -342,3 +339,106 @@ def align(
         target_gene = scoreset_metadata.target_genes[target_label]
         alignment_results[target_label] = _get_best_match(blat_result, target_gene)
     return alignment_results
+
+
+def fetch_alignment(
+    metadata: ScoresetMetadata, silent: bool
+) -> dict[str, AlignmentResult | None]:
+    alignment_results = {}
+    for target_gene in metadata.target_genes:
+        accession_id = metadata.target_genes[target_gene].target_accession_id
+        # protein and contig/chromosome accession ids do not need to be aligned to the genome
+        if accession_id.startswith(("NP", "ENSP", "NC_")):
+            alignment_results[accession_id] = None
+        else:
+            url = f"https://cdot.cc/transcript/{accession_id}"
+            r = requests.get(url, timeout=30)
+
+            try:
+                r.raise_for_status()
+            except requests.HTTPError as e:
+                msg = f"Received HTTPError from {url} for scoreset {metadata.urn}"
+                _logger.error(msg)
+                raise ResourceAcquisitionError(msg) from e
+
+            cdot_mapping = r.json()
+            alignment_results[accession_id] = parse_cdot_mapping(cdot_mapping, silent)
+    return alignment_results
+
+
+def parse_cdot_mapping(cdot_mapping: dict, silent: bool) -> AlignmentResult:
+    # blat psl & AlignmentResult: 0-based, start inclusive, stop exclusive
+    # cdot: 1-based, start inclusive, stop inclusive
+    # so, to "translate" cdot ranges to AlignmentResult-style ranges:
+    # subtract 1 from start and end to go from 1-based to 0-based coord,
+    # and then add 1 to the stop to go from inclusive to exclusive
+    # so just subtract 1 from start and do nothing to end
+
+    grch38 = cdot_mapping.get("genome_builds", {}).get("GRCh38")
+    grch37 = cdot_mapping.get("genome_builds", {}).get("GRCh37")
+    mapping = grch38 if grch38 else grch37
+    if mapping is None:
+        msg = f"Cdot transcript results for transcript {cdot_mapping.get('id')} do not include GRCh37 or GRCh38 mapping"
+        raise AlignmentError(msg)
+
+    chrom = mapping["contig"]
+    strand = Strand.POSITIVE if mapping["strand"] == "+" else Strand.NEGATIVE
+    query_subranges = []
+    hit_subranges = []
+    for exon in mapping["exons"]:
+        query_subranges.append(SequenceRange(start=exon[3] - 1, end=exon[4]))
+        hit_subranges.append(SequenceRange(start=exon[0] - 1, end=exon[1]))
+
+    if strand == Strand.POSITIVE:
+        query_range = SequenceRange(
+            start=query_subranges[0].start, end=query_subranges[-1].end
+        )
+        hit_range = SequenceRange(
+            start=hit_subranges[0].start, end=hit_subranges[-1].end
+        )
+    else:
+        query_range = SequenceRange(
+            start=query_subranges[-1].start, end=query_subranges[0].end
+        )
+        hit_range = SequenceRange(
+            start=hit_subranges[-1].start, end=hit_subranges[0].end
+        )
+
+    return AlignmentResult(
+        chrom=chrom,
+        strand=strand,
+        query_range=query_range,
+        query_subranges=query_subranges,
+        hit_range=hit_range,
+        hit_subranges=hit_subranges,
+    )
+
+
+def build_alignment_result(
+    metadata: ScoresetMetadata, silent: bool
+) -> dict[str, AlignmentResult | None]:
+    # NOTE: Score set must contain all accession-based target genes or all sequence-based target genes
+    # This decision was made because it is most efficient to run BLAT all together, so the alignment function
+    # works on an entire score set rather than per target gene.
+    # However, if the need arises, we can allow both types of target genes in a score set.
+
+    # determine whether score set is accession-based or sequence-based
+    score_set_type = None
+    for target_gene in metadata.target_genes:
+        if metadata.target_genes[target_gene].target_accession_id:
+            if score_set_type == "sequence":
+                msg = "Score set contains both accession-based and sequence-based target genes. This is not currently supported."
+                raise ScoresetNotSupportedError(msg)
+            score_set_type = "accession"
+        else:
+            if score_set_type == "accession":
+                msg = "Score set contains both accession-based and sequence-based target genes. This is not currently supported."
+                raise ScoresetNotSupportedError(msg)
+            score_set_type = "sequence"
+
+    if score_set_type == "sequence":
+        alignment_result = align(metadata, silent)
+    else:
+        alignment_result = fetch_alignment(metadata, silent)
+
+    return alignment_result
@@ -419,14 +419,36 @@ def _get_computed_reference_sequence(
     metadata: TargetGene,
     layer: AnnotationLayer,
     tx_output: TxSelectResult | TxSelectError | None = None,
-) -> ComputedReferenceSequence | None:
+) -> ComputedReferenceSequence | MappedReferenceSequence | None:
     """Report the computed reference sequence for a score set
 
     :param metadata: Target gene metadata from MaveDB API
     :param layer: AnnotationLayer
     :param tx_output: Transcript data for a score set
-    :return A ComputedReferenceSequence object
+    :return A ComputedReferenceSequence object,
+    or if the target gene is accession-based, a mapped reference sequence describing the pre-mapped reference
     """
+    # accession-based target genes always use accession id as pre-mapped reference sequence
+    if metadata.target_accession_id:
+        seq_id = get_vrs_id_from_identifier(metadata.target_accession_id)
+        # use MappedReferenceSequence type because there should be an accession id but no sequence.
+        # for accession-based target genes, the object returned by this function describes the provided reference accession
+        # whereas the object returned by _get_mapped_reference_sequence describes the mapped reference accession, which could be a chromosome for ex.
+        seq_type: TargetSequenceType
+        # TODO full list of protein accession id prefixes
+        if metadata.target_accession_id.startswith(("NP", "ENSP")):
+            seq_type = TargetSequenceType.PROTEIN
+        # TODO full list of transcript and contig accession id prefixes
+        elif metadata.target_accession_id.startswith(("NM", "ENST", "NC")):
+            seq_type = TargetSequenceType.DNA
+        else:
+            msg = f"Unrecognized accession prefix for accession id {metadata.target_accession_id}"
+            raise ValueError(msg)
+        return MappedReferenceSequence(
+            sequence_type=seq_type,
+            sequence_id=seq_id,
+            sequence_accessions=[metadata.target_accession_id],
+        )
     if layer == AnnotationLayer.PROTEIN:
         if tx_output is None or isinstance(tx_output, TxSelectError):
             # TODO catch this error - don't stop whole job for one failed target
 
@@ -12,10 +12,12 @@
 import os
 from pathlib import Path
 
+import hgvs
 import polars as pl
 import requests
 from biocommons.seqrepo import SeqRepo
 from biocommons.seqrepo.seqaliasdb.seqaliasdb import sqlite3
+from cdot.hgvs.dataproviders import ChainedSeqFetcher, FastaSeqFetcher, RESTDataProvider
 from cool_seq_tool.app import (
     LRG_REFSEQGENE_PATH,
     MANE_SUMMARY_PATH,
@@ -42,6 +44,7 @@
 )
 from ga4gh.vrs.dataproxy import SeqRepoDataProxy, coerce_namespace
 from ga4gh.vrs.extras.translator import AlleleTranslator
+from ga4gh.vrs.utils.hgvs_tools import HgvsTools
 from gene.database import create_db
 from gene.query import QueryHandler
 from gene.schemas import MatchType, SourceName
@@ -70,6 +73,23 @@
 ]
 _logger = logging.getLogger(__name__)
 
+# ---------------------------------- Cdot ---------------------------------- #
+
+
+GENOMIC_FASTA_FILES = [
+    "/home/.local/share/dcd_mapping/GCF_000001405.39_GRCh38.p13_genomic.fna.gz",
+    "/home/.local/share/dcd_mapping/GCF_000001405.25_GRCh37.p13_genomic.fna.gz",
+]
+
+
+def seqfetcher() -> ChainedSeqFetcher:
+    return ChainedSeqFetcher(*[FastaSeqFetcher(file) for file in GENOMIC_FASTA_FILES])
+
+
+def cdot_rest() -> RESTDataProvider:
+    return RESTDataProvider(seqfetcher=seqfetcher())
+
+
 # ---------------------------------- Global ---------------------------------- #
 
 
@@ -180,6 +200,15 @@ def __new__(cls) -> QueryHandler:
         return cls.instance
 
 
+def init_hgvs_tools(self, data_proxy=None):  # noqa: ANN202, ANN001
+    """Initialize HgvsTools with cdot as data provider"""
+    self.parser = hgvs.parser.Parser()
+    self.data_proxy = data_proxy
+    cdot_provider = cdot_rest()
+    self.normalizer = hgvs.normalizer.Normalizer(cdot_provider, validate=True)
+    self.variant_mapper = hgvs.variantmapper.VariantMapper(cdot_provider)
+
+
 class TranslatorBuilder:
     """Singleton constructor for VRS Translator instance."""
 
@@ -190,6 +219,8 @@ def __new__(cls, data_proxy: SeqRepoDataProxy) -> AlleleTranslator:
         :return: singleton instance of ``AlleleTranslator``
         """
         if not hasattr(cls, "instance"):
+            # monkey patch to use cdot instead of UTA as HgvsTools data provider
+            HgvsTools.__init__ = init_hgvs_tools
             tr = AlleleTranslator(data_proxy)
             cls.instance = tr
         else:
@@ -430,6 +461,11 @@ def get_chromosome_identifier(chromosome: str) -> str:
     :return: latest ID if available
     :raise KeyError: if unable to retrieve identifier
     """
+    # target sequence alignment references are chromosome names like ``"8"``, ``"X"``
+    # but accession alignment information from cdot has reference accessions, beginning with "NC_"
+    # for "NC_" identifiers, just return the identifier
+    if chromosome.startswith("NC_"):
+        return chromosome
     if not chromosome.startswith("chr"):
         chromosome = f"chr{chromosome}"
     sr = get_seqrepo()
 
@@ -8,7 +8,7 @@
 import click
 from requests import HTTPError
 
-from dcd_mapping.align import AlignmentError, BlatNotFoundError, align
+from dcd_mapping.align import AlignmentError, BlatNotFoundError, build_alignment_result
 from dcd_mapping.annotate import (
     annotate,
     save_mapped_output_json,
@@ -156,7 +156,8 @@ async def map_scoreset(
 
     _emit_info(f"Performing alignment for {metadata.urn}...", silent)
     try:
-        alignment_results = align(metadata, silent)
+        # dictionary where keys are target gene labels or accession ids, and values are alignment result objects
+        alignment_results = build_alignment_result(metadata, silent)
     except BlatNotFoundError as e:
         msg = "BLAT command appears missing. Ensure it is available on the $PATH or use the environment variable BLAT_BIN_PATH to point to it. See instructions in the README prerequisites section for more."
         _emit_info(msg, silent, logging.ERROR)
@@ -175,6 +176,15 @@ async def map_scoreset(
         )
         _emit_info(f"Score set mapping output saved to: {final_output}.", silent)
         return
+    except ScoresetNotSupportedError as e:
+        _emit_info(f"Score set not supported: {e}", silent, logging.ERROR)
+        final_output = write_scoreset_mapping_to_json(
+            metadata.urn,
+            ScoresetMapping(metadata=metadata, error_message=str(e).strip("'")),
+            output_path,
+        )
+        _emit_info(f"Score set mapping output saved to: {final_output}.", silent)
+        return
     _emit_info("Alignment complete.", silent)
 
     _emit_info("Selecting reference sequence...", silent)