VariantEffect
diff --git a/‎src/dcd_mapping/align.py‎
Lines changed: 60 additions & 22 deletions b/‎src/dcd_mapping/align.py‎
Lines changed: 60 additions & 22 deletions
diff --git a/‎src/dcd_mapping/lookup.py‎
Lines changed: 20 additions & 16 deletions b/‎src/dcd_mapping/lookup.py‎
Lines changed: 20 additions & 16 deletions
diff --git a/‎src/dcd_mapping/main.py‎
Lines changed: 7 additions & 7 deletions b/‎src/dcd_mapping/main.py‎
Lines changed: 7 additions & 7 deletions
@@ -4,11 +4,12 @@
 import subprocess
 import tempfile
 from pathlib import Path
+from typing import Any
 from urllib.parse import urlparse
 
 import requests
 from Bio.SearchIO import HSP
-from Bio.SearchIO import read as read_blat
+from Bio.SearchIO import parse as parse_blat
 from Bio.SearchIO._model import Hit, QueryResult
 from cool_seq_tool.schemas import Strand
 
@@ -25,6 +26,7 @@
     GeneLocation,
     ScoresetMetadata,
     SequenceRange,
+    TargetGene,
     TargetSequenceType,
 )
 
@@ -61,7 +63,10 @@ def _build_query_file(scoreset_metadata: ScoresetMetadata, query_file: Path) ->
     :return: Yielded Path to constructed file. Deletes file once complete.
     """
     _logger.debug("Writing BLAT query to %s", query_file)
-    lines = [">query", scoreset_metadata.target_sequence]
+    lines = []
+    for target_gene in scoreset_metadata.target_genes:
+        lines.append(f">{target_gene}")
+        lines.append(scoreset_metadata.target_genes[target_gene].target_sequence)
     _write_query_file(query_file, lines)
     return query_file
 
@@ -143,50 +148,77 @@ def _write_blat_output_tempfile(result: subprocess.CompletedProcess) -> str:
     return tmp.name
 
 
-def _get_blat_output(metadata: ScoresetMetadata, silent: bool) -> QueryResult:
+def _get_target_sequence_type(metadata: ScoresetMetadata) -> TargetSequenceType | str:
+    """Get overall target sequence type for a score set's target genes.
+    Protein if all target sequences are protein sequences, nucleotide if all target
+    sequences are nucleotide sequences, and mixed if there is a mix within the score set.
+    :param metadata: object containing score set attributes
+    :return: TargetSequenceType enum (protein or nucleotide) or string "mixed"
+    """
+    target_sequence_types = set()
+    for target_gene in metadata.target_genes:
+        target_sequence_types.add(
+            metadata.target_genes[target_gene].target_sequence_type
+        )
+    if len(target_sequence_types) > 1:
+        return "mixed"
+    elif len(target_sequence_types) == 1:  # noqa: RET505
+        return target_sequence_types.pop()
+    else:
+        msg = f"Target sequence types not available for score set {metadata.urn}"
+        raise ValueError(msg)
+
+
+def _get_blat_output(metadata: ScoresetMetadata, silent: bool) -> Any:  # noqa: ANN401
     """Run a BLAT query and returns a path to the output object.
 
     If unable to produce a valid query the first time, then try a query using ``dnax``
     bases.
 
     :param scoreset_metadata: object containing scoreset attributes
     :param silent: suppress BLAT command output
-    :return: BLAT query result
+    :return: dict where keys are target gene identifiers and values are BLAT query result objects
     :raise AlignmentError: if BLAT subprocess returns error code
     """
     with tempfile.NamedTemporaryFile() as tmp_file:
         query_file = _build_query_file(metadata, Path(tmp_file.name))
-        if metadata.target_sequence_type == TargetSequenceType.PROTEIN:
+        target_sequence_type = _get_target_sequence_type(metadata)
+        if target_sequence_type == TargetSequenceType.PROTEIN:
             target_args = "-q=prot -t=dnax"
-        else:
+        elif target_sequence_type == TargetSequenceType.DNA:
             target_args = ""
+        else:
+            # TODO implement support for mixed types, not hard to do - just split blat into two files and run command with each set of arguments.
+            msg = "Mapping for score sets with a mix of nucleotide and protein target sequences is not currently supported."
+            raise NotImplementedError(msg)
         process_result = _run_blat(target_args, query_file, "/dev/stdout", silent)
         out_file = _write_blat_output_tempfile(process_result)
 
         try:
-            output = read_blat(out_file, "blat-psl")
+            output = parse_blat(out_file, "blat-psl")
+
+        # TODO reevaluate this code block - are there cases in mavedb where target sequence type is incorrectly supplied?
         except ValueError:
             target_args = "-q=dnax -t=dnax"
             process_result = _run_blat(target_args, query_file, "/dev/stdout", silent)
             out_file = _write_blat_output_tempfile(process_result)
             try:
-                output = read_blat(out_file, "blat-psl")
+                output = parse_blat(out_file, "blat-psl")
             except ValueError as e:
                 msg = f"Unable to run successful BLAT on {metadata.urn}"
                 raise AlignmentError(msg) from e
 
     return output
 
 
-def _get_best_hit(output: QueryResult, urn: str, chromosome: str | None) -> Hit:
+def _get_best_hit(output: QueryResult, chromosome: str | None) -> Hit:
     """Get best hit from BLAT output.
 
     First, try to return hit corresponding to expected chromosome taken from scoreset
     metadata. If chromosome doesn't match any of the outputs or is unavailable, take
     the hit with the single highest-scoring HSP.
 
     :param output: BLAT output
-    :param urn: scoreset URN to use in error messages
     :param chromosome: refseq chromosome ID, e.g. ``"NC_000001.11"``
     :return: best Hit
     :raise AlignmentError: if unable to get hits from output
@@ -207,8 +239,8 @@ def _get_best_hit(output: QueryResult, urn: str, chromosome: str | None) -> Hit:
                 hit_chrs = [h.id for h in output]
                 # TODO should this be an error rather than a warning? it seems like a problem if we can't find a hit on the expected chromosome
                 _logger.warning(
-                    "Failed to match hit chromosomes during alignment. URN: %s, expected chromosome: %s, hit chromosomes: %s",
-                    urn,
+                    "Failed to match hit chromosomes during alignment for target %s. Expected chromosome: %s, hit chromosomes: %s",
+                    output.id,
                     chromosome,
                     hit_chrs,
                 )
@@ -222,21 +254,20 @@ def _get_best_hit(output: QueryResult, urn: str, chromosome: str | None) -> Hit:
             best_score_hit = hit
 
     if best_score_hit is None:
-        msg = f"Couldn't get BLAT hits from {urn}"
+        msg = f"Couldn't get BLAT hits for target {output.id}."
         raise AlignmentError(msg)
 
     return best_score_hit
 
 
-def _get_best_hsp(hit: Hit, urn: str, gene_location: GeneLocation | None) -> HSP:
+def _get_best_hsp(hit: Hit, gene_location: GeneLocation | None) -> HSP:
     """Retrieve preferred HSP from BLAT Hit object.
 
     If gene location data is available, prefer the HSP with the least distance
     between the start of the hit and the start coordinate of the gene. Otherwise,
     take the HSP with the highest score value.
 
     :param hit: hit object from BLAT result
-    :param urn: scoreset identifier for use in error messages
     :param gene_location: location data acquired by normalizing scoreset metadata
     :return: Preferred HSP object
     :raise AlignmentError: if hit object appears to be empty (should be impossible)
@@ -252,17 +283,17 @@ def _get_best_hsp(hit: Hit, urn: str, gene_location: GeneLocation | None) -> HSP
     return best_hsp
 
 
-def _get_best_match(output: QueryResult, metadata: ScoresetMetadata) -> AlignmentResult:
+def _get_best_match(output: QueryResult, target_gene: TargetGene) -> AlignmentResult:
     """Obtain best high-scoring pairs (HSP) object for query sequence.
 
     :param metadata: scoreset metadata
     :param output: BLAT result object
     :return: alignment result ??
     """
-    location = get_gene_location(metadata)
+    location = get_gene_location(target_gene)
     chromosome = location.chromosome if location else None
-    best_hit = _get_best_hit(output, metadata.urn, chromosome)
-    best_hsp = _get_best_hsp(best_hit, metadata.urn, location)
+    best_hit = _get_best_hit(output, chromosome)
+    best_hsp = _get_best_hsp(best_hit, location)
 
     strand = Strand.POSITIVE if best_hsp[0].query_strand == 1 else Strand.NEGATIVE
     coverage = 100 * (best_hsp.query_end - best_hsp.query_start) / output.seq_len
@@ -291,12 +322,19 @@ def _get_best_match(output: QueryResult, metadata: ScoresetMetadata) -> Alignmen
     )
 
 
-def align(scoreset_metadata: ScoresetMetadata, silent: bool = True) -> AlignmentResult:
+def align(
+    scoreset_metadata: ScoresetMetadata, silent: bool = True
+) -> dict[str, AlignmentResult]:
     """Align target sequence to a reference genome.
 
     :param scoreset_metadata: object containing scoreset metadata
     :param silent: suppress BLAT process output if true
-    :return: data wrapper containing alignment results
+    :return: dictionary where keys are target gene identifiers and values are alignment result objects
     """
     blat_output = _get_blat_output(scoreset_metadata, silent)
-    return _get_best_match(blat_output, scoreset_metadata)
+    alignment_results = {}
+    for blat_result in blat_output:
+        target_label = blat_result.id
+        target_gene = scoreset_metadata.target_genes[target_label]
+        alignment_results[target_label] = _get_best_match(blat_result, target_gene)
+    return alignment_results
@@ -46,7 +46,11 @@
 from gene.query import QueryHandler
 from gene.schemas import MatchType, SourceName
 
-from dcd_mapping.schemas import GeneLocation, ManeDescription, ScoresetMetadata
+from dcd_mapping.schemas import (
+    GeneLocation,
+    ManeDescription,
+    TargetGene,
+)
 
 __all__ = [
     "CoolSeqToolBuilder",
@@ -287,25 +291,25 @@ def _get_hgnc_symbol(term: str) -> str | None:
     return None
 
 
-def get_gene_symbol(metadata: ScoresetMetadata) -> str | None:
-    """Acquire HGNC gene symbol given provided metadata from scoreset.
+def get_gene_symbol(target_gene: TargetGene) -> str | None:
+    """Acquire HGNC gene symbol given provided target gene metadata from MaveDB.
 
     Right now, we use two sources for normalizing:
     1. UniProt ID, if available
     2. Target name: specifically, we try the first word in the name (this could
     cause some problems and we should double-check it)
 
-    :param ScoresetMetadata: data given by MaveDB API
+    :param target_gene: target gene metadata given by MaveDB API
     :return: gene symbol if available
     """
-    if metadata.target_uniprot_ref:
-        result = _get_hgnc_symbol(metadata.target_uniprot_ref.id)
+    if target_gene.target_uniprot_ref:
+        result = _get_hgnc_symbol(target_gene.target_uniprot_ref.id)
         if result:
             return result
 
     # try taking the first word in the target name
-    if metadata.target_gene_name:
-        parsed_name = metadata.target_gene_name.split(" ")[0]
+    if target_gene.target_gene_name:
+        parsed_name = target_gene.target_gene_name.split(" ")[0]
         return _get_hgnc_symbol(parsed_name)
     return None
 
@@ -324,21 +328,21 @@ def _normalize_gene(term: str) -> Gene | None:
 
 
 def _get_normalized_gene_response(
-    metadata: ScoresetMetadata,
+    target_gene: TargetGene,
 ) -> Gene | None:
     """Fetch best normalized concept given available scoreset metadata.
 
     :param metadata: salient scoreset metadata items
     :return: Normalized gene if available
     """
-    if metadata.target_uniprot_ref:
-        gene_descriptor = _normalize_gene(metadata.target_uniprot_ref.id)
+    if target_gene.target_uniprot_ref:
+        gene_descriptor = _normalize_gene(target_gene.target_uniprot_ref.id)
         if gene_descriptor:
             return gene_descriptor
 
     # try taking the first word in the target name
-    if metadata.target_gene_name:
-        parsed_name = metadata.target_gene_name.split(" ")[0]
+    if target_gene.target_gene_name:
+        parsed_name = target_gene.target_gene_name.split(" ")[0]
         gene_descriptor = _normalize_gene(parsed_name)
         if gene_descriptor:
             return gene_descriptor
@@ -371,7 +375,7 @@ def _get_genomic_interval(
     return None
 
 
-def get_gene_location(metadata: ScoresetMetadata) -> GeneLocation | None:
+def get_gene_location(target_gene: TargetGene) -> GeneLocation | None:
     """Acquire gene location data from gene normalizer using metadata provided by
     scoreset.
 
@@ -380,10 +384,10 @@ def get_gene_location(metadata: ScoresetMetadata) -> GeneLocation | None:
     2. Target name: specifically, we try the first word in the name (this could
     cause some problems and we should double-check it)
 
-    :param metadata: data given by MaveDB API
+    :param target_gene: data given by MaveDB API
     :return: gene location data if available
     """
-    gene_descriptor = _get_normalized_gene_response(metadata)
+    gene_descriptor = _get_normalized_gene_response(target_gene)
     if not gene_descriptor or not gene_descriptor.extensions:
         return None
 
 
@@ -33,7 +33,7 @@
     ScoresetMetadata,
     VrsVersion,
 )
-from dcd_mapping.transcripts import TxSelectError, select_transcript
+from dcd_mapping.transcripts import TxSelectError, select_transcripts
 from dcd_mapping.vrs_map import VrsMapError, vrs_map
 
 _logger = logging.getLogger(__name__)
@@ -156,7 +156,7 @@ async def map_scoreset(
 
     _emit_info(f"Performing alignment for {metadata.urn}...", silent)
     try:
-        alignment_result = align(metadata, silent)
+        alignment_results = align(metadata, silent)
     except BlatNotFoundError as e:
         msg = "BLAT command appears missing. Ensure it is available on the $PATH or use the environment variable BLAT_BIN_PATH to point to it. See instructions in the README prerequisites section for more."
         _emit_info(msg, silent, logging.ERROR)
@@ -179,7 +179,7 @@ async def map_scoreset(
 
     _emit_info("Selecting reference sequence...", silent)
     try:
-        transcript = await select_transcript(metadata, records, alignment_result)
+        transcripts = await select_transcripts(metadata, records, alignment_results)
     except (TxSelectError, KeyError, ValueError) as e:
         _emit_info(
             f"Transcript selection failed for scoreset {metadata.urn}",
@@ -211,7 +211,7 @@ async def map_scoreset(
 
     _emit_info("Mapping to VRS...", silent)
     try:
-        vrs_results = vrs_map(metadata, alignment_result, records, transcript, silent)
+        vrs_results = vrs_map(metadata, alignment_results, records, transcripts, silent)
     except VrsMapError as e:
         _emit_info(
             f"VRS mapping failed for scoreset {metadata.urn}", silent, logging.ERROR
@@ -239,7 +239,7 @@ async def map_scoreset(
 
     _emit_info("Annotating metadata and saving to file...", silent)
     try:
-        vrs_results = annotate(vrs_results, transcript, metadata, vrs_version)
+        vrs_results = annotate(vrs_results, transcripts, metadata, vrs_version)
     except Exception as e:  # TODO create AnnotationError class and replace ValueErrors in annotation steps with AnnotationErrors
         _emit_info(
             f"VRS annotation failed for scoreset {metadata.urn}", silent, logging.ERROR
@@ -267,8 +267,8 @@ async def map_scoreset(
         final_output = save_mapped_output_json(
             metadata,
             vrs_results,
-            alignment_result,
-            transcript,
+            alignment_results,
+            transcripts,
             prefer_genomic,
             output_path,
         )