Skip to content

Commit dc7c48e

Browse files
committed
Fix BLAT-incompatible target names in multi-target score sets
BLAT automatically removes certain characters from query names, including removing all characters after a space. If the BLAT result name does not match any target genes in the score set, attempt to match based on BLAT's query name shortening patterns. If multiple matches (could happen if labels are something like "Gene 1" and "Gene 2", in which case both would be shortened to "Gene"), raise an error.
1 parent 20e604d commit dc7c48e

File tree

1 file changed

+28
-10
lines changed

1 file changed

+28
-10
lines changed

src/dcd_mapping/align.py

Lines changed: 28 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@
44
import subprocess
55
import tempfile
66
from pathlib import Path
7-
from typing import Any
87
from urllib.parse import urlparse
98

109
import requests
@@ -167,7 +166,9 @@ def _get_target_sequence_type(metadata: ScoresetMetadata) -> TargetSequenceType
167166
raise ValueError(msg)
168167

169168

170-
def _get_blat_output(metadata: ScoresetMetadata, silent: bool) -> Any: # noqa: ANN401
169+
def _get_blat_output(
170+
metadata: ScoresetMetadata, silent: bool
171+
) -> dict[str, QueryResult]:
171172
"""Run a BLAT query and returns a path to the output object.
172173
173174
If unable to produce a valid query the first time, then try a query using ``dnax``
@@ -195,7 +196,6 @@ def _get_blat_output(metadata: ScoresetMetadata, silent: bool) -> Any: # noqa:
195196
try:
196197
output = parse_blat(out_file, "blat-psl")
197198

198-
# TODO reevaluate this code block - are there cases in mavedb where target sequence type is incorrectly supplied?
199199
except ValueError:
200200
target_args = "-q=dnax -t=dnax"
201201
process_result = _run_blat(target_args, query_file, "/dev/stdout", silent)
@@ -336,13 +336,31 @@ def align(
336336
# blat names the result id "query" if there is only one query; replace "query" with the target gene name for single-target score sets
337337
if target_label == "query" and len(scoreset_metadata.target_genes) == 1:
338338
target_label = list(scoreset_metadata.target_genes.keys())[0] # noqa: RUF015
339-
# NOTE this is a temporary fix that will not work for multi-target score sets!
340-
# blat automatically reformats query names.
341-
if (
342-
target_label not in scoreset_metadata.target_genes
343-
and len(scoreset_metadata.target_genes) == 1
344-
):
345-
target_label = list(scoreset_metadata.target_genes.keys())[0] # noqa: RUF015
339+
# blat automatically reformats query names, so sometimes they don't match our metadata
340+
if target_label not in scoreset_metadata.target_genes:
341+
# if single-target score set, don't need to match by name
342+
if len(scoreset_metadata.target_genes) == 1:
343+
target_label = list(scoreset_metadata.target_genes.keys())[0] # noqa: RUF015
344+
else:
345+
# try to match query name to a target gene in the metadata
346+
matches = 0
347+
for target_gene_name in scoreset_metadata.target_genes:
348+
blat_target_gene_name = (
349+
target_gene_name.split(" ")[0]
350+
.replace("(", "")
351+
.replace(")", "")
352+
.replace(",", "")
353+
)
354+
if blat_target_gene_name == target_label:
355+
target_label = target_gene_name
356+
matches += 1
357+
# we may be missing some blat reformatting rules here - if so, this error will be thrown
358+
if matches == 0:
359+
msg = f"BLAT result {target_label} does not match any target gene names in scoreset {scoreset_metadata.urn}."
360+
raise AlignmentError(msg)
361+
if matches > 1:
362+
# could happen if multiple target genes have the same first word in their label (unlikely)
363+
msg = f"BLAT result {target_label} matches multiple target gene names in scoreset {scoreset_metadata.urn}"
346364
target_gene = scoreset_metadata.target_genes[target_label]
347365
alignment_results[target_label] = _get_best_match(blat_result, target_gene)
348366
return alignment_results

0 commit comments

Comments
 (0)