|
4 | 4 | import subprocess |
5 | 5 | import tempfile |
6 | 6 | from pathlib import Path |
7 | | -from typing import Any |
8 | 7 | from urllib.parse import urlparse |
9 | 8 |
|
10 | 9 | import requests |
@@ -167,7 +166,9 @@ def _get_target_sequence_type(metadata: ScoresetMetadata) -> TargetSequenceType |
167 | 166 | raise ValueError(msg) |
168 | 167 |
|
169 | 168 |
|
170 | | -def _get_blat_output(metadata: ScoresetMetadata, silent: bool) -> Any: # noqa: ANN401 |
| 169 | +def _get_blat_output( |
| 170 | + metadata: ScoresetMetadata, silent: bool |
| 171 | +) -> dict[str, QueryResult]: |
171 | 172 | """Run a BLAT query and returns a path to the output object. |
172 | 173 |
|
173 | 174 | If unable to produce a valid query the first time, then try a query using ``dnax`` |
@@ -195,7 +196,6 @@ def _get_blat_output(metadata: ScoresetMetadata, silent: bool) -> Any: # noqa: |
195 | 196 | try: |
196 | 197 | output = parse_blat(out_file, "blat-psl") |
197 | 198 |
|
198 | | - # TODO reevaluate this code block - are there cases in mavedb where target sequence type is incorrectly supplied? |
199 | 199 | except ValueError: |
200 | 200 | target_args = "-q=dnax -t=dnax" |
201 | 201 | process_result = _run_blat(target_args, query_file, "/dev/stdout", silent) |
@@ -336,13 +336,31 @@ def align( |
336 | 336 | # blat names the result id "query" if there is only one query; replace "query" with the target gene name for single-target score sets |
337 | 337 | if target_label == "query" and len(scoreset_metadata.target_genes) == 1: |
338 | 338 | target_label = list(scoreset_metadata.target_genes.keys())[0] # noqa: RUF015 |
339 | | - # NOTE this is a temporary fix that will not work for multi-target score sets! |
340 | | - # blat automatically reformats query names. |
341 | | - if ( |
342 | | - target_label not in scoreset_metadata.target_genes |
343 | | - and len(scoreset_metadata.target_genes) == 1 |
344 | | - ): |
345 | | - target_label = list(scoreset_metadata.target_genes.keys())[0] # noqa: RUF015 |
| 339 | + # blat automatically reformats query names, so sometimes they don't match our metadata |
| 340 | + if target_label not in scoreset_metadata.target_genes: |
| 341 | + # if single-target score set, don't need to match by name |
| 342 | + if len(scoreset_metadata.target_genes) == 1: |
| 343 | + target_label = list(scoreset_metadata.target_genes.keys())[0] # noqa: RUF015 |
| 344 | + else: |
| 345 | + # try to match query name to a target gene in the metadata |
| 346 | + matches = 0 |
| 347 | + for target_gene_name in scoreset_metadata.target_genes: |
| 348 | + blat_target_gene_name = ( |
| 349 | + target_gene_name.split(" ")[0] |
| 350 | + .replace("(", "") |
| 351 | + .replace(")", "") |
| 352 | + .replace(",", "") |
| 353 | + ) |
| 354 | + if blat_target_gene_name == target_label: |
| 355 | + target_label = target_gene_name |
| 356 | + matches += 1 |
| 357 | + # we may be missing some blat reformatting rules here - if so, this error will be thrown |
| 358 | + if matches == 0: |
| 359 | + msg = f"BLAT result {target_label} does not match any target gene names in scoreset {scoreset_metadata.urn}." |
| 360 | + raise AlignmentError(msg) |
| 361 | + if matches > 1: |
| 362 | + # could happen if multiple target genes have the same first word in their label (unlikely) |
| 363 | + msg = f"BLAT result {target_label} matches multiple target gene names in scoreset {scoreset_metadata.urn}" |
346 | 364 | target_gene = scoreset_metadata.target_genes[target_label] |
347 | 365 | alignment_results[target_label] = _get_best_match(blat_result, target_gene) |
348 | 366 | return alignment_results |
|
0 commit comments