Skip to content

Commit c8a31e7

Browse files
committed
Multi-target mapping partial draft: alignment and transcript selection
1 parent c9a4ca3 commit c8a31e7

File tree

6 files changed

+210
-106
lines changed

6 files changed

+210
-106
lines changed

src/dcd_mapping/align.py

Lines changed: 60 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -4,11 +4,12 @@
44
import subprocess
55
import tempfile
66
from pathlib import Path
7+
from typing import Any
78
from urllib.parse import urlparse
89

910
import requests
1011
from Bio.SearchIO import HSP
11-
from Bio.SearchIO import read as read_blat
12+
from Bio.SearchIO import parse as parse_blat
1213
from Bio.SearchIO._model import Hit, QueryResult
1314
from cool_seq_tool.schemas import Strand
1415

@@ -25,6 +26,7 @@
2526
GeneLocation,
2627
ScoresetMetadata,
2728
SequenceRange,
29+
TargetGene,
2830
TargetSequenceType,
2931
)
3032

@@ -61,7 +63,10 @@ def _build_query_file(scoreset_metadata: ScoresetMetadata, query_file: Path) ->
6163
:return: Yielded Path to constructed file. Deletes file once complete.
6264
"""
6365
_logger.debug("Writing BLAT query to %s", query_file)
64-
lines = [">query", scoreset_metadata.target_sequence]
66+
lines = []
67+
for target_gene in scoreset_metadata.target_genes:
68+
lines.append(f">{target_gene}")
69+
lines.append(scoreset_metadata.target_genes[target_gene].target_sequence)
6570
_write_query_file(query_file, lines)
6671
return query_file
6772

@@ -143,50 +148,77 @@ def _write_blat_output_tempfile(result: subprocess.CompletedProcess) -> str:
143148
return tmp.name
144149

145150

146-
def _get_blat_output(metadata: ScoresetMetadata, silent: bool) -> QueryResult:
151+
def _get_target_sequence_type(metadata: ScoresetMetadata) -> TargetSequenceType | str:
152+
"""Get overall target sequence type for a score set's target genes.
153+
Protein if all target sequences are protein sequences, nucleotide if all target
154+
sequences are nucleotide sequences, and mixed if there is a mix within the score set.
155+
:param metadata: object containing score set attributes
156+
:return: TargetSequenceType enum (protein or nucleotide) or string "mixed"
157+
"""
158+
target_sequence_types = set()
159+
for target_gene in metadata.target_genes:
160+
target_sequence_types.add(
161+
metadata.target_genes[target_gene].target_sequence_type
162+
)
163+
if len(target_sequence_types) > 1:
164+
return "mixed"
165+
elif len(target_sequence_types) == 1: # noqa: RET505
166+
return target_sequence_types.pop()
167+
else:
168+
msg = f"Target sequence types not available for score set {metadata.urn}"
169+
raise ValueError(msg)
170+
171+
172+
def _get_blat_output(metadata: ScoresetMetadata, silent: bool) -> Any: # noqa: ANN401
147173
"""Run a BLAT query and returns a path to the output object.
148174
149175
If unable to produce a valid query the first time, then try a query using ``dnax``
150176
bases.
151177
152178
:param scoreset_metadata: object containing scoreset attributes
153179
:param silent: suppress BLAT command output
154-
:return: BLAT query result
180+
:return: dict where keys are target gene identifiers and values are BLAT query result objects
155181
:raise AlignmentError: if BLAT subprocess returns error code
156182
"""
157183
with tempfile.NamedTemporaryFile() as tmp_file:
158184
query_file = _build_query_file(metadata, Path(tmp_file.name))
159-
if metadata.target_sequence_type == TargetSequenceType.PROTEIN:
185+
target_sequence_type = _get_target_sequence_type(metadata)
186+
if target_sequence_type == TargetSequenceType.PROTEIN:
160187
target_args = "-q=prot -t=dnax"
161-
else:
188+
elif target_sequence_type == TargetSequenceType.DNA:
162189
target_args = ""
190+
else:
191+
# TODO implement support for mixed types, not hard to do - just split blat into two files and run command with each set of arguments.
192+
msg = "Mapping for score sets with a mix of nucleotide and protein target sequences is not currently supported."
193+
raise NotImplementedError(msg)
163194
process_result = _run_blat(target_args, query_file, "/dev/stdout", silent)
164195
out_file = _write_blat_output_tempfile(process_result)
165196

166197
try:
167-
output = read_blat(out_file, "blat-psl")
198+
output = parse_blat(out_file, "blat-psl")
199+
200+
# TODO reevaluate this code block - are there cases in mavedb where target sequence type is incorrectly supplied?
168201
except ValueError:
169202
target_args = "-q=dnax -t=dnax"
170203
process_result = _run_blat(target_args, query_file, "/dev/stdout", silent)
171204
out_file = _write_blat_output_tempfile(process_result)
172205
try:
173-
output = read_blat(out_file, "blat-psl")
206+
output = parse_blat(out_file, "blat-psl")
174207
except ValueError as e:
175208
msg = f"Unable to run successful BLAT on {metadata.urn}"
176209
raise AlignmentError(msg) from e
177210

178211
return output
179212

180213

181-
def _get_best_hit(output: QueryResult, urn: str, chromosome: str | None) -> Hit:
214+
def _get_best_hit(output: QueryResult, chromosome: str | None) -> Hit:
182215
"""Get best hit from BLAT output.
183216
184217
First, try to return hit corresponding to expected chromosome taken from scoreset
185218
metadata. If chromosome doesn't match any of the outputs or is unavailable, take
186219
the hit with the single highest-scoring HSP.
187220
188221
:param output: BLAT output
189-
:param urn: scoreset URN to use in error messages
190222
:param chromosome: refseq chromosome ID, e.g. ``"NC_000001.11"``
191223
:return: best Hit
192224
:raise AlignmentError: if unable to get hits from output
@@ -207,8 +239,8 @@ def _get_best_hit(output: QueryResult, urn: str, chromosome: str | None) -> Hit:
207239
hit_chrs = [h.id for h in output]
208240
# TODO should this be an error rather than a warning? it seems like a problem if we can't find a hit on the expected chromosome
209241
_logger.warning(
210-
"Failed to match hit chromosomes during alignment. URN: %s, expected chromosome: %s, hit chromosomes: %s",
211-
urn,
242+
"Failed to match hit chromosomes during alignment for target %s. Expected chromosome: %s, hit chromosomes: %s",
243+
output.id,
212244
chromosome,
213245
hit_chrs,
214246
)
@@ -222,21 +254,20 @@ def _get_best_hit(output: QueryResult, urn: str, chromosome: str | None) -> Hit:
222254
best_score_hit = hit
223255

224256
if best_score_hit is None:
225-
msg = f"Couldn't get BLAT hits from {urn}"
257+
msg = f"Couldn't get BLAT hits for target {output.id}."
226258
raise AlignmentError(msg)
227259

228260
return best_score_hit
229261

230262

231-
def _get_best_hsp(hit: Hit, urn: str, gene_location: GeneLocation | None) -> HSP:
263+
def _get_best_hsp(hit: Hit, gene_location: GeneLocation | None) -> HSP:
232264
"""Retrieve preferred HSP from BLAT Hit object.
233265
234266
If gene location data is available, prefer the HSP with the least distance
235267
between the start of the hit and the start coordinate of the gene. Otherwise,
236268
take the HSP with the highest score value.
237269
238270
:param hit: hit object from BLAT result
239-
:param urn: scoreset identifier for use in error messages
240271
:param gene_location: location data acquired by normalizing scoreset metadata
241272
:return: Preferred HSP object
242273
:raise AlignmentError: if hit object appears to be empty (should be impossible)
@@ -252,17 +283,17 @@ def _get_best_hsp(hit: Hit, urn: str, gene_location: GeneLocation | None) -> HSP
252283
return best_hsp
253284

254285

255-
def _get_best_match(output: QueryResult, metadata: ScoresetMetadata) -> AlignmentResult:
286+
def _get_best_match(output: QueryResult, target_gene: TargetGene) -> AlignmentResult:
256287
"""Obtain best high-scoring pairs (HSP) object for query sequence.
257288
258289
:param metadata: scoreset metadata
259290
:param output: BLAT result object
260291
:return: alignment result ??
261292
"""
262-
location = get_gene_location(metadata)
293+
location = get_gene_location(target_gene)
263294
chromosome = location.chromosome if location else None
264-
best_hit = _get_best_hit(output, metadata.urn, chromosome)
265-
best_hsp = _get_best_hsp(best_hit, metadata.urn, location)
295+
best_hit = _get_best_hit(output, chromosome)
296+
best_hsp = _get_best_hsp(best_hit, location)
266297

267298
strand = Strand.POSITIVE if best_hsp[0].query_strand == 1 else Strand.NEGATIVE
268299
coverage = 100 * (best_hsp.query_end - best_hsp.query_start) / output.seq_len
@@ -291,12 +322,19 @@ def _get_best_match(output: QueryResult, metadata: ScoresetMetadata) -> Alignmen
291322
)
292323

293324

294-
def align(scoreset_metadata: ScoresetMetadata, silent: bool = True) -> AlignmentResult:
325+
def align(
326+
scoreset_metadata: ScoresetMetadata, silent: bool = True
327+
) -> dict[str, AlignmentResult]:
295328
"""Align target sequence to a reference genome.
296329
297330
:param scoreset_metadata: object containing scoreset metadata
298331
:param silent: suppress BLAT process output if true
299-
:return: data wrapper containing alignment results
332+
:return: dictionary where keys are target gene identifiers and values are alignment result objects
300333
"""
301334
blat_output = _get_blat_output(scoreset_metadata, silent)
302-
return _get_best_match(blat_output, scoreset_metadata)
335+
alignment_results = {}
336+
for blat_result in blat_output:
337+
target_label = blat_result.id
338+
target_gene = scoreset_metadata.target_genes[target_label]
339+
alignment_results[target_label] = _get_best_match(blat_result, target_gene)
340+
return alignment_results

src/dcd_mapping/lookup.py

Lines changed: 20 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,11 @@
4646
from gene.query import QueryHandler
4747
from gene.schemas import MatchType, SourceName
4848

49-
from dcd_mapping.schemas import GeneLocation, ManeDescription, ScoresetMetadata
49+
from dcd_mapping.schemas import (
50+
GeneLocation,
51+
ManeDescription,
52+
TargetGene,
53+
)
5054

5155
__all__ = [
5256
"CoolSeqToolBuilder",
@@ -287,25 +291,25 @@ def _get_hgnc_symbol(term: str) -> str | None:
287291
return None
288292

289293

290-
def get_gene_symbol(metadata: ScoresetMetadata) -> str | None:
291-
"""Acquire HGNC gene symbol given provided metadata from scoreset.
294+
def get_gene_symbol(target_gene: TargetGene) -> str | None:
295+
"""Acquire HGNC gene symbol given provided target gene metadata from MaveDB.
292296
293297
Right now, we use two sources for normalizing:
294298
1. UniProt ID, if available
295299
2. Target name: specifically, we try the first word in the name (this could
296300
cause some problems and we should double-check it)
297301
298-
:param ScoresetMetadata: data given by MaveDB API
302+
:param target_gene: target gene metadata given by MaveDB API
299303
:return: gene symbol if available
300304
"""
301-
if metadata.target_uniprot_ref:
302-
result = _get_hgnc_symbol(metadata.target_uniprot_ref.id)
305+
if target_gene.target_uniprot_ref:
306+
result = _get_hgnc_symbol(target_gene.target_uniprot_ref.id)
303307
if result:
304308
return result
305309

306310
# try taking the first word in the target name
307-
if metadata.target_gene_name:
308-
parsed_name = metadata.target_gene_name.split(" ")[0]
311+
if target_gene.target_gene_name:
312+
parsed_name = target_gene.target_gene_name.split(" ")[0]
309313
return _get_hgnc_symbol(parsed_name)
310314
return None
311315

@@ -324,21 +328,21 @@ def _normalize_gene(term: str) -> Gene | None:
324328

325329

326330
def _get_normalized_gene_response(
327-
metadata: ScoresetMetadata,
331+
target_gene: TargetGene,
328332
) -> Gene | None:
329333
"""Fetch best normalized concept given available scoreset metadata.
330334
331335
:param metadata: salient scoreset metadata items
332336
:return: Normalized gene if available
333337
"""
334-
if metadata.target_uniprot_ref:
335-
gene_descriptor = _normalize_gene(metadata.target_uniprot_ref.id)
338+
if target_gene.target_uniprot_ref:
339+
gene_descriptor = _normalize_gene(target_gene.target_uniprot_ref.id)
336340
if gene_descriptor:
337341
return gene_descriptor
338342

339343
# try taking the first word in the target name
340-
if metadata.target_gene_name:
341-
parsed_name = metadata.target_gene_name.split(" ")[0]
344+
if target_gene.target_gene_name:
345+
parsed_name = target_gene.target_gene_name.split(" ")[0]
342346
gene_descriptor = _normalize_gene(parsed_name)
343347
if gene_descriptor:
344348
return gene_descriptor
@@ -371,7 +375,7 @@ def _get_genomic_interval(
371375
return None
372376

373377

374-
def get_gene_location(metadata: ScoresetMetadata) -> GeneLocation | None:
378+
def get_gene_location(target_gene: TargetGene) -> GeneLocation | None:
375379
"""Acquire gene location data from gene normalizer using metadata provided by
376380
scoreset.
377381
@@ -380,10 +384,10 @@ def get_gene_location(metadata: ScoresetMetadata) -> GeneLocation | None:
380384
2. Target name: specifically, we try the first word in the name (this could
381385
cause some problems and we should double-check it)
382386
383-
:param metadata: data given by MaveDB API
387+
:param target_gene: data given by MaveDB API
384388
:return: gene location data if available
385389
"""
386-
gene_descriptor = _get_normalized_gene_response(metadata)
390+
gene_descriptor = _get_normalized_gene_response(target_gene)
387391
if not gene_descriptor or not gene_descriptor.extensions:
388392
return None
389393

src/dcd_mapping/main.py

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@
3333
ScoresetMetadata,
3434
VrsVersion,
3535
)
36-
from dcd_mapping.transcripts import TxSelectError, select_transcript
36+
from dcd_mapping.transcripts import TxSelectError, select_transcripts
3737
from dcd_mapping.vrs_map import VrsMapError, vrs_map
3838

3939
_logger = logging.getLogger(__name__)
@@ -156,7 +156,7 @@ async def map_scoreset(
156156

157157
_emit_info(f"Performing alignment for {metadata.urn}...", silent)
158158
try:
159-
alignment_result = align(metadata, silent)
159+
alignment_results = align(metadata, silent)
160160
except BlatNotFoundError as e:
161161
msg = "BLAT command appears missing. Ensure it is available on the $PATH or use the environment variable BLAT_BIN_PATH to point to it. See instructions in the README prerequisites section for more."
162162
_emit_info(msg, silent, logging.ERROR)
@@ -179,7 +179,7 @@ async def map_scoreset(
179179

180180
_emit_info("Selecting reference sequence...", silent)
181181
try:
182-
transcript = await select_transcript(metadata, records, alignment_result)
182+
transcripts = await select_transcripts(metadata, records, alignment_results)
183183
except (TxSelectError, KeyError, ValueError) as e:
184184
_emit_info(
185185
f"Transcript selection failed for scoreset {metadata.urn}",
@@ -211,7 +211,7 @@ async def map_scoreset(
211211

212212
_emit_info("Mapping to VRS...", silent)
213213
try:
214-
vrs_results = vrs_map(metadata, alignment_result, records, transcript, silent)
214+
vrs_results = vrs_map(metadata, alignment_results, records, transcripts, silent)
215215
except VrsMapError as e:
216216
_emit_info(
217217
f"VRS mapping failed for scoreset {metadata.urn}", silent, logging.ERROR
@@ -239,7 +239,7 @@ async def map_scoreset(
239239

240240
_emit_info("Annotating metadata and saving to file...", silent)
241241
try:
242-
vrs_results = annotate(vrs_results, transcript, metadata, vrs_version)
242+
vrs_results = annotate(vrs_results, transcripts, metadata, vrs_version)
243243
except Exception as e: # TODO create AnnotationError class and replace ValueErrors in annotation steps with AnnotationErrors
244244
_emit_info(
245245
f"VRS annotation failed for scoreset {metadata.urn}", silent, logging.ERROR
@@ -267,8 +267,8 @@ async def map_scoreset(
267267
final_output = save_mapped_output_json(
268268
metadata,
269269
vrs_results,
270-
alignment_result,
271-
transcript,
270+
alignment_results,
271+
transcripts,
272272
prefer_genomic,
273273
output_path,
274274
)

0 commit comments

Comments
 (0)