Skip to content

Commit 4ed6cb9

Browse files
committed
Map score sets with multiple targets
1 parent 0b68f5a commit 4ed6cb9

File tree

6 files changed

+229
-147
lines changed

6 files changed

+229
-147
lines changed

src/dcd_mapping/annotate.py

Lines changed: 102 additions & 76 deletions
Original file line numberDiff line numberDiff line change
@@ -39,10 +39,12 @@
3939
ScoreAnnotationWithLayer,
4040
ScoresetMapping,
4141
ScoresetMetadata,
42+
TargetGene,
4243
TargetSequenceType,
4344
TxSelectResult,
4445
VrsVersion,
4546
)
47+
from dcd_mapping.transcripts import TxSelectError
4648

4749
_logger = logging.getLogger(__name__)
4850

@@ -133,14 +135,15 @@ def _offset_allele_ref_seq(ss: str, start: int, end: int) -> tuple[int, int]:
133135

134136

135137
def _get_vrs_ref_allele_seq(
136-
allele: Allele, metadata: ScoresetMetadata, tx_select_results: TxSelectResult | None
138+
allele: Allele,
139+
metadata: TargetGene,
140+
urn: str,
141+
tx_select_results: TxSelectResult | None,
137142
) -> Extension:
138143
"""Create `vrs_ref_allele_seq` property."""
139-
start, end = _offset_allele_ref_seq(
140-
metadata.urn, allele.location.start, allele.location.end
141-
)
144+
start, end = _offset_allele_ref_seq(urn, allele.location.start, allele.location.end)
142145
if (
143-
metadata.urn.startswith(
146+
urn.startswith(
144147
(
145148
"urn:mavedb:00000047",
146149
"urn:mavedb:00000048",
@@ -149,6 +152,7 @@ def _get_vrs_ref_allele_seq(
149152
)
150153
)
151154
and tx_select_results is not None
155+
and isinstance(tx_select_results, TxSelectResult)
152156
):
153157
seq = tx_select_results.sequence
154158
ref = seq[start:end]
@@ -241,16 +245,19 @@ def _get_hgvs_string(allele: Allele, accession: str) -> tuple[str, Syntax]:
241245

242246
def _annotate_allele_mapping(
243247
mapped_score: MappedScore,
244-
tx_results: TxSelectResult | None,
245-
metadata: ScoresetMetadata,
248+
tx_results: TxSelectResult | TxSelectError | None,
249+
metadata: TargetGene,
250+
urn: str,
246251
vrs_version: VrsVersion = VrsVersion.V_2,
247252
) -> ScoreAnnotationWithLayer:
248253
"""Perform annotations and, if necessary, create VRS 1.3 equivalents for allele mappings."""
249254
pre_mapped: Allele = mapped_score.pre_mapped
250255
post_mapped: Allele = mapped_score.post_mapped
251256

252257
# get vrs_ref_allele_seq for pre-mapped variants
253-
pre_mapped.extensions = [_get_vrs_ref_allele_seq(pre_mapped, metadata, tx_results)]
258+
pre_mapped.extensions = [
259+
_get_vrs_ref_allele_seq(pre_mapped, metadata, urn, tx_results)
260+
]
254261

255262
if post_mapped:
256263
# Determine reference sequence
@@ -262,7 +269,7 @@ def _annotate_allele_mapping(
262269
if accession.startswith("refseq:"):
263270
accession = accession[7:]
264271
else:
265-
if tx_results is None:
272+
if tx_results is None or isinstance(tx_results, TxSelectError):
266273
raise ValueError # impossible by definition
267274
accession = tx_results.np
268275

@@ -295,16 +302,17 @@ def _annotate_allele_mapping(
295302

296303
def _annotate_haplotype_mapping(
297304
mapped_score: MappedScore,
298-
tx_results: TxSelectResult | None,
299-
metadata: ScoresetMetadata,
305+
tx_results: TxSelectResult | TxSelectError | None,
306+
metadata: TargetGene,
307+
urn: str,
300308
vrs_version: VrsVersion = VrsVersion.V_2,
301309
) -> ScoreAnnotationWithLayer:
302310
"""Perform annotations and, if necessary, create VRS 1.3 equivalents for haplotype mappings."""
303311
pre_mapped: Haplotype = mapped_score.pre_mapped # type: ignore
304312
post_mapped: Haplotype = mapped_score.post_mapped # type: ignore
305313
# get vrs_ref_allele_seq for pre-mapped variants
306314
for allele in pre_mapped.members:
307-
allele.extensions = [_get_vrs_ref_allele_seq(allele, metadata, tx_results)]
315+
allele.extensions = [_get_vrs_ref_allele_seq(allele, metadata, urn, tx_results)]
308316

309317
if post_mapped:
310318
# Determine reference sequence
@@ -316,7 +324,7 @@ def _annotate_haplotype_mapping(
316324
if accession.startswith("refseq:"):
317325
accession = accession[7:]
318326
else:
319-
if tx_results is None:
327+
if tx_results is None or isinstance(tx_results, TxSelectError):
320328
raise ValueError # impossible by definition
321329
accession = tx_results.np
322330

@@ -350,8 +358,9 @@ def _annotate_haplotype_mapping(
350358

351359
def annotate(
352360
mapped_scores: list[MappedScore],
353-
tx_results: TxSelectResult | None,
354-
metadata: ScoresetMetadata,
361+
tx_results: TxSelectResult | TxSelectError | None,
362+
metadata: TargetGene,
363+
urn: str,
355364
vrs_version: VrsVersion = VrsVersion.V_2,
356365
) -> list[ScoreAnnotationWithLayer]:
357366
"""Given a list of mappings, add additional contextual data:
@@ -367,7 +376,8 @@ def annotate(
367376
368377
:param vrs_results: in-progress variant mappings
369378
:param tx_select_results: transcript selection if available
370-
:param metadata: MaveDB scoreset metadata
379+
:param metadata: Target gene metadata from MaveDB API
380+
:param urn: Score set URN
371381
:return: annotated mappings objects
372382
"""
373383
score_annotations = []
@@ -386,7 +396,7 @@ def annotate(
386396
):
387397
score_annotations.append(
388398
_annotate_haplotype_mapping(
389-
mapped_score, tx_results, metadata, vrs_version
399+
mapped_score, tx_results, metadata, urn, vrs_version
390400
)
391401
)
392402
elif isinstance(mapped_score.pre_mapped, Allele) and (
@@ -395,7 +405,7 @@ def annotate(
395405
):
396406
score_annotations.append(
397407
_annotate_allele_mapping(
398-
mapped_score, tx_results, metadata, vrs_version
408+
mapped_score, tx_results, metadata, urn, vrs_version
399409
)
400410
)
401411
else:
@@ -406,20 +416,22 @@ def annotate(
406416

407417

408418
def _get_computed_reference_sequence(
409-
metadata: ScoresetMetadata,
419+
metadata: TargetGene,
410420
layer: AnnotationLayer,
411-
tx_output: TxSelectResult | None = None,
412-
) -> ComputedReferenceSequence:
421+
tx_output: TxSelectResult | TxSelectError | None = None,
422+
) -> ComputedReferenceSequence | None:
413423
"""Report the computed reference sequence for a score set
414424
415-
:param ss: A score set string
425+
:param metadata: Target gene metadata from MaveDB API
416426
:param layer: AnnotationLayer
417427
:param tx_output: Transcript data for a score set
418428
:return A ComputedReferenceSequence object
419429
"""
420430
if layer == AnnotationLayer.PROTEIN:
421-
if tx_output is None:
422-
raise ValueError
431+
if tx_output is None or isinstance(tx_output, TxSelectError):
432+
# TODO catch this error - don't stop whole job for one failed target
433+
# raise ValueError
434+
return None
423435
seq_id = f"ga4gh:SQ.{sha512t24u(tx_output.sequence.encode('ascii'))}"
424436
return ComputedReferenceSequence(
425437
sequence=tx_output.sequence,
@@ -436,24 +448,31 @@ def _get_computed_reference_sequence(
436448

437449
def _get_mapped_reference_sequence(
438450
layer: AnnotationLayer,
439-
tx_output: TxSelectResult | None = None,
451+
tx_output: TxSelectResult | TxSelectError | None = None,
440452
align_result: AlignmentResult | None = None,
441-
) -> MappedReferenceSequence:
453+
) -> MappedReferenceSequence | None:
442454
"""Report the mapped reference sequence for a score set
443455
444-
:param ss: A score set string
445456
:param layer: AnnotationLayer
446457
:param tx_output: Transcript data for a score set
447458
:return A MappedReferenceSequence object
448459
"""
449-
if layer == AnnotationLayer.PROTEIN and tx_output is not None:
460+
if (
461+
layer == AnnotationLayer.PROTEIN
462+
and tx_output is not None
463+
and isinstance(tx_output, TxSelectResult)
464+
):
450465
if tx_output.np is None:
451-
msg = "No NP accession associated with reference transcript"
452-
raise ValueError(msg)
466+
# TODO catch this error, don't fail whole job for one target
467+
# msg = "No NP accession associated with reference transcript"
468+
# raise ValueError(msg)
469+
return None
453470
vrs_id = get_vrs_id_from_identifier(tx_output.np)
454471
if vrs_id is None:
455-
msg = "ID could not be acquired from Seqrepo for transcript identifier"
456-
raise ValueError(msg)
472+
# TODO catch this error, don't fail whole job for one target
473+
# msg = "ID could not be acquired from Seqrepo for transcript identifier"
474+
# raise ValueError(msg)
475+
return None
457476
return MappedReferenceSequence(
458477
sequence_type=TargetSequenceType.PROTEIN,
459478
sequence_id=vrs_id,
@@ -462,8 +481,10 @@ def _get_mapped_reference_sequence(
462481
seq_id = get_chromosome_identifier(align_result.chrom)
463482
vrs_id = get_vrs_id_from_identifier(seq_id)
464483
if vrs_id is None:
465-
msg = "ID could not be acquired from Seqrepo for chromosome identifier"
466-
raise ValueError(msg)
484+
# TODO catch this error, don't fail whole job for one target
485+
# msg = "ID could not be acquired from Seqrepo for chromosome identifier"
486+
# raise ValueError(msg)
487+
return None
467488
return MappedReferenceSequence(
468489
sequence_type=TargetSequenceType.DNA,
469490
sequence_id=vrs_id,
@@ -513,64 +534,69 @@ def write_scoreset_mapping_to_json(
513534

514535
def save_mapped_output_json(
515536
metadata: ScoresetMetadata,
516-
mappings: list[ScoreAnnotationWithLayer],
517-
align_result: AlignmentResult,
518-
tx_output: TxSelectResult | None,
537+
mappings: dict[str, ScoreAnnotationWithLayer],
538+
align_results: dict[str, AlignmentResult],
539+
tx_output: dict[str, TxSelectResult | TxSelectError | None],
519540
preferred_layer_only: bool = False,
520541
output_path: Path | None = None,
521542
) -> Path:
522543
"""Save mapping output for a score set in a JSON file
523544
524545
:param urn: Score set accession
525-
:param mave_vrs_mappings: A dictionary of VrsObject1_x objects
526-
:param align_result: Alignment information for a score set
527-
:param tx_output: Transcript output for a score set
546+
:param mappings: A dictionary of annotated VRS mappings for each target
547+
:param align_result: A dictionary of alignment information for each target
548+
:param tx_output: A dictionary of transcript output for each target
528549
:param output_path: specific location to save output to. Default to
529550
<dcd_mapping_data_dir>/urn:mavedb:00000XXX-X-X_mapping_<ISO8601 datetime>.json
530551
:return: output location
531552
"""
532-
if preferred_layer_only:
533-
preferred_layers = {
534-
_set_scoreset_layer(metadata.urn, mappings),
553+
# set preferred layers for each target, to allow a mix of coding and noncoding targets
554+
# TODO maybe this should be reevaluated and we should only allow one preferred layer per score set,
555+
# since I can't imagine an experimental assay where some variants are assayed as nucleotide variants
556+
# and others are assayed as amino acid variants.
557+
reference_sequences: dict[str, dict] = {}
558+
mapped_scores: list[ScoreAnnotation] = []
559+
for target_gene in mappings:
560+
if preferred_layer_only:
561+
preferred_layers = {
562+
_set_scoreset_layer(metadata.urn, mappings[target_gene]),
563+
}
564+
else:
565+
preferred_layers = {
566+
mapping.annotation_layer for mapping in mappings[target_gene]
567+
}
568+
569+
reference_sequences[target_gene] = {
570+
layer: {
571+
"computed_reference_sequence": None,
572+
"mapped_reference_sequence": None,
573+
}
574+
for layer in AnnotationLayer
535575
}
536-
else:
537-
preferred_layers = {mapping.annotation_layer for mapping in mappings}
538576

539-
reference_sequences = {
540-
layer: {"computed_reference_sequence": None, "mapped_reference_sequence": None}
541-
for layer in AnnotationLayer
542-
}
543-
544-
for layer in preferred_layers:
545-
reference_sequences[layer][
546-
"computed_reference_sequence"
547-
] = _get_computed_reference_sequence(metadata, layer, tx_output)
548-
reference_sequences[layer][
549-
"mapped_reference_sequence"
550-
] = _get_mapped_reference_sequence(layer, tx_output, align_result)
577+
for layer in preferred_layers:
578+
reference_sequences[target_gene][layer][
579+
"computed_reference_sequence"
580+
] = _get_computed_reference_sequence(
581+
metadata.target_genes[target_gene], layer, tx_output[target_gene]
582+
)
583+
reference_sequences[target_gene][layer][
584+
"mapped_reference_sequence"
585+
] = _get_mapped_reference_sequence(
586+
layer, tx_output[target_gene], align_results[target_gene]
587+
)
551588

552-
mapped_scores: list[ScoreAnnotation] = []
553-
for m in mappings:
554-
if m.pre_mapped is None:
555-
mapped_scores.append(ScoreAnnotation(**m.model_dump()))
556-
elif m.annotation_layer in preferred_layers:
557-
# drop annotation layer from mapping object
558-
mapped_scores.append(ScoreAnnotation(**m.model_dump()))
589+
for m in mappings[target_gene]:
590+
if m.pre_mapped is None:
591+
mapped_scores.append(ScoreAnnotation(**m.model_dump()))
592+
elif m.annotation_layer in preferred_layers:
593+
# drop annotation layer from mapping object
594+
mapped_scores.append(ScoreAnnotation(**m.model_dump()))
559595

596+
# TODO drop any Nonetype reference sequences
560597
output = ScoresetMapping(
561598
metadata=metadata.model_dump(),
562-
computed_protein_reference_sequence=reference_sequences[
563-
AnnotationLayer.PROTEIN
564-
]["computed_reference_sequence"],
565-
mapped_protein_reference_sequence=reference_sequences[AnnotationLayer.PROTEIN][
566-
"mapped_reference_sequence"
567-
],
568-
computed_genomic_reference_sequence=reference_sequences[
569-
AnnotationLayer.GENOMIC
570-
]["computed_reference_sequence"],
571-
mapped_genomic_reference_sequence=reference_sequences[AnnotationLayer.GENOMIC][
572-
"mapped_reference_sequence"
573-
],
599+
reference_sequences=reference_sequences,
574600
mapped_scores=mapped_scores,
575601
)
576602

0 commit comments

Comments
 (0)