3939 ScoreAnnotationWithLayer ,
4040 ScoresetMapping ,
4141 ScoresetMetadata ,
42+ TargetGene ,
4243 TargetSequenceType ,
4344 TxSelectResult ,
4445 VrsVersion ,
4546)
47+ from dcd_mapping .transcripts import TxSelectError
4648
4749_logger = logging .getLogger (__name__ )
4850
@@ -133,14 +135,15 @@ def _offset_allele_ref_seq(ss: str, start: int, end: int) -> tuple[int, int]:
133135
134136
135137def _get_vrs_ref_allele_seq (
136- allele : Allele , metadata : ScoresetMetadata , tx_select_results : TxSelectResult | None
138+ allele : Allele ,
139+ metadata : TargetGene ,
140+ urn : str ,
141+ tx_select_results : TxSelectResult | None ,
137142) -> Extension :
138143 """Create `vrs_ref_allele_seq` property."""
139- start , end = _offset_allele_ref_seq (
140- metadata .urn , allele .location .start , allele .location .end
141- )
144+ start , end = _offset_allele_ref_seq (urn , allele .location .start , allele .location .end )
142145 if (
143- metadata . urn .startswith (
146+ urn .startswith (
144147 (
145148 "urn:mavedb:00000047" ,
146149 "urn:mavedb:00000048" ,
@@ -149,6 +152,7 @@ def _get_vrs_ref_allele_seq(
149152 )
150153 )
151154 and tx_select_results is not None
155+ and isinstance (tx_select_results , TxSelectResult )
152156 ):
153157 seq = tx_select_results .sequence
154158 ref = seq [start :end ]
@@ -241,16 +245,19 @@ def _get_hgvs_string(allele: Allele, accession: str) -> tuple[str, Syntax]:
241245
242246def _annotate_allele_mapping (
243247 mapped_score : MappedScore ,
244- tx_results : TxSelectResult | None ,
245- metadata : ScoresetMetadata ,
248+ tx_results : TxSelectResult | TxSelectError | None ,
249+ metadata : TargetGene ,
250+ urn : str ,
246251 vrs_version : VrsVersion = VrsVersion .V_2 ,
247252) -> ScoreAnnotationWithLayer :
248253 """Perform annotations and, if necessary, create VRS 1.3 equivalents for allele mappings."""
249254 pre_mapped : Allele = mapped_score .pre_mapped
250255 post_mapped : Allele = mapped_score .post_mapped
251256
252257 # get vrs_ref_allele_seq for pre-mapped variants
253- pre_mapped .extensions = [_get_vrs_ref_allele_seq (pre_mapped , metadata , tx_results )]
258+ pre_mapped .extensions = [
259+ _get_vrs_ref_allele_seq (pre_mapped , metadata , urn , tx_results )
260+ ]
254261
255262 if post_mapped :
256263 # Determine reference sequence
@@ -262,7 +269,7 @@ def _annotate_allele_mapping(
262269 if accession .startswith ("refseq:" ):
263270 accession = accession [7 :]
264271 else :
265- if tx_results is None :
272+ if tx_results is None or isinstance ( tx_results , TxSelectError ) :
266273 raise ValueError # impossible by definition
267274 accession = tx_results .np
268275
@@ -295,16 +302,17 @@ def _annotate_allele_mapping(
295302
296303def _annotate_haplotype_mapping (
297304 mapped_score : MappedScore ,
298- tx_results : TxSelectResult | None ,
299- metadata : ScoresetMetadata ,
305+ tx_results : TxSelectResult | TxSelectError | None ,
306+ metadata : TargetGene ,
307+ urn : str ,
300308 vrs_version : VrsVersion = VrsVersion .V_2 ,
301309) -> ScoreAnnotationWithLayer :
302310 """Perform annotations and, if necessary, create VRS 1.3 equivalents for haplotype mappings."""
303311 pre_mapped : Haplotype = mapped_score .pre_mapped # type: ignore
304312 post_mapped : Haplotype = mapped_score .post_mapped # type: ignore
305313 # get vrs_ref_allele_seq for pre-mapped variants
306314 for allele in pre_mapped .members :
307- allele .extensions = [_get_vrs_ref_allele_seq (allele , metadata , tx_results )]
315+ allele .extensions = [_get_vrs_ref_allele_seq (allele , metadata , urn , tx_results )]
308316
309317 if post_mapped :
310318 # Determine reference sequence
@@ -316,7 +324,7 @@ def _annotate_haplotype_mapping(
316324 if accession .startswith ("refseq:" ):
317325 accession = accession [7 :]
318326 else :
319- if tx_results is None :
327+ if tx_results is None or isinstance ( tx_results , TxSelectError ) :
320328 raise ValueError # impossible by definition
321329 accession = tx_results .np
322330
@@ -350,8 +358,9 @@ def _annotate_haplotype_mapping(
350358
351359def annotate (
352360 mapped_scores : list [MappedScore ],
353- tx_results : TxSelectResult | None ,
354- metadata : ScoresetMetadata ,
361+ tx_results : TxSelectResult | TxSelectError | None ,
362+ metadata : TargetGene ,
363+ urn : str ,
355364 vrs_version : VrsVersion = VrsVersion .V_2 ,
356365) -> list [ScoreAnnotationWithLayer ]:
357366 """Given a list of mappings, add additional contextual data:
@@ -367,7 +376,8 @@ def annotate(
367376
368377 :param vrs_results: in-progress variant mappings
369378 :param tx_select_results: transcript selection if available
370- :param metadata: MaveDB scoreset metadata
379+ :param metadata: Target gene metadata from MaveDB API
380+ :param urn: Score set URN
371381 :return: annotated mappings objects
372382 """
373383 score_annotations = []
@@ -386,7 +396,7 @@ def annotate(
386396 ):
387397 score_annotations .append (
388398 _annotate_haplotype_mapping (
389- mapped_score , tx_results , metadata , vrs_version
399+ mapped_score , tx_results , metadata , urn , vrs_version
390400 )
391401 )
392402 elif isinstance (mapped_score .pre_mapped , Allele ) and (
@@ -395,7 +405,7 @@ def annotate(
395405 ):
396406 score_annotations .append (
397407 _annotate_allele_mapping (
398- mapped_score , tx_results , metadata , vrs_version
408+ mapped_score , tx_results , metadata , urn , vrs_version
399409 )
400410 )
401411 else :
@@ -406,20 +416,22 @@ def annotate(
406416
407417
408418def _get_computed_reference_sequence (
409- metadata : ScoresetMetadata ,
419+ metadata : TargetGene ,
410420 layer : AnnotationLayer ,
411- tx_output : TxSelectResult | None = None ,
412- ) -> ComputedReferenceSequence :
421+ tx_output : TxSelectResult | TxSelectError | None = None ,
422+ ) -> ComputedReferenceSequence | None :
413423 """Report the computed reference sequence for a score set
414424
415- :param ss: A score set string
425+ :param metadata: Target gene metadata from MaveDB API
416426 :param layer: AnnotationLayer
417427 :param tx_output: Transcript data for a score set
418428 :return A ComputedReferenceSequence object
419429 """
420430 if layer == AnnotationLayer .PROTEIN :
421- if tx_output is None :
422- raise ValueError
431+ if tx_output is None or isinstance (tx_output , TxSelectError ):
432+ # TODO catch this error - don't stop whole job for one failed target
433+ # raise ValueError
434+ return None
423435 seq_id = f"ga4gh:SQ.{ sha512t24u (tx_output .sequence .encode ('ascii' ))} "
424436 return ComputedReferenceSequence (
425437 sequence = tx_output .sequence ,
@@ -436,24 +448,31 @@ def _get_computed_reference_sequence(
436448
437449def _get_mapped_reference_sequence (
438450 layer : AnnotationLayer ,
439- tx_output : TxSelectResult | None = None ,
451+ tx_output : TxSelectResult | TxSelectError | None = None ,
440452 align_result : AlignmentResult | None = None ,
441- ) -> MappedReferenceSequence :
453+ ) -> MappedReferenceSequence | None :
442454 """Report the mapped reference sequence for a score set
443455
444- :param ss: A score set string
445456 :param layer: AnnotationLayer
446457 :param tx_output: Transcript data for a score set
447458 :return A MappedReferenceSequence object
448459 """
449- if layer == AnnotationLayer .PROTEIN and tx_output is not None :
460+ if (
461+ layer == AnnotationLayer .PROTEIN
462+ and tx_output is not None
463+ and isinstance (tx_output , TxSelectResult )
464+ ):
450465 if tx_output .np is None :
451- msg = "No NP accession associated with reference transcript"
452- raise ValueError (msg )
466+ # TODO catch this error, don't fail whole job for one target
467+ # msg = "No NP accession associated with reference transcript"
468+ # raise ValueError(msg)
469+ return None
453470 vrs_id = get_vrs_id_from_identifier (tx_output .np )
454471 if vrs_id is None :
455- msg = "ID could not be acquired from Seqrepo for transcript identifier"
456- raise ValueError (msg )
472+ # TODO catch this error, don't fail whole job for one target
473+ # msg = "ID could not be acquired from Seqrepo for transcript identifier"
474+ # raise ValueError(msg)
475+ return None
457476 return MappedReferenceSequence (
458477 sequence_type = TargetSequenceType .PROTEIN ,
459478 sequence_id = vrs_id ,
@@ -462,8 +481,10 @@ def _get_mapped_reference_sequence(
462481 seq_id = get_chromosome_identifier (align_result .chrom )
463482 vrs_id = get_vrs_id_from_identifier (seq_id )
464483 if vrs_id is None :
465- msg = "ID could not be acquired from Seqrepo for chromosome identifier"
466- raise ValueError (msg )
484+ # TODO catch this error, don't fail whole job for one target
485+ # msg = "ID could not be acquired from Seqrepo for chromosome identifier"
486+ # raise ValueError(msg)
487+ return None
467488 return MappedReferenceSequence (
468489 sequence_type = TargetSequenceType .DNA ,
469490 sequence_id = vrs_id ,
@@ -513,64 +534,69 @@ def write_scoreset_mapping_to_json(
513534
514535def save_mapped_output_json (
515536 metadata : ScoresetMetadata ,
516- mappings : list [ ScoreAnnotationWithLayer ],
517- align_result : AlignmentResult ,
518- tx_output : TxSelectResult | None ,
537+ mappings : dict [ str , ScoreAnnotationWithLayer ],
538+ align_results : dict [ str , AlignmentResult ] ,
539+ tx_output : dict [ str , TxSelectResult | TxSelectError | None ] ,
519540 preferred_layer_only : bool = False ,
520541 output_path : Path | None = None ,
521542) -> Path :
522543 """Save mapping output for a score set in a JSON file
523544
524545 :param urn: Score set accession
525- :param mave_vrs_mappings : A dictionary of VrsObject1_x objects
526- :param align_result: Alignment information for a score set
527- :param tx_output: Transcript output for a score set
546+ :param mappings : A dictionary of annotated VRS mappings for each target
547+ :param align_result: A dictionary of alignment information for each target
548+ :param tx_output: A dictionary of transcript output for each target
528549 :param output_path: specific location to save output to. Default to
529550 <dcd_mapping_data_dir>/urn:mavedb:00000XXX-X-X_mapping_<ISO8601 datetime>.json
530551 :return: output location
531552 """
532- if preferred_layer_only :
533- preferred_layers = {
534- _set_scoreset_layer (metadata .urn , mappings ),
553+ # set preferred layers for each target, to allow a mix of coding and noncoding targets
554+ # TODO maybe this should be reevaluated and we should only allow one preferred layer per score set,
555+ # since I can't imagine an experimental assay where some variants are assayed as nucleotide variants
556+ # and others are assayed as amino acid variants.
557+ reference_sequences : dict [str , dict ] = {}
558+ mapped_scores : list [ScoreAnnotation ] = []
559+ for target_gene in mappings :
560+ if preferred_layer_only :
561+ preferred_layers = {
562+ _set_scoreset_layer (metadata .urn , mappings [target_gene ]),
563+ }
564+ else :
565+ preferred_layers = {
566+ mapping .annotation_layer for mapping in mappings [target_gene ]
567+ }
568+
569+ reference_sequences [target_gene ] = {
570+ layer : {
571+ "computed_reference_sequence" : None ,
572+ "mapped_reference_sequence" : None ,
573+ }
574+ for layer in AnnotationLayer
535575 }
536- else :
537- preferred_layers = {mapping .annotation_layer for mapping in mappings }
538576
539- reference_sequences = {
540- layer : {"computed_reference_sequence" : None , "mapped_reference_sequence" : None }
541- for layer in AnnotationLayer
542- }
543-
544- for layer in preferred_layers :
545- reference_sequences [layer ][
546- "computed_reference_sequence"
547- ] = _get_computed_reference_sequence (metadata , layer , tx_output )
548- reference_sequences [layer ][
549- "mapped_reference_sequence"
550- ] = _get_mapped_reference_sequence (layer , tx_output , align_result )
577+ for layer in preferred_layers :
578+ reference_sequences [target_gene ][layer ][
579+ "computed_reference_sequence"
580+ ] = _get_computed_reference_sequence (
581+ metadata .target_genes [target_gene ], layer , tx_output [target_gene ]
582+ )
583+ reference_sequences [target_gene ][layer ][
584+ "mapped_reference_sequence"
585+ ] = _get_mapped_reference_sequence (
586+ layer , tx_output [target_gene ], align_results [target_gene ]
587+ )
551588
552- mapped_scores : list [ScoreAnnotation ] = []
553- for m in mappings :
554- if m .pre_mapped is None :
555- mapped_scores .append (ScoreAnnotation (** m .model_dump ()))
556- elif m .annotation_layer in preferred_layers :
557- # drop annotation layer from mapping object
558- mapped_scores .append (ScoreAnnotation (** m .model_dump ()))
589+ for m in mappings [target_gene ]:
590+ if m .pre_mapped is None :
591+ mapped_scores .append (ScoreAnnotation (** m .model_dump ()))
592+ elif m .annotation_layer in preferred_layers :
593+ # drop annotation layer from mapping object
594+ mapped_scores .append (ScoreAnnotation (** m .model_dump ()))
559595
596+ # TODO drop any Nonetype reference sequences
560597 output = ScoresetMapping (
561598 metadata = metadata .model_dump (),
562- computed_protein_reference_sequence = reference_sequences [
563- AnnotationLayer .PROTEIN
564- ]["computed_reference_sequence" ],
565- mapped_protein_reference_sequence = reference_sequences [AnnotationLayer .PROTEIN ][
566- "mapped_reference_sequence"
567- ],
568- computed_genomic_reference_sequence = reference_sequences [
569- AnnotationLayer .GENOMIC
570- ]["computed_reference_sequence" ],
571- mapped_genomic_reference_sequence = reference_sequences [AnnotationLayer .GENOMIC ][
572- "mapped_reference_sequence"
573- ],
599+ reference_sequences = reference_sequences ,
574600 mapped_scores = mapped_scores ,
575601 )
576602
0 commit comments