Skip to content

Commit c078256

Browse files
sallybgbencap
andcommitted
Output all mappings unless preferred layer is specified
Co-authored-by: Ben Capodanno <[email protected]>
1 parent 04acef7 commit c078256

File tree

5 files changed

+57
-31
lines changed

5 files changed

+57
-31
lines changed

src/dcd_mapping/annotate.py

Lines changed: 35 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
"""Annotate MaveDB score set metadata with mapped scores."""
2+
23
import datetime
34
import json
45
import logging
@@ -464,6 +465,7 @@ def save_mapped_output_json(
464465
align_result: AlignmentResult,
465466
tx_output: TxSelectResult | None,
466467
include_vrs_2: bool = False,
468+
preferred_layer_only: bool = False,
467469
output_path: Path | None = None,
468470
) -> Path:
469471
"""Save mapping output for a score set in a JSON file
@@ -477,24 +479,47 @@ def save_mapped_output_json(
477479
<dcd_mapping_data_dir>/urn:mavedb:00000XXX-X-X_mapping_<ISO8601 datetime>.json
478480
:return: output location
479481
"""
480-
preferred_layer = _set_scoreset_layer(urn, mappings)
481482
metadata = get_raw_scoreset_metadata(urn)
482-
computed_reference_sequence = _get_computed_reference_sequence(
483-
urn, preferred_layer, tx_output
484-
)
485-
mapped_reference_sequence = _get_mapped_reference_sequence(
486-
preferred_layer, tx_output, align_result
487-
)
483+
if preferred_layer_only:
484+
preferred_layers = {
485+
_set_scoreset_layer(urn, mappings),
486+
}
487+
else:
488+
preferred_layers = {mapping.annotation_layer for mapping in mappings}
489+
490+
reference_sequences = {
491+
layer: {"computed_reference_sequence": None, "mapped_reference_sequence": None}
492+
for layer in AnnotationLayer
493+
}
494+
495+
for layer in preferred_layers:
496+
reference_sequences[layer][
497+
"computed_reference_sequence"
498+
] = _get_computed_reference_sequence(urn, layer, tx_output)
499+
reference_sequences[layer][
500+
"mapped_reference_sequence"
501+
] = _get_mapped_reference_sequence(layer, tx_output, align_result)
502+
488503
mapped_scores: list[ScoreAnnotation] = []
489504
for m in mappings:
490-
if m.annotation_layer == preferred_layer:
505+
if m.annotation_layer in preferred_layers:
491506
# drop annotation layer from mapping object
492507
mapped_scores.append(ScoreAnnotation(**m.model_dump()))
493508

494509
output = ScoresetMapping(
495510
metadata=metadata,
496-
computed_reference_sequence=computed_reference_sequence,
497-
mapped_reference_sequence=mapped_reference_sequence,
511+
computed_protein_reference_sequence=reference_sequences[
512+
AnnotationLayer.PROTEIN
513+
]["computed_reference_sequence"],
514+
mapped_protein_reference_sequence=reference_sequences[AnnotationLayer.PROTEIN][
515+
"mapped_reference_sequence"
516+
],
517+
computed_genomic_reference_sequence=reference_sequences[
518+
AnnotationLayer.GENOMIC
519+
]["computed_reference_sequence"],
520+
mapped_genomic_reference_sequence=reference_sequences[AnnotationLayer.GENOMIC][
521+
"mapped_reference_sequence"
522+
],
498523
mapped_scores=mapped_scores,
499524
)
500525

src/dcd_mapping/cli.py

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
"""Provide command-line interface for accessing mapping functions."""
2+
23
import asyncio
34
import logging
45
from pathlib import Path
@@ -38,11 +39,18 @@
3839
default=False,
3940
help="Include VRS 2.0 mappings",
4041
)
42+
@click.option(
43+
"--prefer_genomic",
44+
is_flag=True,
45+
default=False,
46+
help="If mapped variants are available relative to a genomic sequence, only output the genomic mappings",
47+
)
4148
def cli(
4249
urn: str,
4350
debug: bool,
4451
output: Path | None,
4552
include_vrs_2: bool,
53+
prefer_genomic: bool,
4654
) -> None:
4755
"""Get VRS mapping on preferred transcript for URN.
4856
@@ -63,7 +71,9 @@ def cli(
6371
)
6472
_logger.debug("debug logging enabled")
6573
try:
66-
asyncio.run(map_scoreset_urn(urn, output, include_vrs_2, silent=False))
74+
asyncio.run(
75+
map_scoreset_urn(urn, output, include_vrs_2, prefer_genomic, silent=False)
76+
)
6777
except (
6878
LookupError,
6979
AlignmentError,

src/dcd_mapping/main.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
"""Provide core MaveDB mapping methods."""
2+
23
import logging
34
import os
45
import subprocess
@@ -123,6 +124,7 @@ async def map_scoreset(
123124
records: list[ScoreRow],
124125
output_path: Path | None = None,
125126
include_vrs_2: bool = False,
127+
prefer_genomic: bool = False,
126128
silent: bool = True,
127129
) -> None:
128130
"""Given information about a MAVE experiment, map to VRS and save output as JSON.
@@ -182,6 +184,7 @@ async def map_scoreset(
182184
alignment_result,
183185
transcript,
184186
include_vrs_2,
187+
prefer_genomic,
185188
output_path,
186189
)
187190
_emit_info(f"Annotated scores saved to: {final_output}.", silent)
@@ -191,6 +194,7 @@ async def map_scoreset_urn(
191194
urn: str,
192195
output_path: Path | None = None,
193196
include_vrs_2: bool = False,
197+
prefer_genomic: bool = False,
194198
silent: bool = True,
195199
) -> None:
196200
"""Perform end-to-end mapping for a scoreset.
@@ -208,4 +212,6 @@ async def map_scoreset_urn(
208212
_logger.critical(msg)
209213
click.echo(f"Error: {msg}")
210214
raise e
211-
await map_scoreset(metadata, records, output_path, include_vrs_2, silent)
215+
await map_scoreset(
216+
metadata, records, output_path, include_vrs_2, prefer_genomic, silent
217+
)

src/dcd_mapping/schemas.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -180,6 +180,8 @@ class ScoresetMapping(BaseModel):
180180
"""Provide all mapped scores for a scoreset."""
181181

182182
metadata: Any # TODO get exact MaveDB metadata structure?
183-
computed_reference_sequence: ComputedReferenceSequence
184-
mapped_reference_sequence: MappedReferenceSequence
183+
computed_protein_reference_sequence: ComputedReferenceSequence | None
184+
mapped_protein_reference_sequence: MappedReferenceSequence | None
185+
computed_genomic_reference_sequence: ComputedReferenceSequence | None
186+
mapped_genomic_reference_sequence: MappedReferenceSequence | None
185187
mapped_scores: list[ScoreAnnotation]

src/dcd_mapping/vrs_map.py

Lines changed: 0 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
"""Map transcripts to VRS objects."""
22

33
import logging
4-
import re
54
from collections.abc import Iterable
65
from itertools import cycle
76

@@ -18,7 +17,6 @@
1817
SequenceString,
1918
)
2019
from ga4gh.vrs.normalize import normalize
21-
from mavehgvs import patterns
2220
from mavehgvs.util import parse_variant_strings
2321
from mavehgvs.variant import Variant
2422

@@ -105,11 +103,6 @@ def _create_pre_mapped_hgvs_strings(
105103
msg = f"Variant could not be parsed by mavehgvs: {error}"
106104
raise ValueError(msg)
107105

108-
# TODO protein fs hgvs strings are not supported because they can't be handled by vrs allele translator
109-
if re.search(patterns.protein.pro_fs, str(variant)):
110-
msg = f"Pre-map VRS translation not supported for fs variants denoted with protein hgvs strings. Offending variant was: {variant}"
111-
raise NotImplementedError(msg)
112-
113106
# Ideally we would create an HGVS string namespaced to GA4GH. The line below
114107
# creates such a string, but it is not able to be parsed by the GA4GH VRS translator.
115108
# hgvs_strings.append('ga4gh:' + sequence_id + ':' + str(variant))
@@ -164,11 +157,6 @@ def _create_post_mapped_hgvs_strings(
164157
msg = f"Variant could not be parsed by mavehgvs: {error}"
165158
raise ValueError(msg)
166159

167-
# TODO protein fs hgvs strings are not supported because they can't be handled by vrs allele translator
168-
if re.search(patterns.protein.pro_fs, str(variant)):
169-
msg = f"Post-map VRS translation not supported for fs variants denoted with protein hgvs strings. Offending variant was: {variant}"
170-
raise NotImplementedError(msg)
171-
172160
if layer is AnnotationLayer.PROTEIN:
173161
assert tx # noqa: S101. mypy help
174162

@@ -586,12 +574,7 @@ def _construct_vrs_allele(
586574
) -> Allele | Haplotype:
587575
alleles: list[Allele] = []
588576
for hgvs_string in hgvs_strings:
589-
# Generate VRS Allele structure. Set VA digests and SL digests to None
590577
allele = translate_hgvs_to_vrs(hgvs_string)
591-
# allele.id = None
592-
# allele.digest = None
593-
# allele.location.id = None
594-
# allele.location.digest = None
595578

596579
if pre_map:
597580
if sequence_id is None:

0 commit comments

Comments
 (0)