Skip to content

Commit 340b0f1

Browse files
authored
Merge pull request #51 from VariantEffect/fix_key_errors_after_mapping
Handle blank BLAT results, and fix BLAT results for some targets
2 parents 2bd58d2 + d60e81a commit 340b0f1

File tree

5 files changed

+34
-1
lines changed

5 files changed

+34
-1
lines changed

src/api/routers/map.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
get_raw_scoreset_metadata,
2020
get_scoreset_metadata,
2121
get_scoreset_records,
22+
patch_target_sequence_type,
2223
with_mavedb_score_set,
2324
)
2425
from dcd_mapping.resource_utils import ResourceAcquisitionError
@@ -48,6 +49,7 @@ async def map_scoreset(urn: str, store_path: Path | None = None) -> JSONResponse
4849
try:
4950
metadata = get_scoreset_metadata(urn, store_path)
5051
records = get_scoreset_records(metadata, True, store_path)
52+
metadata = patch_target_sequence_type(metadata, records)
5153
except ScoresetNotSupportedError as e:
5254
return JSONResponse(
5355
content=ScoresetMapping(

src/dcd_mapping/align.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -363,6 +363,11 @@ def align(
363363
msg = f"BLAT result {target_label} matches multiple target gene names in scoreset {scoreset_metadata.urn}"
364364
target_gene = scoreset_metadata.target_genes[target_label]
365365
alignment_results[target_label] = _get_best_match(blat_result, target_gene)
366+
# confirm that there is an alignment result for each target gene
367+
for target_gene in scoreset_metadata.target_genes:
368+
if target_gene not in alignment_results:
369+
msg = f"No BLAT result found for target gene {target_gene} in scoreset {scoreset_metadata.urn}"
370+
raise AlignmentError(msg)
366371
return alignment_results
367372

368373

src/dcd_mapping/main.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
ScoresetNotSupportedError,
2525
get_scoreset_metadata,
2626
get_scoreset_records,
27+
patch_target_sequence_type,
2728
with_mavedb_score_set,
2829
)
2930
from dcd_mapping.resource_utils import ResourceAcquisitionError
@@ -332,6 +333,7 @@ async def map_scoreset_urn(
332333
try:
333334
metadata = get_scoreset_metadata(urn, store_path)
334335
records = get_scoreset_records(metadata, silent, store_path)
336+
metadata = patch_target_sequence_type(metadata, records)
335337
except ScoresetNotSupportedError as e:
336338
_emit_info(f"Score set not supported: {e}", silent, logging.ERROR)
337339
final_output = write_scoreset_mapping_to_json(

src/dcd_mapping/mavedb_data.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,8 +30,10 @@
3030
ScoresetMapping,
3131
ScoresetMetadata,
3232
TargetGene,
33+
TargetSequenceType,
3334
UniProtRef,
3435
)
36+
from dcd_mapping.transcripts import _get_protein_sequence
3537

3638
__all__ = [
3739
"get_scoreset_urns",
@@ -324,6 +326,28 @@ def get_scoreset_records(
324326
return _load_scoreset_records(scores_csv, metadata)
325327

326328

329+
def patch_target_sequence_type(
330+
metadata: ScoresetMetadata, records: dict
331+
) -> ScoresetMetadata:
332+
"""If target sequence type is DNA but all variants are protein-level, change to protein.
333+
This avoids BLAT errors in cases where the target sequence was codon-optimized
334+
for a non-human organism
335+
"""
336+
for target_label, target in metadata.target_genes.items():
337+
if target.target_sequence_type == TargetSequenceType.DNA:
338+
all_protein = True
339+
for record in records.get(target_label, []):
340+
if record.hgvs_pro == "NA" or not record.hgvs_pro:
341+
all_protein = False
342+
break
343+
if all_protein:
344+
msg = f"Changing target sequence type for {metadata.urn} target {target_label} from DNA to protein because all variants are protein-level"
345+
_logger.info(msg)
346+
target.target_sequence = _get_protein_sequence(target.target_sequence)
347+
target.target_sequence_type = TargetSequenceType.PROTEIN
348+
return metadata
349+
350+
327351
def with_mavedb_score_set(fn: Callable) -> Callable:
328352
@wraps(fn)
329353
async def wrapper(*args, **kwargs) -> ScoresetMapping: # noqa: ANN002

src/dcd_mapping/version.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11
"""Provide dcd mapping version"""
22

3-
dcd_mapping_version = "2025.1.0"
3+
dcd_mapping_version = "2025.2.0"

0 commit comments

Comments
 (0)