Skip to content

Commit ab81645

Browse files
authored
feat!: Modify annotations and schemas (#76)
1 parent 3fbd386 commit ab81645

File tree

9 files changed

+127
-84
lines changed

9 files changed

+127
-84
lines changed

schema.json

Lines changed: 11 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -370,11 +370,7 @@
370370
"title": "Version"
371371
},
372372
"code": {
373-
"allOf": [
374-
{
375-
"$ref": "#/$defs/Code"
376-
}
377-
],
373+
"$ref": "#/$defs/Code",
378374
"description": "A symbol uniquely identifying the concept, as in a syntax defined by the code system. CURIE format is preferred where possible (e.g. 'SO:0000704' is the CURIE form of the Sequence Ontology code for 'gene')."
379375
}
380376
},
@@ -412,19 +408,11 @@
412408
"description": "A mapping to a concept in a terminology or code system.",
413409
"properties": {
414410
"coding": {
415-
"allOf": [
416-
{
417-
"$ref": "#/$defs/Coding"
418-
}
419-
],
411+
"$ref": "#/$defs/Coding",
420412
"description": "A structured representation of a code for a defined concept in a terminology or code system."
421413
},
422414
"relation": {
423-
"allOf": [
424-
{
425-
"$ref": "#/$defs/Relation"
426-
}
427-
],
415+
"$ref": "#/$defs/Relation",
428416
"description": "A mapping relation between concepts as defined by the Simple Knowledge Organization System (SKOS)."
429417
}
430418
},
@@ -439,11 +427,7 @@
439427
"description": "Representation of a variation by a specified nomenclature or syntax for a\nVariation object. Common examples of expressions for the description of molecular\nvariation include the HGVS and ISCN nomenclatures.",
440428
"properties": {
441429
"syntax": {
442-
"allOf": [
443-
{
444-
"$ref": "#/$defs/Syntax"
445-
}
446-
],
430+
"$ref": "#/$defs/Syntax",
447431
"description": "The syntax used to describe the variation. The value should be one of the supported syntaxes."
448432
},
449433
"value": {
@@ -751,11 +735,7 @@
751735
"title": "Mappings"
752736
},
753737
"sequence": {
754-
"allOf": [
755-
{
756-
"$ref": "#/$defs/SequenceString"
757-
}
758-
],
738+
"$ref": "#/$defs/SequenceString",
759739
"description": "the literal sequence"
760740
}
761741
},
@@ -972,21 +952,24 @@
972952
"pre_mapped": {
973953
"anyOf": [
974954
{
975-
"$ref": "#/$defs/Allele"
955+
"$ref": "#/$defs/CisPhasedBlock"
976956
},
977957
{
978-
"$ref": "#/$defs/CisPhasedBlock"
958+
"$ref": "#/$defs/Allele"
979959
}
980960
],
981961
"title": "Pre Mapped"
982962
},
983963
"post_mapped": {
984964
"anyOf": [
965+
{
966+
"$ref": "#/$defs/CisPhasedBlock"
967+
},
985968
{
986969
"$ref": "#/$defs/Allele"
987970
},
988971
{
989-
"$ref": "#/$defs/CisPhasedBlock"
972+
"type": "null"
990973
}
991974
],
992975
"title": "Post Mapped"

src/dcd_mapping/align.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -165,9 +165,16 @@ def _get_blat_output(metadata: ScoresetMetadata, silent: bool) -> QueryResult:
165165
"""
166166
with tempfile.NamedTemporaryFile() as query_file:
167167
query_file = _build_query_file(metadata, Path(query_file.name))
168+
if len(metadata.target_sequence) > 25000:
169+
msg = f"Target sequence for {metadata.urn} must have a length <= 25000 to run BLAT"
170+
raise AlignmentError(msg)
171+
168172
if metadata.target_sequence_type == TargetSequenceType.PROTEIN:
169173
target_args = "-q=prot -t=dnax"
170-
elif metadata.target_gene_category == TargetType.PROTEIN_CODING:
174+
elif (
175+
metadata.target_gene_category == TargetType.PROTEIN_CODING
176+
and len(metadata.target_sequence) <= 10000
177+
):
171178
target_args = "-q=dnax -t=dnax"
172179
else:
173180
target_args = ""

src/dcd_mapping/annotate.py

Lines changed: 78 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
CisPhasedBlock,
2222
Expression,
2323
LiteralSequenceExpression,
24+
SequenceString,
2425
)
2526

2627
from dcd_mapping.lookup import (
@@ -53,6 +54,27 @@ def _get_vrs_1_3_ext(allele: Allele) -> Extension:
5354
)
5455

5556

57+
def _get_va_digest(allele: Allele) -> Extension:
58+
"""Return the VA digest for a pre-mapped allele
59+
:param allele: A pre-mapped variant
60+
:return A VRS extension reporting the pre-mapped digest
61+
"""
62+
return Extension(name="pre_mapped_id", value=allele.id)
63+
64+
65+
def _is_valid_allele(allele: Allele, align_result: AlignmentResult) -> bool:
66+
"""Check if a post-mapped allele occurs within the alignment coverage
67+
:param allele: A post-mapped allele
68+
:param align_result: Alignment data
69+
:return True if position occurs in coverage, False if not
70+
"""
71+
return (
72+
align_result.query_range.start
73+
<= allele.location.start
74+
<= align_result.query_range.end
75+
)
76+
77+
5678
def _offset_allele_ref_seq(ss: str, start: int, end: int) -> tuple[int, int]:
5779
"""Handle known edge cases in start and end coordinates for vrs_ref_allele_seq."""
5880
if ss.startswith("urn:mavedb:00000060-a-1"):
@@ -94,7 +116,7 @@ def _get_vrs_ref_allele_seq(
94116
ref = sr.get_sequence(seq, start, end)
95117
if ref is None:
96118
raise ValueError
97-
return Extension(name="vrs_ref_allele_seq", value=ref)
119+
return SequenceString(root=ref)
98120

99121

100122
def _get_hgvs_string(allele: Allele, accession: str) -> tuple[str, Syntax]:
@@ -179,16 +201,19 @@ def _annotate_allele_mapping(
179201
mapped_score: MappedScore,
180202
tx_results: TxSelectResult | None,
181203
metadata: ScoresetMetadata,
204+
align_result: AlignmentResult,
182205
) -> ScoreAnnotationWithLayer:
183206
"""Perform annotations for allele mappings."""
184207
pre_mapped: Allele = mapped_score.pre_mapped
185208
post_mapped: Allele = mapped_score.post_mapped
186209

187210
# get vrs_ref_allele_seq for pre-mapped variants
188211
pre_mapped.extensions = [
189-
_get_vrs_ref_allele_seq(pre_mapped, metadata, tx_results),
190212
_get_vrs_1_3_ext(pre_mapped),
191213
]
214+
pre_mapped.location.sequence = _get_vrs_ref_allele_seq(
215+
pre_mapped, metadata, tx_results
216+
)
192217

193218
# Determine reference sequence
194219
if mapped_score.annotation_layer == AnnotationLayer.GENOMIC:
@@ -206,17 +231,29 @@ def _annotate_allele_mapping(
206231
sr = get_seqrepo()
207232
loc = mapped_score.post_mapped.location
208233
sequence_id = f"ga4gh:{loc.sequenceReference.refgetAccession}"
209-
ref = sr.get_sequence(sequence_id, loc.start, loc.end)
234+
post_mapped.location.sequence = SequenceString(
235+
root=sr.get_sequence(sequence_id, loc.start, loc.end)
236+
)
210237
post_mapped.extensions = [
211-
Extension(name="vrs_ref_allele_seq", value=ref),
212238
_get_vrs_1_3_ext(post_mapped),
239+
_get_va_digest(pre_mapped),
213240
]
214241
hgvs_string, syntax = _get_hgvs_string(post_mapped, accession)
215242
post_mapped.expressions = [Expression(syntax=syntax, value=hgvs_string)]
216243

217244
namespace = metadata.urn
218245
val = mapped_score.accession_id.split("#")[1]
219246

247+
# Check if post-mapped allele is valid
248+
if mapped_score.annotation_layer == AnnotationLayer.GENOMIC:
249+
post_mapped = (
250+
post_mapped if _is_valid_allele(pre_mapped, align_result) else None
251+
)
252+
253+
# Remove extra digest attributes
254+
pre_mapped.digest = None
255+
post_mapped.digest = None
256+
220257
return ScoreAnnotationWithLayer(
221258
pre_mapped=pre_mapped,
222259
post_mapped=post_mapped,
@@ -240,17 +277,21 @@ def _get_vrs_1_3_haplotype_id(cpb: CisPhasedBlock) -> str:
240277

241278

242279
def _annotate_cpb_mapping(
243-
mapping: MappedScore, tx_results: TxSelectResult | None, metadata: ScoresetMetadata
280+
mapping: MappedScore,
281+
tx_results: TxSelectResult | None,
282+
metadata: ScoresetMetadata,
283+
align_result: AlignmentResult,
244284
) -> ScoreAnnotationWithLayer:
245285
"""Perform annotations and create VRS 1.3 equivalents for CisPhasedBlock mappings."""
246286
pre_mapped: CisPhasedBlock = mapping.pre_mapped # type: ignore
247287
post_mapped: CisPhasedBlock = mapping.post_mapped # type: ignore
248288
# get vrs_ref_allele_seq for pre-mapped variants
249289
for allele in pre_mapped.members:
250290
allele.extensions = [
251-
_get_vrs_ref_allele_seq(allele, metadata, tx_results),
252291
_get_vrs_1_3_ext(allele),
253292
]
293+
allele.location.sequence = _get_vrs_ref_allele_seq(allele, metadata, tx_results)
294+
allele.digest = None
254295
# Determine reference sequence
255296
if mapping.annotation_layer == AnnotationLayer.GENOMIC:
256297
sequence_id = (
@@ -267,23 +308,37 @@ def _annotate_cpb_mapping(
267308
accession = tx_results.np
268309

269310
sr = get_seqrepo()
270-
for allele in post_mapped.members:
271-
loc = allele.location
311+
valid_post_mapped_alleles = []
312+
for post_mapped_allele, pre_mapped_allele in zip(
313+
post_mapped.members, pre_mapped.members, strict=True
314+
):
315+
loc = post_mapped_allele.location
272316
sequence_id = f"ga4gh:{loc.sequenceReference.refgetAccession}"
273-
ref = sr.get_sequence(sequence_id, loc.start, loc.end)
274-
allele.extensions = [
275-
Extension(name="vrs_ref_allele_seq", value=ref),
276-
_get_vrs_1_3_ext(allele),
317+
post_mapped_allele.location.sequence = SequenceString(
318+
root=sr.get_sequence(sequence_id, loc.start, loc.end)
319+
)
320+
post_mapped_allele.extensions = [
321+
_get_vrs_1_3_ext(post_mapped_allele),
322+
_get_va_digest(pre_mapped_allele),
277323
]
278-
hgvs, syntax = _get_hgvs_string(allele, accession)
279-
allele.expressions = [Expression(syntax=syntax, value=hgvs)]
324+
hgvs, syntax = _get_hgvs_string(post_mapped_allele, accession)
325+
post_mapped_allele.expressions = [Expression(syntax=syntax, value=hgvs)]
326+
if mapping.annotation_layer == AnnotationLayer.PROTEIN or _is_valid_allele(
327+
pre_mapped_allele, align_result
328+
):
329+
valid_post_mapped_alleles.append(post_mapped_allele)
330+
post_mapped_allele.digest = None
331+
post_mapped.members = valid_post_mapped_alleles
280332

281333
pre_mapped.extensions = [
282334
Extension(name="vrs_v1.3_id", value=_get_vrs_1_3_haplotype_id(pre_mapped))
283335
]
284-
post_mapped.extensions = [
285-
Extension(name="vrs_v1.3_id", value=_get_vrs_1_3_haplotype_id(post_mapped))
286-
]
336+
if len(post_mapped.members) >= 2:
337+
post_mapped.extensions = [
338+
Extension(name="vrs_v1.3_id", value=_get_vrs_1_3_haplotype_id(post_mapped)),
339+
]
340+
else:
341+
post_mapped = post_mapped.members[0]
287342

288343
namespace = metadata.urn
289344
val = mapping.accession_id.split("#")[1]
@@ -301,6 +356,7 @@ def annotate(
301356
mapped_scores: list[MappedScore],
302357
tx_results: TxSelectResult | None,
303358
metadata: ScoresetMetadata,
359+
align_result: AlignmentResult,
304360
) -> list[ScoreAnnotationWithLayer]:
305361
"""Given a list of mappings, add additional contextual data:
306362
@@ -316,6 +372,7 @@ def annotate(
316372
:param vrs_results: in-progress variant mappings
317373
:param tx_select_results: transcript selection if available
318374
:param metadata: MaveDB scoreset metadata
375+
:param align_result: Alignment data
319376
:return: annotated mappings objects
320377
"""
321378
score_annotations = []
@@ -324,13 +381,15 @@ def annotate(
324381
mapped_score.post_mapped, CisPhasedBlock
325382
):
326383
score_annotations.append(
327-
_annotate_cpb_mapping(mapped_score, tx_results, metadata)
384+
_annotate_cpb_mapping(mapped_score, tx_results, metadata, align_result)
328385
)
329386
elif isinstance(mapped_score.pre_mapped, Allele) and isinstance(
330387
mapped_score.post_mapped, Allele
331388
):
332389
score_annotations.append(
333-
_annotate_allele_mapping(mapped_score, tx_results, metadata)
390+
_annotate_allele_mapping(
391+
mapped_score, tx_results, metadata, align_result
392+
)
334393
)
335394
else:
336395
ValueError("inconsistent variant structure")

src/dcd_mapping/main.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -197,7 +197,7 @@ async def map_scoreset(
197197
_emit_info("VRS mapping complete.", silent)
198198

199199
_emit_info("Annotating metadata and saving to file...", silent)
200-
vrs_results = annotate(vrs_results, transcript, metadata)
200+
vrs_results = annotate(vrs_results, transcript, metadata, alignment_result)
201201
final_output = save_mapped_output_json(
202202
metadata.urn,
203203
vrs_results,

src/dcd_mapping/schemas.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -18,9 +18,9 @@ class TargetSequenceType(str, Enum):
1818
class TargetType(str, Enum):
1919
"""Define target gene types."""
2020

21-
PROTEIN_CODING = "Protein coding"
22-
REGULATORY = "Regulatory"
23-
OTHER_NC = "Other noncoding"
21+
PROTEIN_CODING = "protein_coding"
22+
REGULATORY = "regulatory"
23+
OTHER_NC = "other_noncoding"
2424

2525

2626
class UniProtRef(BaseModel):
@@ -145,7 +145,7 @@ class MappedScore(BaseModel):
145145
annotation_layer: AnnotationLayer
146146
score: str | None
147147
pre_mapped: Allele | CisPhasedBlock
148-
post_mapped: Allele | CisPhasedBlock | None
148+
post_mapped: Allele | CisPhasedBlock
149149

150150

151151
class ScoreAnnotation(BaseModel):
@@ -155,7 +155,7 @@ class ScoreAnnotation(BaseModel):
155155
"""
156156

157157
pre_mapped: CisPhasedBlock | Allele
158-
post_mapped: CisPhasedBlock | Allele
158+
post_mapped: CisPhasedBlock | Allele | None
159159
mavedb_id: StrictStr
160160
relation: Literal["SO:is_homologous_to"] = "SO:is_homologous_to"
161161
score: float | None

src/dcd_mapping/vrs_map.py

Lines changed: 1 addition & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -462,20 +462,7 @@ def _get_variation(
462462

463463
# Run ga4gh_identify to assign VA digest
464464
allele.id = ga4gh_identify(allele)
465-
466-
# Check if the start of an allele is covered by the alignment block for
467-
# post-mapped genomic variants
468-
if layer == AnnotationLayer.GENOMIC:
469-
if pre_map:
470-
alleles.append(allele)
471-
else:
472-
if (
473-
allele.location.start >= alignment.hit_range.start
474-
and allele.location.start < alignment.hit_range.end
475-
):
476-
alleles.append(allele)
477-
else:
478-
alleles.append(allele)
465+
alleles.append(allele)
479466

480467
if not alleles:
481468
return None

0 commit comments

Comments
 (0)