Skip to content

Commit fa54de7

Browse files
committed
Add latest changes
1 parent af28a0f commit fa54de7

File tree

5 files changed

+40
-40
lines changed

5 files changed

+40
-40
lines changed

schema.json

Lines changed: 11 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -370,11 +370,7 @@
370370
"title": "Version"
371371
},
372372
"code": {
373-
"allOf": [
374-
{
375-
"$ref": "#/$defs/Code"
376-
}
377-
],
373+
"$ref": "#/$defs/Code",
378374
"description": "A symbol uniquely identifying the concept, as in a syntax defined by the code system. CURIE format is preferred where possible (e.g. 'SO:0000704' is the CURIE form of the Sequence Ontology code for 'gene')."
379375
}
380376
},
@@ -412,19 +408,11 @@
412408
"description": "A mapping to a concept in a terminology or code system.",
413409
"properties": {
414410
"coding": {
415-
"allOf": [
416-
{
417-
"$ref": "#/$defs/Coding"
418-
}
419-
],
411+
"$ref": "#/$defs/Coding",
420412
"description": "A structured representation of a code for a defined concept in a terminology or code system."
421413
},
422414
"relation": {
423-
"allOf": [
424-
{
425-
"$ref": "#/$defs/Relation"
426-
}
427-
],
415+
"$ref": "#/$defs/Relation",
428416
"description": "A mapping relation between concepts as defined by the Simple Knowledge Organization System (SKOS)."
429417
}
430418
},
@@ -439,11 +427,7 @@
439427
"description": "Representation of a variation by a specified nomenclature or syntax for a\nVariation object. Common examples of expressions for the description of molecular\nvariation include the HGVS and ISCN nomenclatures.",
440428
"properties": {
441429
"syntax": {
442-
"allOf": [
443-
{
444-
"$ref": "#/$defs/Syntax"
445-
}
446-
],
430+
"$ref": "#/$defs/Syntax",
447431
"description": "The syntax used to describe the variation. The value should be one of the supported syntaxes."
448432
},
449433
"value": {
@@ -751,11 +735,7 @@
751735
"title": "Mappings"
752736
},
753737
"sequence": {
754-
"allOf": [
755-
{
756-
"$ref": "#/$defs/SequenceString"
757-
}
758-
],
738+
"$ref": "#/$defs/SequenceString",
759739
"description": "the literal sequence"
760740
}
761741
},
@@ -972,21 +952,24 @@
972952
"pre_mapped": {
973953
"anyOf": [
974954
{
975-
"$ref": "#/$defs/Allele"
955+
"$ref": "#/$defs/CisPhasedBlock"
976956
},
977957
{
978-
"$ref": "#/$defs/CisPhasedBlock"
958+
"$ref": "#/$defs/Allele"
979959
}
980960
],
981961
"title": "Pre Mapped"
982962
},
983963
"post_mapped": {
984964
"anyOf": [
965+
{
966+
"$ref": "#/$defs/CisPhasedBlock"
967+
},
985968
{
986969
"$ref": "#/$defs/Allele"
987970
},
988971
{
989-
"$ref": "#/$defs/CisPhasedBlock"
972+
"type": "null"
990973
}
991974
],
992975
"title": "Post Mapped"

src/dcd_mapping/align.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -165,9 +165,16 @@ def _get_blat_output(metadata: ScoresetMetadata, silent: bool) -> QueryResult:
165165
"""
166166
with tempfile.NamedTemporaryFile() as query_file:
167167
query_file = _build_query_file(metadata, Path(query_file.name))
168+
if len(metadata.target_sequence) > 25000:
169+
msg = f"Target sequence for {metadata.urn} must have a length <= 25000 to run BLAT"
170+
raise AlignmentError(msg)
171+
168172
if metadata.target_sequence_type == TargetSequenceType.PROTEIN:
169173
target_args = "-q=prot -t=dnax"
170-
elif metadata.target_gene_category == TargetType.PROTEIN_CODING:
174+
elif (
175+
metadata.target_gene_category == TargetType.PROTEIN_CODING
176+
and len(metadata.target_sequence) <= 10000
177+
):
171178
target_args = "-q=dnax -t=dnax"
172179
else:
173180
target_args = ""

src/dcd_mapping/annotate.py

Lines changed: 18 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
CisPhasedBlock,
2222
Expression,
2323
LiteralSequenceExpression,
24+
SequenceString,
2425
)
2526

2627
from dcd_mapping.lookup import (
@@ -115,7 +116,7 @@ def _get_vrs_ref_allele_seq(
115116
ref = sr.get_sequence(seq, start, end)
116117
if ref is None:
117118
raise ValueError
118-
return Extension(name="vrs_ref_allele_seq", value=ref)
119+
return SequenceString(root=ref)
119120

120121

121122
def _get_hgvs_string(allele: Allele, accession: str) -> tuple[str, Syntax]:
@@ -208,9 +209,11 @@ def _annotate_allele_mapping(
208209

209210
# get vrs_ref_allele_seq for pre-mapped variants
210211
pre_mapped.extensions = [
211-
_get_vrs_ref_allele_seq(pre_mapped, metadata, tx_results),
212212
_get_vrs_1_3_ext(pre_mapped),
213213
]
214+
pre_mapped.location.sequence = _get_vrs_ref_allele_seq(
215+
pre_mapped, metadata, tx_results
216+
)
214217

215218
# Determine reference sequence
216219
if mapped_score.annotation_layer == AnnotationLayer.GENOMIC:
@@ -228,9 +231,10 @@ def _annotate_allele_mapping(
228231
sr = get_seqrepo()
229232
loc = mapped_score.post_mapped.location
230233
sequence_id = f"ga4gh:{loc.sequenceReference.refgetAccession}"
231-
ref = sr.get_sequence(sequence_id, loc.start, loc.end)
234+
post_mapped.location.sequence = SequenceString(
235+
root=sr.get_sequence(sequence_id, loc.start, loc.end)
236+
)
232237
post_mapped.extensions = [
233-
Extension(name="vrs_ref_allele_seq", value=ref),
234238
_get_vrs_1_3_ext(post_mapped),
235239
_get_va_digest(pre_mapped),
236240
]
@@ -246,6 +250,10 @@ def _annotate_allele_mapping(
246250
post_mapped if _is_valid_allele(pre_mapped, align_result) else None
247251
)
248252

253+
# Remove extra digest attributes
254+
pre_mapped.digest = None
255+
post_mapped.digest = None
256+
249257
return ScoreAnnotationWithLayer(
250258
pre_mapped=pre_mapped,
251259
post_mapped=post_mapped,
@@ -280,9 +288,10 @@ def _annotate_cpb_mapping(
280288
# get vrs_ref_allele_seq for pre-mapped variants
281289
for allele in pre_mapped.members:
282290
allele.extensions = [
283-
_get_vrs_ref_allele_seq(allele, metadata, tx_results),
284291
_get_vrs_1_3_ext(allele),
285292
]
293+
allele.location.sequence = _get_vrs_ref_allele_seq(allele, metadata, tx_results)
294+
allele.digest = None
286295
# Determine reference sequence
287296
if mapping.annotation_layer == AnnotationLayer.GENOMIC:
288297
sequence_id = (
@@ -305,9 +314,10 @@ def _annotate_cpb_mapping(
305314
):
306315
loc = post_mapped_allele.location
307316
sequence_id = f"ga4gh:{loc.sequenceReference.refgetAccession}"
308-
ref = sr.get_sequence(sequence_id, loc.start, loc.end)
317+
post_mapped_allele.location.sequence = SequenceString(
318+
root=sr.get_sequence(sequence_id, loc.start, loc.end)
319+
)
309320
post_mapped_allele.extensions = [
310-
Extension(name="vrs_ref_allele_seq", value=ref),
311321
_get_vrs_1_3_ext(post_mapped_allele),
312322
_get_va_digest(pre_mapped_allele),
313323
]
@@ -317,6 +327,7 @@ def _annotate_cpb_mapping(
317327
pre_mapped_allele, align_result
318328
):
319329
valid_post_mapped_alleles.append(post_mapped_allele)
330+
post_mapped_allele.digest = None
320331
post_mapped.members = valid_post_mapped_alleles
321332

322333
pre_mapped.extensions = [

src/dcd_mapping/schemas.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -18,9 +18,9 @@ class TargetSequenceType(str, Enum):
1818
class TargetType(str, Enum):
1919
"""Define target gene types."""
2020

21-
PROTEIN_CODING = "Protein coding"
22-
REGULATORY = "Regulatory"
23-
OTHER_NC = "Other noncoding"
21+
PROTEIN_CODING = "protein_coding"
22+
REGULATORY = "regulatory"
23+
OTHER_NC = "other_noncoding"
2424

2525

2626
class UniProtRef(BaseModel):

src/dcd_mapping/vrs_map.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -462,7 +462,6 @@ def _get_variation(
462462

463463
# Run ga4gh_identify to assign VA digest
464464
allele.id = ga4gh_identify(allele)
465-
allele.digest = None
466465
alleles.append(allele)
467466

468467
if not alleles:

0 commit comments

Comments
 (0)