Skip to content

Commit e1ce877

Browse files
committed
add tests and parse additional concepts from GKS
1 parent f68134d commit e1ce877

File tree

8 files changed

+486
-1271
lines changed

8 files changed

+486
-1271
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,7 @@ profile_default/
8686
sdist/
8787
share/python-wheels/
8888
target/
89+
uv.lock
8990
var/
9091
venv.bak/
9192
venv/

pyproject.toml

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -30,12 +30,13 @@ classifiers = [
3030
"Topic :: Software Development :: Libraries :: Python Modules"
3131
]
3232
dependencies = [
33+
"biocommons.seqrepo==0.6.11",
3334
"coloredlogs ~= 15.0",
3435
"ga4gh.cat_vrs~=0.7.1",
3536
"ga4gh.va_spec~=0.4.2",
3637
"ga4gh.vrs==2.1.3",
37-
"pyyaml ~= 6.0",
38-
"pydantic==2.*"
38+
"pydantic==2.*",
39+
"pyyaml ~= 6.0"
3940
]
4041
description = "biocommons.example package (namespaced)"
4142
dynamic = ["version"]
File renamed without changes.

src/biocommons/gks-conversion-tool/converter.py renamed to src/biocommons/gks_conversion_tool/converter.py

Lines changed: 43 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import logging
44
from typing import Any
55

6+
from ga4gh.cat_vrs.models import Constraint
67
from ga4gh.va_spec.base.core import Statement
78
from ga4gh.vrs.models import Allele, Expression, SequenceLocation
89

@@ -82,27 +83,39 @@ def convert_gks_to_hl7_v2(statement: Statement) -> dict[str, Any]:
8283
# 505 - Discrete Genetic Variant (placeholder until models solidify)
8384
# TODO: need to wait for models for this or find out what expected format is
8485

85-
members = subject_variant.members or []
86-
genomic_allele, genomic_location = _find_genomic_allele_and_location(members)
86+
constraints = subject_variant.constraints or []
87+
allele, location = None, None
88+
if constraints:
89+
allele, location = _find_genomic_allele_and_location(constraints)
90+
else:
91+
err = "subjectVariant.constraints is missing or empty"
92+
raise ValueError(err)
8793

8894
# Get hgvs.g expression from the allele (e.g., 'NC_000007.13:g.140453136A>T')
8995
# use seqrepo here instead
90-
expression = _find_expression(genomic_allele, syntax="hgvs.g")
91-
hgvs_g = expression.value if expression else None
92-
chromosome, g_dot = _parse_hgvs_g(hgvs_g)
96+
genomic_expression = _find_expression(allele, syntax="hgvs.g")
97+
hgvs_g = genomic_expression.value if genomic_expression else None
98+
chromosome_ref_seq, g_dot = _parse_hgvs_dot(hgvs_g)
9399

94100
# 511 - Allele start/end
95-
allele_start, allele_end = _get_location_interval(genomic_location)
101+
allele_start, allele_end = _get_location_interval(location)
96102

97103
# 513 - DNA Region
98104

99105
# 514 - Gene Studied
106+
gene_studied = proposition.geneContextQualifier.name
100107

101108
# 516 - Transcript Reference Sequence ID
102109

103110
# 518 - DNA Change
111+
coding_expression = _find_expression(allele, syntax="hgvs.c")
112+
hgvs_c = coding_expression.value if coding_expression else None
113+
c_dot = _parse_hgvs_dot(hgvs_c)[1]
104114

105115
# 520 - Amino Acid Change
116+
protein_expression = _find_expression(allele, syntax="hgvs.p")
117+
hgvs_p = protein_expression.value if protein_expression else None
118+
p_dot = _parse_hgvs_dot(hgvs_p)[1]
106119

107120
# 521 - Molecular Consequence - on hold until approved
108121

@@ -148,11 +161,17 @@ def convert_gks_to_hl7_v2(statement: Statement) -> dict[str, Any]:
148161

149162
# 575 - Interpretation Note
150163

151-
result: dict[str, Any] = {}
152-
result[HL7V2["VARIANT_NAME"]] = variant_name
153-
result[HL7V2["CHROMOSOME"]] = chromosome
154-
result[HL7V2["ALLELE_START"]] = allele_start
155-
result[HL7V2["ALLELE_END"]] = allele_end
164+
result: dict[str, Any] = HL7V2.copy() # start with all keys
165+
result["VARIANT_NAME"] = variant_name
166+
# TODO: this needs to be converted to shorthand
167+
result["CHROMOSOME"] = chromosome_ref_seq
168+
result["ALLELE_START"] = allele_start
169+
result["ALLELE_END"] = allele_end
170+
result["GENE_STUDIED"] = gene_studied
171+
result["DNA_CHANGE"] = c_dot
172+
result["AMINO_ACID_CHANGE"] = p_dot
173+
result["GENOMIC_DNA_CHANGE"] = g_dot
174+
result["GENOMIC_REFERENCE_SEQUENCE_ID"] = chromosome_ref_seq
156175

157176
return result
158177

@@ -161,24 +180,19 @@ def convert_gks_to_hl7_v2(statement: Statement) -> dict[str, Any]:
161180

162181

163182
def _find_genomic_allele_and_location(
164-
members: list[Allele],
183+
constraints: list[Constraint],
165184
) -> tuple[Allele, SequenceLocation] | None:
166185
"""
167-
From a list of members, return the first (allele, location)
168-
whose location.sequenceReference.moleculeType == 'genomic'.
169-
# TODO: not sure if this is a reliable field to check for getting the genomic alleles -
170-
# consider checking expressions instead or as a backup. -> yes
186+
From a list of constraints, return the first (allele, location)
171187
"""
172-
for allele in members:
188+
for constraint in constraints:
189+
if constraint.root.type != "DefiningAlleleConstraint":
190+
continue
191+
allele = constraint.root.allele
173192
location = allele.location
174193
if location is None:
175194
continue
176-
seq_ref = location.sequenceReference
177-
molecule_type = seq_ref.moleculeType if seq_ref else None
178-
# TODO: it would be nice to make this helper take this as a parameter for more potential usability later
179-
# TODO: use seq refget and lookup in seqrepo to get hgvs.g
180-
if molecule_type == "genomic":
181-
return allele, location
195+
return allele, location
182196
return None
183197

184198

@@ -190,7 +204,7 @@ def _find_expression(allele: Allele, syntax: str) -> Expression | None:
190204
expressions = allele.expressions or []
191205

192206
for expr in expressions:
193-
s = expr.get("syntax")
207+
s = expr.syntax
194208
if s == syntax:
195209
return expr
196210
# TODO: raise error?
@@ -209,16 +223,16 @@ def _get_location_interval(location: SequenceLocation) -> tuple[int, int]:
209223
# --- Helpers: transformation / parsing ---------------------------------------
210224

211225

212-
def _parse_hgvs_g(hgvs_g_value: str) -> tuple[str, str]:
226+
def _parse_hgvs_dot(hgvs_value: str) -> tuple[str, str]:
213227
"""
214-
Parse an hgvs.g expression.
228+
Parse an hgvs.(g,c,p) expression.
215229
216230
Expected styles:
217231
- 'NC_000007.13:g.140453136A>T'
218232
219233
Returns:
220-
(chromosome, g_dot) where chromosome is the left of ':', and g_dot includes 'g.' onwards.
234+
(chromosome, dot) where chromosome is the left of ':', and dot includes g.,c.,or p. onwards.
221235
"""
222-
chromosome, g_dot = hgvs_g_value.split(":", 1)
236+
chromosome, dot = hgvs_value.split(":", 1)
223237

224-
return chromosome, g_dot
238+
return chromosome, dot

0 commit comments

Comments
 (0)