Skip to content

Commit 277de67

Browse files
committed
refactor: clean up code and extract out helpers
1 parent 1a034e5 commit 277de67

File tree

1 file changed

+139
-33
lines changed

1 file changed

+139
-33
lines changed
Lines changed: 139 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -1,53 +1,159 @@
11
"""Converter for GKS <-> HL7 v2"""
22

33
import logging
4-
from ga4gh.va_spec.base.core import (
5-
Statement,
6-
)
4+
from typing import Any
5+
6+
from ga4gh.va_spec.base.core import Statement
7+
from ga4gh.vrs.models import Allele, Expression, SequenceLocation
78

89
_logger = logging.getLogger(__name__)
910

11+
# TODO: make this a pydantic class to enforce required vs optional fields and types for the values
12+
HL7V2 = {
13+
"VARIANT_NAME": "504",
14+
"DISCRETE_VARIANT": "505",
15+
"CHROMOSOME": "510",
16+
"ALLELE_START": "511.1",
17+
"ALLELE_END": "511.2",
18+
"DNA_REGION": "513",
19+
"GENE_STUDIED": "514",
20+
"TRANSCRIPT_REFERENCE_SEQUENCE_ID": "516",
21+
"DNA_CHANGE": "518",
22+
"AMINO_ACID_CHANGE": "520",
23+
"MOLECULAR_CONSEQUENCE": "521",
24+
"PROTEIN_REFERENCE_SEQUENCE": "522",
25+
"GENOMIC_REFERENCE_SEQUENCE_ID": "524",
26+
# "AMPLIFICATION": "525", TODO: I don't think we can go from GKS to this yet / no guarantee this is in extensions
27+
"REFERENCE_ALLELE": "526",
28+
"OBSERVED_ALLELE": "527",
29+
"GENOMIC_DNA_CHANGE": "528",
30+
"CYTOGENETIC_LOCATION": "532",
31+
"PENETRANCE": "534",
32+
"GENETIC_VARIANT_SOURCE": "535",
33+
"ALLELE_LENGTH": "545",
34+
"STRUCTURAL_INNER_START": "546.1",
35+
"STRUCTURAL_INNER_END": "546.2",
36+
"STRUCTURAL_OUTER_START": "547.1",
37+
"STRUCTURAL_OUTER_END": "547.2",
38+
"COPY_NUMBER": "550",
39+
# "FUSED_GENES": "551", Not supported until Cat-VRS 2.0
40+
"VARIANT_CLASSIFICATION": "553",
41+
"INTERPRETATION": "554",
42+
"MODE_OF_INHERITANCE": "560",
43+
# This one has a dashed arrow but I can't remember why :(
44+
"FUNCTIONAL_EFFECT": "561",
45+
"REPEAT_NUCLEOTIDES": "564",
46+
"REPEAT_NUMBER": "565",
47+
"AFFECTED_EXON_START": "572.1",
48+
"AFFECTED_EXON_END": "572.2",
49+
"AFFECTED_INTRON_START": "573.1",
50+
"AFFECTED_INTRON_END": "573.2",
51+
"INTERPRETATION_NOTE": "575",
52+
}
53+
54+
55+
def convert_gks_to_hl7_v2(statement: Statement) -> dict[str, Any]:
56+
"""
57+
Convert a VA-Spec Statement to an HL7 v2-compatible dictionary of fields.
1058
11-
def convert_gks_to_hl7_v2(statement: Statement) -> dict:
12-
"""convert GKS to HL7 v2"""
13-
59+
Returns a dict keyed by HL7 field identifiers (see HL7V2 constants).
60+
Raises ValueError if required data are missing.
61+
"""
1462
proposition = statement.proposition
1563
subject_variant = proposition.subjectVariant
1664

1765
# 504 - Variant Name
1866
variant_name = subject_variant.name
67+
if not variant_name:
68+
_logger.warning("subjectVariant.name is missing or empty")
69+
# TODO: error here because I'm pretty sure this is required?
70+
variant_name = None
71+
72+
# 505 - Discrete Genetic Variant (placeholder until models solidify)
73+
# TODO: need to wait for models for this or find out what expected format is
74+
75+
members = subject_variant.members or []
76+
genomic_allele, genomic_location = _find_genomic_allele_and_location(members)
77+
78+
# Get hgvs.g expression from the allele (e.g., 'NC_000007.13:g.140453136A>T')
79+
expression = _find_expression(genomic_allele, syntax="hgvs.g")
80+
hgvs_g = expression.value if expression else None
81+
chromosome, g_dot = _parse_hgvs_g(hgvs_g)
82+
83+
# 511 - Allele start/end
84+
allele_start, allele_end = _get_location_interval(genomic_location)
85+
86+
result: dict[str, Any] = {}
87+
result[HL7V2["VARIANT_NAME"]] = variant_name
88+
result[HL7V2["CHROMOSOME"]] = chromosome
89+
result[HL7V2["ALLELE_START"]] = allele_start
90+
result[HL7V2["ALLELE_END"]] = allele_end
91+
92+
return result
93+
1994

20-
# 505 - Discrete Genetic Variant
21-
# wait for types and models for this
95+
# --- Helpers: extract from VA objects -------------------------------------
2296

23-
# 510 - Chromosome
24-
members = subject_variant.members
25-
genomic_allele = None
26-
genomic_location = None
97+
98+
def _find_genomic_allele_and_location(
99+
members: list[Allele],
100+
) -> tuple[Allele, SequenceLocation] | None:
101+
"""
102+
From a list of members, return the first (allele, location)
103+
whose location.sequenceReference.moleculeType == 'genomic'.
104+
# TODO: not sure if this is a reliable field to check for getting the genomic alleles -
105+
# consider checking expressions instead or as a backup.
106+
"""
27107
for allele in members:
28-
# get member where the allele's location's sequence reference's moleculeType is genomic
29-
genomic_location = allele.location
30-
sequence_reference = genomic_location.sequenceReference
31-
if sequence_reference.moleculeType == "genomic":
32-
genomic_allele = allele
33-
break
34-
# get chromosome from allele
35-
expressions = genomic_allele.expressions
36-
expression = None
108+
location = allele.location
109+
if location is None:
110+
continue
111+
seq_ref = location.sequenceReference
112+
molecule_type = seq_ref.moleculeType if seq_ref else None
113+
# TODO: it would be nice to make this helper take this as a parameter for more potential usability later
114+
if molecule_type == "genomic":
115+
return allele, location
116+
return None
117+
118+
119+
def _find_expression(allele: Allele, syntax: str) -> Expression | None:
120+
"""
121+
Find an expression with a given syntax (e.g., 'hgvs.g') from allele.expressions.
122+
Returns the first matching expression found.
123+
"""
124+
expressions = allele.expressions or []
125+
37126
for expr in expressions:
38-
if expr.get("syntax") == "hgvs.g":
39-
expression = expr
40-
break
41-
g_dot_hgvs = expression.get("value")
42-
g_dot_split = g_dot_hgvs.split(":", 2)
43-
chromosome = g_dot_split[0]
44-
g_dot = g_dot_split[1]
127+
s = expr.get("syntax")
128+
if s == syntax:
129+
return expr
130+
# TODO: raise error?
131+
return None
45132

46-
# 511 - Allele start/end
47-
allele_start = genomic_location.start
48-
allele_end = genomic_location.end
49133

50-
# TODO: finish tomorrow? :-)
134+
def _get_location_interval(location: SequenceLocation) -> tuple[int, int]:
135+
"""
136+
Extract (start, end) from a SequenceLocation.
137+
"""
138+
start = location.start
139+
end = location.end
140+
return start, end
141+
142+
143+
# --- Helpers: transformation / parsing ---------------------------------------
144+
145+
146+
def _parse_hgvs_g(hgvs_g_value: str) -> tuple[str, str]:
147+
"""
148+
Parse an hgvs.g expression.
149+
150+
Expected styles:
151+
- 'NC_000007.13:g.140453136A>T'
152+
# TODO: should we also accept 'chr7:g....' or '7:g....'?
51153
154+
Returns:
155+
(chromosome, g_dot) where chromosome is the left of ':', and g_dot includes 'g.' onwards.
156+
"""
157+
chromosome, g_dot = hgvs_g_value.split(":", 1)
52158

53-
return {}
159+
return chromosome, g_dot

0 commit comments

Comments
 (0)