|
1 | 1 | """Converter for GKS <-> HL7 v2""" |
2 | 2 |
|
3 | 3 | import logging |
4 | | -from ga4gh.va_spec.base.core import ( |
5 | | - Statement, |
6 | | -) |
| 4 | +from typing import Any |
| 5 | + |
| 6 | +from ga4gh.va_spec.base.core import Statement |
| 7 | +from ga4gh.vrs.models import Allele, Expression, SequenceLocation |
7 | 8 |
|
8 | 9 | _logger = logging.getLogger(__name__) |
9 | 10 |
|
| 11 | +# TODO: make this a pydantic class to enforce required vs optional fields and types for the values |
| 12 | +HL7V2 = { |
| 13 | + "VARIANT_NAME": "504", |
| 14 | + "DISCRETE_VARIANT": "505", |
| 15 | + "CHROMOSOME": "510", |
| 16 | + "ALLELE_START": "511.1", |
| 17 | + "ALLELE_END": "511.2", |
| 18 | + "DNA_REGION": "513", |
| 19 | + "GENE_STUDIED": "514", |
| 20 | + "TRANSCRIPT_REFERENCE_SEQUENCE_ID": "516", |
| 21 | + "DNA_CHANGE": "518", |
| 22 | + "AMINO_ACID_CHANGE": "520", |
| 23 | + "MOLECULAR_CONSEQUENCE": "521", |
| 24 | + "PROTEIN_REFERENCE_SEQUENCE": "522", |
| 25 | + "GENOMIC_REFERENCE_SEQUENCE_ID": "524", |
| 26 | + # "AMPLIFICATION": "525", TODO: I don't think we can go from GKS to this yet / no guarantee this is in extensions |
| 27 | + "REFERENCE_ALLELE": "526", |
| 28 | + "OBSERVED_ALLELE": "527", |
| 29 | + "GENOMIC_DNA_CHANGE": "528", |
| 30 | + "CYTOGENETIC_LOCATION": "532", |
| 31 | + "PENETRANCE": "534", |
| 32 | + "GENETIC_VARIANT_SOURCE": "535", |
| 33 | + "ALLELE_LENGTH": "545", |
| 34 | + "STRUCTURAL_INNER_START": "546.1", |
| 35 | + "STRUCTURAL_INNER_END": "546.2", |
| 36 | + "STRUCTURAL_OUTER_START": "547.1", |
| 37 | + "STRUCTURAL_OUTER_END": "547.2", |
| 38 | + "COPY_NUMBER": "550", |
| 39 | + # "FUSED_GENES": "551", Not supported until Cat-VRS 2.0 |
| 40 | + "VARIANT_CLASSIFICATION": "553", |
| 41 | + "INTERPRETATION": "554", |
| 42 | + "MODE_OF_INHERITANCE": "560", |
| 43 | + # This one has a dashed arrow but I can't remember why :( |
| 44 | + "FUNCTIONAL_EFFECT": "561", |
| 45 | + "REPEAT_NUCLEOTIDES": "564", |
| 46 | + "REPEAT_NUMBER": "565", |
| 47 | + "AFFECTED_EXON_START": "572.1", |
| 48 | + "AFFECTED_EXON_END": "572.2", |
| 49 | + "AFFECTED_INTRON_START": "573.1", |
| 50 | + "AFFECTED_INTRON_END": "573.2", |
| 51 | + "INTERPRETATION_NOTE": "575", |
| 52 | +} |
| 53 | + |
| 54 | + |
| 55 | +def convert_gks_to_hl7_v2(statement: Statement) -> dict[str, Any]: |
| 56 | + """ |
| 57 | + Convert a VA-Spec Statement to an HL7 v2-compatible dictionary of fields. |
10 | 58 |
|
11 | | -def convert_gks_to_hl7_v2(statement: Statement) -> dict: |
12 | | - """convert GKS to HL7 v2""" |
13 | | - |
| 59 | + Returns a dict keyed by HL7 field identifiers (see HL7V2 constants). |
| 60 | + Raises ValueError if required data are missing. |
| 61 | + """ |
14 | 62 | proposition = statement.proposition |
15 | 63 | subject_variant = proposition.subjectVariant |
16 | 64 |
|
17 | 65 | # 504 - Variant Name |
18 | 66 | variant_name = subject_variant.name |
| 67 | + if not variant_name: |
| 68 | + _logger.warning("subjectVariant.name is missing or empty") |
| 69 | + # TODO: error here because I'm pretty sure this is required? |
| 70 | + variant_name = None |
| 71 | + |
| 72 | + # 505 - Discrete Genetic Variant (placeholder until models solidify) |
| 73 | + # TODO: need to wait for models for this or find out what expected format is |
| 74 | + |
| 75 | + members = subject_variant.members or [] |
| 76 | + genomic_allele, genomic_location = _find_genomic_allele_and_location(members) |
| 77 | + |
| 78 | + # Get hgvs.g expression from the allele (e.g., 'NC_000007.13:g.140453136A>T') |
| 79 | + expression = _find_expression(genomic_allele, syntax="hgvs.g") |
| 80 | + hgvs_g = expression.value if expression else None |
| 81 | + chromosome, g_dot = _parse_hgvs_g(hgvs_g) |
| 82 | + |
| 83 | + # 511 - Allele start/end |
| 84 | + allele_start, allele_end = _get_location_interval(genomic_location) |
| 85 | + |
| 86 | + result: dict[str, Any] = {} |
| 87 | + result[HL7V2["VARIANT_NAME"]] = variant_name |
| 88 | + result[HL7V2["CHROMOSOME"]] = chromosome |
| 89 | + result[HL7V2["ALLELE_START"]] = allele_start |
| 90 | + result[HL7V2["ALLELE_END"]] = allele_end |
| 91 | + |
| 92 | + return result |
| 93 | + |
19 | 94 |
|
20 | | - # 505 - Discrete Genetic Variant |
21 | | - # wait for types and models for this |
| 95 | +# --- Helpers: extract from VA objects ------------------------------------- |
22 | 96 |
|
23 | | - # 510 - Chromosome |
24 | | - members = subject_variant.members |
25 | | - genomic_allele = None |
26 | | - genomic_location = None |
| 97 | + |
| 98 | +def _find_genomic_allele_and_location( |
| 99 | + members: list[Allele], |
| 100 | +) -> tuple[Allele, SequenceLocation] | None: |
| 101 | + """ |
| 102 | + From a list of members, return the first (allele, location) |
| 103 | + whose location.sequenceReference.moleculeType == 'genomic'. |
| 104 | + # TODO: not sure if this is a reliable field to check for getting the genomic alleles - |
| 105 | + # consider checking expressions instead or as a backup. |
| 106 | + """ |
27 | 107 | for allele in members: |
28 | | - # get member where the allele's location's sequence reference's moleculeType is genomic |
29 | | - genomic_location = allele.location |
30 | | - sequence_reference = genomic_location.sequenceReference |
31 | | - if sequence_reference.moleculeType == "genomic": |
32 | | - genomic_allele = allele |
33 | | - break |
34 | | - # get chromosome from allele |
35 | | - expressions = genomic_allele.expressions |
36 | | - expression = None |
| 108 | + location = allele.location |
| 109 | + if location is None: |
| 110 | + continue |
| 111 | + seq_ref = location.sequenceReference |
| 112 | + molecule_type = seq_ref.moleculeType if seq_ref else None |
| 113 | + # TODO: it would be nice to make this helper take this as a parameter for more potential usability later |
| 114 | + if molecule_type == "genomic": |
| 115 | + return allele, location |
| 116 | + return None |
| 117 | + |
| 118 | + |
| 119 | +def _find_expression(allele: Allele, syntax: str) -> Expression | None: |
| 120 | + """ |
| 121 | + Find an expression with a given syntax (e.g., 'hgvs.g') from allele.expressions. |
| 122 | + Returns the first matching expression found. |
| 123 | + """ |
| 124 | + expressions = allele.expressions or [] |
| 125 | + |
37 | 126 | for expr in expressions: |
38 | | - if expr.get("syntax") == "hgvs.g": |
39 | | - expression = expr |
40 | | - break |
41 | | - g_dot_hgvs = expression.get("value") |
42 | | - g_dot_split = g_dot_hgvs.split(":", 2) |
43 | | - chromosome = g_dot_split[0] |
44 | | - g_dot = g_dot_split[1] |
| 127 | + s = expr.get("syntax") |
| 128 | + if s == syntax: |
| 129 | + return expr |
| 130 | + # TODO: raise error? |
| 131 | + return None |
45 | 132 |
|
46 | | - # 511 - Allele start/end |
47 | | - allele_start = genomic_location.start |
48 | | - allele_end = genomic_location.end |
49 | 133 |
|
50 | | - # TODO: finish tomorrow? :-) |
| 134 | +def _get_location_interval(location: SequenceLocation) -> tuple[int, int]: |
| 135 | + """ |
| 136 | + Extract (start, end) from a SequenceLocation. |
| 137 | + """ |
| 138 | + start = location.start |
| 139 | + end = location.end |
| 140 | + return start, end |
| 141 | + |
| 142 | + |
| 143 | +# --- Helpers: transformation / parsing --------------------------------------- |
| 144 | + |
| 145 | + |
| 146 | +def _parse_hgvs_g(hgvs_g_value: str) -> tuple[str, str]: |
| 147 | + """ |
| 148 | + Parse an hgvs.g expression. |
| 149 | +
|
| 150 | + Expected styles: |
| 151 | + - 'NC_000007.13:g.140453136A>T' |
| 152 | + # TODO: should we also accept 'chr7:g....' or '7:g....'? |
51 | 153 |
|
| 154 | + Returns: |
| 155 | + (chromosome, g_dot) where chromosome is the left of ':', and g_dot includes 'g.' onwards. |
| 156 | + """ |
| 157 | + chromosome, g_dot = hgvs_g_value.split(":", 1) |
52 | 158 |
|
53 | | - return {} |
| 159 | + return chromosome, g_dot |
0 commit comments