Skip to content

Commit 32aba22

Browse files
committed
hoo boy
1 parent 6689728 commit 32aba22

15 files changed

+1243
-139
lines changed

src/anyvlm/anyvar/base_client.py

Lines changed: 16 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
"""Provide abstraction for a AnyVLM-to-AnyVar connection."""
22

33
import abc
4+
from collections.abc import Iterable
45

56
from anyvar.utils.types import VrsVariation
67

@@ -9,23 +10,26 @@ class AnyVarClientError(Exception):
910
"""Generic client-related exception."""
1011

1112

12-
class UnidentifiedObjectError(AnyVarClientError):
13-
"""Raise if input object lacks an ID property"""
13+
class AnyVarClientConnectionError(AnyVarClientError):
14+
"""Raise for failure to connect to AnyVar client.
15+
16+
Likely relevant only for HTTP-based implementation.
17+
"""
1418

1519

1620
class BaseAnyVarClient(abc.ABC):
1721
"""Interface elements for an AnyVar client"""
1822

1923
@abc.abstractmethod
20-
def put_objects(self, objects: list[VrsVariation]) -> None:
21-
"""Register objects with AnyVar
22-
23-
All input objects must have a populated ID field. A validation check for this is
24-
performed before any variants are registered.
25-
26-
:param objects: variation objects to register
27-
:raise AnyVarClientError: for errors relating to specifics of client interface
28-
:raise UnidentifiedObjectError: if *any* provided object lacks a VRS ID
24+
def put_allele_expressions(
25+
self, expressions: Iterable[str], assembly: str = "GRCh38"
26+
) -> list[str | None]:
27+
"""Submit allele expressions to an AnyVar instance and retrieve corresponding VRS IDs
28+
29+
:param expressions: variation expressions to register
30+
:param assembly: reference assembly used in variation expressions
31+
:return: list where the i'th item is either the VRS ID if translation succeeds,
32+
else `None`, for the i'th expression
2933
"""
3034

3135
@abc.abstractmethod
@@ -38,7 +42,7 @@ def search_by_interval(
3842
:param start: start position for genomic region
3943
:param end: end position for genomic region
4044
:return: list of matching variant objects
41-
:raise AnyVarClientError: if connection is unsuccessful during search query
45+
:raise AnyVarClientConnectionError: if connection is unsuccessful during search query
4246
"""
4347

4448
@abc.abstractmethod

src/anyvlm/anyvar/http_client.py

Lines changed: 51 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,16 @@
11
"""Provide abstraction for a VLM-to-AnyVar connection."""
22

33
import logging
4+
from collections.abc import Iterable
45

56
import requests
67
from anyvar.utils.types import VrsVariation
78
from ga4gh.vrs import models
89

910
from anyvlm.anyvar.base_client import (
11+
AnyVarClientConnectionError,
1012
AnyVarClientError,
1113
BaseAnyVarClient,
12-
UnidentifiedObjectError,
1314
)
1415

1516
_logger = logging.getLogger(__name__)
@@ -26,38 +27,61 @@ def __init__(
2627
:param hostname: service API root
2728
:param request_timeout: timeout value, in seconds, for HTTP requests
2829
"""
30+
_logger.info("Initializing HTTP-based AnyVar client with hostname %s", hostname)
2931
self.hostname = hostname
3032
self.request_timeout = request_timeout
3133

32-
def put_objects(self, objects: list[VrsVariation]) -> None:
33-
"""Register objects with AnyVar
34+
def put_allele_expressions(
35+
self, expressions: Iterable[str], assembly: str = "GRCh38"
36+
) -> list[str | None]:
37+
"""Submit allele expressions to an AnyVar instance and retrieve corresponding VRS IDs
3438
35-
All input objects must have a populated ID field. A validation check for this is
36-
performed before any variants are registered.
37-
38-
:param objects: variation objects to register
39-
:return: completed VRS objects
40-
:raise AnyVarClientError: if connection is unsuccessful during registration request
41-
:raise UnidentifiedObjectError: if *any* provided object lacks a VRS ID
39+
:param expressions: variation expressions to register
40+
:param assembly: reference assembly used in expressions
41+
:return: list where the i'th item is either the VRS ID if translation succeeds,
42+
else `None`, for the i'th expression
43+
:raise AnyVarClientError: for unexpected errors relating to specifics of client interface
4244
"""
43-
objects_to_submit = []
44-
for vrs_object in objects:
45-
if not vrs_object.id:
46-
_logger.error("Provided variant %s has no VRS ID", vrs_object)
47-
raise UnidentifiedObjectError
48-
objects_to_submit.append(
49-
vrs_object.model_dump(exclude_none=True, mode="json")
50-
)
51-
for vrs_object in objects_to_submit:
52-
response = requests.put(
53-
f"{self.hostname}/vrs_variation",
54-
json=vrs_object,
55-
timeout=self.request_timeout,
56-
)
45+
results = []
46+
for expression in expressions:
47+
url = f"{self.hostname}/variation"
48+
payload = {
49+
"definition": expression,
50+
"assembly_name": assembly,
51+
"input_type": "Allele",
52+
}
53+
try:
54+
response = requests.put(
55+
url,
56+
json=payload,
57+
timeout=self.request_timeout,
58+
)
59+
except requests.ConnectionError as e:
60+
_logger.exception(
61+
"Unable to establish connection using AnyVar configured at %s",
62+
self.hostname,
63+
)
64+
raise AnyVarClientConnectionError from e
5765
try:
5866
response.raise_for_status()
5967
except requests.HTTPError as e:
68+
_logger.exception(
69+
"Encountered HTTP exception submitting payload %s to %s",
70+
payload,
71+
url,
72+
)
6073
raise AnyVarClientError from e
74+
response_json = response.json()
75+
if messages := response_json.get("messages"):
76+
_logger.warning(
77+
"Variant expression `%s` seems to have failed to translate: %s",
78+
expression,
79+
messages,
80+
)
81+
results.append(None)
82+
else:
83+
results.append(response_json["object_id"])
84+
return results
6185

6286
def search_by_interval(
6387
self, accession: str, start: int, end: int
@@ -89,3 +113,6 @@ def close(self) -> None:
89113
90114
This is a no-op for this class.
91115
"""
116+
_logger.info(
117+
"Closing HTTP-based AnyVar client class. This requires no further action."
118+
)

src/anyvlm/anyvar/python_client.py

Lines changed: 23 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,14 @@
11
"""Implement AnyVar client interface for direct Python-based access."""
22

33
import logging
4+
from collections.abc import Iterable
45

56
from anyvar import AnyVar
67
from anyvar.storage.base_storage import Storage
7-
from anyvar.translate.translate import Translator
8+
from anyvar.translate.translate import TranslationError, Translator
89
from anyvar.utils.types import VrsVariation
910

10-
from anyvlm.anyvar.base_client import BaseAnyVarClient, UnidentifiedObjectError
11+
from anyvlm.anyvar.base_client import BaseAnyVarClient
1112

1213
_logger = logging.getLogger(__name__)
1314

@@ -23,20 +24,28 @@ def __init__(self, translator: Translator, storage: Storage) -> None:
2324
"""
2425
self.av = AnyVar(translator, storage)
2526

26-
def put_objects(self, objects: list[VrsVariation]) -> None:
27-
"""Register objects with AnyVar
27+
def put_allele_expressions(
28+
self, expressions: Iterable[str], assembly: str = "GRCh38"
29+
) -> list[str | None]:
30+
"""Submit allele expressions to an AnyVar instance and retrieve corresponding VRS IDs
2831
29-
All input objects must have a populated ID field. A validation check for this is
30-
performed before any variants are registered.
31-
32-
:param objects: variation objects to register
33-
:raise UnidentifiedObjectError: if *any* provided object lacks a VRS ID
32+
:param expressions: variation expressions to register
33+
:param assembly: reference assembly used in expressions
34+
:return: list where the i'th item is either the VRS ID if translation succeeds,
35+
else `None`, for the i'th expression
3436
"""
35-
for variant in objects:
36-
if not variant.id:
37-
_logger.error("Provided variant %s has no VRS ID", variant)
38-
raise UnidentifiedObjectError
39-
self.av.put_objects(objects) # type: ignore[reportArgumentType]
37+
results = []
38+
for expression in expressions:
39+
translated_variation = None
40+
try:
41+
translated_variation = self.av.translator.translate_variation(
42+
expression, assembly=assembly
43+
)
44+
except TranslationError:
45+
_logger.exception("Failed to translate expression: %s", expression)
46+
self.av.put_objects([translated_variation]) # type: ignore
47+
results.append(translated_variation.id) # type: ignore
48+
return results
4049

4150
def search_by_interval(
4251
self, accession: str, start: int, end: int

src/anyvlm/functions/ingest_vcf.py

Lines changed: 18 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -1,56 +1,48 @@
11
"""Get a VCF, register its contained variants, and add cohort frequency data to storage"""
22

33
import logging
4-
import os
4+
from collections import namedtuple
55
from collections.abc import Iterator
66
from pathlib import Path
77

88
import pysam
9-
from anyvar.translate.vrs_python import AlleleTranslator
10-
from anyvar.utils.types import VrsVariation
11-
from ga4gh.vrs.dataproxy import create_dataproxy
9+
from ga4gh.va_spec.base import CohortAlleleFrequencyStudyResult
1210

1311
from anyvlm.anyvar.base_client import BaseAnyVarClient
14-
from anyvlm.schemas.domain import AlleleFrequencyAnnotation
1512

1613
_logger = logging.getLogger(__name__)
1714

1815

19-
_Var_Af_Pair = tuple[VrsVariation, AlleleFrequencyAnnotation]
16+
AfData = namedtuple("AfData", ("ac", "an", "ac_het", "ac_hom", "ac_hemi"))
2017

2118

22-
def _yield_var_af_batches(
23-
vcf: pysam.VariantFile,
24-
translator: AlleleTranslator,
25-
assembly: str,
26-
batch_size: int = 1000,
27-
) -> Iterator[_Var_Af_Pair]:
28-
"""Generate a variant-allele frequency data pairing, one at a time
19+
def _yield_expression_af_batches(
20+
vcf: pysam.VariantFile, batch_size: int = 1000
21+
) -> Iterator[list[tuple[str, CohortAlleleFrequencyStudyResult]]]:
22+
"""Generate a variant expression-allele frequency data pairing, one at a time
2923
3024
:param vcf: VCF to pull variants from
3125
:param translator: VRS-Python variant translator for converting VCF expressions to VRS
3226
:param assembly: name of reference assembly used by VCF
3327
:param batch_size: size of return batches
28+
:return: iterator of lists of pairs of variant expressions and AF data classes
3429
"""
35-
batch: list[_Var_Af_Pair] = []
30+
batch = []
3631

3732
for record in vcf:
3833
for i, alt in enumerate(record.alts or []):
3934
if record.ref is None or "*" in record.ref or "*" in alt:
4035
_logger.warning("Skipping missing allele at %s", record)
4136
continue
4237
expression = f"{record.chrom}-{record.pos}-{record.ref}-{alt}"
43-
vrs_variation = translator.translate_from(
44-
expression, "gnomad", assembly_name=assembly
45-
)
46-
af = AlleleFrequencyAnnotation(
38+
af = AfData(
4739
ac=record.info["AC"][i],
4840
an=record.info["AN"],
4941
ac_het=record.info["AC_Het"][i],
5042
ac_hom=record.info["AC_Hom"][i],
5143
ac_hemi=record.info["AC_Hemi"][i],
5244
)
53-
batch.append((vrs_variation, af))
45+
batch.append((expression, af))
5446
if len(batch) >= batch_size:
5547
yield batch
5648
batch = []
@@ -74,14 +66,12 @@ def ingest_vcf(vcf_path: Path, av: BaseAnyVarClient, assembly: str = "GRCh38") -
7466
:param av: AnyVar client
7567
:param assembly: reference assembly used by VCF
7668
"""
77-
dataproxy = create_dataproxy(
78-
os.environ.get("SEQREPO_DATAPROXY_URI", "seqrepo+http://localhost:5000/seqrepo")
79-
)
80-
translator = AlleleTranslator(dataproxy)
8169
vcf = pysam.VariantFile(filename=vcf_path.absolute().as_uri(), mode="r")
8270

83-
for batch in _yield_var_af_batches(vcf, translator, assembly):
84-
variants = [v for v, _ in batch]
85-
av.put_objects(variants)
86-
for variant, af in batch: # noqa: B007
87-
pass # make a call to a storage class to store frequency data -- see issue 23
71+
for batch in _yield_expression_af_batches(vcf):
72+
expressions, afs = zip(*batch, strict=True)
73+
variant_ids = av.put_allele_expressions(expressions, assembly)
74+
for variant_id, af in zip(variant_ids, afs, strict=True): # noqa: B007
75+
if variant_id is None:
76+
continue
77+
# put af object here

tests/data/variations.json

Lines changed: 15 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,8 @@
4343
},
4444
"state": { "type": "LiteralSequenceExpression", "sequence": "T" }
4545
},
46-
"comment": "BRAF V600E (genomic)"
46+
"comment": "BRAF V600E (genomic)",
47+
"vcf_expression": "7-140753336-A-T"
4748
},
4849
"ga4gh:VA.ruQCmfXJrEylHmOQF-1PG6bLwQesDU2g": {
4950
"variation": {
@@ -66,30 +67,30 @@
6667
"sequence": "G"
6768
}
6869
},
69-
"comment": "Y-2781704-G-G"
70+
"vcf_expression": "Y-2781704-G-G"
7071
},
7172
"ga4gh:VA.IM4QyU9D2kTJzeftUBBD4Vcd1peq0dn1": {
7273
"variation": {
73-
"id": "ga4gh:VA.IM4QyU9D2kTJzeftUBBD4Vcd1peq0dn1",
74-
"digest": "IM4QyU9D2kTJzeftUBBD4Vcd1peq0dn1",
74+
"id": "ga4gh:VA.xbX035HgURWIUAjn6x3cS26jafP8Q_bk",
7575
"type": "Allele",
76+
"digest": "xbX035HgURWIUAjn6x3cS26jafP8Q_bk",
7677
"location": {
77-
"id": "ga4gh:SL.sWfeTXwGUkfIuYRAkiFGPjkSk_mIDuXG",
78-
"digest": "sWfeTXwGUkfIuYRAkiFGPjkSk_mIDuXG",
78+
"id": "ga4gh:SL.sYiBcbbgF-1CANNCTfQ6zwZOU0iHhymR",
7979
"type": "SequenceLocation",
80-
"start": 2781760,
81-
"end": 2781762,
80+
"digest": "sYiBcbbgF-1CANNCTfQ6zwZOU0iHhymR",
8281
"sequenceReference": {
8382
"type": "SequenceReference",
8483
"refgetAccession": "SQ.8_liLu1aycC0tPQPFmUaGXJLDs5SbPZ5"
85-
}
84+
},
85+
"start": 2781760,
86+
"end": 2781761
8687
},
8788
"state": {
8889
"type": "LiteralSequenceExpression",
89-
"sequence": "CA"
90+
"sequence": "C"
9091
}
9192
},
92-
"comment": "Y-2781761-C-C"
93+
"vcf_expression": "Y-2781761-C-C"
9394
},
9495
"ga4gh:VA.xbX035HgURWIUAjn6x3cS26jafP8Q_bk": {
9596
"variation": {
@@ -109,7 +110,7 @@
109110
},
110111
"state": { "type": "LiteralSequenceExpression", "sequence": "C" }
111112
},
112-
"comment": "Y-2781761-C-C"
113+
"vcf_expression": "Y-2781761-C-C"
113114
},
114115
"ga4gh:VA.9VDxL0stMBOZwcTKw3yb3UoWQkpaI9OD": {
115116
"variation": {
@@ -129,7 +130,7 @@
129130
},
130131
"state": { "type": "LiteralSequenceExpression", "sequence": "A" }
131132
},
132-
"comment": "Y-2781761-C-A"
133+
"vcf_expression": "Y-2781761-C-A"
133134
},
134135
"ga4gh:VA.yi7A2l0uIUMaInQaJnHU_B2Cf_OuZRJg": {
135136
"variation": {
@@ -154,7 +155,7 @@
154155
"repeatSubunitLength": 1
155156
}
156157
},
157-
"comment": "Y-2781761-CA-C"
158+
"vcf_expression": "Y-2781761-CA-C"
158159
}
159160
}
160161
}

0 commit comments

Comments
 (0)