Skip to content

Commit cccd752

Browse files
jennifer-bowserjsstevensonkorikuzma
authored
feat: build vlm response (issue #13)
* first pass at drafting stub api endpoint for vlm caf request * add enums for genomic reference assembly ids * add validation for 'referenceBases' and 'alternateBases' * refactor to use FastAPI's built-in validation * add validation for 'referenceName' param * add TODOs with issue numbers for work to be completed in future tickets * move endpoint from 'main.py' into 'restapi/vlm.py' * refactor chromosome name validation to be more streamlined * fix casing in function params * add newline to end of file * update endpoint description * fix a 'vlm' string to make it 'anyvlm' - missed in original PR * first pass at creating the VlmResponse object and filling in default gregor values * added some descriptions to the vlm response schema objects * adds more decriptions and clearer TODO messages * move logic out of http endpoint handler into a reusable function * add validation for ResultSet ids + add extra info to TODOs * update zygosity getter func to raise a NotImplementedError * just raise a 'NotImplementedError' for 'build_vlm_response_from_caf_data' for now instead of trying to guess how things will be formatted * update comment for clairity * update 'get_caf' method signature with better typing * update a few names, types, and comments for clairity * add support for mitochondrial DNA * use a pydantic model for 'ReturnedSchema' in the 'Meta' class * update 'Meta.apiVersion' to refer to the _VLM_ API version, not our _own_ API version * use correct casing for GREGoR * update TODOs re: configuratbility to reference new issue #27 * add descriptions for all Fields that didn't already have them * adds tests for validation code in 'VlmResponse' * use variable for error message matching instead of comparing raw strings * fix typo: 'uscs' > 'ucsc' Co-authored-by: James Stevenson <[email protected]> * use python's built-in 'removeprefix' function Co-authored-by: James Stevenson <[email protected]> * Update 'uscs' > 'ucsc' in all imports/usages * remove docstring from FastAPI endpoint since the info is duplicated by the auto-generated documentation * streamline chromosome name validation * update one last instance of 'uscs' > 'ucsc' * Expand the allowable values in the 'GenomicSequence' type Co-authored-by: Kori Kuzma <[email protected]> * rename 'GenomicSequence' to NucleotideSequence' for specificity * use 'ConfigDict' instead of raw dictionary object Co-authored-by: Kori Kuzma <[email protected]> * Whoops update import to use 'ConfigDict' Co-authored-by: Kori Kuzma <[email protected]> --------- Co-authored-by: James Stevenson <[email protected]> Co-authored-by: Kori Kuzma <[email protected]>
1 parent 60be88e commit cccd752

File tree

6 files changed

+273
-8
lines changed

6 files changed

+273
-8
lines changed

src/anyvlm/config.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ class Settings(BaseSettings):
1515
"""
1616

1717
model_config = SettingsConfigDict(
18-
env_prefix="vlm_",
18+
env_prefix="anyvlm_",
1919
env_file=".env",
2020
env_file_encoding="utf-8",
2121
extra="ignore",
Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
"""Craft a VlmResponse object from a list of CohortAlleleFrequencyStudyResults"""
2+
3+
from ga4gh.va_spec.base.core import CohortAlleleFrequencyStudyResult
4+
5+
from anyvlm.schemas.vlm import (
6+
VlmResponse,
7+
)
8+
9+
10+
def build_vlm_response_from_caf_data(
11+
caf_data: list[CohortAlleleFrequencyStudyResult],
12+
) -> VlmResponse:
13+
"""Craft a VlmResponse object from a list of CohortAlleleFrequencyStudyResults.
14+
15+
:param caf_data: A list of `CohortAlleleFrequencyStudyResult` objects that will be used to build the VlmResponse
16+
:return: A `VlmResponse` object.
17+
"""
18+
raise NotImplementedError # TODO: Implement this during/after Issue #16

src/anyvlm/restapi/vlm.py

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -4,11 +4,15 @@
44
from typing import Annotated
55

66
from fastapi import Query, Request
7+
from ga4gh.va_spec.base.core import CohortAlleleFrequencyStudyResult
78

89
from anyvlm.anyvar.base_client import BaseAnyVarClient
10+
from anyvlm.functions.build_vlm_response import build_vlm_response_from_caf_data
911
from anyvlm.functions.get_caf import get_caf
1012
from anyvlm.main import app
11-
from anyvlm.schemas.vlm import VlmResponse
13+
from anyvlm.schemas.vlm import (
14+
VlmResponse,
15+
)
1216
from anyvlm.utils.types import (
1317
ChromosomeName,
1418
EndpointTag,
@@ -51,9 +55,7 @@ def variant_counts(
5155
],
5256
) -> VlmResponse:
5357
anyvar_client: BaseAnyVarClient = request.app.state.anyvar_client
54-
55-
caf_data = get_caf( # noqa: F841 - TODO: remove this noqa when endpoint is complete. See Issue #16 and Issue #13.
58+
caf_data: list[CohortAlleleFrequencyStudyResult] = get_caf(
5659
anyvar_client, assemblyId, referenceName, start, referenceBases, alternateBases
5760
)
58-
59-
return VlmResponse() # TODO: fill this out. See Issue #16 and Issue #13
61+
return build_vlm_response_from_caf_data(caf_data)

src/anyvlm/schemas/vlm.py

Lines changed: 147 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,154 @@
11
"""Schemas relating to VLM API."""
22

3-
from pydantic import BaseModel
3+
from typing import ClassVar, Literal, Self
4+
5+
from pydantic import BaseModel, ConfigDict, Field, model_validator
6+
7+
from anyvlm.utils.types import Zygosity
8+
9+
# ruff: noqa: N815 (allows camelCase vars instead of snake_case to align with expected VLM protocol response)
10+
11+
RESULT_ENTITY_TYPE = "genomicVariant"
12+
13+
14+
class HandoverType(BaseModel):
15+
"""The type of handover the parent `BeaconHandover` represents."""
16+
17+
id: str = Field(
18+
default="gregor", description="Node-specific identifier"
19+
) # TODO: enable configuration of this field. See Issue #27.
20+
label: str = Field(
21+
default="GREGoR AnVIL browser", description="Node-specific label"
22+
) # TODO: enable configuration of this field. See Issue #27.
23+
24+
25+
class BeaconHandover(BaseModel):
26+
"""Describes how users can get more information about the results provided in the parent `VlmResponse`"""
27+
28+
handoverType: HandoverType = HandoverType()
29+
url: str = Field(
30+
default="https://anvil.terra.bio/#workspaces?filter=GREGoR", # TODO: enable configuration of this field. See Issue #27.
31+
description="A url which directs users to more detailed information about the results tabulated by the API (ideally human-readable)",
32+
)
33+
34+
35+
class ReturnedSchema(BaseModel):
36+
"""Fixed [Beacon Schema](https://github.com/ga4gh-beacon/beacon-v2/blob/c6558bf2e6494df3905f7b2df66e903dfe509500/framework/json/common/beaconCommonComponents.json#L241)"""
37+
38+
entityType: str = Field(
39+
default=RESULT_ENTITY_TYPE,
40+
description=f"The type of entity this response describes. Must always be set to '{RESULT_ENTITY_TYPE}'",
41+
)
42+
schema_: str = Field(
43+
default="ga4gh-beacon-variant-v2.0.0",
44+
# Alias is required because 'schema' is reserved by Pydantic's BaseModel class,
45+
# But VLM expects a field named 'schema'
46+
alias="schema",
47+
)
48+
49+
model_config = ConfigDict(populate_by_name=True)
50+
51+
52+
class Meta(BaseModel):
53+
"""Relevant metadata about the results provided in the parent `VlmResponse`"""
54+
55+
apiVersion: str = Field(
56+
default="v1.0",
57+
description="The version of the VLM API that this response conforms to",
58+
)
59+
beaconId: str = Field(
60+
default="org.gregor.beacon", # TODO: enable configuration of this field. See Issue #27.
61+
description="""
62+
The Id of a Beacon. Usually a reversed domain string, but any URI is acceptable. The purpose of this attribute is,
63+
in the context of a Beacon network, to disambiguate responses coming from different Beacons. See the beacon documentation
64+
[here](https://github.com/ga4gh-beacon/beacon-v2/blob/c6558bf2e6494df3905f7b2df66e903dfe509500/framework/src/common/beaconCommonComponents.yaml#L26)
65+
""",
66+
)
67+
returnedSchemas: list[ReturnedSchema] = [ReturnedSchema()]
68+
69+
70+
class ResponseSummary(BaseModel):
71+
"""A high-level summary of the results provided in the parent `VlmResponse"""
72+
73+
exists: bool = Field(
74+
..., description="Indicates whether the response contains any results."
75+
)
76+
numTotalResults: int = Field(
77+
..., description="The total number of results found for the given query"
78+
)
79+
80+
81+
class ResultSet(BaseModel):
82+
"""A set of cohort allele frequency results. The zygosity of the ResultSet is identified in the `id` field"""
83+
84+
exists: Literal[True] = Field(
85+
default=True,
86+
description="Indicates whether this ResultSet exists. This must always be `True`, even if `resultsCount` = `0`",
87+
)
88+
id: str = Field(
89+
...,
90+
description="id should be constructed of the `HandoverType.id` + the ResultSet's zygosity. See `validate_resultset_ids` validator in `VlmResponse` class.",
91+
examples=["Geno2MP Homozygous", "MyGene2 Heterozygous"],
92+
)
93+
results: list = Field(
94+
default=[],
95+
min_length=0,
96+
max_length=0,
97+
description="This must always be set to an empty array",
98+
)
99+
resultsCount: int = Field(
100+
..., description="A count for the zygosity indicated by the ResultSet's `id`"
101+
)
102+
setType: str = Field(
103+
default=RESULT_ENTITY_TYPE,
104+
description=f"The type of entity relevant to these results. Must always be set to '{RESULT_ENTITY_TYPE}'",
105+
)
106+
107+
108+
class ResponseField(BaseModel):
109+
"""A list of ResultSets"""
110+
111+
resultSets: list[ResultSet] = Field(
112+
..., description="A list of ResultSets for the given query."
113+
)
4114

5115

6116
class VlmResponse(BaseModel):
7117
"""Define response structure for the variant_counts endpoint."""
8118

9-
# TODO: Fill this in. See Issue #13
119+
beaconHandovers: list[BeaconHandover] = [BeaconHandover()]
120+
meta: Meta = Meta()
121+
responseSummary: ResponseSummary
122+
response: ResponseField
123+
124+
resultset_id_error_message_base: ClassVar[str] = (
125+
"Invalid ResultSet id - ids must be in form '<node_id> <zygosity>'"
126+
)
127+
128+
@model_validator(mode="after")
129+
def validate_resultset_ids(self) -> Self:
130+
"""Ensure each ResultSet.id is correctly constructed."""
131+
handover_ids: list[str] = [
132+
beaconHandover.handoverType.id for beaconHandover in self.beaconHandovers
133+
]
134+
135+
for result_set in self.response.resultSets:
136+
node_id, zygosity = None, None
137+
try:
138+
node_id, zygosity = result_set.id.split(" ")
139+
except ValueError as e:
140+
error_message = f"{self.resultset_id_error_message_base}, but provided id of {result_set.id} contains invalid formatting"
141+
raise ValueError(error_message) from e
142+
143+
if node_id not in handover_ids:
144+
error_message = f"{self.resultset_id_error_message_base}, but provided node_id of {node_id} does not match any `handoverType.id` provided in `self.beaconHandovers`"
145+
raise ValueError(error_message)
146+
147+
try:
148+
Zygosity(zygosity)
149+
except ValueError as e:
150+
valid_zygosity_values = {zygosity.value for zygosity in Zygosity}
151+
error_message = f"{self.resultset_id_error_message_base}, but provided zygosity of {zygosity} is not found in allowable value set of: {', '.join(valid_zygosity_values)}"
152+
raise ValueError(error_message) from e
153+
154+
return self

src/anyvlm/utils/types.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,3 +59,12 @@ def _normalize_chromosome_name(chromosome_name: str) -> str:
5959

6060

6161
ChromosomeName = Annotated[str, BeforeValidator(_normalize_chromosome_name)]
62+
63+
64+
class Zygosity(StrEnum):
65+
"""Allowable zygosity values as defined by the VLM protocol"""
66+
67+
HOMOZYGOUS = "Homozygous"
68+
HETEROZYGOUS = "Heterozygous"
69+
HEMIZYGOUS = "Hemizygous"
70+
UNKNOWN = "Unknown Zygosity"

tests/unit/test_schemas.py

Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,91 @@
1+
"""Test schema validation functionality"""
2+
3+
import re
4+
5+
import pytest
6+
7+
from anyvlm.schemas.vlm import (
8+
RESULT_ENTITY_TYPE,
9+
HandoverType,
10+
ResponseField,
11+
ResponseSummary,
12+
ResultSet,
13+
VlmResponse,
14+
)
15+
from anyvlm.utils.types import Zygosity
16+
17+
18+
@pytest.fixture(scope="module")
19+
def valid_handover_id() -> str:
20+
return HandoverType().id
21+
22+
23+
@pytest.fixture(scope="module")
24+
def response_summary() -> ResponseSummary:
25+
return ResponseSummary(exists=False, numTotalResults=0)
26+
27+
28+
@pytest.fixture(scope="module")
29+
def responses_with_invalid_resultset_ids(valid_handover_id) -> list[ResponseField]:
30+
return [
31+
ResponseField(
32+
resultSets=[
33+
ResultSet(
34+
exists=True,
35+
id=f"invalid_handover_id {Zygosity.HOMOZYGOUS}",
36+
resultsCount=0,
37+
setType=RESULT_ENTITY_TYPE,
38+
)
39+
]
40+
),
41+
ResponseField(
42+
resultSets=[
43+
ResultSet(
44+
exists=True,
45+
id=f"{valid_handover_id} invalid_zygosity",
46+
resultsCount=0,
47+
setType=RESULT_ENTITY_TYPE,
48+
)
49+
]
50+
),
51+
ResponseField(
52+
resultSets=[
53+
ResultSet(
54+
exists=True,
55+
id=f"{Zygosity.HOMOZYGOUS}-{valid_handover_id}", # incorrect order/formatting
56+
resultsCount=0,
57+
setType=RESULT_ENTITY_TYPE,
58+
)
59+
]
60+
),
61+
]
62+
63+
64+
def test_valid_resultset_id(response_summary, valid_handover_id):
65+
response = ResponseField(
66+
resultSets=[
67+
ResultSet(
68+
exists=True,
69+
id=f"{valid_handover_id} {Zygosity.HOMOZYGOUS}",
70+
resultsCount=0,
71+
setType=RESULT_ENTITY_TYPE,
72+
)
73+
]
74+
)
75+
76+
# Should NOT raise an error
77+
vlm_response = VlmResponse(responseSummary=response_summary, response=response)
78+
79+
assert (
80+
vlm_response.response.resultSets[0].id
81+
== f"{valid_handover_id} {Zygosity.HOMOZYGOUS}"
82+
)
83+
84+
85+
def test_invalid_resultset_ids(response_summary, responses_with_invalid_resultset_ids):
86+
for response in responses_with_invalid_resultset_ids:
87+
with pytest.raises(
88+
ValueError,
89+
match=re.escape(VlmResponse.resultset_id_error_message_base),
90+
):
91+
VlmResponse(responseSummary=response_summary, response=response)

0 commit comments

Comments
 (0)