Skip to content

Commit 60be88e

Browse files
jennifer-bowserjsstevensonkorikuzma
authored
feat: stub vlm request endpoint (issue #17)
* first pass at drafting stub api endpoint for vlm caf request * add enums for genomic reference assembly ids * add validation for 'referenceBases' and 'alternateBases' * refactor to use FastAPI's built-in validation * add validation for 'referenceName' param * add TODOs with issue numbers for work to be completed in future tickets * move endpoint from 'main.py' into 'restapi/vlm.py' * refactor chromosome name validation to be more streamlined * fix casing in function params * add newline to end of file * update endpoint description * update 'get_caf' method signature with better typing * update a few names, types, and comments for clairity * add support for mitochondrial DNA * fix typo: 'uscs' > 'ucsc' Co-authored-by: James Stevenson <[email protected]> * use python's built-in 'removeprefix' function Co-authored-by: James Stevenson <[email protected]> * Update 'uscs' > 'ucsc' in all imports/usages * remove docstring from FastAPI endpoint since the info is duplicated by the auto-generated documentation * streamline chromosome name validation * update one last instance of 'uscs' > 'ucsc' * Expand the allowable values in the 'GenomicSequence' type Co-authored-by: Kori Kuzma <[email protected]> * rename 'GenomicSequence' to NucleotideSequence' for specificity * add tests for chromosome name validation * add support for gap character in NucleotideSequence type --------- Co-authored-by: James Stevenson <[email protected]> Co-authored-by: Kori Kuzma <[email protected]>
1 parent a961068 commit 60be88e

File tree

7 files changed

+203
-14
lines changed

7 files changed

+203
-14
lines changed

src/anyvlm/functions/get_caf.py

Lines changed: 18 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3,17 +3,30 @@
33
from ga4gh.va_spec.base.core import CohortAlleleFrequencyStudyResult
44

55
from anyvlm.anyvar.base_client import BaseAnyVarClient
6+
from anyvlm.utils.types import (
7+
ChromosomeName,
8+
GrcAssemblyId,
9+
NucleotideSequence,
10+
UcscAssemblyBuild,
11+
)
612

713

814
def get_caf(
9-
av: BaseAnyVarClient, accession_id: str, start: int, end: int
15+
anyvar_client: BaseAnyVarClient,
16+
assembly_id: GrcAssemblyId | UcscAssemblyBuild,
17+
reference_name: ChromosomeName,
18+
start: int,
19+
reference_bases: NucleotideSequence,
20+
alternate_bases: NucleotideSequence,
1021
) -> list[CohortAlleleFrequencyStudyResult]:
1122
"""Retrieve Cohort Allele Frequency data for all known variants matching provided search params
1223
13-
:param av: AnyVar client
14-
:param accession_id: ID for sequence to search upon
24+
:param anyvar_client: AnyVar client
25+
:param assembly_id: The reference assembly to utilize - must be one of: "GRCh37", "GRCh38", "hg38", "hg19"
26+
:param reference_name: The chromosome to search on, with an optional "chr" prefix - e.g., "1", "chr22", "X", "chrY", etc.
1527
:param start: start of range search
16-
:param end: end of range to search
28+
:param reference_bases: Genomic bases ('T', 'AC', etc.)
29+
:param alternate_bases: Genomic bases ('T', 'AC', etc.)
1730
:return: list of CAFs contained in search interval
1831
"""
19-
raise NotImplementedError
32+
raise NotImplementedError # TODO: Implement this. See Issue #16.

src/anyvlm/main.py

Lines changed: 4 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@
33
import logging
44
from collections.abc import AsyncGenerator
55
from contextlib import asynccontextmanager
6-
from enum import Enum
76

87
from anyvar.anyvar import create_storage, create_translator
98
from fastapi import FastAPI
@@ -19,6 +18,9 @@
1918
ServiceOrganization,
2019
ServiceType,
2120
)
21+
from anyvlm.utils.types import (
22+
EndpointTag,
23+
)
2224

2325
_logger = logging.getLogger(__name__)
2426

@@ -79,18 +81,11 @@ async def lifespan(app: FastAPI) -> AsyncGenerator:
7981
)
8082

8183

82-
class _Tag(str, Enum):
83-
"""Define tag names for endpoints."""
84-
85-
META = "Meta"
86-
SEARCH = "Search"
87-
88-
8984
@app.get(
9085
"/service-info",
9186
summary="Get basic service information",
9287
description="Retrieve service metadata, such as versioning and contact info. Structured in conformance with the [GA4GH service info API specification](https://www.ga4gh.org/product/service-info/)",
93-
tags=[_Tag.META],
88+
tags=[EndpointTag.META],
9489
)
9590
def service_info() -> ServiceInfo:
9691
"""Provide service info per GA4GH Service Info spec"""

src/anyvlm/restapi/vlm.py

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,21 @@
11
"""Define route(s) for the variant-level matching (VLM) protocol"""
22

33
from pathlib import Path
4+
from typing import Annotated
5+
6+
from fastapi import Query, Request
7+
8+
from anyvlm.anyvar.base_client import BaseAnyVarClient
9+
from anyvlm.functions.get_caf import get_caf
10+
from anyvlm.main import app
11+
from anyvlm.schemas.vlm import VlmResponse
12+
from anyvlm.utils.types import (
13+
ChromosomeName,
14+
EndpointTag,
15+
GrcAssemblyId,
16+
NucleotideSequence,
17+
UcscAssemblyBuild,
18+
)
419

520

621
def ingest_vcf(vcf_path: Path) -> None:
@@ -9,3 +24,36 @@ def ingest_vcf(vcf_path: Path) -> None:
924
:param vcf_path: VCF file location
1025
"""
1126
raise NotImplementedError
27+
28+
29+
@app.get(
30+
"/variant_counts",
31+
summary="Provides allele counts of a single sequence variant, broken down by zygosity",
32+
description="Search for a single sequence variant and receive allele counts by zygosity, in accordance with the Variant-Level Matching protocol",
33+
tags=[EndpointTag.SEARCH],
34+
)
35+
# ruff: noqa: D103, N803 (allow camelCase args and don't require docstrings)
36+
def variant_counts(
37+
request: Request,
38+
assemblyId: Annotated[
39+
GrcAssemblyId | UcscAssemblyBuild,
40+
Query(..., description="Genome reference assembly"),
41+
],
42+
referenceName: Annotated[
43+
ChromosomeName, Query(..., description="Chromosome with optional 'chr' prefix")
44+
],
45+
start: Annotated[int, Query(..., description="Variant position")],
46+
referenceBases: Annotated[
47+
NucleotideSequence, Query(..., description="Genomic bases ('T', 'AC', etc.)")
48+
],
49+
alternateBases: Annotated[
50+
NucleotideSequence, Query(..., description="Genomic bases ('T', 'AC', etc.)")
51+
],
52+
) -> VlmResponse:
53+
anyvar_client: BaseAnyVarClient = request.app.state.anyvar_client
54+
55+
caf_data = get_caf( # noqa: F841 - TODO: remove this noqa when endpoint is complete. See Issue #16 and Issue #13.
56+
anyvar_client, assemblyId, referenceName, start, referenceBases, alternateBases
57+
)
58+
59+
return VlmResponse() # TODO: fill this out. See Issue #16 and Issue #13

src/anyvlm/schemas/vlm.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1 +1,9 @@
11
"""Schemas relating to VLM API."""
2+
3+
from pydantic import BaseModel
4+
5+
6+
class VlmResponse(BaseModel):
7+
"""Define response structure for the variant_counts endpoint."""
8+
9+
# TODO: Fill this in. See Issue #13

src/anyvlm/utils/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
"""Provide utilities."""

src/anyvlm/utils/types.py

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
"""Provide helpful type definitions, references, and type-based operations."""
2+
3+
from enum import Enum, StrEnum
4+
from typing import Annotated
5+
6+
from pydantic import BeforeValidator, StringConstraints
7+
8+
9+
class EndpointTag(str, Enum):
10+
"""Define tag names for endpoints."""
11+
12+
META = "Meta"
13+
SEARCH = "Search"
14+
15+
16+
class GrcAssemblyId(StrEnum):
17+
"""Supported GRC assembly identifiers"""
18+
19+
GRCH37 = "GRCh37"
20+
GRCH38 = "GRCh38"
21+
22+
23+
class UcscAssemblyBuild(StrEnum):
24+
"""Supported UCSC assembly builds"""
25+
26+
HG38 = "hg38"
27+
HG19 = "hg19"
28+
29+
30+
NucleotideSequence = Annotated[
31+
str,
32+
BeforeValidator(str.upper),
33+
StringConstraints(pattern=r"^[ACGTURYKMSWBDHVN.-]*$"),
34+
]
35+
36+
37+
def _normalize_chromosome_name(chromosome_name: str) -> str:
38+
"""Normalize a chromosome name. Input must be a string consisting of either a number between 1-22,
39+
or one of the values 'X', 'Y', or 'MT'; optionally prefixed with 'chr'.
40+
41+
:param chromosome_name: The name of the chromosome to normalize, following the rules stated above.
42+
:return: The chromosome name, stripped of it's 'chr' prefix if it was added
43+
"""
44+
chromosome_name = chromosome_name.upper().removeprefix("CHR")
45+
46+
min_chromosome_number = 1
47+
max_chromosome_number = 22
48+
49+
if chromosome_name in {"X", "Y", "MT"} or (
50+
chromosome_name.isdigit()
51+
and min_chromosome_number <= int(chromosome_name) <= max_chromosome_number
52+
):
53+
return chromosome_name
54+
55+
raise ValueError(
56+
"Invalid chromosome. Must be either a number between 1-22, or "
57+
"'one of the values 'X', 'Y', or 'MT'; optionally prefixed with 'chr'."
58+
)
59+
60+
61+
ChromosomeName = Annotated[str, BeforeValidator(_normalize_chromosome_name)]

tests/unit/test_types

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
"""Tests for types and validator functions found in src/anyvlm/utils/types.py"""
2+
3+
from pydantic import TypeAdapter, ValidationError
4+
import pytest
5+
from anyvlm.utils.types import ChromosomeName, _normalize_chromosome_name
6+
7+
8+
@pytest.fixture
9+
def valid_chromosomes():
10+
return [
11+
("1", "1"),
12+
("22", "22"),
13+
("X", "X"),
14+
("Y", "Y"),
15+
("MT", "MT"),
16+
("chr1", "1"),
17+
("Chr22", "22"),
18+
("cHrX", "X"),
19+
("chrMT", "MT"),
20+
]
21+
22+
@pytest.fixture
23+
def invalid_chromosomes():
24+
return [
25+
"0",
26+
"23",
27+
"chr23",
28+
"M",
29+
"chrM",
30+
"XY",
31+
"",
32+
"chr",
33+
"1a",
34+
None
35+
]
36+
37+
@pytest.fixture
38+
def chromosome_adapter():
39+
return TypeAdapter(ChromosomeName)
40+
41+
42+
### Test chromosome name normalization function ###
43+
def test_normalize_chromosome_name_valid(valid_chromosomes):
44+
for unnormalized_name, expected_name in valid_chromosomes:
45+
assert _normalize_chromosome_name(unnormalized_name) == expected_name
46+
47+
48+
def test_normalize_chromosome_name_invalid(invalid_chromosomes):
49+
for chromosome_name in invalid_chromosomes:
50+
with pytest.raises(ValueError):
51+
_normalize_chromosome_name(chromosome_name)
52+
53+
54+
### Test ChromosomeName annotated type ###
55+
def test_chromosome_name_adapter_valid(chromosome_adapter, valid_chromosomes):
56+
for raw, expected in valid_chromosomes:
57+
assert chromosome_adapter.validate_python(raw) == expected
58+
59+
60+
def test_chromosome_name_adapter_invalid(chromosome_adapter, invalid_chromosomes):
61+
for chromosome_name in invalid_chromosomes:
62+
with pytest.raises(ValidationError):
63+
chromosome_adapter.validate_python(chromosome_name)

0 commit comments

Comments
 (0)