feat: stub vlm request endpoint (issue #17)

jennifer-bowser · jsstevenson · korikuzma · web-flow · commit 60be88ebab10 · 2025-12-15T12:11:00.000-05:00
* first pass at drafting stub api endpoint for vlm caf request

* add enums for genomic reference assembly ids

* add validation for 'referenceBases' and 'alternateBases'

* refactor to use FastAPI's built-in validation

* add validation for 'referenceName' param

* add TODOs with issue numbers for work to be completed in future tickets

* move endpoint from 'main.py' into 'restapi/vlm.py'

* refactor chromosome name validation to be more streamlined

* fix casing in function params

* add newline to end of file

* update endpoint description

* update 'get_caf' method signature with better typing

* update a few names, types, and comments for clairity

* add support for mitochondrial DNA

* fix typo: 'uscs' &gt; 'ucsc'

Co-authored-by: James Stevenson &lt;james.stevenson@nationwidechildrens.org&gt;

* use python's built-in 'removeprefix' function

Co-authored-by: James Stevenson &lt;james.stevenson@nationwidechildrens.org&gt;

* Update 'uscs' &gt; 'ucsc' in all imports/usages

* remove docstring from FastAPI endpoint since the info is duplicated by the auto-generated documentation

* streamline chromosome name validation

* update one last instance of 'uscs' &gt; 'ucsc'

* Expand the allowable values in the 'GenomicSequence' type

Co-authored-by: Kori Kuzma &lt;korikuzma@gmail.com&gt;

* rename 'GenomicSequence' to NucleotideSequence' for specificity

* add tests for chromosome name validation

* add support for gap character in NucleotideSequence type

---------

Co-authored-by: James Stevenson &lt;james.stevenson@nationwidechildrens.org&gt;
Co-authored-by: Kori Kuzma &lt;korikuzma@gmail.com&gt;
diff --git a/src/anyvlm/functions/get_caf.py b/src/anyvlm/functions/get_caf.py
@@ -3,17 +3,30 @@
 from ga4gh.va_spec.base.core import CohortAlleleFrequencyStudyResult
 
 from anyvlm.anyvar.base_client import BaseAnyVarClient
+from anyvlm.utils.types import (
+    ChromosomeName,
+    GrcAssemblyId,
+    NucleotideSequence,
+    UcscAssemblyBuild,
+)
 
 
 def get_caf(
-    av: BaseAnyVarClient, accession_id: str, start: int, end: int
+    anyvar_client: BaseAnyVarClient,
+    assembly_id: GrcAssemblyId | UcscAssemblyBuild,
+    reference_name: ChromosomeName,
+    start: int,
+    reference_bases: NucleotideSequence,
+    alternate_bases: NucleotideSequence,
 ) -> list[CohortAlleleFrequencyStudyResult]:
     """Retrieve Cohort Allele Frequency data for all known variants matching provided search params
 
-    :param av: AnyVar client
-    :param accession_id: ID for sequence to search upon
+    :param anyvar_client: AnyVar client
+    :param assembly_id: The reference assembly to utilize - must be one of: "GRCh37", "GRCh38", "hg38", "hg19"
+    :param reference_name: The chromosome to search on, with an optional "chr" prefix - e.g., "1", "chr22", "X", "chrY", etc.
     :param start: start of range search
-    :param end: end of range to search
+    :param reference_bases: Genomic bases ('T', 'AC', etc.)
+    :param alternate_bases: Genomic bases ('T', 'AC', etc.)
     :return: list of CAFs contained in search interval
     """
-    raise NotImplementedError
+    raise NotImplementedError  # TODO: Implement this. See Issue #16.
diff --git a/src/anyvlm/main.py b/src/anyvlm/main.py
@@ -3,7 +3,6 @@
 import logging
 from collections.abc import AsyncGenerator
 from contextlib import asynccontextmanager
-from enum import Enum
 
 from anyvar.anyvar import create_storage, create_translator
 from fastapi import FastAPI
@@ -19,6 +18,9 @@
     ServiceOrganization,
     ServiceType,
 )
+from anyvlm.utils.types import (
+    EndpointTag,
+)
 
 _logger = logging.getLogger(__name__)
 
@@ -79,18 +81,11 @@ async def lifespan(app: FastAPI) -> AsyncGenerator:
 )
 
 
-class _Tag(str, Enum):
-    """Define tag names for endpoints."""
-
-    META = "Meta"
-    SEARCH = "Search"
-
-
 @app.get(
     "/service-info",
     summary="Get basic service information",
     description="Retrieve service metadata, such as versioning and contact info. Structured in conformance with the [GA4GH service info API specification](https://www.ga4gh.org/product/service-info/)",
-    tags=[_Tag.META],
+    tags=[EndpointTag.META],
 )
 def service_info() -> ServiceInfo:
     """Provide service info per GA4GH Service Info spec"""
diff --git a/src/anyvlm/restapi/vlm.py b/src/anyvlm/restapi/vlm.py
@@ -1,6 +1,21 @@
 """Define route(s) for the variant-level matching (VLM) protocol"""
 
 from pathlib import Path
+from typing import Annotated
+
+from fastapi import Query, Request
+
+from anyvlm.anyvar.base_client import BaseAnyVarClient
+from anyvlm.functions.get_caf import get_caf
+from anyvlm.main import app
+from anyvlm.schemas.vlm import VlmResponse
+from anyvlm.utils.types import (
+    ChromosomeName,
+    EndpointTag,
+    GrcAssemblyId,
+    NucleotideSequence,
+    UcscAssemblyBuild,
+)
 
 
 def ingest_vcf(vcf_path: Path) -> None:
@@ -9,3 +24,36 @@ def ingest_vcf(vcf_path: Path) -> None:
     :param vcf_path: VCF file location
     """
     raise NotImplementedError
+
+
+@app.get(
+    "/variant_counts",
+    summary="Provides allele counts of a single sequence variant, broken down by zygosity",
+    description="Search for a single sequence variant and receive allele counts by zygosity, in accordance with the Variant-Level Matching protocol",
+    tags=[EndpointTag.SEARCH],
+)
+# ruff: noqa: D103, N803 (allow camelCase args and don't require docstrings)
+def variant_counts(
+    request: Request,
+    assemblyId: Annotated[
+        GrcAssemblyId | UcscAssemblyBuild,
+        Query(..., description="Genome reference assembly"),
+    ],
+    referenceName: Annotated[
+        ChromosomeName, Query(..., description="Chromosome with optional 'chr' prefix")
+    ],
+    start: Annotated[int, Query(..., description="Variant position")],
+    referenceBases: Annotated[
+        NucleotideSequence, Query(..., description="Genomic bases ('T', 'AC', etc.)")
+    ],
+    alternateBases: Annotated[
+        NucleotideSequence, Query(..., description="Genomic bases ('T', 'AC', etc.)")
+    ],
+) -> VlmResponse:
+    anyvar_client: BaseAnyVarClient = request.app.state.anyvar_client
+
+    caf_data = get_caf(  # noqa: F841 - TODO: remove this noqa when endpoint is complete. See Issue #16 and Issue #13.
+        anyvar_client, assemblyId, referenceName, start, referenceBases, alternateBases
+    )
+
+    return VlmResponse()  # TODO: fill this out. See Issue #16 and Issue #13
diff --git a/src/anyvlm/schemas/vlm.py b/src/anyvlm/schemas/vlm.py
@@ -1 +1,9 @@
 """Schemas relating to VLM API."""
+
+from pydantic import BaseModel
+
+
+class VlmResponse(BaseModel):
+    """Define response structure for the variant_counts endpoint."""
+
+    # TODO: Fill this in. See Issue #13
diff --git a/src/anyvlm/utils/__init__.py b/src/anyvlm/utils/__init__.py
@@ -0,0 +1 @@
+"""Provide utilities."""
diff --git a/src/anyvlm/utils/types.py b/src/anyvlm/utils/types.py
@@ -0,0 +1,61 @@
+"""Provide helpful type definitions, references, and type-based operations."""
+
+from enum import Enum, StrEnum
+from typing import Annotated
+
+from pydantic import BeforeValidator, StringConstraints
+
+
+class EndpointTag(str, Enum):
+    """Define tag names for endpoints."""
+
+    META = "Meta"
+    SEARCH = "Search"
+
+
+class GrcAssemblyId(StrEnum):
+    """Supported GRC assembly identifiers"""
+
+    GRCH37 = "GRCh37"
+    GRCH38 = "GRCh38"
+
+
+class UcscAssemblyBuild(StrEnum):
+    """Supported UCSC assembly builds"""
+
+    HG38 = "hg38"
+    HG19 = "hg19"
+
+
+NucleotideSequence = Annotated[
+    str,
+    BeforeValidator(str.upper),
+    StringConstraints(pattern=r"^[ACGTURYKMSWBDHVN.-]*$"),
+]
+
+
+def _normalize_chromosome_name(chromosome_name: str) -> str:
+    """Normalize a chromosome name. Input must be a string consisting of either a number between 1-22,
+    or one of the values 'X', 'Y', or 'MT'; optionally prefixed with 'chr'.
+
+    :param chromosome_name: The name of the chromosome to normalize, following the rules stated above.
+    :return: The chromosome name, stripped of it's 'chr' prefix if it was added
+    """
+    chromosome_name = chromosome_name.upper().removeprefix("CHR")
+
+    min_chromosome_number = 1
+    max_chromosome_number = 22
+
+    if chromosome_name in {"X", "Y", "MT"} or (
+        chromosome_name.isdigit()
+        and min_chromosome_number <= int(chromosome_name) <= max_chromosome_number
+    ):
+        return chromosome_name
+
+    raise ValueError(
+        "Invalid chromosome. Must be either a number between 1-22, or "
+        "'one of the values 'X', 'Y', or 'MT'; optionally prefixed with 'chr'."
+    )
+
+
+ChromosomeName = Annotated[str, BeforeValidator(_normalize_chromosome_name)]
diff --git a/tests/unit/test_types b/tests/unit/test_types
@@ -0,0 +1,63 @@
+"""Tests for types and validator functions found in src/anyvlm/utils/types.py"""
+
+from pydantic import TypeAdapter, ValidationError
+import pytest
+from anyvlm.utils.types import ChromosomeName, _normalize_chromosome_name
+
+
+@pytest.fixture
+def valid_chromosomes():
+    return [
+        ("1", "1"),
+        ("22", "22"),
+        ("X", "X"),
+        ("Y", "Y"),
+        ("MT", "MT"),
+        ("chr1", "1"),
+        ("Chr22", "22"),
+        ("cHrX", "X"),
+        ("chrMT", "MT"),
+    ]
+
+@pytest.fixture
+def invalid_chromosomes():
+    return [
+        "0",
+        "23",
+        "chr23",
+        "M",
+        "chrM",
+        "XY",
+        "",
+        "chr",
+        "1a",
+        None
+    ]
+
+@pytest.fixture
+def chromosome_adapter():
+    return TypeAdapter(ChromosomeName)
+
+
+### Test chromosome name normalization function ###
+def test_normalize_chromosome_name_valid(valid_chromosomes):
+    for unnormalized_name, expected_name in valid_chromosomes:
+        assert _normalize_chromosome_name(unnormalized_name) == expected_name
+
+
+def test_normalize_chromosome_name_invalid(invalid_chromosomes):
+    for chromosome_name in invalid_chromosomes:
+        with pytest.raises(ValueError):
+            _normalize_chromosome_name(chromosome_name)
+
+
+### Test ChromosomeName annotated type ###
+def test_chromosome_name_adapter_valid(chromosome_adapter, valid_chromosomes):
+    for raw, expected in valid_chromosomes:
+        assert chromosome_adapter.validate_python(raw) == expected
+
+
+def test_chromosome_name_adapter_invalid(chromosome_adapter, invalid_chromosomes):
+    for chromosome_name in invalid_chromosomes:
+        with pytest.raises(ValidationError):
+            chromosome_adapter.validate_python(chromosome_name)