Skip to content

Commit 65ed067

Browse files
Add VCF loading HTTP endpoint (#6) (#43)
see issue #6 for design decisions, etc --------- Co-authored-by: James Stevenson <james.stevenson@nationwidechildrens.org>
1 parent 489a530 commit 65ed067

File tree

8 files changed

+657
-10
lines changed

8 files changed

+657
-10
lines changed

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -154,7 +154,7 @@ ignore-variadic-names = true
154154

155155
[tool.ruff.lint.per-file-ignores]
156156
"__init__.py" = ["F401", "E402"]
157-
"tests/*" = ["ANN001", "ANN2", "ANN102", "S101", "B011", "INP001", "D", "C400", "PLR2004"]
157+
"tests/*" = ["ANN001", "ANN2", "ANN102", "S101", "B011", "INP001", "D", "C400", "PLR2004", "PLC0415"]
158158

159159
[tool.ruff.format]
160160
docstring-code-format = true

src/anyvlm/restapi/vlm.py

Lines changed: 249 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,23 @@
11
"""Define route(s) for the variant-level matching (VLM) protocol"""
22

3+
import gzip
4+
import logging
5+
import tempfile
6+
import uuid
37
from pathlib import Path
4-
from typing import Annotated
8+
from typing import Annotated, BinaryIO, Literal
59

6-
from fastapi import Query, Request
10+
from anyvar.utils.liftover_utils import ReferenceAssembly
11+
from fastapi import HTTPException, Query, Request, UploadFile
12+
from pydantic import BaseModel
713

814
from anyvlm.anyvar.base_client import BaseAnyVarClient
915
from anyvlm.functions.build_vlm_response import build_vlm_response
1016
from anyvlm.functions.get_caf import get_caf
17+
from anyvlm.functions.ingest_vcf import VcfAfColumnsError
18+
from anyvlm.functions.ingest_vcf import ingest_vcf as ingest_vcf_function
1119
from anyvlm.main import app
12-
from anyvlm.schemas.vlm import (
13-
VlmResponse,
14-
)
20+
from anyvlm.schemas.vlm import VlmResponse
1521
from anyvlm.storage.base_storage import Storage
1622
from anyvlm.utils.types import (
1723
AnyVlmCohortAlleleFrequencyResult,
@@ -22,13 +28,247 @@
2228
UcscAssemblyBuild,
2329
)
2430

31+
# Create alias for easier mocking in tests
32+
ingest_vcf = ingest_vcf_function
33+
34+
_logger = logging.getLogger(__name__)
35+
36+
# Constants
37+
MAX_FILE_SIZE = 5 * 1024 * 1024 * 1024 # 5GB
38+
UPLOAD_CHUNK_SIZE = 1024 * 1024 # 1MB
39+
REQUIRED_INFO_FIELDS = {"AC", "AN", "AC_Het", "AC_Hom", "AC_Hemi"}
40+
41+
42+
# ====================
43+
# Response Models
44+
# ====================
45+
46+
47+
class VcfIngestionResponse(BaseModel):
48+
"""Response model for VCF ingestion endpoint."""
49+
50+
status: Literal["success", "error"]
51+
message: str
52+
details: str | None = None
53+
54+
55+
# ====================
56+
# Validation Helpers
57+
# ====================
58+
59+
60+
def validate_filename_extension(filename: str) -> None:
61+
"""Validate that filename has .vcf.gz extension.
62+
63+
:param filename: name of uploaded file
64+
:raise ValueError: if extension is not .vcf.gz
65+
"""
66+
if not filename.endswith(".vcf.gz"):
67+
raise ValueError("Only .vcf.gz files are accepted")
68+
2569

26-
def ingest_vcf(vcf_path: Path) -> None:
27-
"""Ingest variants and cohort allele frequency data from an input VCF
70+
def validate_gzip_magic_bytes(file_obj: BinaryIO) -> None:
71+
"""Validate that file has gzip magic bytes.
2872
29-
:param vcf_path: VCF file location
73+
:param file_obj: file-like object to validate
74+
:raise ValueError: if file is not gzipped
3075
"""
31-
raise NotImplementedError
76+
header = file_obj.read(2)
77+
file_obj.seek(0) # Reset file pointer
78+
79+
if header != b"\x1f\x8b":
80+
raise ValueError("File is not a valid gzip file")
81+
82+
83+
def validate_file_size(size: int) -> None:
84+
"""Validate that file size is within limits.
85+
86+
:param size: file size in bytes
87+
:raise ValueError: if file exceeds maximum size
88+
"""
89+
if size > MAX_FILE_SIZE:
90+
max_gb = MAX_FILE_SIZE / (1024**3)
91+
raise ValueError(f"File too large. Maximum size: {max_gb:.1f}GB")
92+
93+
94+
def validate_vcf_header(file_path: Path) -> None:
95+
"""Validate VCF file format and required INFO fields.
96+
97+
:param file_path: path to VCF file
98+
:raise ValueError: if VCF is malformed or missing required fields
99+
"""
100+
with gzip.open(file_path, "rt") as f:
101+
# Check first line is VCF format declaration
102+
first_line = f.readline().strip()
103+
if not first_line.startswith("##fileformat=VCF"):
104+
raise ValueError("Not a valid VCF file (missing format declaration)")
105+
106+
# Scan headers for required INFO fields
107+
found_fields = set()
108+
109+
for line in f:
110+
if line.startswith("##INFO=<ID="):
111+
# Extract field ID
112+
field_id = line.split("ID=")[1].split(",")[0]
113+
found_fields.add(field_id)
114+
elif line.startswith("#CHROM"):
115+
# End of headers
116+
break
117+
118+
missing = REQUIRED_INFO_FIELDS - found_fields
119+
if missing:
120+
raise ValueError(
121+
f"VCF missing required INFO fields: {', '.join(sorted(missing))}"
122+
)
123+
124+
125+
# ====================
126+
# File Handling
127+
# ====================
128+
129+
130+
async def save_upload_file_temp(upload_file: UploadFile) -> Path:
131+
"""Save uploaded file to temporary location using streaming.
132+
133+
:param upload_file: FastAPI UploadFile object
134+
:return: path to saved temporary file
135+
:raise: Any exceptions during file operations (caller should handle cleanup)
136+
"""
137+
temp_dir = Path(tempfile.gettempdir())
138+
temp_path = temp_dir / f"anyvlm_{uuid.uuid4()}.vcf.gz"
139+
140+
try:
141+
# Stream upload to disk (memory efficient)
142+
# Using blocking I/O here is acceptable as we're writing to local disk
143+
with temp_path.open("wb") as f:
144+
while chunk := await upload_file.read(UPLOAD_CHUNK_SIZE):
145+
f.write(chunk)
146+
except Exception:
147+
# Cleanup on error
148+
if temp_path.exists():
149+
temp_path.unlink()
150+
raise
151+
else:
152+
return temp_path
153+
154+
155+
# ====================
156+
# Endpoints
157+
# ====================
158+
159+
160+
@app.post(
161+
"/ingest_vcf",
162+
summary="Upload and ingest VCF file",
163+
description=(
164+
"Upload a compressed VCF file (.vcf.gz) to register variants and store allele frequency data. "
165+
"**Requirements:** File must be gzip-compressed (.vcf.gz), contain required INFO fields "
166+
"(AC, AN, AC_Het, AC_Hom, AC_Hemi), and be under 5GB. "
167+
"Processing is synchronous with a 30-minute timeout."
168+
),
169+
tags=[EndpointTag.SEARCH],
170+
response_model=VcfIngestionResponse,
171+
)
172+
async def ingest_vcf_endpoint(
173+
request: Request,
174+
file: UploadFile,
175+
assembly: Annotated[
176+
ReferenceAssembly,
177+
Query(..., description="Reference genome assembly (GRCh37 or GRCh38)"),
178+
],
179+
) -> VcfIngestionResponse:
180+
"""Upload and ingest a VCF file with allele frequency data.
181+
182+
Requirements: .vcf.gz format, <5GB, INFO fields (AC, AN, AC_Het, AC_Hom, AC_Hemi).
183+
Synchronous processing with 30-minute timeout. Variants batched in groups of 1000.
184+
185+
:param request: FastAPI request object
186+
:param file: uploaded VCF file
187+
:param assembly: reference assembly used in VCF
188+
:return: ingestion status response
189+
"""
190+
temp_path: Path | None = None
191+
192+
try:
193+
# Validate filename extension
194+
if not file.filename:
195+
raise HTTPException(400, "Filename is required") # noqa: TRY301
196+
197+
try:
198+
validate_filename_extension(file.filename)
199+
except ValueError as e:
200+
raise HTTPException(400, str(e)) from e
201+
202+
# Validate content type (if provided)
203+
if file.content_type and file.content_type not in {
204+
"application/gzip",
205+
"application/x-gzip",
206+
"application/octet-stream",
207+
}:
208+
raise HTTPException( # noqa: TRY301
209+
400,
210+
f"Invalid content type: {file.content_type}",
211+
)
212+
213+
# Validate gzip magic bytes
214+
try:
215+
validate_gzip_magic_bytes(file.file)
216+
except ValueError as e:
217+
raise HTTPException(400, str(e)) from e
218+
219+
# Check file size
220+
file.file.seek(0, 2) # Seek to end
221+
file_size = file.file.tell()
222+
file.file.seek(0) # Reset
223+
224+
try:
225+
validate_file_size(file_size)
226+
except ValueError as e:
227+
raise HTTPException(400, str(e)) from e
228+
229+
# Save to temporary file
230+
_logger.info("Saving uploaded file %s (%d bytes)", file.filename, file_size)
231+
temp_path = await save_upload_file_temp(file)
232+
233+
# Validate VCF format and required fields
234+
try:
235+
validate_vcf_header(temp_path)
236+
except ValueError as e:
237+
raise HTTPException(
238+
422,
239+
f"VCF validation failed: {e!s}",
240+
) from e
241+
242+
# Process VCF
243+
anyvar_client = request.app.state.anyvar_client
244+
_logger.info("Starting VCF ingestion for %s", file.filename)
245+
246+
try:
247+
ingest_vcf_function(temp_path, anyvar_client, assembly)
248+
except VcfAfColumnsError as e:
249+
_logger.exception("VCF missing required INFO columns")
250+
raise HTTPException(422, f"VCF validation failed: {e}") from e
251+
except Exception as e:
252+
_logger.exception("VCF ingestion failed")
253+
raise HTTPException(500, f"Ingestion failed: {e}") from e
254+
255+
_logger.info("Successfully ingested VCF: %s", file.filename)
256+
return VcfIngestionResponse(
257+
status="success",
258+
message=f"Successfully ingested {file.filename}",
259+
)
260+
261+
except HTTPException:
262+
# Re-raise HTTP exceptions
263+
raise
264+
except Exception as e:
265+
_logger.exception("Unexpected error during VCF upload")
266+
raise HTTPException(500, f"Upload failed: {e}") from e
267+
finally:
268+
# Always cleanup temporary file
269+
if temp_path and temp_path.exists():
270+
_logger.debug("Cleaning up temporary file: %s", temp_path)
271+
temp_path.unlink()
32272

33273

34274
@app.get(

tests/conftest.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,12 @@
2020

2121
load_dotenv()
2222

23+
# Set required environment variables for tests if not already set
24+
environ.setdefault("BEACON_NODE_ID", "org.anyvlm.test")
25+
environ.setdefault("HANDOVER_TYPE_ID", "test-id")
26+
environ.setdefault("HANDOVER_TYPE_LABEL", "Test Label")
27+
environ.setdefault("BEACON_HANDOVER_URL", "https://test.example.com")
28+
2329

2430
@pytest.fixture(scope="session")
2531
def test_data_dir() -> Path:
268 Bytes
Binary file not shown.
2.88 KB
Binary file not shown.

tests/data/vcf/not_a_vcf.txt.gz

41 Bytes
Binary file not shown.

tests/data/vcf/valid_small.vcf.gz

2.92 KB
Binary file not shown.

0 commit comments

Comments
 (0)