Skip to content

Commit 43a3bb8

Browse files
committed
first drafty w tests
1 parent e577e12 commit 43a3bb8

File tree

6 files changed

+700
-9
lines changed

6 files changed

+700
-9
lines changed

src/anyvlm/restapi/vlm.py

Lines changed: 243 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,24 @@
11
"""Define route(s) for the variant-level matching (VLM) protocol"""
22

3+
import gzip
4+
import logging
5+
import tempfile
6+
import uuid
37
from pathlib import Path
4-
from typing import Annotated
8+
from typing import Annotated, BinaryIO, Literal
59

6-
from fastapi import Query, Request
10+
from anyvar.utils.liftover_utils import ReferenceAssembly
11+
from fastapi import HTTPException, Query, Request, UploadFile
712
from ga4gh.va_spec.base.core import CohortAlleleFrequencyStudyResult
13+
from pydantic import BaseModel
814

915
from anyvlm.anyvar.base_client import BaseAnyVarClient
1016
from anyvlm.functions.build_vlm_response import build_vlm_response_from_caf_data
1117
from anyvlm.functions.get_caf import get_caf
18+
from anyvlm.functions.ingest_vcf import VcfAfColumnsError
19+
from anyvlm.functions.ingest_vcf import ingest_vcf as ingest_vcf_function
1220
from anyvlm.main import app
13-
from anyvlm.schemas.vlm import (
14-
VlmResponse,
15-
)
21+
from anyvlm.schemas.vlm import VlmResponse
1622
from anyvlm.utils.types import (
1723
ChromosomeName,
1824
EndpointTag,
@@ -21,13 +27,241 @@
2127
UcscAssemblyBuild,
2228
)
2329

30+
# Create alias for easier mocking in tests
31+
ingest_vcf = ingest_vcf_function
32+
33+
_logger = logging.getLogger(__name__)
34+
35+
# Constants
36+
MAX_FILE_SIZE = 5 * 1024 * 1024 * 1024 # 5GB
37+
UPLOAD_CHUNK_SIZE = 1024 * 1024 # 1MB
38+
REQUIRED_INFO_FIELDS = {"AC", "AN", "AC_Het", "AC_Hom", "AC_Hemi"}
39+
40+
41+
# ====================
42+
# Response Models
43+
# ====================
44+
45+
46+
class VcfIngestionResponse(BaseModel):
47+
"""Response model for VCF ingestion endpoint."""
48+
49+
status: Literal["success", "error"]
50+
message: str
51+
details: str | None = None
52+
53+
54+
# ====================
55+
# Validation Helpers
56+
# ====================
57+
58+
59+
def validate_filename_extension(filename: str) -> None:
60+
"""Validate that filename has .vcf.gz extension.
61+
62+
:param filename: name of uploaded file
63+
:raise ValueError: if extension is not .vcf.gz
64+
"""
65+
if not filename.endswith(".vcf.gz"):
66+
raise ValueError("Only .vcf.gz files are accepted")
67+
68+
69+
def validate_gzip_magic_bytes(file_obj: BinaryIO) -> None:
70+
"""Validate that file has gzip magic bytes.
71+
72+
:param file_obj: file-like object to validate
73+
:raise ValueError: if file is not gzipped
74+
"""
75+
header = file_obj.read(2)
76+
file_obj.seek(0) # Reset file pointer
77+
78+
if header != b"\x1f\x8b":
79+
raise ValueError("File is not a valid gzip file")
80+
81+
82+
def validate_file_size(size: int) -> None:
83+
"""Validate that file size is within limits.
84+
85+
:param size: file size in bytes
86+
:raise ValueError: if file exceeds maximum size
87+
"""
88+
if size > MAX_FILE_SIZE:
89+
max_gb = MAX_FILE_SIZE / (1024**3)
90+
raise ValueError(f"File too large. Maximum size: {max_gb:.1f}GB")
91+
92+
93+
def validate_vcf_header(file_path: Path) -> None:
94+
"""Validate VCF file format and required INFO fields.
95+
96+
:param file_path: path to VCF file
97+
:raise ValueError: if VCF is malformed or missing required fields
98+
"""
99+
with gzip.open(file_path, "rt") as f:
100+
# Check first line is VCF format declaration
101+
first_line = f.readline().strip()
102+
if not first_line.startswith("##fileformat=VCF"):
103+
raise ValueError("Not a valid VCF file (missing format declaration)")
104+
105+
# Scan headers for required INFO fields
106+
found_fields = set()
24107

25-
def ingest_vcf(vcf_path: Path) -> None:
26-
"""Ingest variants and cohort allele frequency data from an input VCF
108+
for line in f:
109+
if line.startswith("##INFO=<ID="):
110+
# Extract field ID
111+
field_id = line.split("ID=")[1].split(",")[0]
112+
found_fields.add(field_id)
113+
elif line.startswith("#CHROM"):
114+
# End of headers
115+
break
27116

28-
:param vcf_path: VCF file location
117+
missing = REQUIRED_INFO_FIELDS - found_fields
118+
if missing:
119+
raise ValueError(
120+
f"VCF missing required INFO fields: {', '.join(sorted(missing))}"
121+
)
122+
123+
124+
# ====================
125+
# File Handling
126+
# ====================
127+
128+
129+
async def save_upload_file_temp(upload_file: UploadFile) -> Path:
130+
"""Save uploaded file to temporary location using streaming.
131+
132+
:param upload_file: FastAPI UploadFile object
133+
:return: path to saved temporary file
134+
:raise: Any exceptions during file operations (caller should handle cleanup)
135+
"""
136+
temp_dir = Path(tempfile.gettempdir())
137+
temp_path = temp_dir / f"anyvlm_{uuid.uuid4()}.vcf.gz"
138+
139+
try:
140+
# Stream upload to disk (memory efficient)
141+
with open(temp_path, "wb") as f:
142+
while chunk := await upload_file.read(UPLOAD_CHUNK_SIZE):
143+
f.write(chunk)
144+
return temp_path
145+
except Exception:
146+
# Cleanup on error
147+
if temp_path.exists():
148+
temp_path.unlink()
149+
raise
150+
151+
152+
# ====================
153+
# Endpoints
154+
# ====================
155+
156+
157+
@app.post(
158+
"/ingest_vcf",
159+
summary="Upload and ingest VCF file",
160+
description="Upload a compressed VCF file (.vcf.gz) to register variants and store allele frequency data",
161+
tags=[EndpointTag.SEARCH],
162+
response_model=VcfIngestionResponse,
163+
)
164+
async def ingest_vcf_endpoint(
165+
request: Request,
166+
file: UploadFile,
167+
assembly: Annotated[
168+
ReferenceAssembly,
169+
Query(..., description="Reference genome assembly (GRCh37 or GRCh38)"),
170+
],
171+
) -> VcfIngestionResponse:
172+
"""Upload and ingest a VCF file with allele frequency data.
173+
174+
:param request: FastAPI request object
175+
:param file: uploaded VCF file (must be .vcf.gz)
176+
:param assembly: reference assembly used in VCF
177+
:return: ingestion status response
29178
"""
30-
raise NotImplementedError
179+
temp_path: Path | None = None
180+
181+
try:
182+
# Validate filename extension
183+
if not file.filename:
184+
raise HTTPException(400, "Filename is required")
185+
186+
try:
187+
validate_filename_extension(file.filename)
188+
except ValueError as e:
189+
raise HTTPException(400, str(e)) from e
190+
191+
# Validate content type (if provided)
192+
if file.content_type and file.content_type not in {
193+
"application/gzip",
194+
"application/x-gzip",
195+
"application/octet-stream",
196+
}:
197+
raise HTTPException(
198+
400,
199+
f"Invalid content type: {file.content_type}",
200+
)
201+
202+
# Validate gzip magic bytes
203+
try:
204+
validate_gzip_magic_bytes(file.file)
205+
except ValueError as e:
206+
raise HTTPException(400, str(e)) from e
207+
208+
# Check file size
209+
file.file.seek(0, 2) # Seek to end
210+
file_size = file.file.tell()
211+
file.file.seek(0) # Reset
212+
213+
try:
214+
validate_file_size(file_size)
215+
except ValueError as e:
216+
raise HTTPException(400, str(e)) from e
217+
218+
# Save to temporary file
219+
_logger.info("Saving uploaded file %s (%d bytes)", file.filename, file_size)
220+
temp_path = await save_upload_file_temp(file)
221+
222+
# Validate VCF format and required fields
223+
try:
224+
validate_vcf_header(temp_path)
225+
except ValueError as e:
226+
raise HTTPException(
227+
422,
228+
f"VCF validation failed: {str(e)}",
229+
) from e
230+
231+
# Process VCF
232+
anyvar_client = request.app.state.anyvar_client
233+
_logger.info("Starting VCF ingestion for %s", file.filename)
234+
235+
try:
236+
ingest_vcf_function(temp_path, anyvar_client, assembly)
237+
except VcfAfColumnsError as e:
238+
_logger.exception("VCF missing required INFO columns")
239+
raise HTTPException(
240+
422, f"VCF validation failed: {e}"
241+
) from e
242+
except Exception as e:
243+
_logger.exception("VCF ingestion failed")
244+
raise HTTPException(
245+
500, f"Ingestion failed: {e}"
246+
) from e
247+
248+
_logger.info("Successfully ingested VCF: %s", file.filename)
249+
return VcfIngestionResponse(
250+
status="success",
251+
message=f"Successfully ingested {file.filename}",
252+
)
253+
254+
except HTTPException:
255+
# Re-raise HTTP exceptions
256+
raise
257+
except Exception as e:
258+
_logger.exception("Unexpected error during VCF upload")
259+
raise HTTPException(500, f"Upload failed: {e}") from e
260+
finally:
261+
# Always cleanup temporary file
262+
if temp_path and temp_path.exists():
263+
_logger.debug("Cleaning up temporary file: %s", temp_path)
264+
temp_path.unlink()
31265

32266

33267
@app.get(
268 Bytes
Binary file not shown.
2.88 KB
Binary file not shown.

tests/data/vcf/not_a_vcf.txt.gz

41 Bytes
Binary file not shown.

tests/data/vcf/valid_small.vcf.gz

2.92 KB
Binary file not shown.

0 commit comments

Comments
 (0)