Skip to content

Commit 63d92ab

Browse files
[Frontend] Set MAX_AUDIO_CLIP_FILESIZE_MB via env var instead of hardcoding (#21374)
Signed-off-by: Deven Labovitch <[email protected]>
1 parent 11599b0 commit 63d92ab

File tree

3 files changed

+16
-5
lines changed

3 files changed

+16
-5
lines changed

docs/serving/openai_compatible_server.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -351,6 +351,11 @@ you can use the [official OpenAI Python client](https://github.com/openai/openai
351351
Code example: <gh-file:examples/online_serving/openai_transcription_client.py>
352352
<!-- TODO: api enforced limits + uploading audios -->
353353

354+
#### API Enforced Limits
355+
356+
Set the maximum audio file size (in MB) that VLLM will accept, via the
357+
`VLLM_MAX_AUDIO_CLIP_FILESIZE_MB` environment variable. Default is 25 MB.
358+
354359
#### Extra Parameters
355360

356361
The following [sampling parameters][sampling-params] are supported.

vllm/entrypoints/openai/speech_to_text.py

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
import numpy as np
1212
from fastapi import Request
1313

14+
import vllm.envs as envs
1415
from vllm.config import ModelConfig
1516
from vllm.engine.protocol import EngineClient
1617
from vllm.entrypoints.logger import RequestLogger
@@ -38,10 +39,6 @@
3839

3940
logger = init_logger(__name__)
4041

41-
# As per https://platform.openai.com/docs/guides/speech-to-text#overview.
42-
# TODO configurable
43-
MAX_AUDIO_CLIP_FILESIZE_MB = 25
44-
4542

4643
class OpenAISpeechToText(OpenAIServing):
4744
"""Base class for speech-to-text operations like transcription and
@@ -70,6 +67,8 @@ def __init__(
7067
self.asr_config = self.model_cls.get_speech_to_text_config(
7168
model_config, task_type)
7269

70+
self.max_audio_filesize_mb = envs.VLLM_MAX_AUDIO_CLIP_FILESIZE_MB
71+
7372
if self.default_sampling_params:
7473
logger.info(
7574
"Overwriting default completion sampling param with: %s",
@@ -93,7 +92,7 @@ async def _preprocess_speech_to_text(
9392
lang = request.language or "en"
9493
self.model_cls.validate_language(lang)
9594

96-
if len(audio_data) / 1024**2 > MAX_AUDIO_CLIP_FILESIZE_MB:
95+
if len(audio_data) / 1024**2 > self.max_audio_filesize_mb:
9796
raise ValueError("Maximum file size exceeded.")
9897

9998
with io.BytesIO(audio_data) as bytes_:

vllm/envs.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,7 @@
6161
VLLM_IMAGE_FETCH_TIMEOUT: int = 5
6262
VLLM_VIDEO_FETCH_TIMEOUT: int = 30
6363
VLLM_AUDIO_FETCH_TIMEOUT: int = 10
64+
VLLM_MAX_AUDIO_CLIP_FILESIZE_MB: int = 25
6465
VLLM_VIDEO_LOADER_BACKEND: str = "opencv"
6566
VLLM_MM_INPUT_CACHE_GIB: int = 8
6667
VLLM_TARGET_DEVICE: str = "cuda"
@@ -519,6 +520,12 @@ def get_vllm_port() -> Optional[int]:
519520
"VLLM_AUDIO_FETCH_TIMEOUT":
520521
lambda: int(os.getenv("VLLM_AUDIO_FETCH_TIMEOUT", "10")),
521522

523+
# Maximum filesize in MB for a single audio file when processing
524+
# speech-to-text requests. Files larger than this will be rejected.
525+
# Default is 25 MB
526+
"VLLM_MAX_AUDIO_CLIP_FILESIZE_MB":
527+
lambda: int(os.getenv("VLLM_MAX_AUDIO_CLIP_FILESIZE_MB", "25")),
528+
522529
# Backend for Video IO
523530
# - "opencv": Default backend that uses OpenCV stream buffered backend.
524531
#

0 commit comments

Comments
 (0)