Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 27 additions & 0 deletions tests/entrypoints/openai/conftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest

from vllm.assets.audio import AudioAsset


@pytest.fixture
def mary_had_lamb():
path = AudioAsset('mary_had_lamb').get_local_path()
with open(str(path), "rb") as f:
yield f


@pytest.fixture
def winning_call():
path = AudioAsset('winning_call').get_local_path()
with open(str(path), "rb") as f:
yield f


@pytest.fixture
def foscolo():
# Test translation it->en
path = AudioAsset('azacinto_foscolo').get_local_path()
with open(str(path), "rb") as f:
yield f
35 changes: 19 additions & 16 deletions tests/entrypoints/openai/test_transcription_validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,6 @@
import pytest_asyncio
import soundfile as sf

from vllm.assets.audio import AudioAsset

from ...utils import RemoteOpenAIServer

MODEL_NAME = "openai/whisper-large-v3-turbo"
Expand All @@ -24,20 +22,6 @@
]


@pytest.fixture
def mary_had_lamb():
path = AudioAsset('mary_had_lamb').get_local_path()
with open(str(path), "rb") as f:
yield f


@pytest.fixture
def winning_call():
path = AudioAsset('winning_call').get_local_path()
with open(str(path), "rb") as f:
yield f


@pytest.fixture(scope="module")
def server():
with RemoteOpenAIServer(MODEL_NAME, SERVER_ARGS) as remote_server:
Expand Down Expand Up @@ -76,6 +60,25 @@ async def test_basic_audio(mary_had_lamb, model_name):
assert out_usage["seconds"] == 16, out_usage["seconds"]


@pytest.mark.asyncio
async def test_basic_audio_gemma(foscolo):
# Gemma accuracy on some of the audio samples we use is particularly bad,
# hence we use a different one here. WER is evaluated separately.
model_name = "google/gemma-3n-E2B-it"
server_args = ["--enforce-eager"]

with RemoteOpenAIServer(model_name, server_args) as remote_server:
client = remote_server.get_async_client()
transcription = await client.audio.transcriptions.create(
model=model_name,
file=foscolo,
language="it",
response_format="text",
temperature=0.0)
out = json.loads(transcription)['text']
assert "da cui vergine nacque Venere" in out


@pytest.mark.asyncio
async def test_non_asr_model(winning_call):
# text to text model
Expand Down
78 changes: 44 additions & 34 deletions tests/entrypoints/openai/test_translation_validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,32 +12,24 @@
import pytest_asyncio
import soundfile as sf

from vllm.assets.audio import AudioAsset

from ...utils import RemoteOpenAIServer

MODEL_NAME = "openai/whisper-small"
SERVER_ARGS = ["--enforce-eager"]


@pytest.fixture
def foscolo():
# Test translation it->en
path = AudioAsset('azacinto_foscolo').get_local_path()
with open(str(path), "rb") as f:
yield f


@pytest.fixture(scope="module")
def server():
with RemoteOpenAIServer(MODEL_NAME, SERVER_ARGS) as remote_server:
yield remote_server
@pytest.fixture(scope="module",
params=["openai/whisper-small", "google/gemma-3n-E2B-it"])
def server(request):
# Parametrize over model name
with RemoteOpenAIServer(request.param, SERVER_ARGS) as remote_server:
yield remote_server, request.param


@pytest_asyncio.fixture
async def client(server):
async def client_and_model(server):
server, model_name = server
async with server.get_async_client() as async_client:
yield async_client
yield async_client, model_name


@pytest.mark.asyncio
Expand All @@ -56,27 +48,29 @@ async def test_non_asr_model(foscolo):

# NOTE: (NickLucche) the large-v3-turbo model was not trained on translation!
@pytest.mark.asyncio
async def test_basic_audio(foscolo, client):
async def test_basic_audio(foscolo, client_and_model):
client, model_name = client_and_model
translation = await client.audio.translations.create(
model=MODEL_NAME,
model=model_name,
file=foscolo,
response_format="text",
# TODO remove once language detection is implemented
extra_body=dict(language="it"),
# TODO remove `language="it"` once language detection is implemented
extra_body=dict(language="it", to_language="en"),
temperature=0.0)
out = json.loads(translation)['text'].strip().lower()
assert "greek sea" in out


@pytest.mark.asyncio
async def test_audio_prompt(foscolo, client):
async def test_audio_prompt(foscolo, client_and_model):
client, model_name = client_and_model
# Condition whisper on starting text
prompt = "Nor have I ever"
transcription = await client.audio.translations.create(
model=MODEL_NAME,
model=model_name,
file=foscolo,
prompt=prompt,
extra_body=dict(language="it"),
extra_body=dict(language="it", to_language="en"),
response_format="text",
temperature=0.0)
out = json.loads(transcription)['text']
Expand All @@ -85,22 +79,27 @@ async def test_audio_prompt(foscolo, client):


@pytest.mark.asyncio
async def test_streaming_response(foscolo, client, server):
async def test_streaming_response(foscolo, client_and_model, server):
client, model_name = client_and_model
translation = ""
res_no_stream = await client.audio.translations.create(
model=MODEL_NAME,
model=model_name,
file=foscolo,
response_format="json",
extra_body=dict(language="it"),
extra_body=dict(language="it", to_language="en", seed=42),
temperature=0.0)

# Stream via HTTPX since OpenAI translation client doesn't expose streaming
server, model_name = server
url = server.url_for("v1/audio/translations")
headers = {"Authorization": f"Bearer {server.DUMMY_API_KEY}"}
data = {
"model": MODEL_NAME,
"model": model_name,
"language": "it",
"to_language": "en",
"stream": True,
"temperature": 0.0,
"seed": 42,
}
foscolo.seek(0)
async with httpx.AsyncClient() as http_client:
Expand All @@ -121,16 +120,24 @@ async def test_streaming_response(foscolo, client, server):
text = chunk["choices"][0].get("delta", {}).get("content")
translation += text or ""

assert translation == res_no_stream.text
res_stream = translation.split()
# NOTE There's a small non-deterministic issue here, likely in the attn
# computation, which will cause a few tokens to be different, while still
# being very close semantically.
assert sum([
x == y for x, y in zip(res_stream, res_no_stream.text.split())
]) >= len(res_stream) * 0.9


@pytest.mark.asyncio
async def test_stream_options(foscolo, client, server):
async def test_stream_options(foscolo, server):
server, model_name = server
url = server.url_for("v1/audio/translations")
headers = {"Authorization": f"Bearer {server.DUMMY_API_KEY}"}
data = {
"model": MODEL_NAME,
"model": model_name,
"language": "it",
"to_language": "en",
"stream": True,
"stream_include_usage": True,
"stream_continuous_usage_stats": True,
Expand Down Expand Up @@ -164,7 +171,10 @@ async def test_stream_options(foscolo, client, server):


@pytest.mark.asyncio
async def test_long_audio_request(foscolo, client):
async def test_long_audio_request(foscolo, client_and_model):
client, model_name = client_and_model
if model_name == "google/gemma-3n-E2B-it":
pytest.skip("Gemma3n does not support long audio requests")
foscolo.seek(0)
audio, sr = librosa.load(foscolo)
repeated_audio = np.tile(audio, 2)
Expand All @@ -173,9 +183,9 @@ async def test_long_audio_request(foscolo, client):
sf.write(buffer, repeated_audio, sr, format='WAV')
buffer.seek(0)
translation = await client.audio.translations.create(
model=MODEL_NAME,
model=model_name,
file=buffer,
extra_body=dict(language="it"),
extra_body=dict(language="it", to_language="en"),
response_format="text",
temperature=0.0)
out = json.loads(translation)['text'].strip().lower()
Expand Down
19 changes: 19 additions & 0 deletions vllm/entrypoints/openai/protocol.py
Original file line number Diff line number Diff line change
Expand Up @@ -2135,6 +2135,13 @@ class TranscriptionRequest(OpenAIBaseModel):
)
# --8<-- [end:transcription-extra-params]

to_language: Optional[str] = None
"""The language of the output audio we transcribe to.

Please note that this is not currently used by supported models at this
time, but it is a placeholder for future use, matching translation api.
"""

# --8<-- [start:transcription-sampling-params]
temperature: float = Field(default=0.0)
"""The sampling temperature, between 0 and 1.
Expand Down Expand Up @@ -2368,6 +2375,9 @@ class TranslationRequest(OpenAIBaseModel):

# TODO support additional sampling parameters
# --8<-- [start:translation-sampling-params]
seed: Optional[int] = Field(None, ge=_LONG_INFO.min, le=_LONG_INFO.max)
"""The seed to use for sampling."""

temperature: float = Field(default=0.0)
"""The sampling temperature, between 0 and 1.

Expand All @@ -2387,6 +2397,14 @@ class TranslationRequest(OpenAIBaseModel):
will improve accuracy.
"""

to_language: Optional[str] = None
"""The language of the input audio we translate to.

Please note that this is not supported by all models, refer to the specific
model documentation for more details.
For instance, Whisper only supports `to_language=en`.
"""

stream: Optional[bool] = False
"""Custom field not present in the original OpenAI definition. When set,
it will enable output to be streamed in a similar fashion as the Chat
Expand Down Expand Up @@ -2418,6 +2436,7 @@ def to_sampling_params(

return SamplingParams.from_optional(temperature=temperature,
max_tokens=max_tokens,
seed=self.seed,
output_kind=RequestOutputKind.DELTA
if self.stream \
else RequestOutputKind.FINAL_ONLY)
Expand Down
7 changes: 6 additions & 1 deletion vllm/entrypoints/openai/speech_to_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,9 @@ async def _preprocess_speech_to_text(
) -> tuple[list[PromptType], float]:
# Validate request
language = self.model_cls.validate_language(request.language)
# Skip to_language validation to avoid extra logging for Whisper.
to_language = self.model_cls.validate_language(request.to_language) \
if request.to_language else None

if len(audio_data) / 1024**2 > self.max_audio_filesize_mb:
raise ValueError("Maximum file size exceeded.")
Expand All @@ -112,7 +115,9 @@ async def _preprocess_speech_to_text(
model_config=self.model_config,
language=language,
task_type=self.task_type,
request_prompt=request.prompt)
request_prompt=request.prompt,
to_language=to_language,
)
prompts.append(prompt)
return prompts, duration

Expand Down
Loading