Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
61 changes: 28 additions & 33 deletions examples/online_serving/openai_transcription_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,8 @@
"""

import asyncio
import json

import httpx
from openai import OpenAI
from openai import AsyncOpenAI, OpenAI

from vllm.assets.audio import AudioAsset

Expand All @@ -47,37 +45,30 @@ def sync_openai(audio_path: str, client: OpenAI):
print("transcription result:", transcription.text)


async def stream_openai_response(audio_path: str, base_url: str, api_key: str):
async def stream_openai_response(audio_path: str, client: AsyncOpenAI):
"""
Perform streaming transcription using vLLM's raw HTTP streaming API.
Perform asynchronous transcription using OpenAI-compatible API.
"""
data = {
"language": "en",
"stream": True,
"model": "openai/whisper-large-v3",
}
url = base_url + "/audio/transcriptions"
headers = {"Authorization": f"Bearer {api_key}"}
print("transcription result:", end=" ")
# OpenAI Transcription API client does not support streaming.
async with httpx.AsyncClient() as client:
with open(audio_path, "rb") as f:
async with client.stream(
"POST", url, files={"file": f}, data=data, headers=headers
) as response:
async for line in response.aiter_lines():
# Each line is a JSON object prefixed with 'data: '
if line:
if line.startswith("data: "):
line = line[len("data: ") :]
# Last chunk, stream ends
if line.strip() == "[DONE]":
break
# Parse the JSON response
chunk = json.loads(line)
# Extract and print the content
content = chunk["choices"][0].get("delta", {}).get("content")
print(content, end="")
print("\ntranscription result:", end=" ")
with open(audio_path, "rb") as f:
transcription = await client.audio.transcriptions.create(
file=f,
model="openai/whisper-large-v3",
language="en",
response_format="json",
temperature=0.0,
# Additional sampling params not provided by OpenAI API.
extra_body=dict(
seed=420,
top_p=0.6,
),
stream=True,
)
async for chunk in transcription:
if chunk.choices:
content = chunk.choices[0].get("delta", {}).get("content")
print(content, end="", flush=True)

print() # Final newline after stream ends


Expand All @@ -95,7 +86,11 @@ def main():

sync_openai(mary_had_lamb, client)
# Run the asynchronous function
asyncio.run(stream_openai_response(winning_call, openai_api_base, openai_api_key))
client = AsyncOpenAI(
api_key=openai_api_key,
base_url=openai_api_base,
)
asyncio.run(stream_openai_response(winning_call, client))


if __name__ == "__main__":
Expand Down
7 changes: 3 additions & 4 deletions vllm/entrypoints/openai/protocol.py
Original file line number Diff line number Diff line change
Expand Up @@ -1750,12 +1750,11 @@ class TranscriptionRequest(OpenAIBaseModel):
timestamps incurs additional latency.
"""

# --8<-- [start:transcription-extra-params]
stream: Optional[bool] = False
"""Custom field not present in the original OpenAI definition. When set,
it will enable output to be streamed in a similar fashion as the Chat
Completion endpoint.
"""When set, it will enable output to be streamed in a similar fashion
as the Chat Completion endpoint.
"""
# --8<-- [start:transcription-extra-params]
# Flattened stream option to simplify form data.
stream_include_usage: Optional[bool] = False
stream_continuous_usage_stats: Optional[bool] = False
Expand Down