[Whisper] Add segment-level timestamp support (verbose_json)

JustinTong0323 · JustinTong0323 · commit 3c53805c60ab · 2026-03-25T04:19:40.000Z
- Accept `timestamp_granularities[]` and `response_format=verbose_json`
  in the `/v1/audio/transcriptions` endpoint
- Switch decoder prompt from `&lt;|notimestamps|&gt;` to `&lt;|0.00|&gt;` when
  timestamps are requested so the model emits timestamp tokens
- Parse timestamp tokens from output_ids into segments with start/end
  times in the serving layer
- Add TranscriptionSegment and TranscriptionVerboseResponse protocol
  models matching the OpenAI API spec
- Backward compatible: default behavior (json/text) unchanged
diff --git a/python/sglang/srt/entrypoints/http_server.py b/python/sglang/srt/entrypoints/http_server.py
@@ -1480,11 +1480,18 @@ async def openai_v1_audio_transcriptions(
     response_format: str = Form(default="json"),
     temperature: float = Form(default=0.0),
     stream: bool = Form(default=False),
+    timestamp_granularities: Optional[List[str]] = Form(
+        default=None, alias="timestamp_granularities[]"
+    ),
 ):
     """OpenAI-compatible audio transcription endpoint."""
-    if response_format not in ["json", "text"]:
+    if response_format not in ["json", "text", "verbose_json"]:
         return ORJSONResponse(
-            content={"error": {"message": "Only 'json' and 'text' formats supported"}},
+            content={
+                "error": {
+                    "message": "Only 'json', 'text', and 'verbose_json' formats supported"
+                }
+            },
             status_code=400,
         )
 
@@ -1498,6 +1505,7 @@ async def openai_v1_audio_transcriptions(
             response_format=response_format,
             temperature=temperature,
             stream=stream,
+            timestamp_granularities=timestamp_granularities,
             raw_request=raw_request,
         )
     )
diff --git a/python/sglang/srt/entrypoints/openai/protocol.py b/python/sglang/srt/entrypoints/openai/protocol.py
@@ -1443,6 +1443,7 @@ class TranscriptionRequest(BaseModel):
     language: Optional[str] = None
     response_format: str = "json"
     temperature: float = 0.0
+    timestamp_granularities: Optional[List[str]] = None
     stream: bool = False
     # Internal fields (not from API)
     audio_data: Optional[bytes] = None
@@ -1463,6 +1464,26 @@ class TranscriptionResponse(BaseModel):
     usage: Optional[TranscriptionUsage] = None
 
 
+class TranscriptionSegment(BaseModel):
+    """A segment with timestamp information."""
+
+    id: int
+    start: float
+    end: float
+    text: str
+
+
+class TranscriptionVerboseResponse(BaseModel):
+    """Verbose transcription response with timestamps (OpenAI-compatible)."""
+
+    task: str = "transcribe"
+    language: Optional[str] = None
+    duration: Optional[float] = None
+    text: str
+    segments: List[TranscriptionSegment] = []
+    usage: Optional[TranscriptionUsage] = None
+
+
 class TranscriptionStreamChoice(BaseModel):
     """Delta content for streaming transcription."""
 
diff --git a/python/sglang/srt/entrypoints/openai/serving_transcription.py b/python/sglang/srt/entrypoints/openai/serving_transcription.py
@@ -22,7 +22,7 @@
 import math
 import time
 import uuid
-from typing import TYPE_CHECKING, AsyncGenerator, Optional, Union
+from typing import TYPE_CHECKING, AsyncGenerator, List, Optional, Union
 
 from fastapi import Request
 from fastapi.responses import ORJSONResponse, Response, StreamingResponse
@@ -32,9 +32,11 @@
     ErrorResponse,
     TranscriptionRequest,
     TranscriptionResponse,
+    TranscriptionSegment,
     TranscriptionStreamChoice,
     TranscriptionStreamResponse,
     TranscriptionUsage,
+    TranscriptionVerboseResponse,
 )
 from sglang.srt.entrypoints.openai.serving_base import OpenAIServingBase
 from sglang.srt.managers.io_struct import GenerateReqInput
@@ -44,6 +46,10 @@
 
 logger = logging.getLogger(__name__)
 
+# Whisper timestamp token constants
+TIMESTAMP_BASE_TOKEN_ID = 50365  # <|0.00|>
+TIMESTAMP_BASE_OFFSET = 0.02  # Each token step = 0.02 seconds
+
 
 class OpenAIServingTranscription(OpenAIServingBase):
     """Handler for /v1/audio/transcriptions requests"""
@@ -72,6 +78,9 @@ def _convert_to_internal_request(
             "language": request.language,  # Pass to WhisperProcessor for language-specific decoding
         }
 
+        if request.timestamp_granularities:
+            sampling_params["timestamp_granularities"] = request.timestamp_granularities
+
         # For Whisper, we pass audio_data and let the processor handle it
         adapted_request = GenerateReqInput(
             text="",  # Empty text - Whisper processor will set proper decoder tokens
@@ -89,13 +98,83 @@ def _get_audio_duration(self, audio_data: bytes) -> float:
         try:
             import soundfile as sf
 
-            audio_array, sr = sf.read(io.BytesIO(audio_data))
-            duration = len(audio_array) / sr
-            return duration
+            info = sf.info(io.BytesIO(audio_data))
+            return info.duration
         except Exception as e:
             logger.warning(f"Could not calculate audio duration: {e}")
             return 0.0
 
+    def _parse_segments(
+        self, output_ids: List[int], tokenizer
+    ) -> tuple[str, List[TranscriptionSegment]]:
+        """Parse timestamp tokens from output_ids into segments.
+
+        The decoder prompt ends with <|0.00|>, so the first segment starts at
+        t=0.  The model then outputs:
+            text_tokens <|end_ts|> [<|start_ts|> text_tokens <|end_ts|> ...]
+        Each timestamp token marks the end of the current segment; its value
+        also becomes the start of the next segment.
+        """
+        # Token IDs for special tokens we want to strip from segment text
+        eos_token_id = getattr(tokenizer, "eos_token_id", 50257)
+
+        segments = []
+        full_text_parts = []
+        current_text_tokens = []
+        current_start = 0.0  # First segment starts at 0.0 (from prompt <|0.00|>)
+        seg_id = 0
+
+        for token_id in output_ids:
+            if token_id >= TIMESTAMP_BASE_TOKEN_ID:
+                # This is a timestamp token — marks the end of current segment
+                timestamp = (token_id - TIMESTAMP_BASE_TOKEN_ID) * TIMESTAMP_BASE_OFFSET
+
+                if current_text_tokens:
+                    text = tokenizer.decode(
+                        current_text_tokens, skip_special_tokens=True
+                    ).strip()
+                    if text:
+                        segments.append(
+                            TranscriptionSegment(
+                                id=seg_id,
+                                start=round(current_start, 2),
+                                end=round(timestamp, 2),
+                                text=text,
+                            )
+                        )
+                        full_text_parts.append(text)
+                        seg_id += 1
+                    current_text_tokens = []
+
+                # Next segment starts at this timestamp
+                current_start = timestamp
+
+            elif token_id == eos_token_id:
+                # Skip end-of-text token
+                continue
+            else:
+                # Regular text token
+                current_text_tokens.append(token_id)
+
+        # Handle any trailing text tokens without a closing timestamp
+        if current_text_tokens:
+            text = tokenizer.decode(
+                current_text_tokens, skip_special_tokens=True
+            ).strip()
+            if text:
+                segments.append(
+                    TranscriptionSegment(
+                        id=seg_id,
+                        start=round(current_start, 2),
+                        end=round(current_start, 2),
+                        text=text,
+                    )
+                )
+                full_text_parts.append(text)
+
+        full_text = " ".join(full_text_parts)
+        return full_text, segments
+
     async def create_transcription(
         self,
         audio_data: bytes,
@@ -105,7 +184,14 @@ async def create_transcription(
         temperature: float,
         stream: bool,
         raw_request: Request,
-    ) -> Union[TranscriptionResponse, StreamingResponse, Response, ORJSONResponse]:
+        timestamp_granularities: Optional[List[str]] = None,
+    ) -> Union[
+        TranscriptionResponse,
+        TranscriptionVerboseResponse,
+        StreamingResponse,
+        Response,
+        ORJSONResponse,
+    ]:
         """Main entry point for transcription requests."""
         # Calculate audio duration for usage reporting
         audio_duration_s = self._get_audio_duration(audio_data)
@@ -117,6 +203,7 @@ async def create_transcription(
             language=language,
             response_format=response_format,
             temperature=temperature,
+            timestamp_granularities=timestamp_granularities,
             stream=stream,
             audio_duration_s=audio_duration_s,
         )
@@ -129,7 +216,13 @@ async def _handle_non_streaming_request(
         adapted_request: GenerateReqInput,
         request: TranscriptionRequest,
         raw_request: Request,
-    ) -> Union[TranscriptionResponse, ErrorResponse, ORJSONResponse, Response]:
+    ) -> Union[
+        TranscriptionResponse,
+        TranscriptionVerboseResponse,
+        ErrorResponse,
+        ORJSONResponse,
+        Response,
+    ]:
         """Handle non-streaming transcription request."""
         try:
             ret = await self.tokenizer_manager.generate_request(
@@ -139,14 +232,26 @@ async def _handle_non_streaming_request(
             return self.create_error_response(str(e))
 
         text = ret.get("text", "")
+        usage = TranscriptionUsage(seconds=int(math.ceil(request.audio_duration_s)))
 
         # Build response based on format
         if request.response_format == "text":
             return Response(content=text, media_type="text/plain")
 
-        # JSON format
-        usage = TranscriptionUsage(seconds=int(math.ceil(request.audio_duration_s)))
+        if request.response_format == "verbose_json":
+            output_ids = ret.get("output_ids", [])
+            tokenizer = self.tokenizer_manager.tokenizer
+            parsed_text, segments = self._parse_segments(output_ids, tokenizer)
+
+            return TranscriptionVerboseResponse(
+                language=request.language or "en",
+                duration=round(request.audio_duration_s, 2),
+                text=parsed_text or text,
+                segments=segments,
+                usage=usage,
+            )
 
+        # Default JSON format
         return TranscriptionResponse(text=text, usage=usage)
 
     async def _handle_streaming_request(
diff --git a/python/sglang/srt/multimodal/processors/whisper.py b/python/sglang/srt/multimodal/processors/whisper.py
@@ -115,10 +115,9 @@ def __init__(self, hf_config, server_args, _processor, *args, **kwargs):
         # Cache tokenizer for language token lookup
         self._tokenizer = getattr(self._processor, "tokenizer", None)
 
-    def _extract_language_from_request(self, request_obj) -> Optional[str]:
+    def _pop_sampling_param(self, request_obj, key: str):
         sampling_params = getattr(request_obj, "sampling_params", None) or {}
-        language = sampling_params.pop("language", None)
-        return normalize_language_to_code(language)
+        return sampling_params.pop(key, None)
 
     def _get_language_token_id(self, language: Optional[str]) -> int:
         # Default to English if not specified
@@ -148,27 +147,35 @@ async def process_mm_data_async(
         # For Whisper, ALWAYS use the proper transcription token sequence
         # and IGNORE any text prompt - Whisper is a pure speech-to-text model
         # The decoder_start_token_id and forced_decoder_ids from generation config
-        # set up: <|startoftranscript|> <|lang|> <|task|> [<|notimestamps|>]
+        # set up: <|startoftranscript|> <|lang|> <|task|> [<|notimestamps|> or <|0.00|>]
 
-        # Extract language from request and get token ID
-        language = self._extract_language_from_request(request_obj)
+        language = normalize_language_to_code(
+            self._pop_sampling_param(request_obj, "language")
+        )
         language_token_id = self._get_language_token_id(language)
+        timestamp_granularities = self._pop_sampling_param(
+            request_obj, "timestamp_granularities"
+        )
 
         # Build decoder input tokens
-        # <|startoftranscript|> + <|lang|> + <|transcribe|> + <|notimestamps|>
         decoder_start_token_id = getattr(
             self.hf_config, "decoder_start_token_id", 50258
         )
         transcribe_token_id = self._tokenizer.convert_tokens_to_ids("<|transcribe|>")
-        notimestamps_token_id = self._tokenizer.convert_tokens_to_ids(
-            "<|notimestamps|>"
-        )
+
+        # Use <|0.00|> to enable timestamp generation, or <|notimestamps|> to disable
+        if timestamp_granularities:
+            timestamp_token_id = self._tokenizer.convert_tokens_to_ids("<|0.00|>")
+        else:
+            timestamp_token_id = self._tokenizer.convert_tokens_to_ids(
+                "<|notimestamps|>"
+            )
 
         input_ids = [
             decoder_start_token_id,
             language_token_id,
             transcribe_token_id,
-            notimestamps_token_id,
+            timestamp_token_id,
         ]
 
         # Whisper expects input features padded to max_length (3000 frames = 30 seconds)