|
| 1 | +"""Pydantic models for transcription JSON output schema.""" |
| 2 | + |
| 3 | +from typing import Optional |
| 4 | + |
| 5 | +from pydantic import BaseModel, Field |
| 6 | + |
| 7 | + |
| 8 | +class TranscriptionSegmentModel(BaseModel): |
| 9 | + """Pydantic model for a transcription segment. |
| 10 | +
|
| 11 | + This model validates the JSON structure of a single transcription segment |
| 12 | + with timing information and optional speaker attribution. |
| 13 | +
|
| 14 | + Attributes: |
| 15 | + start: Start time in seconds |
| 16 | + end: End time in seconds |
| 17 | + text: Transcribed text for this segment |
| 18 | + speaker: Optional speaker identifier (used with diarization) |
| 19 | + """ |
| 20 | + |
| 21 | + start: float = Field(..., description="Start time in seconds", ge=0.0) |
| 22 | + end: float = Field(..., description="End time in seconds", ge=0.0) |
| 23 | + text: str = Field(..., description="Transcribed text for this segment") |
| 24 | + speaker: Optional[str] = Field(None, description="Optional speaker identifier") |
| 25 | + |
| 26 | + model_config = {"extra": "forbid"} |
| 27 | + |
| 28 | + |
| 29 | +class TranscriptionJSONModel(BaseModel): |
| 30 | + """Pydantic model for complete transcription JSON output. |
| 31 | +
|
| 32 | + This model validates the complete JSON structure output by the |
| 33 | + JSONFormatWriter, including metadata, segments, and speaker information. |
| 34 | +
|
| 35 | + Attributes: |
| 36 | + text: Full transcription text (concatenated from segments) |
| 37 | + language: Detected or specified language code (e.g., "en", "es") |
| 38 | + duration: Total audio duration in seconds |
| 39 | + word_count: Total number of words in transcription |
| 40 | + segment_count: Number of segments |
| 41 | + has_speakers: Whether transcription includes speaker diarization |
| 42 | + speakers: List of unique speaker identifiers |
| 43 | + segments: List of transcription segments with timing |
| 44 | + metadata: Additional metadata (model info, processing time, etc.) |
| 45 | + """ |
| 46 | + |
| 47 | + text: str = Field(..., description="Full transcription text") |
| 48 | + language: str = Field(..., description="Language code (e.g., 'en', 'es')") |
| 49 | + duration: float = Field(..., description="Total audio duration in seconds", ge=0.0) |
| 50 | + word_count: int = Field(..., description="Total number of words", ge=0) |
| 51 | + segment_count: int = Field(..., description="Number of segments", ge=0) |
| 52 | + has_speakers: bool = Field(..., description="Whether speaker diarization is present") |
| 53 | + speakers: list[str] = Field( |
| 54 | + default_factory=list, description="List of unique speaker identifiers" |
| 55 | + ) |
| 56 | + segments: list[TranscriptionSegmentModel] = Field( |
| 57 | + ..., description="List of transcription segments" |
| 58 | + ) |
| 59 | + metadata: dict = Field(default_factory=dict, description="Additional metadata") |
| 60 | + |
| 61 | + model_config = {"extra": "forbid"} |
0 commit comments