openai
diff --git a/‎examples/Speech_transcription_methods.ipynb‎
Lines changed: 49 additions & 80 deletions b/‎examples/Speech_transcription_methods.ipynb‎
Lines changed: 49 additions & 80 deletions
diff --git a/‎examples/imgs/agents_sdk_transcription.png‎
15.2 KB b/‎examples/imgs/agents_sdk_transcription.png‎
15.2 KB
diff --git a/‎examples/imgs/realtime_api_transcription.png‎
40.8 KB b/‎examples/imgs/realtime_api_transcription.png‎
40.8 KB
diff --git a/‎examples/imgs/speech-to-text-not-streaming.png‎
16.7 KB b/‎examples/imgs/speech-to-text-not-streaming.png‎
16.7 KB
diff --git a/‎examples/imgs/speech-to-text-streaming.png‎
22.9 KB b/‎examples/imgs/speech-to-text-streaming.png‎
22.9 KB
diff --git a/‎examples/mermaid/agents_sdk_transcription.mmd‎
Lines changed: 8 additions & 0 deletions b/‎examples/mermaid/agents_sdk_transcription.mmd‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎examples/mermaid/realtime_api_transcription.mmd‎
Lines changed: 13 additions & 0 deletions b/‎examples/mermaid/realtime_api_transcription.mmd‎
Lines changed: 13 additions & 0 deletions
diff --git a/‎examples/mermaid/speech-to-text-not-streaming.mmd‎
Lines changed: 7 additions & 0 deletions b/‎examples/mermaid/speech-to-text-not-streaming.mmd‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎examples/mermaid/speech-to-text-streaming.mmd‎
Lines changed: 9 additions & 0 deletions b/‎examples/mermaid/speech-to-text-streaming.mmd‎
Lines changed: 9 additions & 0 deletions
@@ -68,7 +68,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 18,
    "id": "e4078915",
    "metadata": {},
    "outputs": [
@@ -81,16 +81,36 @@
     }
    ],
    "source": [
-    "import os, nest_asyncio, asyncio\n",
-    "from openai import OpenAI\n",
+    "# ─── Standard Library ──────────────────────────────────────────────────────────\n",
+    "import asyncio\n",
+    "import base64          # encode raw PCM bytes → base64 before sending JSON\n",
+    "import json            # compose/parse WebSocket messages\n",
+    "import os\n",
     "import time\n",
+    "from typing import List\n",
+    "\n",
+    "# ─── Third-Party ───────────────────────────────────────────────────────────────\n",
+    "import nest_asyncio\n",
+    "import numpy as np     # efficient numeric processing for audio arrays\n",
+    "from openai import OpenAI\n",
+    "import resampy         # high-quality sample-rate conversion\n",
+    "import soundfile as sf # reads many audio formats into float32 arrays\n",
+    "import websockets      # asyncio-based WebSocket client\n",
+    "from agents import Agent\n",
+    "from agents.voice import (\n",
+    "    SingleAgentVoiceWorkflow,\n",
+    "    StreamedAudioInput,\n",
+    "    VoicePipeline,\n",
+    "    VoicePipelineConfig,\n",
+    ")\n",
+    "# ───────────────────────────────────────────────────────────────────────────────\n",
     "nest_asyncio.apply()\n",
     "\n",
     "# ✏️  Put your key in an env-var or just replace the call below.\n",
     "OPENAI_API_KEY = os.getenv(\"OPENAI_API_KEY\")\n",
     "\n",
     "client = OpenAI(api_key=OPENAI_API_KEY)\n",
-    "print('✅ OpenAI client ready')"
+    "print(\"✅ OpenAI client ready\")"
    ]
   },
   {
@@ -102,7 +122,7 @@
     "## 1 · Speech-to-Text with Audio File\n",
     "\n",
     "### When to use\n",
-    "* You have a completed audio file (up to 25 MB).\n",
+    "* You have a completed audio file (up to 25 MB).The following input file types are supported: mp3, mp4, mpeg, mpga, m4a, wav, and webm.\n",
     "* Suitable for batch processing tasks like podcasts, call-center recordings, or voice memos.\n",
     "* Real-time feedback or partial results are not required."
    ]
@@ -114,13 +134,8 @@
    "source": [
     "### How it works\n",
     "\n",
-    "```mermaid\n",
-    "flowchart LR\n",
-    "    AudioFile[\"Audio file<br/>(WAV • MP3 • FLAC)\"] --> Upload[\"Binary upload\"]\n",
-    "    Upload --> API[\"/v1/audio/transcriptions\"]\n",
-    "    API --> JSONOutput[\"JSON transcription<br/>+ metadata\"]\n",
-    "    JSONOutput --> App[\"Your application\"]\n",
-    "```\n",
+    "\n",
+    "![STT Not Streaming Transcription flow](./imgs/speech-to-text-not-streaming.png)\n",
     "\n",
     "#### Benefits\n",
     "\n",
@@ -146,7 +161,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 19,
    "id": "ab545e4c",
    "metadata": {},
    "outputs": [
@@ -180,7 +195,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 20,
    "id": "7ae4af8d",
    "metadata": {},
    "outputs": [
@@ -213,13 +228,11 @@
    "id": "e6a5b89e",
    "metadata": {},
    "source": [
-    "We notice that the transcription is much smaller than expected.\n",
-    "\n",
-    "What you’re seeing is a known limitation of gpt-4o-mini-transcribe models when they’re used through the non-realtime /v1/audio/transcriptions endpoint:\n",
+    "We notice that the transcription is much smaller than expected. What you’re seeing is a known limitation of gpt-4o-mini-transcribe models when they’re used through the non-realtime /v1/audio/transcriptions endpoint:\n",
     "\n",
-    "The backend still runs its own voice-activity-detection (VAD) pass to break the file into “turns”.\n",
+    ">The backend still runs its own voice-activity-detection (VAD) pass to break the file into “turns”.\n",
     "\n",
-    "Only the first completed turn is copied into the text field of the JSON that the endpoint returns. Subsequent turns are silently discarded."
+    ">Only the first completed turn is copied into the text field of the JSON that the endpoint returns. Subsequent turns are silently discarded."
    ]
   },
   {
@@ -234,15 +247,7 @@
     "- You need immediate transcription results (partial or final) as they arrive.  \n",
     "- Scenarios where partial feedback improves UX, e.g., uploading a long voice memo.\n",
     "\n",
-    "```mermaid\n",
-    "flowchart LR\n",
-    "    A[\"Finished audio file<br/>(WAV • MP3 • FLAC • …)\"]\n",
-    "    B[\"OpenAI STT engine<br/>(gpt-4o-transcribe)\"]\n",
-    "    C[\"Your application / UI\"]\n",
-    "\n",
-    "    A -->|HTTP POST<br/>/v1/audio/transcriptions<br/>stream=true| B\n",
-    "    B -->|chunked HTTP response<br/>partial & final transcripts| C\n",
-    "```\n",
+    "![STT Streaming Transcription flow](./imgs/speech-to-text-streaming.png)\n",
     "\n",
     "#### Benefits\n",
     "- **Real-time feel:** Users see transcription updates almost immediately.  \n",
@@ -256,7 +261,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 25,
    "id": "d027fdb9",
    "metadata": {},
    "outputs": [
@@ -310,19 +315,7 @@
    "source": [
     "### How it works\n",
     "\n",
-    "```mermaid\n",
-    "sequenceDiagram\n",
-    "    participant Mic\n",
-    "    participant App\n",
-    "    participant WS as \"WebSocket\"\n",
-    "    participant OAI as \"Realtime Server\"\n",
-    "\n",
-    "    Mic ->> App: 20–40 ms PCM frames\n",
-    "    App ->> WS: Base64-encoded chunks<br/>input_audio_buffer.append\n",
-    "    WS  ->> OAI: Audio stream\n",
-    "    OAI -->> WS: JSON transcription events<br/>(partial & complete)\n",
-    "    WS  -->> App: Transcript updates\n",
-    "```\n",
+    "![Realtime Transcription flow](./imgs/realtime_api_transcription.png)\n",
     "\n",
     "#### Benefits\n",
     "- **Ultra-low latency:** Typically 300–800 ms, enabling near-instant transcription.  \n",
@@ -337,22 +330,11 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 22,
    "id": "2b2f2515",
    "metadata": {},
    "outputs": [],
    "source": [
-    "import base64           # encode raw PCM bytes → base64 before sending JSON\n",
-    "import json             # compose/parse WebSocket messages\n",
-    "\n",
-    "from typing import List\n",
-    "\n",
-    "import numpy as np      # efficient numeric processing for audio arrays\n",
-    "import resampy          # high-quality sample-rate conversion\n",
-    "import soundfile as sf  # reads many audio formats into float32 arrays\n",
-    "import websockets       # asyncio-based WebSocket client\n",
-    "\n",
-    "\n",
     "TARGET_SR   = 24_000          # Realtime STT expects 24-kHz mono input\n",
     "PCM_SCALE   = 32_767          # float32 (−1…1) → int16 conversion factor\n",
     "DEFAULT_CHUNK = 3_072         # 3 072 / 24 000 ≈ 128 ms   (server-VAD sweet-spot)\n",
@@ -501,7 +483,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": 23,
    "id": "d90de5b9",
    "metadata": {},
    "outputs": [
@@ -518,7 +500,7 @@
        "'The stale smell of old beer lingers. It takes heat to bring out the odor. A cold dip restores health and zest. A salt pickle tastes fine with ham. Tacos al pastor are my favorite.'"
       ]
      },
-     "execution_count": 16,
+     "execution_count": 23,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -550,19 +532,12 @@
    "source": [
     "### How it works\n",
     "\n",
-    "```{mermaid}\n",
-    "graph LR\n",
-    "    Mic  -- \"PCM frames\" --> VP[\"VoicePipeline\"]\n",
-    "    VP   -- \"VAD & resample\" --> Buf[\"Sentence buffer\"]\n",
-    "    Buf  --> GPT[\"gpt-4o-transcribe\"]\n",
-    "    GPT  --> Agent[\"Agent callbacks\"]\n",
-    "    Agent -- \"print / reply\" --> App\n",
-    "```\n",
+    "![Agents Transcription flow](./imgs/agents_sdk_transcription.png)\n",
     "\n",
     "**Benefits**\n",
     "\n",
     "- **Minimal boilerplate:** `VoicePipeline` handles resampling, VAD, buffering, token auth, and reconnects.  \n",
-    "- **Seamless agent integration: Enables direct interaction with GPT agents using real-time audio transcription.\n",
+    "- **Seamless agent integration**: Enables direct interaction with GPT agents using real-time audio transcription.\n",
     "\n",
     "**Limitations**\n",
     "\n",
@@ -573,7 +548,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 24,
    "id": "754a846b",
    "metadata": {},
    "outputs": [
@@ -585,32 +560,26 @@
       "[User]: The stale smell of old beer lingers.\n",
       "[Assistant]: L'odeur rance de la vieille bière persiste.\n",
       "[User]: A cold dip restores health and zest.\n",
-      "[Assistant]: Un bain froid restaure la santé et l'énergie.\n",
+      "[Assistant]: Un bain froid restaure la santé et le tonus.\n",
+      "[User]: It takes heat to bring out the odor.\n",
+      "[Assistant]: Il faut de la chaleur pour faire ressortir l'odeur.\n",
+      "[User]: A salt pickle tastes fine with ham.\n",
+      "[Assistant]: Un cornichon au sel se marie bien avec le jambon.\n",
       "[User]: Tacos al pastor are my favorite.\n",
-      "[Assistant]: Les tacos al pastor sont mes préférés."
+      "[Assistant]: "
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "\n",
+      "Les tacos al pastor sont mes préférés.\n",
       "[User]: A zestful food is the hot cross bun.\n",
-      "[Assistant]: Un aliment plein de dynamisme est le petit pain de Pâques."
+      "[Assistant]: Un aliment plein de saveur est le petit pain de Pâques."
      ]
     }
    ],
    "source": [
-    "import asyncio, numpy as np, soundfile as sf, resampy\n",
-    "from agents import Agent\n",
-    "from agents.voice import (\n",
-    "    StreamedAudioInput,\n",
-    "    VoicePipeline,\n",
-    "    VoicePipelineConfig,\n",
-    "    SingleAgentVoiceWorkflow,\n",
-    ")\n",
-    "\n",
-    "\n",
     "# ── 1 · agent that replies in French ---------------------------------------\n",
     "fr_agent = Agent(\n",
     "    name=\"Assistant-FR\",\n",
 
@@ -0,0 +1,8 @@
+```{mermaid}
+graph LR
+    Mic  -- "PCM frames" --> VP["VoicePipeline"]
+    VP   -- "VAD & resample" --> Buf["Sentence buffer"]
+    Buf  --> GPT["gpt-4o-transcribe"]
+    GPT  --> Agent["Agent callbacks"]
+    Agent -- "print / reply" --> App
+```
@@ -0,0 +1,13 @@
+```mermaid
+sequenceDiagram
+    participant Mic
+    participant App
+    participant WS as "WebSocket"
+    participant OAI as "Realtime Server"
+
+    Mic ->> App: 20–40 ms PCM frames
+    App ->> WS: Base64-encoded chunks<br/>input_audio_buffer.append
+    WS  ->> OAI: Audio stream
+    OAI -->> WS: JSON transcription events<br/>(partial & complete)
+    WS  -->> App: Transcript updates
+```
@@ -0,0 +1,7 @@
+```mermaid
+flowchart LR
+    AudioFile["Audio file<br/>(WAV • MP3 • FLAC)"] --> Upload["Binary upload"]
+    Upload --> API["/v1/audio/transcriptions"]
+    API --> JSONOutput["JSON transcription<br/>+ metadata"]
+    JSONOutput --> App["Your application"]
+```
@@ -0,0 +1,9 @@
+```mermaid
+flowchart LR
+    A["Finished audio file<br/>(WAV • MP3 • FLAC • …)"]
+    B["OpenAI STT engine<br/>(gpt-4o-transcribe)"]
+    C["Your application / UI"]
+
+    A -->|HTTP POST<br/>/v1/audio/transcriptions<br/>stream=true| B
+    B -->|chunked HTTP response<br/>partial & final transcripts| C
+```