|
68 | 68 | }, |
69 | 69 | { |
70 | 70 | "cell_type": "code", |
71 | | - "execution_count": 6, |
| 71 | + "execution_count": 18, |
72 | 72 | "id": "e4078915", |
73 | 73 | "metadata": {}, |
74 | 74 | "outputs": [ |
|
81 | 81 | } |
82 | 82 | ], |
83 | 83 | "source": [ |
84 | | - "import os, nest_asyncio, asyncio\n", |
85 | | - "from openai import OpenAI\n", |
| 84 | + "# ─── Standard Library ──────────────────────────────────────────────────────────\n", |
| 85 | + "import asyncio\n", |
| 86 | + "import base64 # encode raw PCM bytes → base64 before sending JSON\n", |
| 87 | + "import json # compose/parse WebSocket messages\n", |
| 88 | + "import os\n", |
86 | 89 | "import time\n", |
| 90 | + "from typing import List\n", |
| 91 | + "\n", |
| 92 | + "# ─── Third-Party ───────────────────────────────────────────────────────────────\n", |
| 93 | + "import nest_asyncio\n", |
| 94 | + "import numpy as np # efficient numeric processing for audio arrays\n", |
| 95 | + "from openai import OpenAI\n", |
| 96 | + "import resampy # high-quality sample-rate conversion\n", |
| 97 | + "import soundfile as sf # reads many audio formats into float32 arrays\n", |
| 98 | + "import websockets # asyncio-based WebSocket client\n", |
| 99 | + "from agents import Agent\n", |
| 100 | + "from agents.voice import (\n", |
| 101 | + " SingleAgentVoiceWorkflow,\n", |
| 102 | + " StreamedAudioInput,\n", |
| 103 | + " VoicePipeline,\n", |
| 104 | + " VoicePipelineConfig,\n", |
| 105 | + ")\n", |
| 106 | + "# ───────────────────────────────────────────────────────────────────────────────\n", |
87 | 107 | "nest_asyncio.apply()\n", |
88 | 108 | "\n", |
89 | 109 | "# ✏️ Put your key in an env-var or just replace the call below.\n", |
90 | 110 | "OPENAI_API_KEY = os.getenv(\"OPENAI_API_KEY\")\n", |
91 | 111 | "\n", |
92 | 112 | "client = OpenAI(api_key=OPENAI_API_KEY)\n", |
93 | | - "print('✅ OpenAI client ready')" |
| 113 | + "print(\"✅ OpenAI client ready\")" |
94 | 114 | ] |
95 | 115 | }, |
96 | 116 | { |
|
102 | 122 | "## 1 · Speech-to-Text with Audio File\n", |
103 | 123 | "\n", |
104 | 124 | "### When to use\n", |
105 | | - "* You have a completed audio file (up to 25 MB).\n", |
| 125 | + "* You have a completed audio file (up to 25 MB).The following input file types are supported: mp3, mp4, mpeg, mpga, m4a, wav, and webm.\n", |
106 | 126 | "* Suitable for batch processing tasks like podcasts, call-center recordings, or voice memos.\n", |
107 | 127 | "* Real-time feedback or partial results are not required." |
108 | 128 | ] |
|
114 | 134 | "source": [ |
115 | 135 | "### How it works\n", |
116 | 136 | "\n", |
117 | | - "```mermaid\n", |
118 | | - "flowchart LR\n", |
119 | | - " AudioFile[\"Audio file<br/>(WAV • MP3 • FLAC)\"] --> Upload[\"Binary upload\"]\n", |
120 | | - " Upload --> API[\"/v1/audio/transcriptions\"]\n", |
121 | | - " API --> JSONOutput[\"JSON transcription<br/>+ metadata\"]\n", |
122 | | - " JSONOutput --> App[\"Your application\"]\n", |
123 | | - "```\n", |
| 137 | + "\n", |
| 138 | + "\n", |
124 | 139 | "\n", |
125 | 140 | "#### Benefits\n", |
126 | 141 | "\n", |
|
146 | 161 | }, |
147 | 162 | { |
148 | 163 | "cell_type": "code", |
149 | | - "execution_count": 2, |
| 164 | + "execution_count": 19, |
150 | 165 | "id": "ab545e4c", |
151 | 166 | "metadata": {}, |
152 | 167 | "outputs": [ |
|
180 | 195 | }, |
181 | 196 | { |
182 | 197 | "cell_type": "code", |
183 | | - "execution_count": 10, |
| 198 | + "execution_count": 20, |
184 | 199 | "id": "7ae4af8d", |
185 | 200 | "metadata": {}, |
186 | 201 | "outputs": [ |
|
213 | 228 | "id": "e6a5b89e", |
214 | 229 | "metadata": {}, |
215 | 230 | "source": [ |
216 | | - "We notice that the transcription is much smaller than expected.\n", |
217 | | - "\n", |
218 | | - "What you’re seeing is a known limitation of gpt-4o-mini-transcribe models when they’re used through the non-realtime /v1/audio/transcriptions endpoint:\n", |
| 231 | + "We notice that the transcription is much smaller than expected. What you’re seeing is a known limitation of gpt-4o-mini-transcribe models when they’re used through the non-realtime /v1/audio/transcriptions endpoint:\n", |
219 | 232 | "\n", |
220 | | - "The backend still runs its own voice-activity-detection (VAD) pass to break the file into “turns”.\n", |
| 233 | + ">The backend still runs its own voice-activity-detection (VAD) pass to break the file into “turns”.\n", |
221 | 234 | "\n", |
222 | | - "Only the first completed turn is copied into the text field of the JSON that the endpoint returns. Subsequent turns are silently discarded." |
| 235 | + ">Only the first completed turn is copied into the text field of the JSON that the endpoint returns. Subsequent turns are silently discarded." |
223 | 236 | ] |
224 | 237 | }, |
225 | 238 | { |
|
234 | 247 | "- You need immediate transcription results (partial or final) as they arrive. \n", |
235 | 248 | "- Scenarios where partial feedback improves UX, e.g., uploading a long voice memo.\n", |
236 | 249 | "\n", |
237 | | - "```mermaid\n", |
238 | | - "flowchart LR\n", |
239 | | - " A[\"Finished audio file<br/>(WAV • MP3 • FLAC • …)\"]\n", |
240 | | - " B[\"OpenAI STT engine<br/>(gpt-4o-transcribe)\"]\n", |
241 | | - " C[\"Your application / UI\"]\n", |
242 | | - "\n", |
243 | | - " A -->|HTTP POST<br/>/v1/audio/transcriptions<br/>stream=true| B\n", |
244 | | - " B -->|chunked HTTP response<br/>partial & final transcripts| C\n", |
245 | | - "```\n", |
| 250 | + "\n", |
246 | 251 | "\n", |
247 | 252 | "#### Benefits\n", |
248 | 253 | "- **Real-time feel:** Users see transcription updates almost immediately. \n", |
|
256 | 261 | }, |
257 | 262 | { |
258 | 263 | "cell_type": "code", |
259 | | - "execution_count": null, |
| 264 | + "execution_count": 25, |
260 | 265 | "id": "d027fdb9", |
261 | 266 | "metadata": {}, |
262 | 267 | "outputs": [ |
|
310 | 315 | "source": [ |
311 | 316 | "### How it works\n", |
312 | 317 | "\n", |
313 | | - "```mermaid\n", |
314 | | - "sequenceDiagram\n", |
315 | | - " participant Mic\n", |
316 | | - " participant App\n", |
317 | | - " participant WS as \"WebSocket\"\n", |
318 | | - " participant OAI as \"Realtime Server\"\n", |
319 | | - "\n", |
320 | | - " Mic ->> App: 20–40 ms PCM frames\n", |
321 | | - " App ->> WS: Base64-encoded chunks<br/>input_audio_buffer.append\n", |
322 | | - " WS ->> OAI: Audio stream\n", |
323 | | - " OAI -->> WS: JSON transcription events<br/>(partial & complete)\n", |
324 | | - " WS -->> App: Transcript updates\n", |
325 | | - "```\n", |
| 318 | + "\n", |
326 | 319 | "\n", |
327 | 320 | "#### Benefits\n", |
328 | 321 | "- **Ultra-low latency:** Typically 300–800 ms, enabling near-instant transcription. \n", |
|
337 | 330 | }, |
338 | 331 | { |
339 | 332 | "cell_type": "code", |
340 | | - "execution_count": 14, |
| 333 | + "execution_count": 22, |
341 | 334 | "id": "2b2f2515", |
342 | 335 | "metadata": {}, |
343 | 336 | "outputs": [], |
344 | 337 | "source": [ |
345 | | - "import base64 # encode raw PCM bytes → base64 before sending JSON\n", |
346 | | - "import json # compose/parse WebSocket messages\n", |
347 | | - "\n", |
348 | | - "from typing import List\n", |
349 | | - "\n", |
350 | | - "import numpy as np # efficient numeric processing for audio arrays\n", |
351 | | - "import resampy # high-quality sample-rate conversion\n", |
352 | | - "import soundfile as sf # reads many audio formats into float32 arrays\n", |
353 | | - "import websockets # asyncio-based WebSocket client\n", |
354 | | - "\n", |
355 | | - "\n", |
356 | 338 | "TARGET_SR = 24_000 # Realtime STT expects 24-kHz mono input\n", |
357 | 339 | "PCM_SCALE = 32_767 # float32 (−1…1) → int16 conversion factor\n", |
358 | 340 | "DEFAULT_CHUNK = 3_072 # 3 072 / 24 000 ≈ 128 ms (server-VAD sweet-spot)\n", |
|
501 | 483 | }, |
502 | 484 | { |
503 | 485 | "cell_type": "code", |
504 | | - "execution_count": 16, |
| 486 | + "execution_count": 23, |
505 | 487 | "id": "d90de5b9", |
506 | 488 | "metadata": {}, |
507 | 489 | "outputs": [ |
|
518 | 500 | "'The stale smell of old beer lingers. It takes heat to bring out the odor. A cold dip restores health and zest. A salt pickle tastes fine with ham. Tacos al pastor are my favorite.'" |
519 | 501 | ] |
520 | 502 | }, |
521 | | - "execution_count": 16, |
| 503 | + "execution_count": 23, |
522 | 504 | "metadata": {}, |
523 | 505 | "output_type": "execute_result" |
524 | 506 | } |
|
550 | 532 | "source": [ |
551 | 533 | "### How it works\n", |
552 | 534 | "\n", |
553 | | - "```{mermaid}\n", |
554 | | - "graph LR\n", |
555 | | - " Mic -- \"PCM frames\" --> VP[\"VoicePipeline\"]\n", |
556 | | - " VP -- \"VAD & resample\" --> Buf[\"Sentence buffer\"]\n", |
557 | | - " Buf --> GPT[\"gpt-4o-transcribe\"]\n", |
558 | | - " GPT --> Agent[\"Agent callbacks\"]\n", |
559 | | - " Agent -- \"print / reply\" --> App\n", |
560 | | - "```\n", |
| 535 | + "\n", |
561 | 536 | "\n", |
562 | 537 | "**Benefits**\n", |
563 | 538 | "\n", |
564 | 539 | "- **Minimal boilerplate:** `VoicePipeline` handles resampling, VAD, buffering, token auth, and reconnects. \n", |
565 | | - "- **Seamless agent integration: Enables direct interaction with GPT agents using real-time audio transcription.\n", |
| 540 | + "- **Seamless agent integration**: Enables direct interaction with GPT agents using real-time audio transcription.\n", |
566 | 541 | "\n", |
567 | 542 | "**Limitations**\n", |
568 | 543 | "\n", |
|
573 | 548 | }, |
574 | 549 | { |
575 | 550 | "cell_type": "code", |
576 | | - "execution_count": 13, |
| 551 | + "execution_count": 24, |
577 | 552 | "id": "754a846b", |
578 | 553 | "metadata": {}, |
579 | 554 | "outputs": [ |
|
585 | 560 | "[User]: The stale smell of old beer lingers.\n", |
586 | 561 | "[Assistant]: L'odeur rance de la vieille bière persiste.\n", |
587 | 562 | "[User]: A cold dip restores health and zest.\n", |
588 | | - "[Assistant]: Un bain froid restaure la santé et l'énergie.\n", |
| 563 | + "[Assistant]: Un bain froid restaure la santé et le tonus.\n", |
| 564 | + "[User]: It takes heat to bring out the odor.\n", |
| 565 | + "[Assistant]: Il faut de la chaleur pour faire ressortir l'odeur.\n", |
| 566 | + "[User]: A salt pickle tastes fine with ham.\n", |
| 567 | + "[Assistant]: Un cornichon au sel se marie bien avec le jambon.\n", |
589 | 568 | "[User]: Tacos al pastor are my favorite.\n", |
590 | | - "[Assistant]: Les tacos al pastor sont mes préférés." |
| 569 | + "[Assistant]: " |
591 | 570 | ] |
592 | 571 | }, |
593 | 572 | { |
594 | 573 | "name": "stdout", |
595 | 574 | "output_type": "stream", |
596 | 575 | "text": [ |
597 | | - "\n", |
| 576 | + "Les tacos al pastor sont mes préférés.\n", |
598 | 577 | "[User]: A zestful food is the hot cross bun.\n", |
599 | | - "[Assistant]: Un aliment plein de dynamisme est le petit pain de Pâques." |
| 578 | + "[Assistant]: Un aliment plein de saveur est le petit pain de Pâques." |
600 | 579 | ] |
601 | 580 | } |
602 | 581 | ], |
603 | 582 | "source": [ |
604 | | - "import asyncio, numpy as np, soundfile as sf, resampy\n", |
605 | | - "from agents import Agent\n", |
606 | | - "from agents.voice import (\n", |
607 | | - " StreamedAudioInput,\n", |
608 | | - " VoicePipeline,\n", |
609 | | - " VoicePipelineConfig,\n", |
610 | | - " SingleAgentVoiceWorkflow,\n", |
611 | | - ")\n", |
612 | | - "\n", |
613 | | - "\n", |
614 | 583 | "# ── 1 · agent that replies in French ---------------------------------------\n", |
615 | 584 | "fr_agent = Agent(\n", |
616 | 585 | " name=\"Assistant-FR\",\n", |
|
0 commit comments