Skip to content

Commit 32179a3

Browse files
committed
Fixed mermaid rendering in notebook
1 parent 32918fa commit 32179a3

9 files changed

+86
-80
lines changed

examples/Speech_transcription_methods.ipynb

Lines changed: 49 additions & 80 deletions
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,7 @@
6868
},
6969
{
7070
"cell_type": "code",
71-
"execution_count": 6,
71+
"execution_count": 18,
7272
"id": "e4078915",
7373
"metadata": {},
7474
"outputs": [
@@ -81,16 +81,36 @@
8181
}
8282
],
8383
"source": [
84-
"import os, nest_asyncio, asyncio\n",
85-
"from openai import OpenAI\n",
84+
"# ─── Standard Library ──────────────────────────────────────────────────────────\n",
85+
"import asyncio\n",
86+
"import base64 # encode raw PCM bytes → base64 before sending JSON\n",
87+
"import json # compose/parse WebSocket messages\n",
88+
"import os\n",
8689
"import time\n",
90+
"from typing import List\n",
91+
"\n",
92+
"# ─── Third-Party ───────────────────────────────────────────────────────────────\n",
93+
"import nest_asyncio\n",
94+
"import numpy as np # efficient numeric processing for audio arrays\n",
95+
"from openai import OpenAI\n",
96+
"import resampy # high-quality sample-rate conversion\n",
97+
"import soundfile as sf # reads many audio formats into float32 arrays\n",
98+
"import websockets # asyncio-based WebSocket client\n",
99+
"from agents import Agent\n",
100+
"from agents.voice import (\n",
101+
" SingleAgentVoiceWorkflow,\n",
102+
" StreamedAudioInput,\n",
103+
" VoicePipeline,\n",
104+
" VoicePipelineConfig,\n",
105+
")\n",
106+
"# ───────────────────────────────────────────────────────────────────────────────\n",
87107
"nest_asyncio.apply()\n",
88108
"\n",
89109
"# ✏️ Put your key in an env-var or just replace the call below.\n",
90110
"OPENAI_API_KEY = os.getenv(\"OPENAI_API_KEY\")\n",
91111
"\n",
92112
"client = OpenAI(api_key=OPENAI_API_KEY)\n",
93-
"print('✅ OpenAI client ready')"
113+
"print(\"✅ OpenAI client ready\")"
94114
]
95115
},
96116
{
@@ -102,7 +122,7 @@
102122
"## 1 · Speech-to-Text with Audio File\n",
103123
"\n",
104124
"### When to use\n",
105-
"* You have a completed audio file (up to 25 MB).\n",
125+
"* You have a completed audio file (up to 25 MB).The following input file types are supported: mp3, mp4, mpeg, mpga, m4a, wav, and webm.\n",
106126
"* Suitable for batch processing tasks like podcasts, call-center recordings, or voice memos.\n",
107127
"* Real-time feedback or partial results are not required."
108128
]
@@ -114,13 +134,8 @@
114134
"source": [
115135
"### How it works\n",
116136
"\n",
117-
"```mermaid\n",
118-
"flowchart LR\n",
119-
" AudioFile[\"Audio file<br/>(WAV • MP3 • FLAC)\"] --> Upload[\"Binary upload\"]\n",
120-
" Upload --> API[\"/v1/audio/transcriptions\"]\n",
121-
" API --> JSONOutput[\"JSON transcription<br/>+ metadata\"]\n",
122-
" JSONOutput --> App[\"Your application\"]\n",
123-
"```\n",
137+
"\n",
138+
"![STT Not Streaming Transcription flow](./imgs/speech-to-text-not-streaming.png)\n",
124139
"\n",
125140
"#### Benefits\n",
126141
"\n",
@@ -146,7 +161,7 @@
146161
},
147162
{
148163
"cell_type": "code",
149-
"execution_count": 2,
164+
"execution_count": 19,
150165
"id": "ab545e4c",
151166
"metadata": {},
152167
"outputs": [
@@ -180,7 +195,7 @@
180195
},
181196
{
182197
"cell_type": "code",
183-
"execution_count": 10,
198+
"execution_count": 20,
184199
"id": "7ae4af8d",
185200
"metadata": {},
186201
"outputs": [
@@ -213,13 +228,11 @@
213228
"id": "e6a5b89e",
214229
"metadata": {},
215230
"source": [
216-
"We notice that the transcription is much smaller than expected.\n",
217-
"\n",
218-
"What you’re seeing is a known limitation of gpt-4o-mini-transcribe models when they’re used through the non-realtime /v1/audio/transcriptions endpoint:\n",
231+
"We notice that the transcription is much smaller than expected. What you’re seeing is a known limitation of gpt-4o-mini-transcribe models when they’re used through the non-realtime /v1/audio/transcriptions endpoint:\n",
219232
"\n",
220-
"The backend still runs its own voice-activity-detection (VAD) pass to break the file into “turns”.\n",
233+
">The backend still runs its own voice-activity-detection (VAD) pass to break the file into “turns”.\n",
221234
"\n",
222-
"Only the first completed turn is copied into the text field of the JSON that the endpoint returns. Subsequent turns are silently discarded."
235+
">Only the first completed turn is copied into the text field of the JSON that the endpoint returns. Subsequent turns are silently discarded."
223236
]
224237
},
225238
{
@@ -234,15 +247,7 @@
234247
"- You need immediate transcription results (partial or final) as they arrive. \n",
235248
"- Scenarios where partial feedback improves UX, e.g., uploading a long voice memo.\n",
236249
"\n",
237-
"```mermaid\n",
238-
"flowchart LR\n",
239-
" A[\"Finished audio file<br/>(WAV • MP3 • FLAC • …)\"]\n",
240-
" B[\"OpenAI STT engine<br/>(gpt-4o-transcribe)\"]\n",
241-
" C[\"Your application / UI\"]\n",
242-
"\n",
243-
" A -->|HTTP POST<br/>/v1/audio/transcriptions<br/>stream=true| B\n",
244-
" B -->|chunked HTTP response<br/>partial & final transcripts| C\n",
245-
"```\n",
250+
"![STT Streaming Transcription flow](./imgs/speech-to-text-streaming.png)\n",
246251
"\n",
247252
"#### Benefits\n",
248253
"- **Real-time feel:** Users see transcription updates almost immediately. \n",
@@ -256,7 +261,7 @@
256261
},
257262
{
258263
"cell_type": "code",
259-
"execution_count": null,
264+
"execution_count": 25,
260265
"id": "d027fdb9",
261266
"metadata": {},
262267
"outputs": [
@@ -310,19 +315,7 @@
310315
"source": [
311316
"### How it works\n",
312317
"\n",
313-
"```mermaid\n",
314-
"sequenceDiagram\n",
315-
" participant Mic\n",
316-
" participant App\n",
317-
" participant WS as \"WebSocket\"\n",
318-
" participant OAI as \"Realtime Server\"\n",
319-
"\n",
320-
" Mic ->> App: 20–40 ms PCM frames\n",
321-
" App ->> WS: Base64-encoded chunks<br/>input_audio_buffer.append\n",
322-
" WS ->> OAI: Audio stream\n",
323-
" OAI -->> WS: JSON transcription events<br/>(partial & complete)\n",
324-
" WS -->> App: Transcript updates\n",
325-
"```\n",
318+
"![Realtime Transcription flow](./imgs/realtime_api_transcription.png)\n",
326319
"\n",
327320
"#### Benefits\n",
328321
"- **Ultra-low latency:** Typically 300–800 ms, enabling near-instant transcription. \n",
@@ -337,22 +330,11 @@
337330
},
338331
{
339332
"cell_type": "code",
340-
"execution_count": 14,
333+
"execution_count": 22,
341334
"id": "2b2f2515",
342335
"metadata": {},
343336
"outputs": [],
344337
"source": [
345-
"import base64 # encode raw PCM bytes → base64 before sending JSON\n",
346-
"import json # compose/parse WebSocket messages\n",
347-
"\n",
348-
"from typing import List\n",
349-
"\n",
350-
"import numpy as np # efficient numeric processing for audio arrays\n",
351-
"import resampy # high-quality sample-rate conversion\n",
352-
"import soundfile as sf # reads many audio formats into float32 arrays\n",
353-
"import websockets # asyncio-based WebSocket client\n",
354-
"\n",
355-
"\n",
356338
"TARGET_SR = 24_000 # Realtime STT expects 24-kHz mono input\n",
357339
"PCM_SCALE = 32_767 # float32 (−1…1) → int16 conversion factor\n",
358340
"DEFAULT_CHUNK = 3_072 # 3 072 / 24 000 ≈ 128 ms (server-VAD sweet-spot)\n",
@@ -501,7 +483,7 @@
501483
},
502484
{
503485
"cell_type": "code",
504-
"execution_count": 16,
486+
"execution_count": 23,
505487
"id": "d90de5b9",
506488
"metadata": {},
507489
"outputs": [
@@ -518,7 +500,7 @@
518500
"'The stale smell of old beer lingers. It takes heat to bring out the odor. A cold dip restores health and zest. A salt pickle tastes fine with ham. Tacos al pastor are my favorite.'"
519501
]
520502
},
521-
"execution_count": 16,
503+
"execution_count": 23,
522504
"metadata": {},
523505
"output_type": "execute_result"
524506
}
@@ -550,19 +532,12 @@
550532
"source": [
551533
"### How it works\n",
552534
"\n",
553-
"```{mermaid}\n",
554-
"graph LR\n",
555-
" Mic -- \"PCM frames\" --> VP[\"VoicePipeline\"]\n",
556-
" VP -- \"VAD & resample\" --> Buf[\"Sentence buffer\"]\n",
557-
" Buf --> GPT[\"gpt-4o-transcribe\"]\n",
558-
" GPT --> Agent[\"Agent callbacks\"]\n",
559-
" Agent -- \"print / reply\" --> App\n",
560-
"```\n",
535+
"![Agents Transcription flow](./imgs/agents_sdk_transcription.png)\n",
561536
"\n",
562537
"**Benefits**\n",
563538
"\n",
564539
"- **Minimal boilerplate:** `VoicePipeline` handles resampling, VAD, buffering, token auth, and reconnects. \n",
565-
"- **Seamless agent integration: Enables direct interaction with GPT agents using real-time audio transcription.\n",
540+
"- **Seamless agent integration**: Enables direct interaction with GPT agents using real-time audio transcription.\n",
566541
"\n",
567542
"**Limitations**\n",
568543
"\n",
@@ -573,7 +548,7 @@
573548
},
574549
{
575550
"cell_type": "code",
576-
"execution_count": 13,
551+
"execution_count": 24,
577552
"id": "754a846b",
578553
"metadata": {},
579554
"outputs": [
@@ -585,32 +560,26 @@
585560
"[User]: The stale smell of old beer lingers.\n",
586561
"[Assistant]: L'odeur rance de la vieille bière persiste.\n",
587562
"[User]: A cold dip restores health and zest.\n",
588-
"[Assistant]: Un bain froid restaure la santé et l'énergie.\n",
563+
"[Assistant]: Un bain froid restaure la santé et le tonus.\n",
564+
"[User]: It takes heat to bring out the odor.\n",
565+
"[Assistant]: Il faut de la chaleur pour faire ressortir l'odeur.\n",
566+
"[User]: A salt pickle tastes fine with ham.\n",
567+
"[Assistant]: Un cornichon au sel se marie bien avec le jambon.\n",
589568
"[User]: Tacos al pastor are my favorite.\n",
590-
"[Assistant]: Les tacos al pastor sont mes préférés."
569+
"[Assistant]: "
591570
]
592571
},
593572
{
594573
"name": "stdout",
595574
"output_type": "stream",
596575
"text": [
597-
"\n",
576+
"Les tacos al pastor sont mes préférés.\n",
598577
"[User]: A zestful food is the hot cross bun.\n",
599-
"[Assistant]: Un aliment plein de dynamisme est le petit pain de Pâques."
578+
"[Assistant]: Un aliment plein de saveur est le petit pain de Pâques."
600579
]
601580
}
602581
],
603582
"source": [
604-
"import asyncio, numpy as np, soundfile as sf, resampy\n",
605-
"from agents import Agent\n",
606-
"from agents.voice import (\n",
607-
" StreamedAudioInput,\n",
608-
" VoicePipeline,\n",
609-
" VoicePipelineConfig,\n",
610-
" SingleAgentVoiceWorkflow,\n",
611-
")\n",
612-
"\n",
613-
"\n",
614583
"# ── 1 · agent that replies in French ---------------------------------------\n",
615584
"fr_agent = Agent(\n",
616585
" name=\"Assistant-FR\",\n",
15.2 KB
Loading
40.8 KB
Loading
16.7 KB
Loading
22.9 KB
Loading
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
```{mermaid}
2+
graph LR
3+
Mic -- "PCM frames" --> VP["VoicePipeline"]
4+
VP -- "VAD & resample" --> Buf["Sentence buffer"]
5+
Buf --> GPT["gpt-4o-transcribe"]
6+
GPT --> Agent["Agent callbacks"]
7+
Agent -- "print / reply" --> App
8+
```
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
```mermaid
2+
sequenceDiagram
3+
participant Mic
4+
participant App
5+
participant WS as "WebSocket"
6+
participant OAI as "Realtime Server"
7+
8+
Mic ->> App: 20–40 ms PCM frames
9+
App ->> WS: Base64-encoded chunks<br/>input_audio_buffer.append
10+
WS ->> OAI: Audio stream
11+
OAI -->> WS: JSON transcription events<br/>(partial & complete)
12+
WS -->> App: Transcript updates
13+
```
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
```mermaid
2+
flowchart LR
3+
AudioFile["Audio file<br/>(WAV • MP3 • FLAC)"] --> Upload["Binary upload"]
4+
Upload --> API["/v1/audio/transcriptions"]
5+
API --> JSONOutput["JSON transcription<br/>+ metadata"]
6+
JSONOutput --> App["Your application"]
7+
```
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
```mermaid
2+
flowchart LR
3+
A["Finished audio file<br/>(WAV • MP3 • FLAC • …)"]
4+
B["OpenAI STT engine<br/>(gpt-4o-transcribe)"]
5+
C["Your application / UI"]
6+
7+
A -->|HTTP POST<br/>/v1/audio/transcriptions<br/>stream=true| B
8+
B -->|chunked HTTP response<br/>partial & final transcripts| C
9+
```

0 commit comments

Comments
 (0)