openai · minh-hoque · May 7, 2025 · Apr 29, 2025 · Apr 29, 2025 · Apr 29, 2025
diff --git a/.gitignore b/.gitignore
@@ -140,6 +140,7 @@ examples/fine-tuned_qa/local_cache/*
 
 # PyCharm files
 .idea/
+.cursorignore
 
 # VS Code files
 .vscode/
diff --git a/authors.yaml b/authors.yaml
@@ -3,6 +3,11 @@
 # You can optionally customize how your information shows up cookbook.openai.com over here.
 # If your information is not present here, it will be pulled from your GitHub profile.
 
+minh-hoque:
+  name: "Minhajul Hoque"
+  website: "https://www.linkedin.com/in/minhajul-hoque-83242b163/"
+  avatar: "https://avatars.githubusercontent.com/u/84698472?v=4"
+
 shikhar-cyber:
   name: "Shikhar Kwatra"
   website: "https://www.linkedin.com/in/shikharkwatra/"
@@ -126,13 +131,13 @@ aaronwilkowitz-openai:
 charuj:
   name: "Charu Jaiswal"
   website: "https://www.linkedin.com/in/charu-j-8a866471"
-  avatar: "https://avatars.githubusercontent.com/u/18404643?v=4" 
+  avatar: "https://avatars.githubusercontent.com/u/18404643?v=4"
 
 rupert-openai:
   name: "Rupert Truman"
   website: "https://www.linkedin.com/in/rupert-truman/"
   avatar: "https://avatars.githubusercontent.com/u/171234447"
-  
+
 keelan-openai:
   name: "Keelan Schule"
   website: "https://www.linkedin.com/in/keelanschule/"
@@ -171,8 +176,8 @@ evanweiss-openai:
 girishd:
   name: "Girish Dusane"
   website: "https://www.linkedin.com/in/girishdusane/"
-  avatar: "https://avatars.githubusercontent.com/u/272708"   
-  
+  avatar: "https://avatars.githubusercontent.com/u/272708"
+
 lxing-oai:
   name: "Luke Xing"
   website: "https://www.linkedin.com/in/lukexing/"
@@ -227,7 +232,7 @@ erickgort:
   name: "Erick Gort"
   website: "https://www.linkedin.com/in/erick-gort-32ab1678/"
   avatar: "https://avatars.githubusercontent.com/u/189261906?v=4"
-  
+
 kylecote-tray:
   name: "Kyle Cote"
   website: "https://github.com/kylecote-tray"
@@ -297,7 +302,7 @@ rzhao-openai:
   name: "Randy Zhao"
   website: "https://www.linkedin.com/in/randy-zhao-27433616b"
   avatar: "https://avatars.githubusercontent.com/u/208724779?v=4"
-  
+
 brandonbaker-openai:
   name: "Brandon Baker"
   website: "https://www.linkedin.com/in/brandonbaker18"

diff --git a/examples/Speech_transcription_methods.ipynb b/examples/Speech_transcription_methods.ipynb
diff --git a/examples/data/sample_audio_files/18_sec_food_story.wav b/examples/data/sample_audio_files/18_sec_food_story.wav
diff --git a/examples/data/sample_audio_files/6_sec_female_speaker.wav b/examples/data/sample_audio_files/6_sec_female_speaker.wav
diff --git a/examples/data/sample_audio_files/lotsoftimes-78085.mp3 b/examples/data/sample_audio_files/lotsoftimes-78085.mp3
diff --git a/examples/imgs/agents_sdk_transcription.png b/examples/imgs/agents_sdk_transcription.png
diff --git a/examples/imgs/realtime_api_transcription.png b/examples/imgs/realtime_api_transcription.png
diff --git a/examples/imgs/speech-to-text-not-streaming.png b/examples/imgs/speech-to-text-not-streaming.png
diff --git a/examples/imgs/speech-to-text-streaming.png b/examples/imgs/speech-to-text-streaming.png
diff --git a/examples/mermaid/agents_sdk_transcription.mmd b/examples/mermaid/agents_sdk_transcription.mmd
@@ -0,0 +1,8 @@
+```{mermaid}
+graph LR
+    Mic  -- "PCM frames" --> VP["VoicePipeline"]
+    VP   -- "VAD & resample" --> Buf["Sentence buffer"]
+    Buf  --> GPT["gpt-4o-transcribe"]
+    GPT  --> Agent["Agent callbacks"]
+    Agent -- "print / reply" --> App
+```
diff --git a/examples/mermaid/realtime_api_transcription.mmd b/examples/mermaid/realtime_api_transcription.mmd
@@ -0,0 +1,13 @@
+```mermaid
+sequenceDiagram
+    participant Mic
+    participant App
+    participant WS as "WebSocket"
+    participant OAI as "Realtime Server"
+
+    Mic ->> App: 20–40 ms PCM frames
+    App ->> WS: Base64-encoded chunks<br/>input_audio_buffer.append
+    WS  ->> OAI: Audio stream
+    OAI -->> WS: JSON transcription events<br/>(partial & complete)
+    WS  -->> App: Transcript updates
+```
diff --git a/examples/mermaid/speech-to-text-not-streaming.mmd b/examples/mermaid/speech-to-text-not-streaming.mmd
@@ -0,0 +1,7 @@
+```mermaid
+flowchart LR
+    AudioFile["Audio file<br/>(WAV • MP3 • FLAC)"] --> Upload["Binary upload"]
+    Upload --> API["/v1/audio/transcriptions"]
+    API --> JSONOutput["JSON transcription<br/>+ metadata"]
+    JSONOutput --> App["Your application"]
+```
diff --git a/examples/mermaid/speech-to-text-streaming.mmd b/examples/mermaid/speech-to-text-streaming.mmd
@@ -0,0 +1,9 @@
+```mermaid
+flowchart LR
+    A["Finished audio file<br/>(WAV • MP3 • FLAC • …)"]
+    B["OpenAI STT engine<br/>(gpt-4o-transcribe)"]
+    C["Your application / UI"]
+
+    A -->|HTTP POST<br/>/v1/audio/transcriptions<br/>stream=true| B
+    B -->|chunked HTTP response<br/>partial & final transcripts| C
+```
diff --git a/registry.yaml b/registry.yaml
@@ -4,6 +4,16 @@
 # should build pages for, and indicates metadata such as tags, creation date and
 # authors for each page.
 
+- title: Methods of Speech-to-Text using OpenAI API & Agents SDK
+  path: examples/Speech_transcription_methods.ipynb
+  date: 2025-04-29
+  authors:
+    - minh-hoque
+  tags:
+    - audio
+    - speech
+    - agents-sdk
+
 - title: EvalsAPI Use-case - Detecting prompt regressions
   path: examples/evaluation/use-cases/regression.ipynb
   date: 2025-04-08