Skip to content
This repository was archived by the owner on May 20, 2025. It is now read-only.

Commit fc0ab2c

Browse files
update guide with model download and extra route
1 parent 5e8fca7 commit fc0ab2c

File tree

1 file changed

+168
-37
lines changed

1 file changed

+168
-37
lines changed

docs/guides/python/podcast-transcription.mdx

Lines changed: 168 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ Next, let's install our base dependencies, then add the `openai-whisper` library
3535
# Install the base dependencies
3636
uv sync
3737
# Add OpenAI whisper dependency
38-
uv add openai-whisper --optional ml
38+
uv add openai-whisper librosa numpy --optional ml
3939
```
4040

4141
<Note>
@@ -110,19 +110,19 @@ submittable_transcribe_job = transcribe_job.allow("submit")
110110

111111
@main_api.get("/podcast/:name")
112112
async def get_podcast(ctx: HttpContext):
113-
name = ctx.req.params['name']
113+
name = ctx.req.params['name']
114114

115-
download_url = await readable_transcript_bucket.file(f"{name}-transcript.txt").download_url()
115+
download_url = await readable_transcript_bucket.file(f"{name}-transcript.txt").download_url()
116116

117-
ctx.res.headers["Location"] = download_url
118-
ctx.res.status = 303
117+
ctx.res.headers["Location"] = download_url
118+
ctx.res.status = 303
119119

120-
return ctx
120+
return ctx
121121

122122
Nitric.run()
123123
```
124124

125-
We will add a storage listener which will be triggered by files being added to the `podcast_bucket`.
125+
We will then add a route to get an upload URL for the bucket. We will do this via a URL as it circumvents the size limits of requests to the API Gateway.
126126

127127
```python title:services/api.py
128128
# !collapse(1:18) collapsed
@@ -134,39 +134,126 @@ from nitric.context import HttpContext
134134
readable_transcript_bucket = transcript_bucket.allow("read")
135135
submittable_transcribe_job = transcribe_job.allow("submit")
136136

137+
@main_api.get("/podcast/:name")
138+
async def get_podcast(ctx: HttpContext):
139+
name = ctx.req.params['name']
140+
141+
download_url = await readable_transcript_bucket.file(f"{name}-transcript.txt").download_url()
142+
143+
ctx.res.headers["Location"] = download_url
144+
ctx.res.status = 303
145+
146+
return ctx
147+
148+
@main_api.get("/audio-upload-url/:name")
149+
async def get_audio_upload_url(ctx: HttpContext):
150+
name = ctx.req.params['name']
151+
152+
upload_url = await writable_podcast_bucket.file(name).upload_url()
153+
154+
ctx.res.body = upload_url
155+
156+
Nitric.run()
157+
```
158+
159+
We will add a storage listener which will be triggered by files being added to the `podcast_bucket`.
160+
161+
```python title:services/api.py
162+
# !collapse(1:26) collapsed
163+
from common.resources import main_api, transcript_bucket, podcast_bucket, transcribe_job
164+
from nitric.application import Nitric
165+
from nitric.resources import BucketNotificationContext
166+
from nitric.context import HttpContext
167+
168+
readable_transcript_bucket = transcript_bucket.allow("read")
169+
submittable_transcribe_job = transcribe_job.allow("submit")
170+
137171
@main_api.get("/transcript/:name")
138172
async def get_podcast(ctx: HttpContext):
139-
name = ctx.req.params['name']
173+
name = ctx.req.params['name']
140174

141-
download_url = await readable_transcript_bucket.file(f"{name}-transcript.txt").download_url()
175+
download_url = await readable_transcript_bucket.file(f"{name}-transcript.txt").download_url()
142176

143-
ctx.res.headers["Location"] = download_url
144-
ctx.res.status = 303
177+
ctx.res.headers["Location"] = download_url
178+
ctx.res.status = 303
145179

146-
return ctx
180+
return ctx
181+
182+
@main_api.get("/podcast/:name")
183+
async def get_podcast(ctx: HttpContext):
184+
name = ctx.req.params['name']
185+
186+
download_url = await readable_transcript_bucket.file(f"{name}-transcript.txt").download_url()
187+
188+
ctx.res.headers["Location"] = download_url
189+
ctx.res.status = 303
190+
191+
return ctx
147192

148193
@podcast_bucket.on("write", "*")
149194
async def on_add_podcast(ctx: BucketNotificationContext):
150-
await submittable_transcribe_job.submit({ "podcast_name": ctx.req.key })
195+
await submittable_transcribe_job.submit({ "podcast_name": ctx.req.key })
151196

152-
return ctx
197+
return ctx
153198

154199
Nitric.run()
155200
```
156201

202+
## Downloading our model
203+
204+
We can download our model and embed it into our container to reduce the start up time of our transcription. We'll create a script which can be triggered using `uv run download_model.py --model_name turbo`.
205+
206+
```python title:download_model.py
207+
from whisper import _MODELS, _download
208+
import argparse
209+
import os
210+
211+
default = os.path.join(os.path.expanduser("~"), ".cache")
212+
download_root = os.path.join(os.getenv("XDG_CACHE_HOME", default), "whisper")
213+
214+
def download_whisper_model(model_name="base"):
215+
print("downloading model...")
216+
# if we have the original download go to the default whisper cache
217+
model = _download(_MODELS[model_name], root=download_root, in_memory=True)
218+
219+
# make sure the ./model directory exists
220+
os.makedirs("./.model", exist_ok=True)
221+
222+
# write the model to disk
223+
save_path = f"./.model/model.pt"
224+
with open(save_path, "wb") as f:
225+
f.write(model)
226+
227+
print(f"Model '{model_name}' has been downloaded and saved to './model/model.pt'.")
228+
229+
if __name__ == "__main__":
230+
parser = argparse.ArgumentParser(description="Download a Whisper model.")
231+
parser.add_argument("--model_name", type=str, default="base", help="Name of the model to download.")
232+
233+
args = parser.parse_args()
234+
235+
download_whisper_model(model_name=args.model_name)
236+
```
237+
157238
## Add Transcribe Batch Job
158239

159-
Start by adding our imports and adding permissions to the resources we defined earlier.
240+
Start by adding our imports and adding permissions to the resources we defined earlier. We'll also get the location of the model set as an environment variable, defaulting to `./.model/model.pt`.
160241

161242
```python title:batches/transcribe.py
162243
import whisper
244+
import io
245+
import numpy as np
246+
import os
247+
import librosa
163248
from common.resources import transcribe_job, transcript_bucket, podcast_bucket
164249
from nitric.context import JobContext
165250
from nitric.application import Nitric
166251

167252
writeable_transcript_bucket = transcript_bucket.allow("write")
168253
readable_podcast_bucket = podcast_bucket.allow("read")
169254

255+
MODEL = os.environ.get("MODEL", "./.model/model.pt")
256+
170257
Nitric.run()
171258
```
172259

@@ -181,74 +268,96 @@ We'll then create our Job and set the required memory to `12000`. This is a safe
181268
| large | 1550 M | N/A | large | `~10 GB` | `1x` |
182269

183270
```python title:batches/transcribe.py
184-
# !collapse(1:7) collapsed
271+
# !collapse(1:13) collapsed
185272
import whisper
273+
import io
274+
import numpy as np
275+
import os
276+
import librosa
186277
from common.resources import transcribe_job, transcript_bucket, podcast_bucket
187278
from nitric.context import JobContext
188279
from nitric.application import Nitric
189280

190281
writeable_transcript_bucket = transcript_bucket.allow("write")
191282
readable_podcast_bucket = podcast_bucket.allow("read")
192283

284+
MODEL = os.environ.get("MODEL", "./.model/model.pt")
285+
193286
@transcribe_job(cpus=1, memory=12000, gpus=0)
194287
async def transcribe_podcast(ctx: JobContext):
195288
return ctx
196289

197290
Nitric.run()
198291
```
199292

200-
We'll then read the audio file that is referenced in the `JobContext` data that was sent with the submit request. We'll write the podcast to a local file so that the model can read from it.
293+
We'll then read the audio file that is referenced in the `JobContext` data that was sent with the submit request. We'll load these bytes into a variable as a floating point time series using `librosa` so that it can be converted to a `numpy` array for use by the `whisper`.
201294

202295
```python title:batches/transcribe.py
203-
# !collapse(1:7) collapsed
296+
# !collapse(1:13) collapsed
204297
import whisper
298+
import io
299+
import numpy as np
300+
import os
301+
import librosa
205302
from common.resources import transcribe_job, transcript_bucket, podcast_bucket
206303
from nitric.context import JobContext
207304
from nitric.application import Nitric
208305

209306
writeable_transcript_bucket = transcript_bucket.allow("write")
210307
readable_podcast_bucket = podcast_bucket.allow("read")
211308

309+
MODEL = os.environ.get("MODEL", "./.model/model.pt")
310+
212311
@transcribe_job(cpus=1, memory=12000, gpus=0)
213312
async def transcribe_podcast(ctx: JobContext):
214313
podcast_name = ctx.req.data["podcast_name"]
215314
print(f"Transcribing: {podcast_name}")
216315

217316
podcast = await readable_podcast_bucket.file(podcast_name).read()
218317

219-
with open("local-podcast", "wb") as f:
220-
f.write(podcast)
318+
podcast_io = io.BytesIO(podcast)
319+
320+
y, sr = librosa.load(podcast_io)
321+
audio_array = np.array(y)
221322

222323
return ctx
223324

224325
Nitric.run()
225326
```
226327

227-
We'll then load our model and transcribe the audio. This is where we can choose the model based on balancing speed, size, and accuracy. We can turn off `FP16` with `fp16=False` which will use `FP32` instead. This will depend on what is supported on your CPU when testing locally, however, `FP16` and `FP32` are supported on Lambda.
328+
We'll then load our model and transcribe the audio. We can turn off `FP16` with `fp16=False` which will use `FP32` instead. This will depend on what is supported on your CPU when testing locally, however, `FP16` and `FP32` are supported on Lambda.
228329

229330
```python title:batches/transcribe.py
230-
# !collapse(1:7) collapsed
331+
# !collapse(1:13) collapsed
231332
import whisper
333+
import io
334+
import numpy as np
335+
import os
336+
import librosa
232337
from common.resources import transcribe_job, transcript_bucket, podcast_bucket
233338
from nitric.context import JobContext
234339
from nitric.application import Nitric
235340

236341
writeable_transcript_bucket = transcript_bucket.allow("write")
237342
readable_podcast_bucket = podcast_bucket.allow("read")
238343

344+
MODEL = os.environ.get("MODEL", "./.model/model.pt")
345+
239346
@transcribe_job(cpus=1, memory=12000, gpus=0)
240-
# !collapse(1:9) collapsed
347+
# !collapse(1:10) collapsed
241348
async def transcribe_podcast(ctx: JobContext):
242349
podcast_name = ctx.req.data["podcast_name"]
243350
print(f"Transcribing: {podcast_name}")
244351

245352
podcast = await readable_podcast_bucket.file(podcast_name).read()
246353

247-
with open("local-podcast", "wb") as f:
248-
f.write(podcast)
354+
podcast_io = io.BytesIO(podcast)
355+
356+
y, sr = librosa.load(podcast_io)
357+
audio_array = np.array(y)
249358

250-
model = whisper.load_model("turbo")
251-
result = model.transcribe("local-podcast", verbose=True, fp16=False)
359+
model = whisper.load_model(MODEL)
360+
result = model.transcribe(audio_array, verbose=True, fp16=False)
252361

253362
return ctx
254363

@@ -258,35 +367,44 @@ Nitric.run()
258367
Finally, we'll take the outputted transcript and write that to the transcript bucket. This transcript is stored in `result["text"]`.
259368

260369
```python title:batches/transcribe.py
261-
# !collapse(1:7) collapsed
370+
# !collapse(1:13) collapsed
262371
import whisper
372+
import io
373+
import numpy as np
374+
import os
375+
import librosa
263376
from common.resources import transcribe_job, transcript_bucket, podcast_bucket
264377
from nitric.context import JobContext
265378
from nitric.application import Nitric
266379

267380
writeable_transcript_bucket = transcript_bucket.allow("write")
268381
readable_podcast_bucket = podcast_bucket.allow("read")
269382

383+
MODEL = os.environ.get("MODEL", "./.model/model.pt")
384+
270385
@transcribe_job(cpus=1, memory=12000, gpus=0)
271-
# !collapse(1:12) collapsed
386+
# !collapse(1:13) collapsed
272387
async def transcribe_podcast(ctx: JobContext):
273388
podcast_name = ctx.req.data["podcast_name"]
274389
print(f"Transcribing: {podcast_name}")
275390

276391
podcast = await readable_podcast_bucket.file(podcast_name).read()
277392

278-
with open("local-podcast", "wb") as f:
279-
f.write(podcast)
393+
podcast_io = io.BytesIO(podcast)
280394

281-
model = whisper.load_model("turbo")
282-
result = model.transcribe("local-podcast", verbose=True, fp16=False)
395+
y, sr = librosa.load(podcast_io)
396+
audio_array = np.array(y)
283397

284-
transcript = result["text"].encode()
398+
model = whisper.load_model(MODEL)
399+
result = model.transcribe(audio_array, verbose=True, fp16=False)
285400

286-
print("Finished transcoding... Writing to Bucket")
287-
await writeable_transcript_bucket.file(f"{podcast_name}-transcript.txt").write(transcript)
401+
transcript = result["text"].encode()
288402

289-
return ctx
403+
print("Finished transcoding... Writing to Bucket")
404+
await writeable_transcript_bucket.file(f"{podcast_name}-transcript.txt").write(transcript)
405+
print("Done!")
406+
407+
return ctx
290408

291409
Nitric.run()
292410
```
@@ -403,6 +521,19 @@ We'll add a `dockerignore` to help reduce the size of the Docker Image that is b
403521
.mypy_cache/
404522
.nitric/
405523
.venv/
524+
nitric-spec.json
525+
nitric.yaml
526+
README.md
527+
```
528+
529+
And add `./model` to the python docker ignore.
530+
531+
```text tile:python.dockerignore
532+
.mypy_cache/
533+
.nitric/
534+
.venv/
535+
.model/
536+
nitric-spec.json
406537
nitric.yaml
407538
README.md
408539
```

0 commit comments

Comments
 (0)