@@ -11,17 +11,15 @@ tags:
1111
1212In this guide we'll be building the first of many parts to create a fully autonomous AI podcast.
1313
14- This part will focus on generating long form audio content from text using the Nitric Batch service.
14+ This part will focus on generating long form audio content from text using a [ Nitric Batch service] ( /batch ) .
1515
1616By the end of this guide we'll have a project that will be able to produce audio content from text input.
1717
1818Here is a sample of what we'll be able to produce:
1919
20- { /* Add audio clip here */ }
21-
2220<audio controls >
23- <source src = " audio/dead-internet-podcast.m4a" type = " audio/x-m4a" />
24- </audio >{ ' ' }
21+ <source src = " /docs/ audio/dead-internet-podcast.m4a" type = " audio/x-m4a" />
22+ </audio >
2523
2624## Prerequisites
2725
@@ -30,3 +28,349 @@ Here is a sample of what we'll be able to produce:
3028- _ (optional)_ Your choice of an [ AWS] ( https://aws.amazon.com ) or [ GCP] ( https://cloud.google.com )
3129
3230## Getting started
31+
32+ We'll start by creating a new project for our AI podcast.
33+
34+ ``` bash
35+ nitric new ai-podcast py-starter
36+ cd ai-podcast
37+ ```
38+
39+ Next we'll install our base dependencies:
40+
41+ ``` bash
42+ pipenv install --dev
43+ ```
44+
45+ Then we'll install the dependencies we need for this project:
46+
47+ ``` bash
48+ pipenv install --categories=" ml" torch transformers scipy
49+ ```
50+
51+ <Note >
52+ We'll be using the ` transformers ` library from Hugging Face to generate the
53+ audio content. Specifically we'll be using the ` suno/bark ` model for this
54+ project.
55+ </Note >
56+
57+ ## Designing our project
58+
59+ We'll start off by creating a new module that will help us manage our cloud resources for this project.
60+
61+ We'll create this as ` common/resources.py ` in our project.
62+
63+ ``` python
64+ from nitric.resources import api, bucket, job
65+ # Our main API for invoking our project
66+ main_api = api(" main" )
67+ # A job for generating our audio content
68+ gen_audio_job = job(" audio" )
69+ # A job for managing our model downloads
70+ download_audio_model_job = job(" download-audio-model" )
71+
72+ # A bucket for storing our audio clips
73+ clips_bucket = bucket(" clips" )
74+ # And another bucket for storing our models
75+ models_bucket = bucket(" models" )
76+ ```
77+
78+ ## Creating our first batch job
79+
80+ Next we'll create the beginnings of our audio generation job.
81+
82+ ``` python
83+ from common.resources import gen_audio_job
84+ from nitric.context import JobContext
85+ from nitric.application import Nitric
86+ from transformers import AutoProcessor, BarkModel
87+
88+ import scipy
89+ import io
90+ import torch
91+ import numpy as np
92+ import requests
93+
94+ @gen_audio_job (cpus = 4 , memory = 12000 , gpus = 1 )
95+ async def do_generate_audio (ctx : JobContext):
96+ file = ctx.req.data[" file" ]
97+ text: str = ctx.req.data[" text" ]
98+
99+ print (" Loading model" )
100+ model = BarkModel.from_pretrained(" suno/bark" )
101+ processor = AutoProcessor.from_pretrained(" suno/bark" )
102+ print (" Model loaded" )
103+
104+ # Split the text by sentences and chain the audio clips together
105+ # We do this because the model can only reliably generate a certain amount of audio at a time
106+ sentences = text.split(" ." )
107+ sentences = [sentence for sentence in sentences if sentence.strip() != " " ]
108+
109+ audio_arrays = []
110+ # for each sentence, generate the audio clip
111+ for index, sentence in enumerate (sentences):
112+ # Insert pauses between sentences to prevent clips from running together
113+ inputs = processor(f " { sentence} ... " , voice_preset = voice_preset)
114+
115+ if torch.cuda.is_available():
116+ inputs.to(" cuda" )
117+ model.to(" cuda" )
118+ else :
119+ print (" CUDA unavailable, defaulting to CPU. This may take a while." )
120+
121+ print (f " Generating clip { index + 1 } / { len (sentences)} " )
122+ audio_array = model.generate(** inputs, pad_token_id = 0 )
123+ audio_array = audio_array.cpu().numpy().squeeze()
124+
125+ audio_arrays.append(audio_array)
126+
127+ final_array = np.concatenate(audio_arrays)
128+
129+ buffer = io.BytesIO()
130+ print (" Encoding clip" )
131+ sample_rate = model.generation_config.sample_rate
132+ scipy.io.wavfile.write(buffer, rate = sample_rate, data = final_array)
133+
134+ print (" Uploading clip" )
135+ upload_url = await clips.file(f ' { file } .wav ' ).upload_url()
136+
137+ # make a put request to the upload url
138+ requests.put(upload_url, data = buffer.getvalue(), headers = {" Content-Type" : " audio/wav" }, timeout = 600 )
139+
140+ print (" Done!" )
141+ ```
142+
143+ ## Creating our API
144+
145+ First we'll remove out starter API and replace it with our own.
146+
147+ ``` bash
148+ rm services/hello.pi
149+ touch services/api.py
150+ ```
151+
152+ Then we'll create an API endpoint in ` services/api.py ` that will us to call the job we defined in the first step.
153+
154+ ``` python
155+ from common.resources import main_api, gen_audio_job
156+
157+ # Give this service permission to submit the gen_audio_job
158+ gen_audio = gen_audio_job.allow(" submit" )
159+
160+ default_voice_preset = " v2/en_speaker_6"
161+
162+ # Generate a sample voice line
163+ @main_api.post (" /audio/:filename" )
164+ async def submit_auto (ctx : HttpContext):
165+ name = ctx.req.params[" filename" ]
166+ preset = ctx.req.query.get(" preset" , default_voice_preset)
167+
168+ if isinstance (model_id, list ):
169+ model_id = model_id[0 ]
170+
171+ if isinstance (preset, list ):
172+ preset = preset[0 ]
173+
174+ body = ctx.req.data
175+ if body is None :
176+ ctx.res.status = 400
177+ return
178+
179+ print (f " using preset { preset} " )
180+
181+ await generate_audio.submit({" file" : name, " text" : body.decode(' utf-8' ), " preset" : preset})
182+
183+ Nitric.run()
184+ ```
185+
186+ ## Updating the nitric.yaml
187+
188+ Finally we'll update our ` nitric.yaml ` to include the batch service we created and add the preview flag for batch.
189+
190+ ``` yaml
191+ name : podcast-ai
192+ services :
193+ - match : services/*.py
194+ start : pipenv run dev $SERVICE_PATH
195+ batch-services :
196+ - match : batches/*.py
197+ start : pipenv run dev $SERVICE_PATH
198+
199+ preview :
200+ - batch-services
201+ ` ` `
202+
203+ ## Running our project
204+
205+ We can start our project by running
206+
207+ ` ` ` bash
208+ nitric start
209+ ```
210+
211+ Once its up and running we can test out our API by running:
212+
213+ ``` bash
214+
215+ ```
216+
217+ Or you can use your favorite API client to test it out.
218+
219+ <Note >
220+ If you're running without a GPU it can take some time for the audio content to
221+ generate
222+ </Note >
223+
224+ Once the generation is complete you should have something like this:
225+
226+ <audio >
227+
228+ </audio >
229+
230+ Feel free to play around with it a bit more before continuing on. It can be fun to experiment with different text inputs and see what the model generates.
231+
232+ ## Preparing to deploy to the cloud
233+
234+ Before we can deploy our project to the cloud we need to make a few changes to our project.
235+
236+ First we want to be able to cache models to be used between runs without having to pull them from hugging face hub each time.
237+
238+ This is what we added the models bucket and download job for.
239+
240+ Lets update our ` batches/podcast.py ` to include the download job.
241+
242+ ``` python
243+ from common.resources import gen_audio_job, clips_bucket, models_bucket, download_audio_model_job
244+ from nitric.context import JobContext
245+ from nitric.application import Nitric
246+ from transformers import AutoProcessor, BarkModel
247+
248+ import scipy
249+ import io
250+ import torch
251+ import numpy as np
252+ import requests
253+ import zipfile
254+ import os
255+
256+ clips = clips_bucket.allow(' write' )
257+ models = models_bucket.allow(' read' , ' write' )
258+
259+ model_dir = " ./.model"
260+ # Download the model and save it to a nitric bucket
261+ @download_audio_model_job (cpus = 4 , memory = 12000 )
262+ async def do_download_audio_model (ctx : JobContext):
263+ model_id = ctx.req.data[" model_id" ]
264+
265+ print (" Downloading models - this may take several minutes" )
266+ processor = AutoProcessor.from_pretrained(model_id)
267+ model = BarkModel.from_pretrained(model_id)
268+
269+ processor.save_pretrained(f " { model_dir} /processor " )
270+ model.save_pretrained(f " { model_dir} /audio " )
271+
272+ print (" Compressing models" )
273+ zip_path = " model.zip"
274+
275+ # zip the model
276+ with zipfile.ZipFile(zip_path, ' w' , zipfile.ZIP_STORED ) as zip_file:
277+ for root, dirs, files in os.walk(model_dir):
278+ for file in files:
279+ file_path = os.path.join(root, file )
280+ archive_name = os.path.relpath(file_path, start = model_dir)
281+ print (f " Adding { file_path} to zip as { archive_name} " )
282+ zip_file.write(file_path, archive_name)
283+
284+ print (" Storing models in bucket" )
285+ # push the archive
286+ module_url = await models.file(f " { model_id} .zip " ).upload_url()
287+ print (module_url)
288+ with open (zip_path, " rb" ) as f:
289+ requests.put(module_url, data = f, timeout = 6000 )
290+ print (" Done!" )
291+ ```
292+
293+ We'll also update our audio generation job to download the model from the bucket before processing the audio.
294+
295+ ``` python
296+ @gen_audio_job (cpus = 4 , memory = 12000 , gpus = 1 )
297+ async def do_generate_audio (ctx : JobContext):
298+ file = ctx.req.data[" file" ]
299+ voice_preset = ctx.req.data[" preset" ]
300+ text: str = ctx.req.data[" text" ]
301+ model_id = ctx.req.data[" model_id" ]
302+
303+ # Copy model from nitric bucket to local storage
304+ if not os.path.exists(model_dir):
305+ print (" Downloading model" )
306+ download_url = await models.file(f " { model_id} .zip " ).download_url()
307+ response = requests.get(download_url, allow_redirects = True , timeout = 600 )
308+ # save the zip file
309+ with open (" model.zip" , " wb" ) as f:
310+ f.write(response.content)
311+ print (" Unzipping model" )
312+ with zipfile.ZipFile(" model.zip" , ' r' ) as zip_ref:
313+ zip_ref.extractall(model_dir)
314+
315+ # cleanup zip file
316+ print (" Cleaning up" )
317+ os.remove(" model.zip" )
318+
319+
320+ print (" Loading model" )
321+ model = BarkModel.from_pretrained(f " { model_dir} /audio " )
322+ processor = AutoProcessor.from_pretrained(" ./.model/processor" )
323+ print (" Model loaded" )
324+
325+ print (f ' Using voice preset { voice_preset} ' )
326+
327+ # Split the text by sentences and chain the audio clips together
328+ sentences = text.split(" ." )
329+ sentences = [sentence for sentence in sentences if sentence.strip() != " " ]
330+
331+ audio_arrays = []
332+ # for each sentence, generate the audio clip
333+ for index, sentence in enumerate (sentences):
334+ # Insert pauses between sentences to prevent clips from running together
335+ inputs = processor(f " { sentence} ... " , voice_preset = voice_preset)
336+
337+ if torch.cuda.is_available():
338+ inputs.to(" cuda" )
339+ model.to(" cuda" )
340+ else :
341+ print (" CUDA unavailable, defaulting to CPU. This may take a while." )
342+
343+ print (f " Generating clip { index + 1 } / { len (sentences)} " )
344+ audio_array = model.generate(** inputs, pad_token_id = 0 )
345+ audio_array = audio_array.cpu().numpy().squeeze()
346+
347+ audio_arrays.append(audio_array)
348+
349+ final_array = np.concatenate(audio_arrays)
350+
351+ buffer = io.BytesIO()
352+ print (" Encoding clip" )
353+ sample_rate = model.generation_config.sample_rate
354+ scipy.io.wavfile.write(buffer, rate = sample_rate, data = final_array)
355+
356+ print (" Uploading clip" )
357+ upload_url = await clips.file(f ' { file } .wav ' ).upload_url()
358+
359+ # make a put request to the upload url
360+ # with the buffer as the body
361+ # and the content type as audio/wav
362+ requests.put(upload_url, data = buffer.getvalue(), headers = {" Content-Type" : " audio/wav" }, timeout = 600 )
363+
364+ print (" Done!" )
365+ ```
366+
367+ Then we can add an api endpoint to trigger the download job. This will allow us to prefetch models before we need them.
368+
369+ <Note >
370+ If you like the download/cache step can also be rolled into the audio
371+ generation job.
372+ </Note >
373+
374+ ```
375+
376+ ```
0 commit comments