Skip to content

Commit 1743dce

Browse files
authored
adding files for xtts streaming (#241)
1 parent 2001aee commit 1743dce

File tree

5 files changed

+227
-0
lines changed

5 files changed

+227
-0
lines changed

xtts-streaming/README.md

Lines changed: 115 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,115 @@
1+
# XTTS Streaming
2+
3+
This repository packages [TTS](https://github.com/coqui-ai/TTS) as a [Truss](https://truss.baseten.co/) but with streaming.
4+
5+
TTS is a generative audio model for text-to-speech generation. This model takes in text and a speaker's voice as input and converts the text to speech in the voice of the speaker.
6+
7+
## Deploying XTTS
8+
9+
First, clone this repository:
10+
11+
```sh
12+
git clone https://github.com/basetenlabs/truss-examples/
13+
cd xtts-streaming
14+
```
15+
16+
Before deployment:
17+
18+
1. Make sure you have a [Baseten account](https://app.baseten.co/signup) and [API key](https://app.baseten.co/settings/account/api_keys).
19+
2. Install the latest version of Truss: `pip install --upgrade truss`
20+
21+
With `xtts-v2-truss` as your working directory, you can deploy the model with:
22+
23+
```sh
24+
truss push
25+
```
26+
27+
Paste your Baseten API key if prompted.
28+
29+
For more information, see [Truss documentation](https://truss.baseten.co).
30+
31+
## Invoking the model
32+
33+
Here are the following inputs for the model:
34+
1. `text`: The text that needs to be converted into speech
35+
2. `language`: Language for the text
36+
3. `chunk_size`: Integer size of each chunk being streamed
37+
38+
Here are two examples of streaming the audio. This first example write all of the streamed chunks to an audio file.
39+
40+
```python
41+
import wave
42+
import requests
43+
44+
channels = 1 # mono=1, stereo=2
45+
sampwidth = 2 # Sample width in bytes, typical values: 2 for 16-bit audio, 1 for 8-bit audio
46+
framerate = 24000 # Sampling rate, in samples per second (Hz)
47+
48+
49+
resp = requests.post(
50+
"https://model-<model-id>.api.baseten.co/development/predict",
51+
headers={"Authorization": "Api-Key BASETEN-API-KEY"},
52+
json={"text": "Kurt watched the incoming Pelicans. The blocky jet-powered craft were so distant they were only specks against the setting sun. He hit the magnification on his faceplate and saw lines of fire tracing their reentry vectors. They would touch down in three minutes."},
53+
stream=True
54+
)
55+
56+
with wave.open("dat2-wav.wav", 'wb') as wav_file:
57+
wav_file.setnchannels(channels)
58+
wav_file.setsampwidth(sampwidth)
59+
wav_file.setframerate(framerate)
60+
61+
# Iterate through streamed content and write audio chunks directly
62+
for chunk in resp.iter_content(chunk_size=None): # Use server's chunk size
63+
if chunk:
64+
wav_file.writeframes(chunk)
65+
```
66+
67+
If you want to stream the audio directly as it gets generated here is another option:
68+
69+
```python
70+
import pyaudio
71+
72+
FORMAT = pyaudio.paInt16 # Audio format (e.g., 16-bit PCM)
73+
CHANNELS = 1 # Number of audio channels
74+
RATE = 24000 # Sample rate
75+
76+
# Initialize PyAudio
77+
p = pyaudio.PyAudio()
78+
79+
# Open a stream for audio playback
80+
stream = p.open(format=p.get_format_from_width(2), channels=CHANNELS, rate=RATE, output=True)
81+
82+
# Make a streaming HTTP request to the server
83+
original_text = "Kurt watched the incoming Pelicans. The blocky jet-powered craft were so distant they were only specks against the setting sun. He hit the magnification on his faceplate and saw lines of fire tracing their reentry vectors. They would touch down in three minutes."
84+
85+
86+
resp = requests.post(
87+
"https://model-<model-id>.api.baseten.co/development/predict",
88+
headers={"Authorization": "Api-Key BASETEN-API-KEY"},
89+
json={"text": "Kurt watched the incoming Pelicans. The blocky jet-powered craft were so distant they were only specks against the setting sun. He hit the magnification on his faceplate and saw lines of fire tracing their reentry vectors. They would touch down in three minutes."},
90+
stream=True
91+
)
92+
93+
# Create a buffer to hold multiple chunks
94+
buffer = b''
95+
buffer_size_threshold = 2**20
96+
97+
# Stream and play the audio data as it's received
98+
for chunk in resp.iter_content(chunk_size=4096):
99+
if chunk:
100+
buffer += chunk
101+
if len(buffer) >= buffer_size_threshold:
102+
print(f"Writing buffer of size: {len(buffer)}")
103+
stream.write(buffer)
104+
buffer = b'' # Clear the buffer
105+
# stream.write(chunk)
106+
107+
if buffer:
108+
print(f"Writing final buffer of size: {len(buffer)}")
109+
stream.write(buffer)
110+
111+
# Close and terminate the stream and PyAudio
112+
stream.stop_stream()
113+
stream.close()
114+
p.terminate()
115+
```

xtts-streaming/config.yaml

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
environment_variables:
2+
COQUI_TOS_AGREED: '1'
3+
external_package_dirs: []
4+
model_metadata: {}
5+
model_name: XTTS Streaming
6+
python_version: py310
7+
requirements_file: ./requirements.txt
8+
resources:
9+
accelerator: T4
10+
cpu: '3'
11+
memory: 10Gi
12+
use_gpu: true
13+
secrets: {}
14+
system_packages: []

xtts-streaming/model/__init__.py

Whitespace-only changes.

xtts-streaming/model/model.py

Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,97 @@
1+
import base64
2+
import io
3+
import logging
4+
import os
5+
import wave
6+
7+
import numpy as np
8+
import torch
9+
from TTS.tts.configs.xtts_config import XttsConfig
10+
from TTS.tts.models.xtts import Xtts
11+
from TTS.utils.generic_utils import get_user_data_dir
12+
from TTS.utils.manage import ModelManager
13+
14+
# This is one of the speaker voices that comes with xtts
15+
SPEAKER_NAME = "Claribel Dervla"
16+
17+
18+
class Model:
19+
def __init__(self, **kwargs):
20+
self.model = None
21+
self.speaker = None
22+
23+
def load(self):
24+
device = "cuda"
25+
model_name = "tts_models/multilingual/multi-dataset/xtts_v2"
26+
logging.info("⏳Downloading model")
27+
ModelManager().download_model(model_name)
28+
model_path = os.path.join(
29+
get_user_data_dir("tts"), model_name.replace("/", "--")
30+
)
31+
32+
config = XttsConfig()
33+
config.load_json(os.path.join(model_path, "config.json"))
34+
self.model = Xtts.init_from_config(config)
35+
self.model.load_checkpoint(config, checkpoint_dir=model_path, eval=True)
36+
self.model.to(device)
37+
38+
self.speaker = {
39+
"speaker_embedding": self.model.speaker_manager.speakers[SPEAKER_NAME][
40+
"speaker_embedding"
41+
]
42+
.cpu()
43+
.squeeze()
44+
.half()
45+
.tolist(),
46+
"gpt_cond_latent": self.model.speaker_manager.speakers[SPEAKER_NAME][
47+
"gpt_cond_latent"
48+
]
49+
.cpu()
50+
.squeeze()
51+
.half()
52+
.tolist(),
53+
}
54+
logging.info("🔥Model Loaded")
55+
56+
def wav_postprocess(self, wav):
57+
"""Post process the output waveform"""
58+
if isinstance(wav, list):
59+
wav = torch.cat(wav, dim=0)
60+
wav = wav.clone().detach().cpu().numpy()
61+
wav = np.clip(wav, -1, 1)
62+
wav = (wav * 32767).astype(np.int16)
63+
return wav
64+
65+
def predict(self, model_input):
66+
text = model_input.get("text")
67+
language = model_input.get("language", "en")
68+
chunk_size = int(
69+
model_input.get("chunk_size", 150)
70+
) # Ensure chunk_size is an integer
71+
add_wav_header = False
72+
73+
speaker_embedding = (
74+
torch.tensor(self.speaker.get("speaker_embedding"))
75+
.unsqueeze(0)
76+
.unsqueeze(-1)
77+
)
78+
gpt_cond_latent = (
79+
torch.tensor(self.speaker.get("gpt_cond_latent"))
80+
.reshape((-1, 1024))
81+
.unsqueeze(0)
82+
)
83+
84+
streamer = self.model.inference_stream(
85+
text,
86+
language,
87+
gpt_cond_latent,
88+
speaker_embedding,
89+
stream_chunk_size=chunk_size,
90+
enable_text_splitting=True,
91+
)
92+
93+
for chunk in streamer:
94+
print(type(chunk))
95+
processed_chunk = self.wav_postprocess(chunk)
96+
processed_bytes = processed_chunk.tobytes()
97+
yield processed_bytes

xtts-streaming/requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
git+https://github.com/coqui-ai/TTS@fa28f99f1508b5b5366539b2149963edcb80ba62

0 commit comments

Comments
 (0)