adding files for xtts streaming (#241)

htrivedi99 · web-flow · commit 1743dce6a4a5 · 2024-03-19T14:15:38.000-07:00
diff --git a/xtts-streaming/README.md b/xtts-streaming/README.md
@@ -0,0 +1,115 @@
+# XTTS Streaming
+
+This repository packages [TTS](https://github.com/coqui-ai/TTS) as a [Truss](https://truss.baseten.co/) but with streaming.
+
+TTS is a generative audio model for text-to-speech generation. This model takes in text and a speaker's voice as input and converts the text to speech in the voice of the speaker.
+
+## Deploying XTTS
+
+First, clone this repository:
+
+```sh
+git clone https://github.com/basetenlabs/truss-examples/
+cd xtts-streaming
+```
+
+Before deployment:
+
+1. Make sure you have a [Baseten account](https://app.baseten.co/signup) and [API key](https://app.baseten.co/settings/account/api_keys).
+2. Install the latest version of Truss: `pip install --upgrade truss`
+
+With `xtts-v2-truss` as your working directory, you can deploy the model with:
+
+```sh
+truss push
+```
+
+Paste your Baseten API key if prompted.
+
+For more information, see [Truss documentation](https://truss.baseten.co).
+
+## Invoking the model
+
+Here are the following inputs for the model:
+1. `text`: The text that needs to be converted into speech
+2. `language`: Language for the text
+3. `chunk_size`: Integer size of each chunk being streamed
+
+Here are two examples of streaming the audio. This first example write all of the streamed chunks to an audio file.
+
+```python
+import wave
+import requests
+
+channels = 1  # mono=1, stereo=2
+sampwidth = 2  # Sample width in bytes, typical values: 2 for 16-bit audio, 1 for 8-bit audio
+framerate = 24000  # Sampling rate, in samples per second (Hz)
+
+
+resp = requests.post(
+    "https://model-<model-id>.api.baseten.co/development/predict",
+    headers={"Authorization": "Api-Key BASETEN-API-KEY"},
+    json={"text": "Kurt watched the incoming Pelicans. The blocky jet-powered craft were so distant they were only specks against the setting sun. He hit the magnification on his faceplate and saw lines of fire tracing their reentry vectors. They would touch down in three minutes."},
+    stream=True
+)
+
+with wave.open("dat2-wav.wav", 'wb') as wav_file:
+    wav_file.setnchannels(channels)
+    wav_file.setsampwidth(sampwidth)
+    wav_file.setframerate(framerate)
+
+    # Iterate through streamed content and write audio chunks directly
+    for chunk in resp.iter_content(chunk_size=None):  # Use server's chunk size
+        if chunk:
+            wav_file.writeframes(chunk)
+```
+
+If you want to stream the audio directly as it gets generated here is another option:
+
+```python
+import pyaudio
+
+FORMAT = pyaudio.paInt16  # Audio format (e.g., 16-bit PCM)
+CHANNELS = 1              # Number of audio channels
+RATE = 24000              # Sample rate
+
+# Initialize PyAudio
+p = pyaudio.PyAudio()
+
+# Open a stream for audio playback
+stream = p.open(format=p.get_format_from_width(2), channels=CHANNELS, rate=RATE, output=True)
+
+# Make a streaming HTTP request to the server
+original_text = "Kurt watched the incoming Pelicans. The blocky jet-powered craft were so distant they were only specks against the setting sun. He hit the magnification on his faceplate and saw lines of fire tracing their reentry vectors. They would touch down in three minutes."
+
+
+resp = requests.post(
+    "https://model-<model-id>.api.baseten.co/development/predict",
+    headers={"Authorization": "Api-Key BASETEN-API-KEY"},
+    json={"text": "Kurt watched the incoming Pelicans. The blocky jet-powered craft were so distant they were only specks against the setting sun. He hit the magnification on his faceplate and saw lines of fire tracing their reentry vectors. They would touch down in three minutes."},
+    stream=True
+)
+
+# Create a buffer to hold multiple chunks
+buffer = b''
+buffer_size_threshold = 2**20
+
+# Stream and play the audio data as it's received
+for chunk in resp.iter_content(chunk_size=4096):
+    if chunk:
+        buffer += chunk
+        if len(buffer) >= buffer_size_threshold:
+            print(f"Writing buffer of size: {len(buffer)}")
+            stream.write(buffer)
+            buffer = b''  # Clear the buffer
+        # stream.write(chunk)
+
+if buffer:
+    print(f"Writing final buffer of size: {len(buffer)}")
+    stream.write(buffer)
+
+# Close and terminate the stream and PyAudio
+stream.stop_stream()
+stream.close()
+p.terminate()
+```
diff --git a/xtts-streaming/config.yaml b/xtts-streaming/config.yaml
@@ -0,0 +1,14 @@
+environment_variables:
+  COQUI_TOS_AGREED: '1'
+external_package_dirs: []
+model_metadata: {}
+model_name: XTTS Streaming
+python_version: py310
+requirements_file: ./requirements.txt
+resources:
+  accelerator: T4
+  cpu: '3'
+  memory: 10Gi
+  use_gpu: true
+secrets: {}
+system_packages: []
diff --git a/xtts-streaming/model/__init__.py b/xtts-streaming/model/__init__.py
diff --git a/xtts-streaming/model/model.py b/xtts-streaming/model/model.py
@@ -0,0 +1,97 @@
+import base64
+import io
+import logging
+import os
+import wave
+
+import numpy as np
+import torch
+from TTS.tts.configs.xtts_config import XttsConfig
+from TTS.tts.models.xtts import Xtts
+from TTS.utils.generic_utils import get_user_data_dir
+from TTS.utils.manage import ModelManager
+
+# This is one of the speaker voices that comes with xtts
+SPEAKER_NAME = "Claribel Dervla"
+
+
+class Model:
+    def __init__(self, **kwargs):
+        self.model = None
+        self.speaker = None
+
+    def load(self):
+        device = "cuda"
+        model_name = "tts_models/multilingual/multi-dataset/xtts_v2"
+        logging.info("⏳Downloading model")
+        ModelManager().download_model(model_name)
+        model_path = os.path.join(
+            get_user_data_dir("tts"), model_name.replace("/", "--")
+        )
+
+        config = XttsConfig()
+        config.load_json(os.path.join(model_path, "config.json"))
+        self.model = Xtts.init_from_config(config)
+        self.model.load_checkpoint(config, checkpoint_dir=model_path, eval=True)
+        self.model.to(device)
+
+        self.speaker = {
+            "speaker_embedding": self.model.speaker_manager.speakers[SPEAKER_NAME][
+                "speaker_embedding"
+            ]
+            .cpu()
+            .squeeze()
+            .half()
+            .tolist(),
+            "gpt_cond_latent": self.model.speaker_manager.speakers[SPEAKER_NAME][
+                "gpt_cond_latent"
+            ]
+            .cpu()
+            .squeeze()
+            .half()
+            .tolist(),
+        }
+        logging.info("🔥Model Loaded")
+
+    def wav_postprocess(self, wav):
+        """Post process the output waveform"""
+        if isinstance(wav, list):
+            wav = torch.cat(wav, dim=0)
+        wav = wav.clone().detach().cpu().numpy()
+        wav = np.clip(wav, -1, 1)
+        wav = (wav * 32767).astype(np.int16)
+        return wav
+
+    def predict(self, model_input):
+        text = model_input.get("text")
+        language = model_input.get("language", "en")
+        chunk_size = int(
+            model_input.get("chunk_size", 150)
+        )  # Ensure chunk_size is an integer
+        add_wav_header = False
+
+        speaker_embedding = (
+            torch.tensor(self.speaker.get("speaker_embedding"))
+            .unsqueeze(0)
+            .unsqueeze(-1)
+        )
+        gpt_cond_latent = (
+            torch.tensor(self.speaker.get("gpt_cond_latent"))
+            .reshape((-1, 1024))
+            .unsqueeze(0)
+        )
+
+        streamer = self.model.inference_stream(
+            text,
+            language,
+            gpt_cond_latent,
+            speaker_embedding,
+            stream_chunk_size=chunk_size,
+            enable_text_splitting=True,
+        )
+
+        for chunk in streamer:
+            print(type(chunk))
+            processed_chunk = self.wav_postprocess(chunk)
+            processed_bytes = processed_chunk.tobytes()
+            yield processed_bytes
diff --git a/xtts-streaming/requirements.txt b/xtts-streaming/requirements.txt
@@ -0,0 +1 @@
+git+https://github.com/coqui-ai/TTS@fa28f99f1508b5b5366539b2149963edcb80ba62

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+git+https://github.com/coqui-ai/TTS@fa28f99f1508b5b5366539b2149963edcb80ba62`