feat: add orpheus tts with streaming (#422)

safonovklim · web-flow · commit 717c1fa3e6ea · 2025-03-27T15:24:01.000-07:00
Hello guys,

As discussed privately, we adding Orpheus TTS to your truss-examples
repo.
Important to note that we originally tried sync/HTTP approach, but
response time was very long.

This PR contains your implementation with some minor changes described
in README.md.
Also added variable `request_id` as it seems conflicting when running
two+ requests in parallel.
Also attached example code (call.py) with streaming to your device audio
and audio sample
diff --git a/orpheus-tts-streaming/.gitignore b/orpheus-tts-streaming/.gitignore
@@ -0,0 +1,4 @@
+.idea
+.venv
+data/
+.DS_Store
diff --git a/orpheus-tts-streaming/README.md b/orpheus-tts-streaming/README.md
@@ -0,0 +1,16 @@
+# ai-orpheus-tts
+
+Source Code:
+- https://huggingface.co/canopylabs/orpheus-3b-0.1-ft
+- https://github.com/canopyai/Orpheus-TTS/tree/main
+- https://github.com/canopyai/Orpheus-Speech-PyPi/blob/main/orpheus_tts/engine_class.py
+- https://huggingface.co/spaces/MohamedRashad/Orpheus-TTS
+
+# Voices
+
+`["zoe", "zac", "jess", "leo", "mia", "julia", "leah"]`
+
+# Performance
+- Use A100, H100+ GPUs as they give the optimal token/second performance
+- `dtype` overwritten from `dtype=torch.dfloat16` to `dtype=torch.float16`
+- According to [creators of Orpheus](https://github.com/canopyai/Orpheus-TTS/issues/53#issuecomment-2749433171), `The required generation speed for streaming is 83 toks/s as that is the number of tokens needed for 1s of audio. It seems like the A100 is generating faster than the necessary speed (~110 tok/s) as recorded in the logs (and the audio is generated in less time than its duration).`
diff --git a/orpheus-tts-streaming/call.py b/orpheus-tts-streaming/call.py
@@ -0,0 +1,58 @@
+import time
+
+import pyaudio
+import requests
+
+BASETEN_HOST = "<ENTER_PREDICT_URL>"
+BASETEN_API_KEY = "<ENTER_API_KEY>"
+FORMAT = pyaudio.paInt16  # Audio format (e.g., 16-bit PCM)
+CHANNELS = 1  # Number of audio channels
+RATE = 24000  # Sample rate
+
+# Initialize PyAudio
+p = pyaudio.PyAudio()
+
+# Open a stream for audio playback
+stream = p.open(
+    format=p.get_format_from_width(2), channels=CHANNELS, rate=RATE, output=True
+)
+
+# Make a streaming HTTP request to the server
+start_time = time.time()
+resp = requests.post(
+    BASETEN_HOST,
+    headers={"Authorization": f"Api-Key {BASETEN_API_KEY}"},
+    json={
+        "text": "Absolutely! Let's explore together. I'll help you with this. The concept of making inferences can be very useful when you encounter new words. Inference means using clues from the text to guess the meaning of a word or phrase. For example, if I say The library is open from 8 AM to 10 PM, and you see the word library, you might guess it’s a place where people read or borrow books because of its context. Now, let’s try this with a word from our text! Here’s one: “borrow.” What do you think “borrow” means based on how it's used in the sentence?",
+        "max_tokens": 10000,
+        "voice": "tara",
+    },
+    stream=True,
+)
+
+# Create a buffer to hold multiple chunks
+buffer = b""
+buffer_size_threshold = 2**2
+
+# Stream and play the audio data as it's received
+for chunk in resp.iter_content(chunk_size=4096):
+    if chunk:
+        now = time.time()
+        execution_time_ms = (now - start_time) * 1000
+        print(f"Received chunk after {execution_time_ms:.2f}ms: {len(buffer)}")
+        buffer += chunk
+        # stream.write(buffer)
+        if len(buffer) >= buffer_size_threshold:
+            print(f"Writing buffer of size: {len(buffer)}")
+            stream.write(buffer)
+            buffer = b""  # Clear the buffer
+        # stream.write(chunk)
+
+if buffer:
+    print(f"Writing final buffer of size: {len(buffer)}")
+    stream.write(buffer)
+
+# Close and terminate the stream and PyAudio
+stream.stop_stream()
+stream.close()
+p.terminate()
diff --git a/orpheus-tts-streaming/config.yaml b/orpheus-tts-streaming/config.yaml
@@ -0,0 +1,19 @@
+environment_variables: {}
+model_metadata:
+  example_model_input: {"text": "Hello! What's new in your world?", "voice": "tara"}
+model_name: orpheus-tts
+python_version: py310
+requirements:
+  - orpheus-speech
+  - vllm==0.7.3
+  - soundfile
+  - huggingface_hub[hf_transfer]
+  - hf_transfer==0.1.9
+resources:
+  accelerator: A100
+  # accelerator: H100_40GB
+  use_gpu: true
+runtime:
+  predict_concurrency: 16
+secrets:
+  hf_access_token: null
diff --git a/orpheus-tts-streaming/model/__init__.py b/orpheus-tts-streaming/model/__init__.py
diff --git a/orpheus-tts-streaming/model/model.py b/orpheus-tts-streaming/model/model.py
@@ -0,0 +1,89 @@
+import logging
+import os
+import struct
+
+import torch
+from fastapi.responses import StreamingResponse
+
+os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
+os.environ["HF_HUB_DISABLE_TELEMETRY"] = "1"
+# os.environ["VLLM_USE_V1"] = "1"
+# os.environ["VLLM_LOGGING_LEVEL"] = "DEBUG"
+
+from orpheus_tts.engine_class import OrpheusModel
+
+logger = logging.getLogger(__name__)
+
+
+class Model:
+    def __init__(self, **kwargs):
+        # Uncomment the following to get access
+        # to various parts of the Truss config.
+
+        self._data_dir = kwargs["data_dir"]
+        self.model = None
+        self._secrets = kwargs["secrets"]
+        os.environ["HF_TOKEN"] = self._secrets["hf_access_token"]
+
+    def load(self):
+        # default dtype is torch.bfloat16
+        # https://github.com/canopyai/Orpheus-Speech-PyPi/blob/main/orpheus_tts/engine_class.py#L10
+        self.model = OrpheusModel(
+            model_name="canopylabs/orpheus-tts-0.1-finetune-prod", dtype=torch.float16
+        )
+
+    def create_wav_header(self, sample_rate=24000, bits_per_sample=16, channels=1):
+        byte_rate = sample_rate * channels * bits_per_sample // 8
+        block_align = channels * bits_per_sample // 8
+
+        data_size = 0
+
+        header = struct.pack(
+            "<4sI4s4sIHHIIHH4sI",
+            b"RIFF",
+            36 + data_size,
+            b"WAVE",
+            b"fmt ",
+            16,
+            1,
+            channels,
+            sample_rate,
+            byte_rate,
+            block_align,
+            bits_per_sample,
+            b"data",
+            data_size,
+        )
+        return header
+
+    def predict(self, model_input):
+        # Run model inference here
+        text = str(model_input.get("text", "Hi, I'm Orhpeus model"))
+        voice = str(model_input.get("voice", "tara"))
+        request_id = str(model_input.get("request_id", "req-001"))
+        repetition_penalty = model_input.get("repetition_penalty", 1.1)
+        max_tokens = int(model_input.get("max_tokens", 10000))
+        temperature = model_input.get("temperature", 0.4)
+        top_p = model_input.get("top_p", 0.9)
+
+        logger.info(
+            f"Generating audio from processed text ({len(text)} chars, voice {voice}): {text}"
+        )
+
+        def generate_audio_stream():
+            yield self.create_wav_header()
+
+            audio_generator = self.model.generate_speech(
+                prompt=text,
+                voice=voice,
+                request_id=request_id,
+                repetition_penalty=repetition_penalty,
+                stop_token_ids=[128258],
+                max_tokens=max_tokens,
+                temperature=temperature,
+                top_p=top_p,
+            )
+            for chunk in audio_generator:
+                yield chunk
+
+        return StreamingResponse(generate_audio_stream(), media_type="audio/wav")
diff --git a/orpheus-tts-streaming/sample.wav b/orpheus-tts-streaming/sample.wav

-Original file line number
+Diff line change
@@ @@ -0,0 +1,4 @@ @@
 +.idea
 +.venv
 +data/
 +.DS_Store