Skip to content

Commit 717c1fa

Browse files
authored
feat: add orpheus tts with streaming (#422)
Hello guys, As discussed privately, we adding Orpheus TTS to your truss-examples repo. Important to note that we originally tried sync/HTTP approach, but response time was very long. This PR contains your implementation with some minor changes described in README.md. Also added variable `request_id` as it seems conflicting when running two+ requests in parallel. Also attached example code (call.py) with streaming to your device audio and audio sample
1 parent 7cbfbb4 commit 717c1fa

File tree

7 files changed

+186
-0
lines changed

7 files changed

+186
-0
lines changed

orpheus-tts-streaming/.gitignore

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
.idea
2+
.venv
3+
data/
4+
.DS_Store

orpheus-tts-streaming/README.md

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
# ai-orpheus-tts
2+
3+
Source Code:
4+
- https://huggingface.co/canopylabs/orpheus-3b-0.1-ft
5+
- https://github.com/canopyai/Orpheus-TTS/tree/main
6+
- https://github.com/canopyai/Orpheus-Speech-PyPi/blob/main/orpheus_tts/engine_class.py
7+
- https://huggingface.co/spaces/MohamedRashad/Orpheus-TTS
8+
9+
# Voices
10+
11+
`["zoe", "zac", "jess", "leo", "mia", "julia", "leah"]`
12+
13+
# Performance
14+
- Use A100, H100+ GPUs as they give the optimal token/second performance
15+
- `dtype` overwritten from `dtype=torch.dfloat16` to `dtype=torch.float16`
16+
- According to [creators of Orpheus](https://github.com/canopyai/Orpheus-TTS/issues/53#issuecomment-2749433171), `The required generation speed for streaming is 83 toks/s as that is the number of tokens needed for 1s of audio. It seems like the A100 is generating faster than the necessary speed (~110 tok/s) as recorded in the logs (and the audio is generated in less time than its duration).`

orpheus-tts-streaming/call.py

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
import time
2+
3+
import pyaudio
4+
import requests
5+
6+
BASETEN_HOST = "<ENTER_PREDICT_URL>"
7+
BASETEN_API_KEY = "<ENTER_API_KEY>"
8+
FORMAT = pyaudio.paInt16 # Audio format (e.g., 16-bit PCM)
9+
CHANNELS = 1 # Number of audio channels
10+
RATE = 24000 # Sample rate
11+
12+
# Initialize PyAudio
13+
p = pyaudio.PyAudio()
14+
15+
# Open a stream for audio playback
16+
stream = p.open(
17+
format=p.get_format_from_width(2), channels=CHANNELS, rate=RATE, output=True
18+
)
19+
20+
# Make a streaming HTTP request to the server
21+
start_time = time.time()
22+
resp = requests.post(
23+
BASETEN_HOST,
24+
headers={"Authorization": f"Api-Key {BASETEN_API_KEY}"},
25+
json={
26+
"text": "Absolutely! Let's explore together. I'll help you with this. The concept of making inferences can be very useful when you encounter new words. Inference means using clues from the text to guess the meaning of a word or phrase. For example, if I say The library is open from 8 AM to 10 PM, and you see the word library, you might guess it’s a place where people read or borrow books because of its context. Now, let’s try this with a word from our text! Here’s one: “borrow.” What do you think “borrow” means based on how it's used in the sentence?",
27+
"max_tokens": 10000,
28+
"voice": "tara",
29+
},
30+
stream=True,
31+
)
32+
33+
# Create a buffer to hold multiple chunks
34+
buffer = b""
35+
buffer_size_threshold = 2**2
36+
37+
# Stream and play the audio data as it's received
38+
for chunk in resp.iter_content(chunk_size=4096):
39+
if chunk:
40+
now = time.time()
41+
execution_time_ms = (now - start_time) * 1000
42+
print(f"Received chunk after {execution_time_ms:.2f}ms: {len(buffer)}")
43+
buffer += chunk
44+
# stream.write(buffer)
45+
if len(buffer) >= buffer_size_threshold:
46+
print(f"Writing buffer of size: {len(buffer)}")
47+
stream.write(buffer)
48+
buffer = b"" # Clear the buffer
49+
# stream.write(chunk)
50+
51+
if buffer:
52+
print(f"Writing final buffer of size: {len(buffer)}")
53+
stream.write(buffer)
54+
55+
# Close and terminate the stream and PyAudio
56+
stream.stop_stream()
57+
stream.close()
58+
p.terminate()

orpheus-tts-streaming/config.yaml

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
environment_variables: {}
2+
model_metadata:
3+
example_model_input: {"text": "Hello! What's new in your world?", "voice": "tara"}
4+
model_name: orpheus-tts
5+
python_version: py310
6+
requirements:
7+
- orpheus-speech
8+
- vllm==0.7.3
9+
- soundfile
10+
- huggingface_hub[hf_transfer]
11+
- hf_transfer==0.1.9
12+
resources:
13+
accelerator: A100
14+
# accelerator: H100_40GB
15+
use_gpu: true
16+
runtime:
17+
predict_concurrency: 16
18+
secrets:
19+
hf_access_token: null

orpheus-tts-streaming/model/__init__.py

Whitespace-only changes.
Lines changed: 89 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,89 @@
1+
import logging
2+
import os
3+
import struct
4+
5+
import torch
6+
from fastapi.responses import StreamingResponse
7+
8+
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
9+
os.environ["HF_HUB_DISABLE_TELEMETRY"] = "1"
10+
# os.environ["VLLM_USE_V1"] = "1"
11+
# os.environ["VLLM_LOGGING_LEVEL"] = "DEBUG"
12+
13+
from orpheus_tts.engine_class import OrpheusModel
14+
15+
logger = logging.getLogger(__name__)
16+
17+
18+
class Model:
19+
def __init__(self, **kwargs):
20+
# Uncomment the following to get access
21+
# to various parts of the Truss config.
22+
23+
self._data_dir = kwargs["data_dir"]
24+
self.model = None
25+
self._secrets = kwargs["secrets"]
26+
os.environ["HF_TOKEN"] = self._secrets["hf_access_token"]
27+
28+
def load(self):
29+
# default dtype is torch.bfloat16
30+
# https://github.com/canopyai/Orpheus-Speech-PyPi/blob/main/orpheus_tts/engine_class.py#L10
31+
self.model = OrpheusModel(
32+
model_name="canopylabs/orpheus-tts-0.1-finetune-prod", dtype=torch.float16
33+
)
34+
35+
def create_wav_header(self, sample_rate=24000, bits_per_sample=16, channels=1):
36+
byte_rate = sample_rate * channels * bits_per_sample // 8
37+
block_align = channels * bits_per_sample // 8
38+
39+
data_size = 0
40+
41+
header = struct.pack(
42+
"<4sI4s4sIHHIIHH4sI",
43+
b"RIFF",
44+
36 + data_size,
45+
b"WAVE",
46+
b"fmt ",
47+
16,
48+
1,
49+
channels,
50+
sample_rate,
51+
byte_rate,
52+
block_align,
53+
bits_per_sample,
54+
b"data",
55+
data_size,
56+
)
57+
return header
58+
59+
def predict(self, model_input):
60+
# Run model inference here
61+
text = str(model_input.get("text", "Hi, I'm Orhpeus model"))
62+
voice = str(model_input.get("voice", "tara"))
63+
request_id = str(model_input.get("request_id", "req-001"))
64+
repetition_penalty = model_input.get("repetition_penalty", 1.1)
65+
max_tokens = int(model_input.get("max_tokens", 10000))
66+
temperature = model_input.get("temperature", 0.4)
67+
top_p = model_input.get("top_p", 0.9)
68+
69+
logger.info(
70+
f"Generating audio from processed text ({len(text)} chars, voice {voice}): {text}"
71+
)
72+
73+
def generate_audio_stream():
74+
yield self.create_wav_header()
75+
76+
audio_generator = self.model.generate_speech(
77+
prompt=text,
78+
voice=voice,
79+
request_id=request_id,
80+
repetition_penalty=repetition_penalty,
81+
stop_token_ids=[128258],
82+
max_tokens=max_tokens,
83+
temperature=temperature,
84+
top_p=top_p,
85+
)
86+
for chunk in audio_generator:
87+
yield chunk
88+
89+
return StreamingResponse(generate_audio_stream(), media_type="audio/wav")

orpheus-tts-streaming/sample.wav

1.57 MB
Binary file not shown.

0 commit comments

Comments
 (0)