Skip to content

Commit 43f25fa

Browse files
committed
Add xAI HTTP TTS service
Reworks the xAI TTS integration from #4031 with consistency fixes: - Rename to XAIHttpTTSService (leaves room for future WebSocket service) - Add proper language map with all 20 supported xAI languages - Remove unnecessary deprecated InputParams/params (new service, nothing to deprecate) - Add encoding as a constructor parameter - Use Language.EN enum instead of string for default language - Linting fixes
1 parent a16fe9f commit 43f25fa

File tree

7 files changed

+284
-144
lines changed

7 files changed

+284
-144
lines changed

changelog/4031.added.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
- Added `XAIHttpTTSService` for text-to-speech using xAI's HTTP TTS API.
Lines changed: 129 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,129 @@
1+
#
2+
# Copyright (c) 2024-2026, Daily
3+
#
4+
# SPDX-License-Identifier: BSD 2-Clause License
5+
#
6+
7+
import os
8+
9+
import aiohttp
10+
from dotenv import load_dotenv
11+
from loguru import logger
12+
13+
from pipecat.audio.vad.silero import SileroVADAnalyzer
14+
from pipecat.frames.frames import LLMRunFrame
15+
from pipecat.pipeline.pipeline import Pipeline
16+
from pipecat.pipeline.runner import PipelineRunner
17+
from pipecat.pipeline.task import PipelineParams, PipelineTask
18+
from pipecat.processors.aggregators.llm_context import LLMContext
19+
from pipecat.processors.aggregators.llm_response_universal import (
20+
LLMContextAggregatorPair,
21+
LLMUserAggregatorParams,
22+
)
23+
from pipecat.runner.types import RunnerArguments
24+
from pipecat.runner.utils import create_transport
25+
from pipecat.services.deepgram.stt import DeepgramSTTService
26+
from pipecat.services.grok.llm import GrokLLMService
27+
from pipecat.services.xai.tts import XAIHttpTTSService
28+
from pipecat.transports.base_transport import BaseTransport, TransportParams
29+
from pipecat.transports.daily.transport import DailyParams
30+
from pipecat.transports.websocket.fastapi import FastAPIWebsocketParams
31+
32+
load_dotenv(override=True)
33+
34+
# We use lambdas to defer transport parameter creation until the transport
35+
# type is selected at runtime.
36+
transport_params = {
37+
"daily": lambda: DailyParams(
38+
audio_in_enabled=True,
39+
audio_out_enabled=True,
40+
),
41+
"twilio": lambda: FastAPIWebsocketParams(
42+
audio_in_enabled=True,
43+
audio_out_enabled=True,
44+
),
45+
"webrtc": lambda: TransportParams(
46+
audio_in_enabled=True,
47+
audio_out_enabled=True,
48+
),
49+
}
50+
51+
52+
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
53+
logger.info(f"Starting bot")
54+
55+
async with aiohttp.ClientSession() as session:
56+
stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
57+
58+
tts = XAIHttpTTSService(
59+
api_key=os.getenv("GROK_API_KEY"),
60+
aiohttp_session=session,
61+
settings=XAIHttpTTSService.Settings(
62+
voice="eve",
63+
),
64+
)
65+
66+
llm = GrokLLMService(
67+
api_key=os.getenv("GROK_API_KEY"),
68+
settings=GrokLLMService.Settings(
69+
system_instruction="You are a helpful assistant in a voice conversation. Your responses will be spoken aloud, so avoid emojis, bullet points, or other formatting that can't be spoken. Respond to what the user said in a creative, helpful, and brief way.",
70+
),
71+
)
72+
73+
context = LLMContext()
74+
user_aggregator, assistant_aggregator = LLMContextAggregatorPair(
75+
context,
76+
user_params=LLMUserAggregatorParams(vad_analyzer=SileroVADAnalyzer()),
77+
)
78+
79+
pipeline = Pipeline(
80+
[
81+
transport.input(), # Transport user input
82+
stt,
83+
user_aggregator, # User responses
84+
llm, # LLM
85+
tts, # TTS
86+
transport.output(), # Transport bot output
87+
assistant_aggregator, # Assistant spoken responses
88+
]
89+
)
90+
91+
task = PipelineTask(
92+
pipeline,
93+
params=PipelineParams(
94+
enable_metrics=True,
95+
enable_usage_metrics=True,
96+
audio_out_sample_rate=8000,
97+
),
98+
idle_timeout_secs=runner_args.pipeline_idle_timeout_secs,
99+
)
100+
101+
@transport.event_handler("on_client_connected")
102+
async def on_client_connected(transport, client):
103+
logger.info(f"Client connected")
104+
# Kick off the conversation.
105+
context.add_message(
106+
{"role": "user", "content": "Please introduce yourself to the user."}
107+
)
108+
await task.queue_frames([LLMRunFrame()])
109+
110+
@transport.event_handler("on_client_disconnected")
111+
async def on_client_disconnected(transport, client):
112+
logger.info(f"Client disconnected")
113+
await task.cancel()
114+
115+
runner = PipelineRunner(handle_sigint=runner_args.handle_sigint)
116+
117+
await runner.run(task)
118+
119+
120+
async def bot(runner_args: RunnerArguments):
121+
"""Main bot entry point compatible with Pipecat Cloud."""
122+
transport = await create_transport(runner_args, transport_params)
123+
await run_bot(transport, runner_args)
124+
125+
126+
if __name__ == "__main__":
127+
from pipecat.runner.run import main
128+
129+
main()

examples/foundational/14g-function-calling-grok.py

Lines changed: 83 additions & 77 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77

88
import os
99

10+
import aiohttp
1011
from dotenv import load_dotenv
1112
from loguru import logger
1213

@@ -24,10 +25,10 @@
2425
)
2526
from pipecat.runner.types import RunnerArguments
2627
from pipecat.runner.utils import create_transport
27-
from pipecat.services.cartesia.tts import CartesiaTTSService
2828
from pipecat.services.deepgram.stt import DeepgramSTTService
2929
from pipecat.services.grok.llm import GrokLLMService
3030
from pipecat.services.llm_service import FunctionCallParams
31+
from pipecat.services.xai.tts import XAIHttpTTSService
3132
from pipecat.transports.base_transport import BaseTransport, TransportParams
3233
from pipecat.transports.daily.transport import DailyParams
3334
from pipecat.transports.websocket.fastapi import FastAPIWebsocketParams
@@ -60,83 +61,88 @@ async def fetch_weather_from_api(params: FunctionCallParams):
6061
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
6162
logger.info(f"Starting bot")
6263

63-
stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
64-
65-
tts = CartesiaTTSService(
66-
api_key=os.getenv("CARTESIA_API_KEY"),
67-
settings=CartesiaTTSService.Settings(
68-
voice="71a7ad14-091c-4e8e-a314-022ece01c121", # British Reading Lady
69-
),
70-
)
71-
72-
llm = GrokLLMService(
73-
api_key=os.getenv("GROK_API_KEY"),
74-
settings=GrokLLMService.Settings(
75-
system_instruction="You are a helpful assistant in a voice conversation. Your responses will be spoken aloud, so avoid emojis, bullet points, or other formatting that can't be spoken. Respond to what the user said in a creative, helpful, and brief way.",
76-
),
77-
)
78-
# You can also register a function_name of None to get all functions
79-
# sent to the same callback with an additional function_name parameter.
80-
llm.register_function("get_current_weather", fetch_weather_from_api)
81-
82-
weather_function = FunctionSchema(
83-
name="get_current_weather",
84-
description="Get the current weather",
85-
properties={
86-
"location": {
87-
"type": "string",
88-
"description": "The city and state, e.g. San Francisco, CA",
64+
async with aiohttp.ClientSession() as session:
65+
stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
66+
67+
tts = XAIHttpTTSService(
68+
api_key=os.getenv("GROK_API_KEY"),
69+
aiohttp_session=session,
70+
settings=XAIHttpTTSService.Settings(
71+
voice="eve",
72+
),
73+
)
74+
75+
llm = GrokLLMService(
76+
api_key=os.getenv("GROK_API_KEY"),
77+
settings=GrokLLMService.Settings(
78+
system_instruction="You are a helpful assistant in a voice conversation. Your responses will be spoken aloud, so avoid emojis, bullet points, or other formatting that can't be spoken. Respond to what the user said in a creative, helpful, and brief way.",
79+
),
80+
)
81+
# You can also register a function_name of None to get all functions
82+
# sent to the same callback with an additional function_name parameter.
83+
llm.register_function("get_current_weather", fetch_weather_from_api)
84+
85+
weather_function = FunctionSchema(
86+
name="get_current_weather",
87+
description="Get the current weather",
88+
properties={
89+
"location": {
90+
"type": "string",
91+
"description": "The city and state, e.g. San Francisco, CA",
92+
},
93+
"format": {
94+
"type": "string",
95+
"enum": ["celsius", "fahrenheit"],
96+
"description": "The temperature unit to use. Infer this from the user's location.",
97+
},
8998
},
90-
"format": {
91-
"type": "string",
92-
"enum": ["celsius", "fahrenheit"],
93-
"description": "The temperature unit to use. Infer this from the user's location.",
94-
},
95-
},
96-
required=["location", "format"],
97-
)
98-
tools = ToolsSchema(standard_tools=[weather_function])
99-
context = LLMContext(tools=tools)
100-
user_aggregator, assistant_aggregator = LLMContextAggregatorPair(
101-
context,
102-
user_params=LLMUserAggregatorParams(vad_analyzer=SileroVADAnalyzer()),
103-
)
104-
105-
pipeline = Pipeline(
106-
[
107-
transport.input(),
108-
stt,
109-
user_aggregator,
110-
llm,
111-
tts,
112-
transport.output(),
113-
assistant_aggregator,
114-
]
115-
)
116-
117-
task = PipelineTask(
118-
pipeline,
119-
params=PipelineParams(
120-
enable_metrics=True,
121-
enable_usage_metrics=True,
122-
),
123-
idle_timeout_secs=runner_args.pipeline_idle_timeout_secs,
124-
)
125-
126-
@transport.event_handler("on_client_connected")
127-
async def on_client_connected(transport, client):
128-
logger.info(f"Client connected")
129-
# Kick off the conversation.
130-
await task.queue_frames([LLMRunFrame()])
131-
132-
@transport.event_handler("on_client_disconnected")
133-
async def on_client_disconnected(transport, client):
134-
logger.info(f"Client disconnected")
135-
await task.cancel()
136-
137-
runner = PipelineRunner(handle_sigint=runner_args.handle_sigint)
138-
139-
await runner.run(task)
99+
required=["location", "format"],
100+
)
101+
tools = ToolsSchema(standard_tools=[weather_function])
102+
context = LLMContext(tools=tools)
103+
user_aggregator, assistant_aggregator = LLMContextAggregatorPair(
104+
context,
105+
user_params=LLMUserAggregatorParams(vad_analyzer=SileroVADAnalyzer()),
106+
)
107+
108+
pipeline = Pipeline(
109+
[
110+
transport.input(),
111+
stt,
112+
user_aggregator,
113+
llm,
114+
tts,
115+
transport.output(),
116+
assistant_aggregator,
117+
]
118+
)
119+
120+
task = PipelineTask(
121+
pipeline,
122+
params=PipelineParams(
123+
enable_metrics=True,
124+
enable_usage_metrics=True,
125+
),
126+
idle_timeout_secs=runner_args.pipeline_idle_timeout_secs,
127+
)
128+
129+
@transport.event_handler("on_client_connected")
130+
async def on_client_connected(transport, client):
131+
logger.info(f"Client connected")
132+
# Kick off the conversation.
133+
context.add_message(
134+
{"role": "user", "content": "Please introduce yourself to the user."}
135+
)
136+
await task.queue_frames([LLMRunFrame()])
137+
138+
@transport.event_handler("on_client_disconnected")
139+
async def on_client_disconnected(transport, client):
140+
logger.info(f"Client disconnected")
141+
await task.cancel()
142+
143+
runner = PipelineRunner(handle_sigint=runner_args.handle_sigint)
144+
145+
await runner.run(task)
140146

141147

142148
async def bot(runner_args: RunnerArguments):

scripts/evals/run-release-evals.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -108,6 +108,7 @@ def EVAL_VISION_IMAGE(*, eval_speaks_first: bool = False):
108108
("07c-interruptible-deepgram-http.py", EVAL_SIMPLE_MATH),
109109
("07d-interruptible-elevenlabs.py", EVAL_SIMPLE_MATH),
110110
("07d-interruptible-elevenlabs-http.py", EVAL_SIMPLE_MATH),
111+
("07e-interruptible-xai.py", EVAL_SIMPLE_MATH),
111112
("07f-interruptible-azure.py", EVAL_SIMPLE_MATH),
112113
("07f-interruptible-azure-http.py", EVAL_SIMPLE_MATH),
113114
("07g-interruptible-openai.py", EVAL_SIMPLE_MATH),

0 commit comments

Comments
 (0)