Skip to content

Commit d9218f0

Browse files
committed
feat(platform|agent): add support for Speech
1 parent 3fd26c2 commit d9218f0

34 files changed

+1419
-15
lines changed

demo/tests/Blog/Command/StreamCommandTest.php

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,13 @@
1515
use PHPUnit\Framework\TestCase;
1616
use Symfony\AI\Agent\AgentInterface;
1717
use Symfony\AI\Platform\Metadata\Metadata;
18+
use Symfony\AI\Platform\PlainConverter;
19+
use Symfony\AI\Platform\Result\DeferredResult;
20+
use Symfony\AI\Platform\Result\InMemoryRawResult;
1821
use Symfony\AI\Platform\Result\RawResultInterface;
1922
use Symfony\AI\Platform\Result\ResultInterface;
23+
use Symfony\AI\Platform\Result\TextResult;
24+
use Symfony\AI\Platform\Speech\Speech;
2025
use Symfony\Component\Console\Input\ArrayInput;
2126
use Symfony\Component\Console\Output\BufferedOutput;
2227
use Symfony\Component\Console\Style\SymfonyStyle;
@@ -50,6 +55,15 @@ public function getRawResult(): ?RawResultInterface
5055
public function setRawResult(RawResultInterface $rawResult): void
5156
{
5257
}
58+
59+
public function addSpeech(?Speech $speech = null): void
60+
{
61+
}
62+
63+
public function getSpeech(): Speech
64+
{
65+
return new Speech(new DeferredResult(new PlainConverter(new TextResult('foo')), new InMemoryRawResult()));
66+
}
5367
});
5468

5569
$input = new ArrayInput([]);

docs/bundles/ai-bundle.rst

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1174,6 +1174,40 @@ Chats are defined in the ``chat`` section of your configuration:
11741174
agent: 'ai.agent.youtube'
11751175
message_store: 'ai.message_store.cache.youtube'
11761176
1177+
Speech
1178+
------
1179+
1180+
Speech can be used to create a `stt` / `tts` / `sts` pipelines, leading to a more "human-like" conversation flow,
1181+
the main drawback is the latency that it can introduce due to network calls.
1182+
1183+
Configuring speech
1184+
~~~~~~~~~~~~~~~~~~
1185+
1186+
When using the bundle, the configuration allows to configure models and voices::
1187+
1188+
ai:
1189+
platform:
1190+
elevenlabs:
1191+
api_key: '%env(ELEVEN_LABS_API_KEY)%'
1192+
openai:
1193+
api_key: '%env(OPENAI_API_KEY)%'
1194+
1195+
agent:
1196+
sts_openai:
1197+
platform: ai.platform.openai
1198+
model: gpt-4o
1199+
speech:
1200+
platform: 'ai.platform.elevenlabs'
1201+
tts_model: '%env(ELEVEN_LABS_TTS_MODEL)%'
1202+
tts_options:
1203+
voice: '%env(ELEVEN_LABS_VOICE_IDENTIFIER)%'
1204+
stt_model: '%env(ELEVEN_LABS_STT_MODEL)%'
1205+
1206+
.. note::
1207+
1208+
The current example is built for "TTS / STT sandwich", a pattern that handles both input and output as audio,
1209+
both STT and TTS can be enabled independently.
1210+
11771211
.. _`Symfony AI Agent`: https://github.com/symfony/ai-agent
11781212
.. _`Symfony AI Chat`: https://github.com/symfony/ai-chat
11791213
.. _`Symfony AI Platform`: https://github.com/symfony/ai-platform

docs/components/agent.rst

Lines changed: 110 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -731,6 +731,116 @@ Testing a service that uses an agent::
731731
The ``MockAgent`` provides all the benefits of traditional mocks while offering a more intuitive API for AI agent testing,
732732
making your tests more reliable and easier to maintain.
733733

734+
Speech support
735+
~~~~~~~~~~~~~~
736+
737+
Using speech to send messages / receive answers as audio is a common use case when integrating agents and/or chats,
738+
this approach allows to either send audio and expect text output or send a text and receive an audio content.
739+
740+
Another approach is to use stt / tts together to enable a full audio pipeline, this approach introduce some latency
741+
(as both input/output must be processed) but allows to create a more natural and "human-like" conversation flow.
742+
743+
Speech support can be enabled using :class:`Symfony\\AI\\Agent\\InputProcessor\\SpeechProcessor` (for `text-to-speech` in this example)::
744+
745+
use Symfony\AI\Agent\Agent;
746+
use Symfony\AI\Agent\InputProcessor\SpeechProcessor;
747+
use Symfony\AI\Platform\Bridge\ElevenLabs\PlatformFactory as ElevenLabsPlatformFactory;
748+
use Symfony\AI\Platform\Bridge\OpenAi\PlatformFactory as OpenAiPlatformFactory;
749+
use Symfony\AI\Platform\Message\Message;
750+
use Symfony\AI\Platform\Message\MessageBag;
751+
use Symfony\AI\Platform\Speech\SpeechConfiguration;
752+
use Symfony\Component\HttpClient\HttpClient;
753+
754+
require_once dirname(__DIR__).'/bootstrap.php';
755+
756+
$platform = OpenAiPlatformFactory::create('key', httpClient: HttpClient::create());
757+
758+
$agent = new Agent($platform, 'gpt-4o', outputProcessors: [
759+
new SpeechProcessor(ElevenLabsPlatformFactory::create(
760+
apiKey: 'key',
761+
httpClient: http_client()
762+
), new SpeechConfiguration([
763+
'tts_model' => 'eleven_multilingual_v2',
764+
'tts_options' => [
765+
'voice' => 'Dslrhjl3ZpzrctukrQSN', // Brad (https://elevenlabs.io/app/voice-library?voiceId=Dslrhjl3ZpzrctukrQSN)
766+
],
767+
])),
768+
]);
769+
$answer = $agent->call(new MessageBag(
770+
Message::ofUser('Tina has one brother and one sister. How many sisters do Tina\'s siblings have?'),
771+
));
772+
773+
echo $answer->getSpeech()->asBinary();
774+
775+
When handling `speech-to-speech`, the process still the same but requires a :class:`Symfony\\AI\\Platform\\Message\\Content\\Audio` as an input::
776+
777+
use Symfony\AI\Agent\Agent;
778+
use Symfony\AI\Agent\InputProcessor\SpeechProcessor;
779+
use Symfony\AI\Platform\Bridge\ElevenLabs\PlatformFactory as ElevenLabsPlatformFactory;
780+
use Symfony\AI\Platform\Bridge\OpenAi\PlatformFactory as OpenAiPlatformFactory;
781+
use Symfony\AI\Platform\Message\Content\Audio;
782+
use Symfony\AI\Platform\Message\Message;
783+
use Symfony\AI\Platform\Message\MessageBag;
784+
use Symfony\AI\Platform\Speech\SpeechConfiguration;
785+
use Symfony\Component\HttpClient\HttpClient;
786+
787+
require_once dirname(__DIR__).'/bootstrap.php';
788+
789+
$platform = OpenAiPlatformFactory::create('key', httpClient: HttpClient::create());
790+
791+
$agent = new Agent($platform, 'gpt-4o', [
792+
new SpeechProcessor(ElevenLabsPlatformFactory::create(
793+
apiKey: 'key',
794+
httpClient: http_client(),
795+
), new SpeechConfiguration([
796+
'stt_model' => 'scribe_v1',
797+
]))
798+
]);
799+
$answer = $agent->call(new MessageBag(
800+
Message::ofUser(Audio::fromFile(dirname(__DIR__, 2).'/fixtures/audio.mp3'))
801+
));
802+
803+
echo $answer->getContent();
804+
805+
A "STT / TTS sandwich" can be created using the :class:`Symfony\\AI\\Agent\\InputProcessor\\SpeechProcessor` as input and output processor::
806+
807+
use Symfony\AI\Agent\Agent;
808+
use Symfony\AI\Agent\InputProcessor\SpeechProcessor;
809+
use Symfony\AI\Platform\Bridge\ElevenLabs\PlatformFactory as ElevenLabsPlatformFactory;
810+
use Symfony\AI\Platform\Bridge\OpenAi\PlatformFactory as OpenAiPlatformFactory;
811+
use Symfony\AI\Platform\Message\Content\Audio;
812+
use Symfony\AI\Platform\Message\Message;
813+
use Symfony\AI\Platform\Message\MessageBag;
814+
use Symfony\AI\Platform\Speech\SpeechConfiguration;
815+
use Symfony\Component\HttpClient\HttpClient;
816+
817+
require_once dirname(__DIR__).'/bootstrap.php';
818+
819+
$platform = OpenAiPlatformFactory::create('key', httpClient: HttpClient::create());
820+
821+
$speechProcessor = new SpeechProcessor(ElevenLabsPlatformFactory::create(
822+
apiKey: 'key',
823+
httpClient: http_client(),
824+
), new SpeechConfiguration([
825+
'tts_model' => 'eleven_multilingual_v2',
826+
'tts_options' => [
827+
'voice' => 'Dslrhjl3ZpzrctukrQSN', // Brad (https://elevenlabs.io/app/voice-library?voiceId=Dslrhjl3ZpzrctukrQSN)
828+
],
829+
'stt_model' => 'scribe_v1',
830+
]));
831+
832+
$agent = new Agent($platform, 'gpt-4o', [$speechProcessor], [$speechProcessor]);
833+
834+
$answer = $agent->call(new MessageBag(
835+
Message::ofUser(Audio::fromFile(dirname(__DIR__, 2).'/fixtures/audio.mp3'))
836+
));
837+
838+
echo $answer->getSpeech()->asBinary();
839+
840+
.. note::
841+
842+
Handling both `text-to-speech` and `speech-to-text` introduce latency as most of the process is synchronous.
843+
734844
Code Examples
735845
~~~~~~~~~~~~~
736846

examples/speech/README.md

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
# Speech Examples
2+
3+
Speech is mainly used to transform text to audio and vice versa, it can also be used to create an audio to audio pipeline.
4+
5+
To run the examples, you can use additional tools like (mpg123)[https://www.mpg123.de/]:
6+
7+
```bash
8+
php speech/agent-eleven-labs-speech-tts.php | mpg123 -
9+
php speech/agent-eleven-labs-speech-sts.php | mpg123 -
10+
```
Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
<?php
2+
3+
/*
4+
* This file is part of the Symfony package.
5+
*
6+
* (c) Fabien Potencier <fabien@symfony.com>
7+
*
8+
* For the full copyright and license information, please view the LICENSE
9+
* file that was distributed with this source code.
10+
*/
11+
12+
use Symfony\AI\Agent\Agent;
13+
use Symfony\AI\Agent\InputProcessor\SpeechProcessor;
14+
use Symfony\AI\Platform\Bridge\ElevenLabs\PlatformFactory as ElevenLabsPlatformFactory;
15+
use Symfony\AI\Platform\Bridge\OpenAi\PlatformFactory as OpenAiPlatformFactory;
16+
use Symfony\AI\Platform\Message\Content\Audio;
17+
use Symfony\AI\Platform\Message\Message;
18+
use Symfony\AI\Platform\Message\MessageBag;
19+
use Symfony\AI\Platform\Speech\SpeechConfiguration;
20+
21+
require_once dirname(__DIR__).'/bootstrap.php';
22+
23+
$platform = OpenAiPlatformFactory::create(env('OPENAI_API_KEY'), httpClient: http_client());
24+
25+
$speechProcessor = new SpeechProcessor(ElevenLabsPlatformFactory::create(
26+
apiKey: env('ELEVEN_LABS_API_KEY'),
27+
httpClient: http_client(),
28+
), new SpeechConfiguration([
29+
'tts_model' => 'eleven_multilingual_v2',
30+
'tts_options' => [
31+
'voice' => 'Dslrhjl3ZpzrctukrQSN', // Brad (https://elevenlabs.io/app/voice-library?voiceId=Dslrhjl3ZpzrctukrQSN)
32+
],
33+
'stt_model' => 'scribe_v1',
34+
]));
35+
36+
$agent = new Agent($platform, 'gpt-4o', [$speechProcessor], [$speechProcessor]);
37+
38+
$answer = $agent->call(new MessageBag(
39+
Message::ofUser(Audio::fromFile(dirname(__DIR__, 2).'/fixtures/audio.mp3'))
40+
));
41+
42+
echo $answer->getSpeech()->asBinary();
Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
<?php
2+
3+
/*
4+
* This file is part of the Symfony package.
5+
*
6+
* (c) Fabien Potencier <fabien@symfony.com>
7+
*
8+
* For the full copyright and license information, please view the LICENSE
9+
* file that was distributed with this source code.
10+
*/
11+
12+
use Symfony\AI\Agent\Agent;
13+
use Symfony\AI\Agent\InputProcessor\SpeechProcessor;
14+
use Symfony\AI\Platform\Bridge\ElevenLabs\PlatformFactory as ElevenLabsPlatformFactory;
15+
use Symfony\AI\Platform\Bridge\OpenAi\PlatformFactory as OpenAiPlatformFactory;
16+
use Symfony\AI\Platform\Message\Content\Audio;
17+
use Symfony\AI\Platform\Message\Message;
18+
use Symfony\AI\Platform\Message\MessageBag;
19+
use Symfony\AI\Platform\Speech\SpeechConfiguration;
20+
21+
require_once dirname(__DIR__).'/bootstrap.php';
22+
23+
$platform = OpenAiPlatformFactory::create(env('OPENAI_API_KEY'), httpClient: http_client());
24+
25+
$agent = new Agent($platform, 'gpt-4o', [
26+
new SpeechProcessor(ElevenLabsPlatformFactory::create(
27+
apiKey: env('ELEVEN_LABS_API_KEY'),
28+
httpClient: http_client(),
29+
), new SpeechConfiguration([
30+
'stt_model' => 'scribe_v1',
31+
])),
32+
]);
33+
$answer = $agent->call(new MessageBag(
34+
Message::ofUser(Audio::fromFile(dirname(__DIR__, 2).'/fixtures/audio.mp3'))
35+
));
36+
37+
echo $answer->getContent();
Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
<?php
2+
3+
/*
4+
* This file is part of the Symfony package.
5+
*
6+
* (c) Fabien Potencier <fabien@symfony.com>
7+
*
8+
* For the full copyright and license information, please view the LICENSE
9+
* file that was distributed with this source code.
10+
*/
11+
12+
use Symfony\AI\Agent\Agent;
13+
use Symfony\AI\Agent\InputProcessor\SpeechProcessor;
14+
use Symfony\AI\Platform\Bridge\ElevenLabs\PlatformFactory as ElevenLabsPlatformFactory;
15+
use Symfony\AI\Platform\Bridge\OpenAi\PlatformFactory as OpenAiPlatformFactory;
16+
use Symfony\AI\Platform\Message\Message;
17+
use Symfony\AI\Platform\Message\MessageBag;
18+
use Symfony\AI\Platform\Speech\SpeechConfiguration;
19+
20+
require_once dirname(__DIR__).'/bootstrap.php';
21+
22+
$platform = OpenAiPlatformFactory::create(env('OPENAI_API_KEY'), httpClient: http_client());
23+
24+
$agent = new Agent($platform, 'gpt-4o', outputProcessors: [
25+
new SpeechProcessor(ElevenLabsPlatformFactory::create(
26+
env('ELEVEN_LABS_API_KEY'),
27+
httpClient: http_client()
28+
), new SpeechConfiguration([
29+
'tts_model' => 'eleven_multilingual_v2',
30+
'tts_options' => [
31+
'voice' => 'Dslrhjl3ZpzrctukrQSN', // Brad (https://elevenlabs.io/app/voice-library?voiceId=Dslrhjl3ZpzrctukrQSN)
32+
],
33+
])),
34+
]);
35+
$answer = $agent->call(new MessageBag(
36+
Message::ofUser('Tina has one brother and one sister. How many sisters do Tina\'s siblings have?'),
37+
));
38+
39+
echo $answer->getSpeech()->asBinary();

src/agent/composer.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@
4444
"phpunit/phpunit": "^11.5.53",
4545
"symfony/ai-store": "^0.5",
4646
"symfony/event-dispatcher": "^7.3|^8.0",
47+
"symfony/options-resolver": "^7.3|^8.0",
4748
"symfony/translation": "^7.3|^8.0",
4849
"symfony/translation-contracts": "^3.6"
4950
},

0 commit comments

Comments
 (0)