Skip to content

Commit a3fec4f

Browse files
committed
refactor(platform|agent): support for speech moved to processors
1 parent 191e3a5 commit a3fec4f

File tree

15 files changed

+848
-241
lines changed

15 files changed

+848
-241
lines changed

docs/bundles/ai-bundle.rst

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1189,12 +1189,25 @@ When using the bundle, the configuration allows to configure models and voices::
11891189
platform:
11901190
elevenlabs:
11911191
api_key: '%env(ELEVEN_LABS_API_KEY)%'
1192+
openai:
1193+
api_key: '%env(OPENAI_API_KEY)%'
1194+
1195+
agent:
1196+
assistant_vocal:
1197+
platform: ai.platform.openai
1198+
model: gpt-4o
11921199
speech:
1200+
platform: 'ai.platform.elevenlabs'
11931201
tts_model: '%env(ELEVEN_LABS_TTS_MODEL)%'
11941202
tts_options:
11951203
voice: '%env(ELEVEN_LABS_VOICE_IDENTIFIER)%'
11961204
stt_model: '%env(ELEVEN_LABS_STT_MODEL)%'
11971205

1206+
.. note::
1207+
1208+
The current example is built for "TTS / STT sandwich", a pattern that handles both input and output as audio,
1209+
both STT and TTS can be enabled independently.
1210+
11981211
.. _`Symfony AI Agent`: https://github.com/symfony/ai-agent
11991212
.. _`Symfony AI Chat`: https://github.com/symfony/ai-chat
12001213
.. _`Symfony AI Platform`: https://github.com/symfony/ai-platform

docs/components/agent.rst

Lines changed: 110 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -731,6 +731,116 @@ Testing a service that uses an agent::
731731
The ``MockAgent`` provides all the benefits of traditional mocks while offering a more intuitive API for AI agent testing,
732732
making your tests more reliable and easier to maintain.
733733

734+
Speech support
735+
~~~~~~~~~~~~~~
736+
737+
Using speech to send messages / receive answers as audio is a common use case when integrating agents and/or chats,
738+
this approach allows to either send audio and expect text output or send a text and receive an audio content.
739+
740+
Another approach is to use stt / tts together to enable a full audio pipeline, this approach introduce some latency
741+
(as both input/output must be processed) but allows to create a more natural and "human-like" conversation flow.
742+
743+
Speech support can be enabled using :class:`Symfony\\AI\\Agent\\InputProcessor\\SpeechProcessor` (for `text-to-speech` in this example)::
744+
745+
use Symfony\AI\Agent\Agent;
746+
use Symfony\AI\Agent\InputProcessor\SpeechProcessor;
747+
use Symfony\AI\Platform\Bridge\ElevenLabs\PlatformFactory as ElevenLabsPlatformFactory;
748+
use Symfony\AI\Platform\Bridge\OpenAi\PlatformFactory as OpenAiPlatformFactory;
749+
use Symfony\AI\Platform\Message\Message;
750+
use Symfony\AI\Platform\Message\MessageBag;
751+
use Symfony\AI\Platform\Speech\SpeechConfiguration;
752+
use Symfony\Component\HttpClient\HttpClient;
753+
754+
require_once dirname(__DIR__).'/bootstrap.php';
755+
756+
$platform = OpenAiPlatformFactory::create('key', httpClient: HttpClient::create());
757+
758+
$agent = new Agent($platform, 'gpt-4o', outputProcessors: [
759+
new SpeechProcessor(ElevenLabsPlatformFactory::create(
760+
apiKey: 'key',
761+
httpClient: http_client()
762+
), new SpeechConfiguration([
763+
'tts_model' => 'eleven_multilingual_v2',
764+
'tts_options' => [
765+
'voice' => 'Dslrhjl3ZpzrctukrQSN', // Brad (https://elevenlabs.io/app/voice-library?voiceId=Dslrhjl3ZpzrctukrQSN)
766+
],
767+
])),
768+
]);
769+
$answer = $agent->call(new MessageBag(
770+
Message::ofUser('Tina has one brother and one sister. How many sisters do Tina\'s siblings have?'),
771+
));
772+
773+
echo $answer->getSpeech()->asBinary();
774+
775+
When handling `speech-to-speech`, the process still the same but requires a :class:`Symfony\\AI\\Platform\\Message\\Content\\Audio` as an input::
776+
777+
use Symfony\AI\Agent\Agent;
778+
use Symfony\AI\Agent\InputProcessor\SpeechProcessor;
779+
use Symfony\AI\Platform\Bridge\ElevenLabs\PlatformFactory as ElevenLabsPlatformFactory;
780+
use Symfony\AI\Platform\Bridge\OpenAi\PlatformFactory as OpenAiPlatformFactory;
781+
use Symfony\AI\Platform\Message\Content\Audio;
782+
use Symfony\AI\Platform\Message\Message;
783+
use Symfony\AI\Platform\Message\MessageBag;
784+
use Symfony\AI\Platform\Speech\SpeechConfiguration;
785+
use Symfony\Component\HttpClient\HttpClient;
786+
787+
require_once dirname(__DIR__).'/bootstrap.php';
788+
789+
$platform = OpenAiPlatformFactory::create('key', httpClient: HttpClient::create());
790+
791+
$agent = new Agent($platform, 'gpt-4o', [
792+
new SpeechProcessor(ElevenLabsPlatformFactory::create(
793+
apiKey: 'key',
794+
httpClient: http_client(),
795+
), new SpeechConfiguration([
796+
'stt_model' => 'scribe_v1',
797+
]))
798+
]);
799+
$answer = $agent->call(new MessageBag(
800+
Message::ofUser(Audio::fromFile(dirname(__DIR__, 2).'/fixtures/audio.mp3'))
801+
));
802+
803+
echo $answer->getContent();
804+
805+
A "STT / TTS sandwich" can be created using the :class:`Symfony\\AI\\Agent\\InputProcessor\\SpeechProcessor` as input and output processor::
806+
807+
use Symfony\AI\Agent\Agent;
808+
use Symfony\AI\Agent\InputProcessor\SpeechProcessor;
809+
use Symfony\AI\Platform\Bridge\ElevenLabs\PlatformFactory as ElevenLabsPlatformFactory;
810+
use Symfony\AI\Platform\Bridge\OpenAi\PlatformFactory as OpenAiPlatformFactory;
811+
use Symfony\AI\Platform\Message\Content\Audio;
812+
use Symfony\AI\Platform\Message\Message;
813+
use Symfony\AI\Platform\Message\MessageBag;
814+
use Symfony\AI\Platform\Speech\SpeechConfiguration;
815+
use Symfony\Component\HttpClient\HttpClient;
816+
817+
require_once dirname(__DIR__).'/bootstrap.php';
818+
819+
$platform = OpenAiPlatformFactory::create('key', httpClient: HttpClient::create());
820+
821+
$speechProcessor = new SpeechProcessor(ElevenLabsPlatformFactory::create(
822+
apiKey: 'key',
823+
httpClient: http_client(),
824+
), new SpeechConfiguration([
825+
'tts_model' => 'eleven_multilingual_v2',
826+
'tts_options' => [
827+
'voice' => 'Dslrhjl3ZpzrctukrQSN', // Brad (https://elevenlabs.io/app/voice-library?voiceId=Dslrhjl3ZpzrctukrQSN)
828+
],
829+
'stt_model' => 'scribe_v1',
830+
]));
831+
832+
$agent = new Agent($platform, 'gpt-4o', [$speechProcessor], [$speechProcessor]);
833+
834+
$answer = $agent->call(new MessageBag(
835+
Message::ofUser(Audio::fromFile(dirname(__DIR__, 2).'/fixtures/audio.mp3'))
836+
));
837+
838+
echo $answer->getSpeech()->asBinary();
839+
840+
.. note::
841+
842+
Handling both `text-to-speech` and `speech-to-text` introduce latency as most of the process is synchronous.
843+
734844
Code Examples
735845
~~~~~~~~~~~~~
736846

docs/components/platform.rst

Lines changed: 0 additions & 86 deletions
Original file line numberDiff line numberDiff line change
@@ -687,92 +687,6 @@ This allows fast and isolated testing of AI-powered features without relying on
687687

688688
This requires `cURL` and the `ext-curl` extension to be installed.
689689

690-
Speech support
691-
~~~~~~~~~~~~~~
692-
693-
Using speech to send messages / receive answers as audio is a common use case when integrating agents and/or chats,
694-
this approach allows to either send audio and expect text output or send a text and receive an audio content.
695-
696-
Another approach is to use stt / tts together to enable a full audio pipeline, this approach introduce some latency
697-
(as both input/output must be processed) but allows to create a more natural and "human-like" conversation flow.
698-
699-
Speech support can be enabled using :class:`Symfony\\AI\\Platform\\Speech\\SpeechListener` (for `text-to-speech` in this example)::
700-
701-
use Symfony\AI\Agent\Agent;
702-
use Symfony\AI\Platform\Bridge\ElevenLabs\PlatformFactory as ElevenLabsPlatformFactory;
703-
use Symfony\AI\Platform\Bridge\OpenAi\PlatformFactory as OpenAiPlatformFactory;
704-
use Symfony\AI\Platform\Message\Message;
705-
use Symfony\AI\Platform\Message\MessageBag;
706-
use Symfony\AI\Platform\Speech\SpeechConfiguration;
707-
use Symfony\AI\Platform\Speech\SpeechListener;
708-
use Symfony\Component\EventDispatcher\EventDispatcher;
709-
use Symfony\Component\HttpClient\HttpClient;
710-
711-
$eventDispatcher = new EventDispatcher();
712-
$eventDispatcher->addSubscriber(new SpeechListener([
713-
'elevenlabs' => ElevenLabsPlatformFactory::create(
714-
env('ELEVEN_LABS_API_KEY'),
715-
httpClient: HttpClient::create(),
716-
),
717-
], [
718-
'elevenlabs' => new SpeechConfiguration([
719-
'tts_model' => 'eleven_multilingual_v2',
720-
'tts_options' => [
721-
'voice' => 'Dslrhjl3ZpzrctukrQSN', // Brad (https://elevenlabs.io/app/voice-library?voiceId=Dslrhjl3ZpzrctukrQSN)
722-
],
723-
]),
724-
]));
725-
726-
$platform = OpenAiPlatformFactory::create(env('OPENAI_API_KEY'), httpClient: HttpClient::create(), eventDispatcher: $eventDispatcher);
727-
728-
$agent = new Agent($platform, 'gpt-4o');
729-
$answer = $agent->call(new MessageBag(
730-
Message::ofUser('Tina has one brother and one sister. How many sisters do Tina\'s siblings have?'),
731-
));
732-
733-
echo $answer->getSpeech()->asBinary();
734-
735-
When handling `speech-to-speech`, the process still the same but requires a :class:`Symfony\\AI\\Platform\\Message\\Content\\Audio` as an input::
736-
737-
use Symfony\AI\Agent\Agent;
738-
use Symfony\AI\Platform\Bridge\ElevenLabs\PlatformFactory as ElevenLabsPlatformFactory;
739-
use Symfony\AI\Platform\Bridge\OpenAi\PlatformFactory as OpenAiPlatformFactory;
740-
use Symfony\AI\Platform\Message\Content\Audio;
741-
use Symfony\AI\Platform\Message\Message;
742-
use Symfony\AI\Platform\Message\MessageBag;
743-
use Symfony\AI\Platform\Speech\SpeechConfiguration;
744-
use Symfony\AI\Platform\Speech\SpeechListener;
745-
use Symfony\Component\EventDispatcher\EventDispatcher;
746-
use Symfony\Component\HttpClient\HttpClient;
747-
748-
$eventDispatcher = new EventDispatcher();
749-
$eventDispatcher->addSubscriber(new SpeechListener([
750-
'elevenlabs' => ElevenLabsPlatformFactory::create(
751-
env('ELEVEN_LABS_API_KEY'),
752-
httpClient: HttpClient::create(),
753-
),
754-
], [
755-
'elevenlabs' => new SpeechConfiguration([
756-
'tts_model' => 'eleven_multilingual_v2',
757-
'tts_options' => [
758-
'voice' => 'Dslrhjl3ZpzrctukrQSN', // Brad (https://elevenlabs.io/app/voice-library?voiceId=Dslrhjl3ZpzrctukrQSN)
759-
],
760-
]),
761-
]));
762-
763-
$platform = OpenAiPlatformFactory::create(env('OPENAI_API_KEY'), httpClient: HttpClient::create(), eventDispatcher: $eventDispatcher);
764-
765-
$agent = new Agent($platform, 'gpt-4o');
766-
$answer = $agent->call(new MessageBag(
767-
Message::ofUser(Audio::fromFile(...)
768-
));
769-
770-
echo $answer->getSpeech()->asBinary();
771-
772-
.. note::
773-
774-
Handling `speech-to-speech` introduce latency as most of the process is synchronous.
775-
776690
Code Examples
777691
~~~~~~~~~~~~~
778692

examples/speech/agent-eleven-labs-speech-sts.php

Lines changed: 13 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -10,36 +10,31 @@
1010
*/
1111

1212
use Symfony\AI\Agent\Agent;
13+
use Symfony\AI\Agent\InputProcessor\SpeechProcessor;
1314
use Symfony\AI\Platform\Bridge\ElevenLabs\PlatformFactory as ElevenLabsPlatformFactory;
1415
use Symfony\AI\Platform\Bridge\OpenAi\PlatformFactory as OpenAiPlatformFactory;
1516
use Symfony\AI\Platform\Message\Content\Audio;
1617
use Symfony\AI\Platform\Message\Message;
1718
use Symfony\AI\Platform\Message\MessageBag;
1819
use Symfony\AI\Platform\Speech\SpeechConfiguration;
19-
use Symfony\AI\Platform\Speech\SpeechListener;
20-
use Symfony\Component\EventDispatcher\EventDispatcher;
2120

2221
require_once dirname(__DIR__).'/bootstrap.php';
2322

24-
$eventDispatcher = new EventDispatcher();
25-
$eventDispatcher->addSubscriber(new SpeechListener([
26-
'elevenlabs' => ElevenLabsPlatformFactory::create(
27-
apiKey: env('ELEVEN_LABS_API_KEY'),
28-
httpClient: http_client(),
29-
),
30-
], [
31-
'elevenlabs' => new SpeechConfiguration([
32-
'tts_model' => 'eleven_multilingual_v2',
33-
'tts_options' => [
34-
'voice' => 'Dslrhjl3ZpzrctukrQSN', // Brad (https://elevenlabs.io/app/voice-library?voiceId=Dslrhjl3ZpzrctukrQSN)
35-
],
36-
'stt_model' => 'scribe_v1',
37-
]),
23+
$platform = OpenAiPlatformFactory::create(env('OPENAI_API_KEY'), httpClient: http_client());
24+
25+
$speechProcessor = new SpeechProcessor(ElevenLabsPlatformFactory::create(
26+
apiKey: env('ELEVEN_LABS_API_KEY'),
27+
httpClient: http_client(),
28+
), new SpeechConfiguration([
29+
'tts_model' => 'eleven_multilingual_v2',
30+
'tts_options' => [
31+
'voice' => 'Dslrhjl3ZpzrctukrQSN', // Brad (https://elevenlabs.io/app/voice-library?voiceId=Dslrhjl3ZpzrctukrQSN)
32+
],
33+
'stt_model' => 'scribe_v1',
3834
]));
3935

40-
$platform = OpenAiPlatformFactory::create(env('OPENAI_API_KEY'), httpClient: http_client(), eventDispatcher: $eventDispatcher);
36+
$agent = new Agent($platform, 'gpt-4o', [$speechProcessor], [$speechProcessor]);
4137

42-
$agent = new Agent($platform, 'gpt-4o');
4338
$answer = $agent->call(new MessageBag(
4439
Message::ofUser(Audio::fromFile(dirname(__DIR__, 2).'/fixtures/audio.mp3'))
4540
));

examples/speech/agent-eleven-labs-speech-stt.php

Lines changed: 8 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -10,32 +10,26 @@
1010
*/
1111

1212
use Symfony\AI\Agent\Agent;
13+
use Symfony\AI\Agent\InputProcessor\SpeechProcessor;
1314
use Symfony\AI\Platform\Bridge\ElevenLabs\PlatformFactory as ElevenLabsPlatformFactory;
1415
use Symfony\AI\Platform\Bridge\OpenAi\PlatformFactory as OpenAiPlatformFactory;
1516
use Symfony\AI\Platform\Message\Content\Audio;
1617
use Symfony\AI\Platform\Message\Message;
1718
use Symfony\AI\Platform\Message\MessageBag;
1819
use Symfony\AI\Platform\Speech\SpeechConfiguration;
19-
use Symfony\AI\Platform\Speech\SpeechListener;
20-
use Symfony\Component\EventDispatcher\EventDispatcher;
2120

2221
require_once dirname(__DIR__).'/bootstrap.php';
2322

24-
$eventDispatcher = new EventDispatcher();
25-
$eventDispatcher->addSubscriber(new SpeechListener([
26-
'elevenlabs' => ElevenLabsPlatformFactory::create(
23+
$platform = OpenAiPlatformFactory::create(env('OPENAI_API_KEY'), httpClient: http_client());
24+
25+
$agent = new Agent($platform, 'gpt-4o', [
26+
new SpeechProcessor(ElevenLabsPlatformFactory::create(
2727
apiKey: env('ELEVEN_LABS_API_KEY'),
2828
httpClient: http_client(),
29-
),
30-
], [
31-
'elevenlabs' => new SpeechConfiguration([
29+
), new SpeechConfiguration([
3230
'stt_model' => 'scribe_v1',
33-
]),
34-
]));
35-
36-
$platform = OpenAiPlatformFactory::create(env('OPENAI_API_KEY'), httpClient: http_client(), eventDispatcher: $eventDispatcher);
37-
38-
$agent = new Agent($platform, 'gpt-4o');
31+
])),
32+
]);
3933
$answer = $agent->call(new MessageBag(
4034
Message::ofUser(Audio::fromFile(dirname(__DIR__, 2).'/fixtures/audio.mp3'))
4135
));

examples/speech/agent-eleven-labs-speech-tts.php

Lines changed: 10 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -10,35 +10,28 @@
1010
*/
1111

1212
use Symfony\AI\Agent\Agent;
13+
use Symfony\AI\Agent\InputProcessor\SpeechProcessor;
1314
use Symfony\AI\Platform\Bridge\ElevenLabs\PlatformFactory as ElevenLabsPlatformFactory;
1415
use Symfony\AI\Platform\Bridge\OpenAi\PlatformFactory as OpenAiPlatformFactory;
1516
use Symfony\AI\Platform\Message\Message;
1617
use Symfony\AI\Platform\Message\MessageBag;
1718
use Symfony\AI\Platform\Speech\SpeechConfiguration;
18-
use Symfony\AI\Platform\Speech\SpeechListener;
19-
use Symfony\Component\EventDispatcher\EventDispatcher;
2019

2120
require_once dirname(__DIR__).'/bootstrap.php';
2221

23-
$eventDispatcher = new EventDispatcher();
24-
$eventDispatcher->addSubscriber(new SpeechListener([
25-
'elevenlabs' => ElevenLabsPlatformFactory::create(
26-
apiKey: env('ELEVEN_LABS_API_KEY'),
27-
httpClient: http_client(),
28-
),
29-
], [
30-
'elevenlabs' => new SpeechConfiguration([
22+
$platform = OpenAiPlatformFactory::create(env('OPENAI_API_KEY'), httpClient: http_client());
23+
24+
$agent = new Agent($platform, 'gpt-4o', outputProcessors: [
25+
new SpeechProcessor(ElevenLabsPlatformFactory::create(
26+
env('ELEVEN_LABS_API_KEY'),
27+
httpClient: http_client()
28+
), new SpeechConfiguration([
3129
'tts_model' => 'eleven_multilingual_v2',
3230
'tts_options' => [
3331
'voice' => 'Dslrhjl3ZpzrctukrQSN', // Brad (https://elevenlabs.io/app/voice-library?voiceId=Dslrhjl3ZpzrctukrQSN)
3432
],
35-
'stt_model' => 'scribe_v1',
36-
]),
37-
]));
38-
39-
$platform = OpenAiPlatformFactory::create(env('OPENAI_API_KEY'), httpClient: http_client(), eventDispatcher: $eventDispatcher);
40-
41-
$agent = new Agent($platform, 'gpt-4o');
33+
])),
34+
]);
4235
$answer = $agent->call(new MessageBag(
4336
Message::ofUser('Tina has one brother and one sister. How many sisters do Tina\'s siblings have?'),
4437
));

src/agent/composer.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@
2828
"phpdocumentor/reflection-docblock": "^5.4",
2929
"phpstan/phpdoc-parser": "^2.1",
3030
"psr/log": "^3.0",
31-
"symfony/ai-platform": "^0.3",
31+
"symfony/ai-platform": "^0.4",
3232
"symfony/clock": "^7.3|^8.0",
3333
"symfony/http-client": "^7.3|^8.0",
3434
"symfony/polyfill-php85": "^1.33",

0 commit comments

Comments
 (0)