@@ -731,6 +731,116 @@ Testing a service that uses an agent::
731731The ``MockAgent `` provides all the benefits of traditional mocks while offering a more intuitive API for AI agent testing,
732732making your tests more reliable and easier to maintain.
733733
734+ Speech support
735+ ~~~~~~~~~~~~~~
736+
737+ Using speech to send messages / receive answers as audio is a common use case when integrating agents and/or chats,
738+ this approach allows to either send audio and expect text output or send a text and receive an audio content.
739+
740+ Another approach is to use stt / tts together to enable a full audio pipeline, this approach introduce some latency
741+ (as both input/output must be processed) but allows to create a more natural and "human-like" conversation flow.
742+
743+ Speech support can be enabled using :class: `Symfony\\ AI\\ Agent\\ InputProcessor\\ SpeechProcessor ` (for `text-to-speech ` in this example)::
744+
745+ use Symfony\AI\Agent\Agent;
746+ use Symfony\AI\Agent\InputProcessor\SpeechProcessor;
747+ use Symfony\AI\Platform\Bridge\ElevenLabs\PlatformFactory as ElevenLabsPlatformFactory;
748+ use Symfony\AI\Platform\Bridge\OpenAi\PlatformFactory as OpenAiPlatformFactory;
749+ use Symfony\AI\Platform\Message\Message;
750+ use Symfony\AI\Platform\Message\MessageBag;
751+ use Symfony\AI\Platform\Speech\SpeechConfiguration;
752+ use Symfony\Component\HttpClient\HttpClient;
753+
754+ require_once dirname(__DIR__).'/bootstrap.php';
755+
756+ $platform = OpenAiPlatformFactory::create('key', httpClient: HttpClient::create());
757+
758+ $agent = new Agent($platform, 'gpt-4o', outputProcessors: [
759+ new SpeechProcessor(ElevenLabsPlatformFactory::create(
760+ apiKey: 'key',
761+ httpClient: http_client()
762+ ), new SpeechConfiguration([
763+ 'tts_model' => 'eleven_multilingual_v2',
764+ 'tts_options' => [
765+ 'voice' => 'Dslrhjl3ZpzrctukrQSN', // Brad (https://elevenlabs.io/app/voice-library?voiceId=Dslrhjl3ZpzrctukrQSN)
766+ ],
767+ ])),
768+ ]);
769+ $answer = $agent->call(new MessageBag(
770+ Message::ofUser('Tina has one brother and one sister. How many sisters do Tina\'s siblings have?'),
771+ ));
772+
773+ echo $answer->getSpeech()->asBinary();
774+
775+ When handling `speech-to-speech `, the process still the same but requires a :class: `Symfony\\ AI\\ Platform\\ Message\\ Content\\ Audio ` as an input::
776+
777+ use Symfony\AI\Agent\Agent;
778+ use Symfony\AI\Agent\InputProcessor\SpeechProcessor;
779+ use Symfony\AI\Platform\Bridge\ElevenLabs\PlatformFactory as ElevenLabsPlatformFactory;
780+ use Symfony\AI\Platform\Bridge\OpenAi\PlatformFactory as OpenAiPlatformFactory;
781+ use Symfony\AI\Platform\Message\Content\Audio;
782+ use Symfony\AI\Platform\Message\Message;
783+ use Symfony\AI\Platform\Message\MessageBag;
784+ use Symfony\AI\Platform\Speech\SpeechConfiguration;
785+ use Symfony\Component\HttpClient\HttpClient;
786+
787+ require_once dirname(__DIR__).'/bootstrap.php';
788+
789+ $platform = OpenAiPlatformFactory::create('key', httpClient: HttpClient::create());
790+
791+ $agent = new Agent($platform, 'gpt-4o', [
792+ new SpeechProcessor(ElevenLabsPlatformFactory::create(
793+ apiKey: 'key',
794+ httpClient: http_client(),
795+ ), new SpeechConfiguration([
796+ 'stt_model' => 'scribe_v1',
797+ ]))
798+ ]);
799+ $answer = $agent->call(new MessageBag(
800+ Message::ofUser(Audio::fromFile(dirname(__DIR__, 2).'/fixtures/audio.mp3'))
801+ ));
802+
803+ echo $answer->getContent();
804+
805+ A "STT / TTS sandwich" can be created using the :class: `Symfony\\ AI\\ Agent\\ InputProcessor\\ SpeechProcessor ` as input and output processor::
806+
807+ use Symfony\AI\Agent\Agent;
808+ use Symfony\AI\Agent\InputProcessor\SpeechProcessor;
809+ use Symfony\AI\Platform\Bridge\ElevenLabs\PlatformFactory as ElevenLabsPlatformFactory;
810+ use Symfony\AI\Platform\Bridge\OpenAi\PlatformFactory as OpenAiPlatformFactory;
811+ use Symfony\AI\Platform\Message\Content\Audio;
812+ use Symfony\AI\Platform\Message\Message;
813+ use Symfony\AI\Platform\Message\MessageBag;
814+ use Symfony\AI\Platform\Speech\SpeechConfiguration;
815+ use Symfony\Component\HttpClient\HttpClient;
816+
817+ require_once dirname(__DIR__).'/bootstrap.php';
818+
819+ $platform = OpenAiPlatformFactory::create('key', httpClient: HttpClient::create());
820+
821+ $speechProcessor = new SpeechProcessor(ElevenLabsPlatformFactory::create(
822+ apiKey: 'key',
823+ httpClient: http_client(),
824+ ), new SpeechConfiguration([
825+ 'tts_model' => 'eleven_multilingual_v2',
826+ 'tts_options' => [
827+ 'voice' => 'Dslrhjl3ZpzrctukrQSN', // Brad (https://elevenlabs.io/app/voice-library?voiceId=Dslrhjl3ZpzrctukrQSN)
828+ ],
829+ 'stt_model' => 'scribe_v1',
830+ ]));
831+
832+ $agent = new Agent($platform, 'gpt-4o', [$speechProcessor], [$speechProcessor]);
833+
834+ $answer = $agent->call(new MessageBag(
835+ Message::ofUser(Audio::fromFile(dirname(__DIR__, 2).'/fixtures/audio.mp3'))
836+ ));
837+
838+ echo $answer->getSpeech()->asBinary();
839+
840+ .. note ::
841+
842+ Handling both `text-to-speech ` and `speech-to-text ` introduce latency as most of the process is synchronous.
843+
734844Code Examples
735845~~~~~~~~~~~~~
736846
0 commit comments