symfony
diff --git a/‎demo/tests/Blog/Command/StreamCommandTest.php‎
Lines changed: 14 additions & 0 deletions b/‎demo/tests/Blog/Command/StreamCommandTest.php‎
Lines changed: 14 additions & 0 deletions
diff --git a/‎docs/bundles/ai-bundle.rst‎
Lines changed: 34 additions & 0 deletions b/‎docs/bundles/ai-bundle.rst‎
Lines changed: 34 additions & 0 deletions
diff --git a/‎docs/components/agent.rst‎
Lines changed: 110 additions & 0 deletions b/‎docs/components/agent.rst‎
Lines changed: 110 additions & 0 deletions
diff --git a/‎examples/speech/README.md‎
Lines changed: 10 additions & 0 deletions b/‎examples/speech/README.md‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎examples/speech/agent-eleven-labs-speech-sts.php‎
Lines changed: 42 additions & 0 deletions b/‎examples/speech/agent-eleven-labs-speech-sts.php‎
Lines changed: 42 additions & 0 deletions
diff --git a/‎examples/speech/agent-eleven-labs-speech-stt.php‎
Lines changed: 37 additions & 0 deletions b/‎examples/speech/agent-eleven-labs-speech-stt.php‎
Lines changed: 37 additions & 0 deletions
diff --git a/‎examples/speech/agent-eleven-labs-speech-tts.php‎
Lines changed: 39 additions & 0 deletions b/‎examples/speech/agent-eleven-labs-speech-tts.php‎
Lines changed: 39 additions & 0 deletions
diff --git a/‎src/agent/composer.json‎
Lines changed: 1 addition & 0 deletions b/‎src/agent/composer.json‎
Lines changed: 1 addition & 0 deletions
@@ -15,8 +15,13 @@
 use PHPUnit\Framework\TestCase;
 use Symfony\AI\Agent\AgentInterface;
 use Symfony\AI\Platform\Metadata\Metadata;
+use Symfony\AI\Platform\PlainConverter;
+use Symfony\AI\Platform\Result\DeferredResult;
+use Symfony\AI\Platform\Result\InMemoryRawResult;
 use Symfony\AI\Platform\Result\RawResultInterface;
 use Symfony\AI\Platform\Result\ResultInterface;
+use Symfony\AI\Platform\Result\TextResult;
+use Symfony\AI\Platform\Speech\Speech;
 use Symfony\Component\Console\Input\ArrayInput;
 use Symfony\Component\Console\Output\BufferedOutput;
 use Symfony\Component\Console\Style\SymfonyStyle;
@@ -50,6 +55,15 @@ public function getRawResult(): ?RawResultInterface
                 public function setRawResult(RawResultInterface $rawResult): void
                 {
                 }
+
+                public function addSpeech(?Speech $speech = null): void
+                {
+                }
+
+                public function getSpeech(): Speech
+                {
+                    return new Speech(new DeferredResult(new PlainConverter(new TextResult('foo')), new InMemoryRawResult()));
+                }
             });
 
         $input = new ArrayInput([]);
 
@@ -1174,6 +1174,40 @@ Chats are defined in the ``chat`` section of your configuration:
                 agent: 'ai.agent.youtube'
                 message_store: 'ai.message_store.cache.youtube'
 
+Speech
+------
+
+Speech can be used to create a `stt` / `tts` / `sts` pipelines, leading to a more "human-like" conversation flow,
+the main drawback is the latency that it can introduce due to network calls.
+
+Configuring speech
+~~~~~~~~~~~~~~~~~~
+
+When using the bundle, the configuration allows to configure models and voices::
+
+    ai:
+        platform:
+            elevenlabs:
+                api_key: '%env(ELEVEN_LABS_API_KEY)%'
+            openai:
+                api_key: '%env(OPENAI_API_KEY)%'
+
+        agent:
+            sts_openai:
+                platform: ai.platform.openai
+                model: gpt-4o
+                speech:
+                    platform: 'ai.platform.elevenlabs'
+                    tts_model: '%env(ELEVEN_LABS_TTS_MODEL)%'
+                    tts_options:
+                        voice: '%env(ELEVEN_LABS_VOICE_IDENTIFIER)%'
+                    stt_model: '%env(ELEVEN_LABS_STT_MODEL)%'
+
+.. note::
+
+    The current example is built for "TTS / STT sandwich", a pattern that handles both input and output as audio,
+    both STT and TTS can be enabled independently.
+
 .. _`Symfony AI Agent`: https://github.com/symfony/ai-agent
 .. _`Symfony AI Chat`: https://github.com/symfony/ai-chat
 .. _`Symfony AI Platform`: https://github.com/symfony/ai-platform
 
@@ -731,6 +731,116 @@ Testing a service that uses an agent::
 The ``MockAgent`` provides all the benefits of traditional mocks while offering a more intuitive API for AI agent testing,
 making your tests more reliable and easier to maintain.
 
+Speech support
+~~~~~~~~~~~~~~
+
+Using speech to send messages / receive answers as audio is a common use case when integrating agents and/or chats,
+this approach allows to either send audio and expect text output or send a text and receive an audio content.
+
+Another approach is to use stt / tts together to enable a full audio pipeline, this approach introduce some latency
+(as both input/output must be processed) but allows to create a more natural and "human-like" conversation flow.
+
+Speech support can be enabled using :class:`Symfony\\AI\\Agent\\InputProcessor\\SpeechProcessor` (for `text-to-speech` in this example)::
+
+    use Symfony\AI\Agent\Agent;
+    use Symfony\AI\Agent\InputProcessor\SpeechProcessor;
+    use Symfony\AI\Platform\Bridge\ElevenLabs\PlatformFactory as ElevenLabsPlatformFactory;
+    use Symfony\AI\Platform\Bridge\OpenAi\PlatformFactory as OpenAiPlatformFactory;
+    use Symfony\AI\Platform\Message\Message;
+    use Symfony\AI\Platform\Message\MessageBag;
+    use Symfony\AI\Platform\Speech\SpeechConfiguration;
+    use Symfony\Component\HttpClient\HttpClient;
+
+    require_once dirname(__DIR__).'/bootstrap.php';
+
+    $platform = OpenAiPlatformFactory::create('key', httpClient: HttpClient::create());
+
+    $agent = new Agent($platform, 'gpt-4o', outputProcessors: [
+        new SpeechProcessor(ElevenLabsPlatformFactory::create(
+            apiKey: 'key',
+            httpClient: http_client()
+        ), new SpeechConfiguration([
+            'tts_model' => 'eleven_multilingual_v2',
+            'tts_options' => [
+                'voice' => 'Dslrhjl3ZpzrctukrQSN', // Brad (https://elevenlabs.io/app/voice-library?voiceId=Dslrhjl3ZpzrctukrQSN)
+            ],
+        ])),
+    ]);
+    $answer = $agent->call(new MessageBag(
+        Message::ofUser('Tina has one brother and one sister. How many sisters do Tina\'s siblings have?'),
+    ));
+
+    echo $answer->getSpeech()->asBinary();
+
+When handling `speech-to-speech`, the process still the same but requires a :class:`Symfony\\AI\\Platform\\Message\\Content\\Audio` as an input::
+
+    use Symfony\AI\Agent\Agent;
+    use Symfony\AI\Agent\InputProcessor\SpeechProcessor;
+    use Symfony\AI\Platform\Bridge\ElevenLabs\PlatformFactory as ElevenLabsPlatformFactory;
+    use Symfony\AI\Platform\Bridge\OpenAi\PlatformFactory as OpenAiPlatformFactory;
+    use Symfony\AI\Platform\Message\Content\Audio;
+    use Symfony\AI\Platform\Message\Message;
+    use Symfony\AI\Platform\Message\MessageBag;
+    use Symfony\AI\Platform\Speech\SpeechConfiguration;
+    use Symfony\Component\HttpClient\HttpClient;
+
+    require_once dirname(__DIR__).'/bootstrap.php';
+
+    $platform = OpenAiPlatformFactory::create('key', httpClient: HttpClient::create());
+
+    $agent = new Agent($platform, 'gpt-4o', [
+        new SpeechProcessor(ElevenLabsPlatformFactory::create(
+            apiKey: 'key',
+            httpClient: http_client(),
+        ), new SpeechConfiguration([
+            'stt_model' => 'scribe_v1',
+        ]))
+    ]);
+    $answer = $agent->call(new MessageBag(
+        Message::ofUser(Audio::fromFile(dirname(__DIR__, 2).'/fixtures/audio.mp3'))
+    ));
+
+    echo $answer->getContent();
+
+A "STT / TTS sandwich" can be created using the :class:`Symfony\\AI\\Agent\\InputProcessor\\SpeechProcessor` as input and output processor::
+
+    use Symfony\AI\Agent\Agent;
+    use Symfony\AI\Agent\InputProcessor\SpeechProcessor;
+    use Symfony\AI\Platform\Bridge\ElevenLabs\PlatformFactory as ElevenLabsPlatformFactory;
+    use Symfony\AI\Platform\Bridge\OpenAi\PlatformFactory as OpenAiPlatformFactory;
+    use Symfony\AI\Platform\Message\Content\Audio;
+    use Symfony\AI\Platform\Message\Message;
+    use Symfony\AI\Platform\Message\MessageBag;
+    use Symfony\AI\Platform\Speech\SpeechConfiguration;
+    use Symfony\Component\HttpClient\HttpClient;
+
+    require_once dirname(__DIR__).'/bootstrap.php';
+
+    $platform = OpenAiPlatformFactory::create('key', httpClient: HttpClient::create());
+
+    $speechProcessor = new SpeechProcessor(ElevenLabsPlatformFactory::create(
+        apiKey: 'key',
+        httpClient: http_client(),
+    ), new SpeechConfiguration([
+        'tts_model' => 'eleven_multilingual_v2',
+        'tts_options' => [
+            'voice' => 'Dslrhjl3ZpzrctukrQSN', // Brad (https://elevenlabs.io/app/voice-library?voiceId=Dslrhjl3ZpzrctukrQSN)
+        ],
+        'stt_model' => 'scribe_v1',
+    ]));
+
+    $agent = new Agent($platform, 'gpt-4o', [$speechProcessor], [$speechProcessor]);
+
+    $answer = $agent->call(new MessageBag(
+        Message::ofUser(Audio::fromFile(dirname(__DIR__, 2).'/fixtures/audio.mp3'))
+    ));
+
+    echo $answer->getSpeech()->asBinary();
+
+.. note::
+
+    Handling both `text-to-speech` and `speech-to-text` introduce latency as most of the process is synchronous.
+
 Code Examples
 ~~~~~~~~~~~~~
 
 
@@ -0,0 +1,10 @@
+# Speech Examples
+
+Speech is mainly used to transform text to audio and vice versa, it can also be used to create an audio to audio pipeline.
+
+To run the examples, you can use additional tools like (mpg123)[https://www.mpg123.de/]:
+
+```bash
+php speech/agent-eleven-labs-speech-tts.php | mpg123 -
+php speech/agent-eleven-labs-speech-sts.php | mpg123 -
+```
@@ -0,0 +1,42 @@
+<?php
+
+/*
+ * This file is part of the Symfony package.
+ *
+ * (c) Fabien Potencier <fabien@symfony.com>
+ *
+ * For the full copyright and license information, please view the LICENSE
+ * file that was distributed with this source code.
+ */
+
+use Symfony\AI\Agent\Agent;
+use Symfony\AI\Agent\InputProcessor\SpeechProcessor;
+use Symfony\AI\Platform\Bridge\ElevenLabs\PlatformFactory as ElevenLabsPlatformFactory;
+use Symfony\AI\Platform\Bridge\OpenAi\PlatformFactory as OpenAiPlatformFactory;
+use Symfony\AI\Platform\Message\Content\Audio;
+use Symfony\AI\Platform\Message\Message;
+use Symfony\AI\Platform\Message\MessageBag;
+use Symfony\AI\Platform\Speech\SpeechConfiguration;
+
+require_once dirname(__DIR__).'/bootstrap.php';
+
+$platform = OpenAiPlatformFactory::create(env('OPENAI_API_KEY'), httpClient: http_client());
+
+$speechProcessor = new SpeechProcessor(ElevenLabsPlatformFactory::create(
+    apiKey: env('ELEVEN_LABS_API_KEY'),
+    httpClient: http_client(),
+), new SpeechConfiguration([
+    'tts_model' => 'eleven_multilingual_v2',
+    'tts_options' => [
+        'voice' => 'Dslrhjl3ZpzrctukrQSN', // Brad (https://elevenlabs.io/app/voice-library?voiceId=Dslrhjl3ZpzrctukrQSN)
+    ],
+    'stt_model' => 'scribe_v1',
+]));
+
+$agent = new Agent($platform, 'gpt-4o', [$speechProcessor], [$speechProcessor]);
+
+$answer = $agent->call(new MessageBag(
+    Message::ofUser(Audio::fromFile(dirname(__DIR__, 2).'/fixtures/audio.mp3'))
+));
+
+echo $answer->getSpeech()->asBinary();
@@ -0,0 +1,37 @@
+<?php
+
+/*
+ * This file is part of the Symfony package.
+ *
+ * (c) Fabien Potencier <fabien@symfony.com>
+ *
+ * For the full copyright and license information, please view the LICENSE
+ * file that was distributed with this source code.
+ */
+
+use Symfony\AI\Agent\Agent;
+use Symfony\AI\Agent\InputProcessor\SpeechProcessor;
+use Symfony\AI\Platform\Bridge\ElevenLabs\PlatformFactory as ElevenLabsPlatformFactory;
+use Symfony\AI\Platform\Bridge\OpenAi\PlatformFactory as OpenAiPlatformFactory;
+use Symfony\AI\Platform\Message\Content\Audio;
+use Symfony\AI\Platform\Message\Message;
+use Symfony\AI\Platform\Message\MessageBag;
+use Symfony\AI\Platform\Speech\SpeechConfiguration;
+
+require_once dirname(__DIR__).'/bootstrap.php';
+
+$platform = OpenAiPlatformFactory::create(env('OPENAI_API_KEY'), httpClient: http_client());
+
+$agent = new Agent($platform, 'gpt-4o', [
+    new SpeechProcessor(ElevenLabsPlatformFactory::create(
+        apiKey: env('ELEVEN_LABS_API_KEY'),
+        httpClient: http_client(),
+    ), new SpeechConfiguration([
+        'stt_model' => 'scribe_v1',
+    ])),
+]);
+$answer = $agent->call(new MessageBag(
+    Message::ofUser(Audio::fromFile(dirname(__DIR__, 2).'/fixtures/audio.mp3'))
+));
+
+echo $answer->getContent();
@@ -0,0 +1,39 @@
+<?php
+
+/*
+ * This file is part of the Symfony package.
+ *
+ * (c) Fabien Potencier <fabien@symfony.com>
+ *
+ * For the full copyright and license information, please view the LICENSE
+ * file that was distributed with this source code.
+ */
+
+use Symfony\AI\Agent\Agent;
+use Symfony\AI\Agent\InputProcessor\SpeechProcessor;
+use Symfony\AI\Platform\Bridge\ElevenLabs\PlatformFactory as ElevenLabsPlatformFactory;
+use Symfony\AI\Platform\Bridge\OpenAi\PlatformFactory as OpenAiPlatformFactory;
+use Symfony\AI\Platform\Message\Message;
+use Symfony\AI\Platform\Message\MessageBag;
+use Symfony\AI\Platform\Speech\SpeechConfiguration;
+
+require_once dirname(__DIR__).'/bootstrap.php';
+
+$platform = OpenAiPlatformFactory::create(env('OPENAI_API_KEY'), httpClient: http_client());
+
+$agent = new Agent($platform, 'gpt-4o', outputProcessors: [
+    new SpeechProcessor(ElevenLabsPlatformFactory::create(
+        env('ELEVEN_LABS_API_KEY'),
+        httpClient: http_client()
+    ), new SpeechConfiguration([
+        'tts_model' => 'eleven_multilingual_v2',
+        'tts_options' => [
+            'voice' => 'Dslrhjl3ZpzrctukrQSN', // Brad (https://elevenlabs.io/app/voice-library?voiceId=Dslrhjl3ZpzrctukrQSN)
+        ],
+    ])),
+]);
+$answer = $agent->call(new MessageBag(
+    Message::ofUser('Tina has one brother and one sister. How many sisters do Tina\'s siblings have?'),
+));
+
+echo $answer->getSpeech()->asBinary();
@@ -44,6 +44,7 @@
         "phpunit/phpunit": "^11.5.53",
         "symfony/ai-store": "^0.5",
         "symfony/event-dispatcher": "^7.3|^8.0",
+        "symfony/options-resolver": "^7.3|^8.0",
         "symfony/translation": "^7.3|^8.0",
         "symfony/translation-contracts": "^3.6"
     },