diff --git a/src/Providers/Gemini/Gemini.php b/src/Providers/Gemini/Gemini.php index 2e31640ac..4552759b1 100644 --- a/src/Providers/Gemini/Gemini.php +++ b/src/Providers/Gemini/Gemini.php @@ -5,30 +5,33 @@ namespace Prism\Prism\Providers\Gemini; use Generator; -use Illuminate\Http\Client\PendingRequest; -use Illuminate\Http\Client\RequestException; -use Prism\Prism\Concerns\InitializesClient; use Prism\Prism\Contracts\Message; -use Prism\Prism\Embeddings\Request as EmbeddingRequest; -use Prism\Prism\Embeddings\Response as EmbeddingResponse; +use Prism\Prism\Providers\Provider; +use Prism\Prism\Audio\AudioResponse as TextToSpeechResponse; +use Illuminate\Http\Client\PendingRequest; +use Prism\Prism\Audio\TextToSpeechRequest; use Prism\Prism\Exceptions\PrismException; -use Prism\Prism\Exceptions\PrismProviderOverloadedException; -use Prism\Prism\Exceptions\PrismRateLimitedException; +use Prism\Prism\Concerns\InitializesClient; +use Illuminate\Http\Client\RequestException; +use Prism\Prism\Text\Request as TextRequest; +use Prism\Prism\Text\Response as TextResponse; +use Prism\Prism\Providers\Gemini\Handlers\Text; use Prism\Prism\Images\Request as ImagesRequest; -use Prism\Prism\Images\Response as ImagesResponse; +use Prism\Prism\Providers\Gemini\Handlers\Audio; use Prism\Prism\Providers\Gemini\Handlers\Cache; -use Prism\Prism\Providers\Gemini\Handlers\Embeddings; use Prism\Prism\Providers\Gemini\Handlers\Images; use Prism\Prism\Providers\Gemini\Handlers\Stream; +use Prism\Prism\Images\Response as ImagesResponse; +use Prism\Prism\ValueObjects\Messages\SystemMessage; +use Prism\Prism\Exceptions\PrismRateLimitedException; +use Prism\Prism\Providers\Gemini\Handlers\Embeddings; use Prism\Prism\Providers\Gemini\Handlers\Structured; -use Prism\Prism\Providers\Gemini\Handlers\Text; -use Prism\Prism\Providers\Gemini\ValueObjects\GeminiCachedObject; -use Prism\Prism\Providers\Provider; +use Prism\Prism\Embeddings\Request as EmbeddingRequest; use Prism\Prism\Structured\Request as StructuredRequest; +use Prism\Prism\Embeddings\Response as EmbeddingResponse; use Prism\Prism\Structured\Response as StructuredResponse; -use Prism\Prism\Text\Request as TextRequest; -use Prism\Prism\Text\Response as TextResponse; -use Prism\Prism\ValueObjects\Messages\SystemMessage; +use Prism\Prism\Exceptions\PrismProviderOverloadedException; +use Prism\Prism\Providers\Gemini\ValueObjects\GeminiCachedObject; class Gemini extends Provider { @@ -83,6 +86,17 @@ public function images(ImagesRequest $request): ImagesResponse return $handler->handle($request); } + #[\Override] + public function textToSpeech(TextToSpeechRequest $request): TextToSpeechResponse + { + $handler = new Audio($this->client( + $request->clientOptions(), + $request->clientRetry() + )); + + return $handler->handleTextToSpeech($request); + } + #[\Override] public function stream(TextRequest $request): Generator { diff --git a/src/Providers/Gemini/Handlers/Audio.php b/src/Providers/Gemini/Handlers/Audio.php new file mode 100644 index 000000000..60e673b3b --- /dev/null +++ b/src/Providers/Gemini/Handlers/Audio.php @@ -0,0 +1,39 @@ +client->post("{$request->model()}:generateContent", $mapper->toPayload()); + + if (! $response->successful()) { + throw new Exception('Failed to generate audio: '.$response->body()); + } + + $data = $response->json(); + + $base64Audio = $data['candidates'][0]['content']['parts'][0]['inlineData']['data'] + ?? throw new Exception('No audio data returned from TTS API'); + + return new AudioResponse( + audio: new GeneratedAudio( + base64: $base64Audio, + ), + ); + } +} \ No newline at end of file diff --git a/src/Providers/Gemini/Handlers/TextToSpeechRequestMapper.php b/src/Providers/Gemini/Handlers/TextToSpeechRequestMapper.php new file mode 100644 index 000000000..abe57c08b --- /dev/null +++ b/src/Providers/Gemini/Handlers/TextToSpeechRequestMapper.php @@ -0,0 +1,122 @@ + + */ + public function toPayload(): array + { + $providerOptions = $this->request->providerOptions(); + + $contents = [ + 'parts' => [ + [ + 'text' => $this->request->input(), + ], + ], + ]; + + $baseData = [ + 'model' => $this->request->model(), + 'contents' => [$contents], + ]; + + $speechConfig = $this->buildSpeechConfig($providerOptions); + + $generationConfig = Arr::whereNotNull([ + 'responseModalities' => $providerOptions['responseModalities'] ?? ['AUDIO'], + 'speechConfig' => $speechConfig !== [] ? $speechConfig : null, + ]); + + $supportedOptions = Arr::whereNotNull([ + 'generationConfig' => $generationConfig !== [] ? $generationConfig : null, + ]); + + return array_merge( + $baseData, + $supportedOptions, + ); + } + + + /** + * @param array $providerOptions + * @return array + */ + protected function buildSpeechConfig(array $providerOptions): array + { + if (isset($providerOptions['multiSpeaker']) && is_array($providerOptions['multiSpeaker'])) { + $multiSpeakerConfig = $this->buildMultiSpeakerConfig($providerOptions['multiSpeaker']); + + if ($multiSpeakerConfig !== []) { + return $multiSpeakerConfig; + } + } + + if ($this->request->voice()) { + return $this->buildSingleVoiceConfig($this->request->voice()); + } + + return []; + } + + /** + * @return array>> + */ + protected function buildSingleVoiceConfig(string $voiceName): array + { + return [ + 'voiceConfig' => [ + 'prebuiltVoiceConfig' => [ + 'voiceName' => $voiceName, + ], + ], + ]; + } + + /** + * @param array $speakers + * @return array + */ + protected function buildMultiSpeakerConfig(array $speakers): array + { + $speakerVoiceConfigs = []; + + foreach ($speakers as $speaker) { + if (!isset($speaker['speaker']) || !isset($speaker['voiceName'])) { + continue; + } + + $speakerVoiceConfigs[] = [ + 'speaker' => $speaker['speaker'], + 'voiceConfig' => [ + 'prebuiltVoiceConfig' => [ + 'voiceName' => $speaker['voiceName'], + ], + ], + ]; + } + + return $speakerVoiceConfigs !== [] ? [ + 'multiSpeakerVoiceConfig' => [ + 'speakerVoiceConfigs' => $speakerVoiceConfigs, + ], + ] : []; + } + + protected function provider(): string|Provider + { + return Provider::Gemini; + } +} \ No newline at end of file diff --git a/tests/Fixtures/gemini/tts-flash-1-1.json b/tests/Fixtures/gemini/tts-flash-1-1.json new file mode 100644 index 000000000..af63a966e --- /dev/null +++ b/tests/Fixtures/gemini/tts-flash-1-1.json @@ -0,0 +1,16 @@ +{ + "candidates": [ + { + "content": { + "parts": [ + { + "inlineData": { + "mimeType": "audio/wav", + "data": "UklGRiQAAABXQVZFZm10IBAAAAABAAEAQB8AAEAfAAABAAgAZGF0YQAAAAA=" + } + } + ] + } + } + ] +} \ No newline at end of file diff --git a/tests/Fixtures/gemini/tts-flash-multi-speaker-1.json b/tests/Fixtures/gemini/tts-flash-multi-speaker-1.json new file mode 100644 index 000000000..af63a966e --- /dev/null +++ b/tests/Fixtures/gemini/tts-flash-multi-speaker-1.json @@ -0,0 +1,16 @@ +{ + "candidates": [ + { + "content": { + "parts": [ + { + "inlineData": { + "mimeType": "audio/wav", + "data": "UklGRiQAAABXQVZFZm10IBAAAAABAAEAQB8AAEAfAAABAAgAZGF0YQAAAAA=" + } + } + ] + } + } + ] +} \ No newline at end of file diff --git a/tests/Fixtures/gemini/tts-flash-voice-option-1.json b/tests/Fixtures/gemini/tts-flash-voice-option-1.json new file mode 100644 index 000000000..af63a966e --- /dev/null +++ b/tests/Fixtures/gemini/tts-flash-voice-option-1.json @@ -0,0 +1,16 @@ +{ + "candidates": [ + { + "content": { + "parts": [ + { + "inlineData": { + "mimeType": "audio/wav", + "data": "UklGRiQAAABXQVZFZm10IBAAAAABAAEAQB8AAEAfAAABAAgAZGF0YQAAAAA=" + } + } + ] + } + } + ] +} \ No newline at end of file diff --git a/tests/Fixtures/gemini/tts-pro-1-1.json b/tests/Fixtures/gemini/tts-pro-1-1.json new file mode 100644 index 000000000..af63a966e --- /dev/null +++ b/tests/Fixtures/gemini/tts-pro-1-1.json @@ -0,0 +1,16 @@ +{ + "candidates": [ + { + "content": { + "parts": [ + { + "inlineData": { + "mimeType": "audio/wav", + "data": "UklGRiQAAABXQVZFZm10IBAAAAABAAEAQB8AAEAfAAABAAgAZGF0YQAAAAA=" + } + } + ] + } + } + ] +} \ No newline at end of file diff --git a/tests/Fixtures/gemini/tts-pro-multi-speaker-1.json b/tests/Fixtures/gemini/tts-pro-multi-speaker-1.json new file mode 100644 index 000000000..af63a966e --- /dev/null +++ b/tests/Fixtures/gemini/tts-pro-multi-speaker-1.json @@ -0,0 +1,16 @@ +{ + "candidates": [ + { + "content": { + "parts": [ + { + "inlineData": { + "mimeType": "audio/wav", + "data": "UklGRiQAAABXQVZFZm10IBAAAAABAAEAQB8AAEAfAAABAAgAZGF0YQAAAAA=" + } + } + ] + } + } + ] +} \ No newline at end of file diff --git a/tests/Fixtures/gemini/tts-pro-voice-option-1.json b/tests/Fixtures/gemini/tts-pro-voice-option-1.json new file mode 100644 index 000000000..af63a966e --- /dev/null +++ b/tests/Fixtures/gemini/tts-pro-voice-option-1.json @@ -0,0 +1,16 @@ +{ + "candidates": [ + { + "content": { + "parts": [ + { + "inlineData": { + "mimeType": "audio/wav", + "data": "UklGRiQAAABXQVZFZm10IBAAAAABAAEAQB8AAEAfAAABAAgAZGF0YQAAAAA=" + } + } + ] + } + } + ] +} \ No newline at end of file diff --git a/tests/Providers/Gemini/AudioTest.php b/tests/Providers/Gemini/AudioTest.php new file mode 100644 index 000000000..2819a6b89 --- /dev/null +++ b/tests/Providers/Gemini/AudioTest.php @@ -0,0 +1,363 @@ +set('prism.providers.gemini.api_key', env('GEMINI_API_KEY', 'test-api-key')); +}); + +describe('Text-to-Speech', function (): void { + it('can generate audio with gemini-2.5-flash-preview-tts model', function (): void { + FixtureResponse::fakeResponseSequence( + '/gemini-2.5-flash-preview-tts:generateContent', + 'gemini/tts-flash-1' + ); + + $response = Prism::audio() + ->using(Provider::Gemini, 'gemini-2.5-flash-preview-tts') + ->withInput('Hello, world!') + ->withVoice('Kore') + ->asAudio(); + + expect($response->audio)->not->toBeNull(); + expect($response->audio->hasBase64())->toBeTrue(); + expect($response->audio->base64)->not->toBeEmpty(); + + Http::assertSent(function (Request $request): bool { + $data = $request->data(); + + return $request->url() === 'https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash-preview-tts:generateContent' && + $data['model'] === 'gemini-2.5-flash-preview-tts' && + $data['contents'][0]['parts'][0]['text'] === 'Hello, world!'; + }); + }); + + it('can generate audio with gemini-2.5-pro-preview-tts model', function (): void { + FixtureResponse::fakeResponseSequence( + '/gemini-2.5-pro-preview-tts:generateContent', + 'gemini/tts-pro-1' + ); + + $response = Prism::audio() + ->using(Provider::Gemini, 'gemini-2.5-pro-preview-tts') + ->withInput('Hello, world!') + ->withVoice('Kore') + ->asAudio(); + + expect($response->audio)->not->toBeNull(); + expect($response->audio->hasBase64())->toBeTrue(); + expect($response->audio->base64)->not->toBeEmpty(); + + Http::assertSent(function (Request $request): bool { + $data = $request->data(); + + return $request->url() === 'https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-pro-preview-tts:generateContent' && + $data['model'] === 'gemini-2.5-pro-preview-tts' && + $data['contents'][0]['parts'][0]['text'] === 'Hello, world!'; + }); + }); + + it("supports different voice options for gemini-2.5-pro-preview-tts model", function (): void { + FixtureResponse::fakeResponseSequence( + '/gemini-2.5-pro-preview-tts:generateContent', + 'gemini/tts-pro-voice-option' + ); + + $response = Prism::audio() + ->using(Provider::Gemini, 'gemini-2.5-pro-preview-tts') + ->withInput('Hello, world!') + ->withVoice('Enceladus') + ->asAudio(); + + Http::assertSent(function (Request $request): bool { + $data = $request->data(); + + return $data['generationConfig']['speechConfig']['voiceConfig']['prebuiltVoiceConfig']['voiceName'] === 'Enceladus'; + }); + }); + + it("supports different voice options for gemini-2.5-flash-preview-tts model", function (): void { + FixtureResponse::fakeResponseSequence( + '/gemini-2.5-flash-preview-tts:generateContent', + 'gemini/tts-flash-voice-option' + ); + + $response = Prism::audio() + ->using(Provider::Gemini, 'gemini-2.5-flash-preview-tts') + ->withInput('Hello, world!') + ->withVoice('Enceladus') + ->asAudio(); + + Http::assertSent(function (Request $request): bool { + $data = $request->data(); + + return $data['generationConfig']['speechConfig']['voiceConfig']['prebuiltVoiceConfig']['voiceName'] === 'Enceladus'; + }); + }); + + it("supports multi-speaker voice configuration for gemini-2.5-pro-preview-tts model", function (): void { + FixtureResponse::fakeResponseSequence( + '/gemini-2.5-pro-preview-tts:generateContent', + 'gemini/tts-pro-voice-option' + ); + + $response = Prism::audio() + ->using(Provider::Gemini, 'gemini-2.5-pro-preview-tts') + ->withInput('TTS the following conversation between Joe and Jane: + Joe: Hows it going today Jane? + Jane: Not too bad, how about you?') + ->withVoice('Enceladus') + ->withProviderOptions([ + 'multiSpeaker' => [ + [ + 'speaker' => 'Joe', + 'voiceName' => 'Kore', + ], + [ + 'speaker' => 'Jane', + 'voiceName' => 'Puck', + ], + ], + ]) + ->asAudio(); + + expect($response->audio)->not->toBeNull(); + expect($response->audio->hasBase64())->toBeTrue(); + + Http::assertSent(function (Request $request): bool { + $data = $request->data(); + + if (!isset($data['generationConfig']['speechConfig']['multiSpeakerVoiceConfig'])) { + return false; + } + + $speakerConfigs = $data['generationConfig']['speechConfig']['multiSpeakerVoiceConfig']['speakerVoiceConfigs']; + + if (count($speakerConfigs) !== 2) { + return false; + } + + $joe = collect($speakerConfigs)->firstWhere('speaker', 'Joe'); + if (!$joe || $joe['voiceConfig']['prebuiltVoiceConfig']['voiceName'] !== 'Kore') { + return false; + } + + $jane = collect($speakerConfigs)->firstWhere('speaker', 'Jane'); + if (!$jane || $jane['voiceConfig']['prebuiltVoiceConfig']['voiceName'] !== 'Puck') { + return false; + } + + return true; + }); + }); + + it("supports multi-speaker voice configuration for gemini-2.5-flash-preview-tts model", function (): void { + FixtureResponse::fakeResponseSequence( + '/gemini-2.5-flash-preview-tts:generateContent', + 'gemini/tts-flash-voice-option' + ); + + $response = Prism::audio() + ->using(Provider::Gemini, 'gemini-2.5-flash-preview-tts') + ->withInput('TTS the following conversation between Joe and Jane: + Joe: Hows it going today Jane? + Jane: Not too bad, how about you?') + ->withVoice('Enceladus') + ->withProviderOptions([ + 'multiSpeaker' => [ + [ + 'speaker' => 'Joe', + 'voiceName' => 'Kore', + ], + [ + 'speaker' => 'Jane', + 'voiceName' => 'Puck', + ], + ], + ]) + ->asAudio(); + + expect($response->audio)->not->toBeNull(); + expect($response->audio->hasBase64())->toBeTrue(); + + Http::assertSent(function (Request $request): bool { + $data = $request->data(); + + if (!isset($data['generationConfig']['speechConfig']['multiSpeakerVoiceConfig'])) { + return false; + } + + $speakerConfigs = $data['generationConfig']['speechConfig']['multiSpeakerVoiceConfig']['speakerVoiceConfigs']; + + if (count($speakerConfigs) !== 2) { + return false; + } + + $joe = collect($speakerConfigs)->firstWhere('speaker', 'Joe'); + if (!$joe || $joe['voiceConfig']['prebuiltVoiceConfig']['voiceName'] !== 'Kore') { + return false; + } + + $jane = collect($speakerConfigs)->firstWhere('speaker', 'Jane'); + if (!$jane || $jane['voiceConfig']['prebuiltVoiceConfig']['voiceName'] !== 'Puck') { + return false; + } + + return true; + }); + }); + + it("prioritizes multi-speaker config over single voice for gemini-2.5-pro-preview-tts model", function (): void { + FixtureResponse::fakeResponseSequence( + '/gemini-2.5-pro-preview-tts:generateContent', + 'gemini/tts-pro-multi-speaker' + ); + + $response = Prism::audio() + ->using(Provider::Gemini, 'gemini-2.5-pro-preview-tts') + ->withInput('Conversation test') + ->withVoice('Enceladus') + ->withProviderOptions([ + 'multiSpeaker' => [ + [ + 'speaker' => 'Speaker1', + 'voiceName' => 'Kore', + ], + ], + ]) + ->asAudio(); + + Http::assertSent(function (Request $request): bool { + $data = $request->data(); + + return isset($data['generationConfig']['speechConfig']['multiSpeakerVoiceConfig']) && + !isset($data['generationConfig']['speechConfig']['voiceConfig']); + }); + }); + + it("prioritizes multi-speaker config over single voice for gemini-2.5-flash-preview-tts model", function (): void { + FixtureResponse::fakeResponseSequence( + '/gemini-2.5-flash-preview-tts:generateContent', + 'gemini/tts-flash-multi-speaker' + ); + + $response = Prism::audio() + ->using(Provider::Gemini, 'gemini-2.5-flash-preview-tts') + ->withInput('Conversation test') + ->withVoice('Enceladus') + ->withProviderOptions([ + 'multiSpeaker' => [ + [ + 'speaker' => 'Speaker1', + 'voiceName' => 'Kore', + ], + ], + ]) + ->asAudio(); + + Http::assertSent(function (Request $request): bool { + $data = $request->data(); + + return isset($data['generationConfig']['speechConfig']['multiSpeakerVoiceConfig']) && + !isset($data['generationConfig']['speechConfig']['voiceConfig']); + }); + }); + + + it("handles invalid multi-speaker configurations gracefully for gemini-2.5-pro-preview-tts model", function (): void { + FixtureResponse::fakeResponseSequence( + '/gemini-2.5-pro-preview-tts:generateContent', + 'gemini/tts-pro-multi-speaker' + ); + + $response = Prism::audio() + ->using(Provider::Gemini, 'gemini-2.5-pro-preview-tts') + ->withInput('Test') + ->withVoice('Enceladus') + ->withProviderOptions([ + 'multiSpeaker' => [ + ['speaker' => 'Joe'], + ['voiceName' => 'Kore'], + ], + ]) + ->asAudio(); + + Http::assertSent(function (Request $request): bool { + $data = $request->data(); + + return isset($data['generationConfig']['speechConfig']['voiceConfig']) && + $data['generationConfig']['speechConfig']['voiceConfig']['prebuiltVoiceConfig']['voiceName'] === 'Enceladus'; + }); + }); + + it("handles invalid multi-speaker configurations gracefully for gemini-2.5-flash-preview-tts model", function (): void { + FixtureResponse::fakeResponseSequence( + '/gemini-2.5-flash-preview-tts:generateContent', + 'gemini/tts-flash-multi-speaker' + ); + + $response = Prism::audio() + ->using(Provider::Gemini, 'gemini-2.5-flash-preview-tts') + ->withInput('Test') + ->withVoice('Enceladus') + ->withProviderOptions([ + 'multiSpeaker' => [ + ['speaker' => 'Joe'], + ['voiceName' => 'Kore'], + ], + ]) + ->asAudio(); + + Http::assertSent(function (Request $request): bool { + $data = $request->data(); + + return isset($data['generationConfig']['speechConfig']['voiceConfig']) && + $data['generationConfig']['speechConfig']['voiceConfig']['prebuiltVoiceConfig']['voiceName'] === 'Enceladus'; + }); + }); +}); + + +describe('GeneratedAudio Value Object', function (): void { + it('can check if audio has base64 data', function (): void { + Http::fake([ + 'generativelanguage.googleapis.com/v1beta/models/gemini-2.5-pro-preview-tts:generateContent' => Http::response( + [ + 'candidates' => [ + [ + 'content' => [ + 'parts' => [ + [ + 'inlineData' => [ + 'mimeType' => 'audio/wav', + 'data' => 'UklGRiQAAABXQVZFZm10IBAAAAABAAEAQB8AAEAfAAABAAgAZGF0YQAAAAA=' + ] + ] + ] + ] + ] + ] + ], + 200, + ['Content-Type' => 'application/json'] + ), + ]); + + $response = Prism::audio() + ->using(Provider::Gemini, 'gemini-2.5-pro-preview-tts') + ->withInput('Test audio generation') + ->withVoice('Enceladus') + ->asAudio(); + + expect($response->audio->hasBase64())->toBeTrue(); + }); +}); \ No newline at end of file