Skip to content

Commit 288796e

Browse files
committed
feat: Implement ElevenLabs Text-to-Speech
This commit introduces support for ElevenLabs Text-to-Speech (TTS) service within the Spring AI framework. **Key Changes:** - **New Model Module:** Added `spring-ai-elevenlabs` module for ElevenLabs integration. - **Core Classes:** - `ElevenLabsTextToSpeechModel`: Implements `TextToSpeechModel` and `StreamingTextToSpeechModel` for interacting with the ElevenLabs API. - `ElevenLabsTextToSpeechOptions`: Configuration options for the ElevenLabs TTS service. - `ElevenLabsApi`: Low-level client for interacting with the ElevenLabs API. - `ElevenLabsVoicesApi`: client for the elevenLabs Voices API - `Speech`, `TextToSpeechMessage`, `TextToSpeechPrompt`, `TextToSpeechResponse`: Data transfer objects. - **Auto-configuration:** - `ElevenLabsAutoConfiguration`: Spring Boot auto-configuration for easy setup. - `ElevenLabsConnectionProperties`: Configuration properties for ElevenLabs connection. - `ElevenLabsSpeechProperties`: Configuration properties for default TTS settings. - **API Clients:** Provides `ElevenLabsApi` for direct interaction with the ElevenLabs API. Also provides a `ElevenLabsVoicesApi`. - **Tests:** Includes comprehensive unit and integration tests. - **Documentation:** Added documentation to the Spring AI reference guide, including examples. **Functionality:** - **Text-to-Speech Conversion:** Allows users to convert text input into audio using ElevenLabs' high-quality voices. - **Streaming Support:** Supports real-time audio streaming, enabling immediate playback as audio is generated. - **Configurable Options:** Provides flexible configuration options for voice selection, output format, speed, stability, and more. - **Spring Boot Starter:** Includes a Spring Boot starter (`spring-ai-elevenlabs-spring-boot-starter`) for simplified dependency management and auto-configuration. **Notes:** - The classes defnined on tts package will be moved to core-package, along with any required refactoring needed to support OpenAi speech api. Signed-off-by: Alexandros Pappas <[email protected]>
1 parent 6b25b62 commit 288796e

File tree

34 files changed

+5216
-2
lines changed

34 files changed

+5216
-2
lines changed
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
# Spring AI - ElevenLabs Text-to-Speech
2+
3+
[ElevenLabs Text-to-Speech Documentation](https://docs.spring.io/spring-ai/reference/api/audio/speech/elevenlabs-speech.html)
Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,87 @@
1+
<?xml version="1.0" encoding="UTF-8"?>
2+
<project xmlns="http://maven.apache.org/POM/4.0.0"
3+
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
4+
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
5+
<modelVersion>4.0.0</modelVersion>
6+
<parent>
7+
<groupId>org.springframework.ai</groupId>
8+
<artifactId>spring-ai</artifactId>
9+
<version>1.0.0-SNAPSHOT</version>
10+
<relativePath>../../pom.xml</relativePath>
11+
</parent>
12+
13+
<artifactId>spring-ai-elevenlabs</artifactId>
14+
<packaging>jar</packaging>
15+
<name>Spring AI Model - ElevenLabs</name>
16+
<description>ElevenLabs Text-to-Speech model support</description>
17+
<url>https://github.com/spring-projects/spring-ai</url>
18+
19+
<scm>
20+
<url>https://github.com/spring-projects/spring-ai</url>
21+
<connection>git://github.com/spring-projects/spring-ai.git</connection>
22+
<developerConnection>[email protected]:spring-projects/spring-ai.git</developerConnection>
23+
</scm>
24+
25+
<properties>
26+
<!-- ElevenLabs-specific properties here, if needed -->
27+
</properties>
28+
29+
<dependencies>
30+
31+
<!-- production dependencies -->
32+
<dependency>
33+
<groupId>org.springframework.ai</groupId>
34+
<artifactId>spring-ai-core</artifactId>
35+
<version>${project.parent.version}</version>
36+
</dependency>
37+
38+
<dependency>
39+
<groupId>org.springframework.ai</groupId>
40+
<artifactId>spring-ai-retry</artifactId>
41+
<version>${project.parent.version}</version>
42+
</dependency>
43+
44+
<dependency>
45+
<groupId>io.rest-assured</groupId>
46+
<artifactId>json-path</artifactId>
47+
</dependency>
48+
49+
<dependency>
50+
<groupId>org.springframework</groupId>
51+
<artifactId>spring-context-support</artifactId>
52+
</dependency>
53+
54+
<dependency>
55+
<groupId>org.slf4j</groupId>
56+
<artifactId>slf4j-api</artifactId>
57+
</dependency>
58+
59+
<!-- test dependencies -->
60+
<dependency>
61+
<groupId>org.springframework.ai</groupId>
62+
<artifactId>spring-ai-test</artifactId>
63+
<version>${project.version}</version>
64+
<scope>test</scope>
65+
</dependency>
66+
67+
<dependency>
68+
<groupId>io.micrometer</groupId>
69+
<artifactId>micrometer-observation-test</artifactId>
70+
<scope>test</scope>
71+
</dependency>
72+
73+
<dependency>
74+
<groupId>com.fasterxml.jackson.dataformat</groupId>
75+
<artifactId>jackson-dataformat-xml</artifactId>
76+
<version>2.11.1</version>
77+
<scope>test</scope>
78+
</dependency>
79+
80+
<dependency>
81+
<groupId>io.projectreactor</groupId>
82+
<artifactId>reactor-test</artifactId>
83+
<scope>test</scope>
84+
</dependency>
85+
</dependencies>
86+
87+
</project>
Lines changed: 210 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,210 @@
1+
/*
2+
* Copyright 2025-2025 the original author or authors.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* https://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
package org.springframework.ai.elevenlabs;
18+
19+
import java.util.List;
20+
21+
import org.slf4j.Logger;
22+
import org.slf4j.LoggerFactory;
23+
import reactor.core.publisher.Flux;
24+
25+
import org.springframework.ai.elevenlabs.api.ElevenLabsApi;
26+
import org.springframework.ai.elevenlabs.tts.Speech;
27+
import org.springframework.ai.elevenlabs.tts.StreamingTextToSpeechModel;
28+
import org.springframework.ai.elevenlabs.tts.TextToSpeechModel;
29+
import org.springframework.ai.elevenlabs.tts.TextToSpeechPrompt;
30+
import org.springframework.ai.elevenlabs.tts.TextToSpeechResponse;
31+
import org.springframework.ai.retry.RetryUtils;
32+
import org.springframework.retry.support.RetryTemplate;
33+
import org.springframework.util.Assert;
34+
import org.springframework.util.LinkedMultiValueMap;
35+
import org.springframework.util.MultiValueMap;
36+
37+
/**
38+
* Implementation of the {@link TextToSpeechModel} and {@link StreamingTextToSpeechModel}
39+
* interfaces
40+
*
41+
* @author Alexandros Pappas
42+
*/
43+
public class ElevenLabsTextToSpeechModel implements TextToSpeechModel, StreamingTextToSpeechModel {
44+
45+
private final Logger logger = LoggerFactory.getLogger(getClass());
46+
47+
private final ElevenLabsApi elevenLabsApi;
48+
49+
private final RetryTemplate retryTemplate;
50+
51+
private final ElevenLabsTextToSpeechOptions defaultOptions;
52+
53+
public ElevenLabsTextToSpeechModel(ElevenLabsApi elevenLabsApi, ElevenLabsTextToSpeechOptions defaultOptions) {
54+
this(elevenLabsApi, defaultOptions, RetryUtils.DEFAULT_RETRY_TEMPLATE);
55+
}
56+
57+
public ElevenLabsTextToSpeechModel(ElevenLabsApi elevenLabsApi, ElevenLabsTextToSpeechOptions defaultOptions,
58+
RetryTemplate retryTemplate) {
59+
Assert.notNull(elevenLabsApi, "ElevenLabsApi must not be null");
60+
Assert.notNull(defaultOptions, "ElevenLabsSpeechOptions must not be null");
61+
Assert.notNull(retryTemplate, "RetryTemplate must not be null");
62+
63+
this.elevenLabsApi = elevenLabsApi;
64+
this.defaultOptions = defaultOptions;
65+
this.retryTemplate = retryTemplate;
66+
}
67+
68+
public static Builder builder() {
69+
return new Builder();
70+
}
71+
72+
@Override
73+
public TextToSpeechResponse call(TextToSpeechPrompt prompt) {
74+
ElevenLabsApi.SpeechRequest request = createRequest(prompt);
75+
String voiceId = getOptions(prompt).getVoice();
76+
77+
MultiValueMap<String, String> queryParameters = new LinkedMultiValueMap<>();
78+
if (getOptions(prompt).getEnableLogging() != null) {
79+
queryParameters.add("enable_logging", getOptions(prompt).getEnableLogging().toString());
80+
}
81+
if (getOptions(prompt).getFormat() != null) {
82+
queryParameters.add("output_format", getOptions(prompt).getFormat());
83+
}
84+
85+
byte[] audioData = retryTemplate.execute(context -> {
86+
var response = elevenLabsApi.textToSpeech(request, voiceId, queryParameters);
87+
if (response.getBody() == null) {
88+
logger.warn("No speech response returned for request: {}", request);
89+
return new byte[0];
90+
}
91+
return response.getBody();
92+
});
93+
94+
return new TextToSpeechResponse(List.of(new Speech(audioData)));
95+
}
96+
97+
@Override
98+
public Flux<TextToSpeechResponse> stream(TextToSpeechPrompt prompt) {
99+
ElevenLabsApi.SpeechRequest request = createRequest(prompt);
100+
String voiceId = getOptions(prompt).getVoice();
101+
102+
MultiValueMap<String, String> queryParameters = new LinkedMultiValueMap<>();
103+
if (getOptions(prompt).getEnableLogging() != null) {
104+
queryParameters.add("enable_logging", getOptions(prompt).getEnableLogging().toString());
105+
}
106+
if (getOptions(prompt).getFormat() != null) {
107+
queryParameters.add("output_format", getOptions(prompt).getFormat());
108+
}
109+
110+
return retryTemplate.execute(context -> elevenLabsApi.textToSpeechStream(request, voiceId, queryParameters)
111+
.map(entity -> new TextToSpeechResponse(List.of(new Speech(entity.getBody())))));
112+
}
113+
114+
private ElevenLabsApi.SpeechRequest createRequest(TextToSpeechPrompt prompt) {
115+
ElevenLabsTextToSpeechOptions options = getOptions(prompt);
116+
117+
String voiceId = options.getVoice();
118+
Assert.notNull(voiceId, "A voiceId must be specified in the ElevenLabsSpeechOptions.");
119+
120+
String text = prompt.getInstructions().getText();
121+
Assert.hasText(text, "Prompt must contain text to convert to speech.");
122+
123+
return ElevenLabsApi.SpeechRequest.builder()
124+
.text(text)
125+
.modelId(options.getModelId())
126+
.voiceSettings(options.getVoiceSettings())
127+
.languageCode(options.getLanguageCode())
128+
.pronunciationDictionaryLocators(options.getPronunciationDictionaryLocators())
129+
.seed(options.getSeed())
130+
.previousText(options.getPreviousText())
131+
.nextText(options.getNextText())
132+
.previousRequestIds(options.getPreviousRequestIds())
133+
.nextRequestIds(options.getNextRequestIds())
134+
.usePvcAsIvc(options.getUsePvcAsIvc())
135+
.applyTextNormalization(options.getApplyTextNormalization())
136+
.build();
137+
}
138+
139+
private ElevenLabsTextToSpeechOptions getOptions(TextToSpeechPrompt prompt) {
140+
ElevenLabsTextToSpeechOptions runtimeOptions = (prompt
141+
.getOptions() instanceof ElevenLabsTextToSpeechOptions elevenLabsSpeechOptions) ? elevenLabsSpeechOptions
142+
: null;
143+
return (runtimeOptions != null) ? merge(runtimeOptions, this.defaultOptions) : this.defaultOptions;
144+
}
145+
146+
private ElevenLabsTextToSpeechOptions merge(ElevenLabsTextToSpeechOptions runtimeOptions,
147+
ElevenLabsTextToSpeechOptions defaultOptions) {
148+
return ElevenLabsTextToSpeechOptions.builder()
149+
.modelId(getOrDefault(runtimeOptions.getModelId(), defaultOptions.getModelId()))
150+
.voice(getOrDefault(runtimeOptions.getVoice(), defaultOptions.getVoice()))
151+
.voiceId(getOrDefault(runtimeOptions.getVoiceId(), defaultOptions.getVoiceId()))
152+
.format(getOrDefault(runtimeOptions.getFormat(), defaultOptions.getFormat()))
153+
.outputFormat(getOrDefault(runtimeOptions.getOutputFormat(), defaultOptions.getOutputFormat()))
154+
.voiceSettings(getOrDefault(runtimeOptions.getVoiceSettings(), defaultOptions.getVoiceSettings()))
155+
.languageCode(getOrDefault(runtimeOptions.getLanguageCode(), defaultOptions.getLanguageCode()))
156+
.pronunciationDictionaryLocators(getOrDefault(runtimeOptions.getPronunciationDictionaryLocators(),
157+
defaultOptions.getPronunciationDictionaryLocators()))
158+
.seed(getOrDefault(runtimeOptions.getSeed(), defaultOptions.getSeed()))
159+
.previousText(getOrDefault(runtimeOptions.getPreviousText(), defaultOptions.getPreviousText()))
160+
.nextText(getOrDefault(runtimeOptions.getNextText(), defaultOptions.getNextText()))
161+
.previousRequestIds(
162+
getOrDefault(runtimeOptions.getPreviousRequestIds(), defaultOptions.getPreviousRequestIds()))
163+
.nextRequestIds(getOrDefault(runtimeOptions.getNextRequestIds(), defaultOptions.getNextRequestIds()))
164+
.usePvcAsIvc(getOrDefault(runtimeOptions.getUsePvcAsIvc(), defaultOptions.getUsePvcAsIvc()))
165+
.applyTextNormalization(getOrDefault(runtimeOptions.getApplyTextNormalization(),
166+
defaultOptions.getApplyTextNormalization()))
167+
.build();
168+
}
169+
170+
private <T> T getOrDefault(T runtimeValue, T defaultValue) {
171+
return runtimeValue != null ? runtimeValue : defaultValue;
172+
}
173+
174+
@Override
175+
public ElevenLabsTextToSpeechOptions getDefaultOptions() {
176+
return this.defaultOptions;
177+
}
178+
179+
public static class Builder {
180+
181+
private ElevenLabsApi elevenLabsApi;
182+
183+
private RetryTemplate retryTemplate = RetryUtils.DEFAULT_RETRY_TEMPLATE;
184+
185+
private ElevenLabsTextToSpeechOptions defaultOptions = ElevenLabsTextToSpeechOptions.builder().build();
186+
187+
public Builder elevenLabsApi(ElevenLabsApi elevenLabsApi) {
188+
this.elevenLabsApi = elevenLabsApi;
189+
return this;
190+
}
191+
192+
public Builder retryTemplate(RetryTemplate retryTemplate) {
193+
this.retryTemplate = retryTemplate;
194+
return this;
195+
}
196+
197+
public Builder defaultOptions(ElevenLabsTextToSpeechOptions defaultOptions) {
198+
this.defaultOptions = defaultOptions;
199+
return this;
200+
}
201+
202+
public ElevenLabsTextToSpeechModel build() {
203+
Assert.notNull(elevenLabsApi, "ElevenLabsApi must not be null");
204+
Assert.notNull(defaultOptions, "ElevenLabsSpeechOptions must not be null");
205+
return new ElevenLabsTextToSpeechModel(elevenLabsApi, defaultOptions, retryTemplate);
206+
}
207+
208+
}
209+
210+
}

0 commit comments

Comments
 (0)