elevenlabs-go/speechtospeech.go at main · plexusone/elevenlabs-go · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
package elevenlabs

import (
	"bytes"
	"context"
	"fmt"
	"io"
	"mime/multipart"
	"net/http"
)

// SpeechToSpeechService handles voice conversion operations.
type SpeechToSpeechService struct {
	client *Client
}

// SpeechToSpeechRequest is a request to convert speech to a different voice.
type SpeechToSpeechRequest struct {
	// VoiceID is the target voice to convert to.
	VoiceID string

	// Audio is the source audio data to convert.
	Audio io.Reader

	// AudioFilename is the filename for the audio (optional, helps with format detection).
	AudioFilename string

	// ModelID is the model to use. Defaults to "eleven_english_sts_v2".
	ModelID string

	// VoiceSettings configures the voice parameters.
	VoiceSettings *VoiceSettings

	// OutputFormat specifies the audio output format.
	// Examples: "mp3_44100_128", "pcm_16000", "pcm_22050"
	OutputFormat string

	// RemoveBackgroundNoise removes background noise from the source audio.
	RemoveBackgroundNoise bool

	// SeedAudio is optional seed audio to influence the conversion.
	SeedAudio io.Reader

	// SeedAudioFilename is the filename for the seed audio.
	SeedAudioFilename string
}

// Validate validates the speech-to-speech request.
func (r *SpeechToSpeechRequest) Validate() error {
	if r.VoiceID == "" {
		return ErrEmptyVoiceID
	}
	if r.Audio == nil {
		return &APIError{Message: "audio is required"}
	}
	if r.VoiceSettings != nil {
		if err := r.VoiceSettings.Validate(); err != nil {
			return err
		}
	}
	return nil
}

// SpeechToSpeechResponse contains the converted audio.
type SpeechToSpeechResponse struct {
	// Audio is the converted audio data.
	Audio io.Reader
}

// Convert converts speech from one voice to another.
func (s *SpeechToSpeechService) Convert(ctx context.Context, req *SpeechToSpeechRequest) (*SpeechToSpeechResponse, error) {
	if err := req.Validate(); err != nil {
		return nil, err
	}

	// Build multipart form
	var buf bytes.Buffer
	writer := multipart.NewWriter(&buf)

	// Add audio file
	audioFilename := req.AudioFilename
	if audioFilename == "" {
		audioFilename = "audio.mp3"
	}
	audioWriter, err := writer.CreateFormFile("audio", audioFilename)
	if err != nil {
		return nil, fmt.Errorf("failed to create audio form field: %w", err)
	}
	if _, err := io.Copy(audioWriter, req.Audio); err != nil {
		return nil, fmt.Errorf("failed to write audio: %w", err)
	}

	// Add model ID
	modelID := req.ModelID
	if modelID == "" {
		modelID = "eleven_english_sts_v2"
	}
	if err := writer.WriteField("model_id", modelID); err != nil {
		return nil, fmt.Errorf("failed to write model_id: %w", err)
	}

	// Add voice settings if provided
	if req.VoiceSettings != nil {
		if err := writer.WriteField("stability", fmt.Sprintf("%.2f", req.VoiceSettings.Stability)); err != nil {
			return nil, err
		}
		if err := writer.WriteField("similarity_boost", fmt.Sprintf("%.2f", req.VoiceSettings.SimilarityBoost)); err != nil {
			return nil, err
		}
		if req.VoiceSettings.Style > 0 {
			if err := writer.WriteField("style", fmt.Sprintf("%.2f", req.VoiceSettings.Style)); err != nil {
				return nil, err
			}
		}
		if req.VoiceSettings.UseSpeakerBoost {
			if err := writer.WriteField("use_speaker_boost", "true"); err != nil {
				return nil, err
			}
		}
	}

	// Add remove background noise option
	if req.RemoveBackgroundNoise {
		if err := writer.WriteField("remove_background_noise", "true"); err != nil {
			return nil, err
		}
	}

	// Add seed audio if provided
	if req.SeedAudio != nil {
		seedFilename := req.SeedAudioFilename
		if seedFilename == "" {
			seedFilename = "seed.mp3"
		}
		seedWriter, err := writer.CreateFormFile("seed_audio", seedFilename)
		if err != nil {
			return nil, fmt.Errorf("failed to create seed_audio form field: %w", err)
		}
		if _, err := io.Copy(seedWriter, req.SeedAudio); err != nil {
			return nil, fmt.Errorf("failed to write seed audio: %w", err)
		}
	}

	if err := writer.Close(); err != nil {
		return nil, fmt.Errorf("failed to close multipart writer: %w", err)
	}

	// Build URL
	url := fmt.Sprintf("%s/v1/speech-to-speech/%s", s.client.baseURL, req.VoiceID)
	if req.OutputFormat != "" {
		url += "?output_format=" + req.OutputFormat
	}

	// Make request
	httpReq, err := http.NewRequestWithContext(ctx, "POST", url, &buf)
	if err != nil {
		return nil, err
	}

	httpReq.Header.Set("Content-Type", writer.FormDataContentType())
	httpReq.Header.Set("xi-api-key", s.client.apiKey)

	resp, err := http.DefaultClient.Do(httpReq) //nolint:gosec // G704: API client, URL is fixed ElevenLabs endpoint
	if err != nil {
		return nil, fmt.Errorf("request failed: %w", err)
	}

	if resp.StatusCode != http.StatusOK {
		defer resp.Body.Close()
		respBody, _ := io.ReadAll(resp.Body)
		return nil, &APIError{
			StatusCode: resp.StatusCode,
			Message:    string(respBody),
		}
	}

	return &SpeechToSpeechResponse{Audio: resp.Body}, nil
}

// ConvertStream converts speech with streaming response.
func (s *SpeechToSpeechService) ConvertStream(ctx context.Context, req *SpeechToSpeechRequest) (*SpeechToSpeechResponse, error) {
	if err := req.Validate(); err != nil {
		return nil, err
	}

	// Build multipart form
	var buf bytes.Buffer
	writer := multipart.NewWriter(&buf)

	// Add audio file
	audioFilename := req.AudioFilename
	if audioFilename == "" {
		audioFilename = "audio.mp3"
	}
	audioWriter, err := writer.CreateFormFile("audio", audioFilename)
	if err != nil {
		return nil, fmt.Errorf("failed to create audio form field: %w", err)
	}
	if _, err := io.Copy(audioWriter, req.Audio); err != nil {
		return nil, fmt.Errorf("failed to write audio: %w", err)
	}

	// Add model ID
	modelID := req.ModelID
	if modelID == "" {
		modelID = "eleven_english_sts_v2"
	}
	if err := writer.WriteField("model_id", modelID); err != nil {
		return nil, fmt.Errorf("failed to write model_id: %w", err)
	}

	// Add voice settings if provided
	if req.VoiceSettings != nil {
		if err := writer.WriteField("stability", fmt.Sprintf("%.2f", req.VoiceSettings.Stability)); err != nil {
			return nil, err
		}
		if err := writer.WriteField("similarity_boost", fmt.Sprintf("%.2f", req.VoiceSettings.SimilarityBoost)); err != nil {
			return nil, err
		}
		if req.VoiceSettings.Style > 0 {
			if err := writer.WriteField("style", fmt.Sprintf("%.2f", req.VoiceSettings.Style)); err != nil {
				return nil, err
			}
		}
		if req.VoiceSettings.UseSpeakerBoost {
			if err := writer.WriteField("use_speaker_boost", "true"); err != nil {
				return nil, err
			}
		}
	}

	// Add remove background noise option
	if req.RemoveBackgroundNoise {
		if err := writer.WriteField("remove_background_noise", "true"); err != nil {
			return nil, err
		}
	}

	if err := writer.Close(); err != nil {
		return nil, fmt.Errorf("failed to close multipart writer: %w", err)
	}

	// Build URL for streaming endpoint
	url := fmt.Sprintf("%s/v1/speech-to-speech/%s/stream", s.client.baseURL, req.VoiceID)
	if req.OutputFormat != "" {
		url += "?output_format=" + req.OutputFormat
	}

	// Make request
	httpReq, err := http.NewRequestWithContext(ctx, "POST", url, &buf)
	if err != nil {
		return nil, err
	}

	httpReq.Header.Set("Content-Type", writer.FormDataContentType())
	httpReq.Header.Set("xi-api-key", s.client.apiKey)

	resp, err := http.DefaultClient.Do(httpReq) //nolint:gosec // G704: API client, URL is fixed ElevenLabs endpoint
	if err != nil {
		return nil, fmt.Errorf("request failed: %w", err)
	}

	if resp.StatusCode != http.StatusOK {
		defer resp.Body.Close()
		respBody, _ := io.ReadAll(resp.Body)
		return nil, &APIError{
			StatusCode: resp.StatusCode,
			Message:    string(respBody),
		}
	}

	return &SpeechToSpeechResponse{Audio: resp.Body}, nil
}

// Simple is a convenience method for basic voice conversion.
func (s *SpeechToSpeechService) Simple(ctx context.Context, voiceID string, audio io.Reader) (io.Reader, error) {
	resp, err := s.Convert(ctx, &SpeechToSpeechRequest{
		VoiceID:       voiceID,
		Audio:         audio,
		VoiceSettings: DefaultVoiceSettings(),
	})
	if err != nil {
		return nil, err
	}
	return resp.Audio, nil
}