ElevenLabs Go SDK

Go SDK for the ElevenLabs API.

Features

🗣️ Text-to-Speech: Convert text to realistic speech with multiple voices and models
📝 Speech-to-Text: Transcribe audio with speaker diarization support
🎙️ Speech-to-Speech: Voice conversion - transform speech to a different voice
🔊 Sound Effects: Generate sound effects from text descriptions
🎨 Voice Design: Create custom AI voices with specific characteristics
🎵 Music Composition: Generate music from text prompts
🎙️ Audio Isolation: Extract vocals/speech from audio
⏱️ Forced Alignment: Get word-level timestamps for audio
💬 Text-to-Dialogue: Generate multi-speaker conversations
🌍 Dubbing: Translate and dub video/audio content
📚 Projects: Manage long-form audio content (audiobooks, podcasts)
📖 Pronunciation Dictionaries: Control pronunciation of specific terms

Real-Time Services

⚡ WebSocket TTS: Low-latency text-to-speech streaming for real-time voice synthesis
⚡ WebSocket STT: Real-time speech-to-text with partial results
📞 Twilio Integration: Phone call integration for conversational AI agents
📱 Phone Numbers: Manage phone numbers for voice agents

Command Line Interface

🖥️ elevenlabs tts: Generate speech from text files with YAML config support
📜 elevenlabs ttsscript: Batch TTS from JSON scripts with per-slide output
🎛️ Presets: Built-in configurations for oratory, podcast, audiobook styles

OmniVoice Integration

🔌 OmniVoice Providers: Use ElevenLabs as a drop-in backend for the vendor-agnostic OmniVoice interface
🔄 Portable Code: Swap voice providers (ElevenLabs, OpenAI, Google) without changing application logic
🧪 TTS, STT, Agent: Full provider implementations for text-to-speech, speech-to-text, and voice agents

Installation

go get github.com/agentplexus/go-elevenlabs

CLI Installation

go install github.com/agentplexus/go-elevenlabs/cmd/elevenlabs@latest

Quick Start

Basic Text-to-Speech

package main

import (
    "context"
    "io"
    "log"
    "os"

    elevenlabs "github.com/agentplexus/go-elevenlabs"
)

func main() {
    // Create client (uses ELEVENLABS_API_KEY env var)
    client, err := elevenlabs.NewClient()
    if err != nil {
        log.Fatal(err)
    }

    ctx := context.Background()

    // List available voices
    voices, err := client.Voices().List(ctx)
    if err != nil {
        log.Fatal(err)
    }
    log.Printf("Found %d voices", len(voices))

    // Generate speech
    if len(voices) > 0 {
        audio, err := client.TextToSpeech().Simple(ctx,
            voices[0].VoiceID,
            "Hello from the ElevenLabs Go SDK!")
        if err != nil {
            log.Fatal(err)
        }

        // Save to file
        f, _ := os.Create("hello.mp3")
        defer f.Close()
        io.Copy(f, audio)
    }
}

With Custom Options

client, err := elevenlabs.NewClient(
    elevenlabs.WithAPIKey("your-api-key"),
    elevenlabs.WithTimeout(5 * time.Minute),
)

Services

Text-to-Speech

// Simple generation
audio, err := client.TextToSpeech().Simple(ctx, voiceID, "Hello world")

// With full options
resp, err := client.TextToSpeech().Generate(ctx, &elevenlabs.TTSRequest{
    VoiceID: "21m00Tcm4TlvDq8ikWAM",
    Text:    "Hello with custom settings!",
    ModelID: "eleven_multilingual_v2",
    VoiceSettings: &elevenlabs.VoiceSettings{
        Stability:       0.6,
        SimilarityBoost: 0.8,
        Style:           0.1,
        SpeakerBoost:    true,
    },
    OutputFormat: "mp3_44100_192",
})

Speech-to-Text

// Transcribe from URL
result, err := client.SpeechToText().TranscribeURL(ctx, "https://example.com/audio.mp3")
fmt.Printf("Text: %s\n", result.Text)
fmt.Printf("Language: %s\n", result.LanguageCode)

// With speaker diarization
result, err := client.SpeechToText().TranscribeWithDiarization(ctx, audioURL)
for _, word := range result.Words {
    fmt.Printf("[%s] %s (%.2fs - %.2fs)\n", word.Speaker, word.Text, word.Start, word.End)
}

Sound Effects

// Simple sound effect
audio, err := client.SoundEffects().Simple(ctx, "thunder and rain storm")

// With options
sfx, err := client.SoundEffects().Generate(ctx, &elevenlabs.SoundEffectRequest{
    Text:            "spaceship engine humming",
    DurationSeconds: 10,
    PromptInfluence: 0.5,
})

Music Composition

// Generate music from prompt
resp, err := client.Music().Generate(ctx, &elevenlabs.MusicRequest{
    Prompt:     "upbeat electronic music for a tech video",
    DurationMs: 30000,
})

// Instrumental only
audio, err := client.Music().GenerateInstrumental(ctx, "calm piano melody", 60000)

// Generate with composition plan for fine-grained control
plan, _ := client.Music().GeneratePlan(ctx, &elevenlabs.CompositionPlanRequest{
    Prompt:     "pop song about summer",
    DurationMs: 180000,
})
resp, err := client.Music().GenerateDetailed(ctx, &elevenlabs.MusicDetailedRequest{
    CompositionPlan: plan,
})

// Separate stems (vocals, drums, bass, etc.)
f, _ := os.Open("song.mp3")
stems, err := client.Music().SeparateStems(ctx, &elevenlabs.StemSeparationRequest{
    File:     f,
    Filename: "song.mp3",
})

Audio Isolation

// Extract vocals from audio file
f, _ := os.Open("mixed_audio.mp3")
isolated, err := client.AudioIsolation().IsolateFile(ctx, f, "mixed_audio.mp3")

Forced Alignment

// Get word-level timestamps
f, _ := os.Open("speech.mp3")
result, err := client.ForcedAlignment().AlignFile(ctx, f, "speech.mp3",
    "The text that was spoken in the audio")

for _, word := range result.Words {
    fmt.Printf("%s: %.2fs - %.2fs\n", word.Text, word.Start, word.End)
}

Text-to-Dialogue

// Generate multi-speaker dialogue
audio, err := client.TextToDialogue().Simple(ctx, []elevenlabs.DialogueInput{
    {Text: "Hello, how are you?", VoiceID: "voice1"},
    {Text: "I'm doing great, thanks!", VoiceID: "voice2"},
})

Voice Design

// Generate a custom voice
resp, err := client.VoiceDesign().GeneratePreview(ctx, &elevenlabs.VoiceDesignRequest{
    Gender:         elevenlabs.VoiceGenderFemale,
    Age:            elevenlabs.VoiceAgeYoung,
    Accent:         elevenlabs.VoiceAccentAmerican,
    AccentStrength: 1.0,
    Text:           "This is a preview of the generated voice. It should be at least one hundred characters long for best results.",
})

Pronunciation Dictionaries

// Create from a map
dict, err := client.Pronunciation().CreateFromMap(ctx, "Tech Terms", map[string]string{
    "API":     "A P I",
    "kubectl": "kube control",
    "nginx":   "engine X",
})

// Create from JSON file
dict, err := client.Pronunciation().CreateFromJSON(ctx, "Terms", "pronunciation.json")

Dubbing

// Create dubbing job
dub, err := client.Dubbing().Create(ctx, &elevenlabs.DubbingRequest{
    SourceURL:      "https://example.com/video.mp4",
    TargetLanguage: "es",
    Name:           "Video - Spanish",
})

// Check status
status, err := client.Dubbing().GetStatus(ctx, dub.DubbingID)

Projects (Studio)

// Create a project for long-form content
project, err := client.Projects().Create(ctx, &elevenlabs.CreateProjectRequest{
    Name:                    "My Audiobook",
    DefaultModelID:          "eleven_multilingual_v2",
    DefaultParagraphVoiceID: voiceID,
})

// Convert to audio
err = client.Projects().Convert(ctx, project.ProjectID)

Speech-to-Speech (Voice Conversion)

// Convert speech from one voice to another
f, _ := os.Open("input.mp3")
resp, err := client.SpeechToSpeech().Convert(ctx, &elevenlabs.SpeechToSpeechRequest{
    VoiceID: targetVoiceID,
    Audio:   f,
})

// Simple conversion
output, err := client.SpeechToSpeech().Simple(ctx, targetVoiceID, audioReader)

WebSocket TTS (Real-Time Streaming)

// Connect for low-latency TTS (ideal for LLM output)
conn, err := client.WebSocketTTS().Connect(ctx, voiceID, &elevenlabs.WebSocketTTSOptions{
    ModelID:                  "eleven_turbo_v2_5",
    OutputFormat:             "pcm_16000",
    OptimizeStreamingLatency: 3,
})
defer conn.Close()

// Stream text as it arrives (e.g., from LLM)
for text := range llmOutputStream {
    conn.SendText(text)
}
conn.Flush()

// Receive audio chunks
for audio := range conn.Audio() {
    // Play or save audio chunks
}

WebSocket STT (Real-Time Transcription)

// Connect for live transcription
conn, err := client.WebSocketSTT().Connect(ctx, &elevenlabs.WebSocketSTTOptions{
    SampleRate:     16000,
    EnablePartials: true,
})
defer conn.Close()

// Send audio chunks
go func() {
    for audioChunk := range microphoneInput {
        conn.SendAudio(audioChunk)
    }
    conn.EndStream()
}()

// Receive transcripts
for transcript := range conn.Transcripts() {
    if transcript.IsFinal {
        fmt.Println("Final:", transcript.Text)
    } else {
        fmt.Println("Partial:", transcript.Text)
    }
}

Twilio Integration (Phone Calls)

// Register incoming Twilio call with an ElevenLabs agent
resp, err := client.Twilio().RegisterCall(ctx, &elevenlabs.TwilioRegisterCallRequest{
    AgentID: "your-agent-id",
})
// Return resp.TwiML to Twilio webhook

// Make outbound call
call, err := client.Twilio().OutboundCall(ctx, &elevenlabs.TwilioOutboundCallRequest{
    AgentID:            "your-agent-id",
    AgentPhoneNumberID: "phone-number-id",
    ToNumber:           "+1234567890",
})

// List phone numbers
numbers, err := client.PhoneNumbers().List(ctx)

Examples

See the examples/ directory for runnable examples:

Example	Description
`basic/`	Common SDK operations
`websocket-tts/`	Real-time TTS streaming for LLM integration
`websocket-stt/`	Live transcription with partial results
`speech-to-speech/`	Voice conversion
`twilio/`	Phone call integration with Twilio
`ttsscript/`	Multi-voice script authoring
`retryhttp/`	Retry-capable HTTP transport

export ELEVENLABS_API_KEY="your-api-key"
go run examples/basic/main.go

Command Line Interface

The elevenlabs CLI provides text-to-speech generation from the command line.

Basic Usage

# Generate speech from a text file
elevenlabs tts -v <voice-id> speech.txt

# Use a preset (oratory, podcast, audiobook)
elevenlabs tts -v <voice-id> --preset oratory speech.txt

# High-quality PCM output
elevenlabs tts -v <voice-id> -f pcm_48000 -o output.wav speech.txt

# Estimate credits without calling API
elevenlabs tts -v <voice-id> --estimate speech.txt

Configuration Files

Save and reuse TTS settings with YAML config files:

# Use config file
elevenlabs tts --config tts-config.yaml speech.txt

# Save current settings to config
elevenlabs tts -v <voice-id> --preset oratory --save-config my-config.yaml speech.txt

Example config file:

voice_id: IT8nQhZJj9jzRwmC46Ko
model_id: eleven_v3
output_format: pcm_48000

voice_settings:
  stability: 0.4        # Lower = more expressive
  similarity_boost: 0.75
  style: 0.3            # Higher = more dramatic
  speed: 0.95           # Slightly slower for gravitas

Presets

Preset	Stability	Style	Speed	Format	Use Case
`oratory`	0.4	0.3	0.95	pcm_48000	Speeches, presentations
`podcast`	0.5	0.0	1.0	mp3_44100_128	Conversational content
`audiobook`	0.6	0.1	0.95	pcm_48000	Long-form narration

Input Format

Text files support ElevenLabs formatting:

[calm] <break time="1s"/>
There are moments in history when humanity TRANSFORMS.
<break time="0.5s"/>
[excited] This is AMAZING news!

SSML <break> tags for pauses
Emotion tags ([calm], [excited], [firm]) for v3 model
CAPITALIZED words for emphasis

Error Handling

audio, err := client.TextToSpeech().Simple(ctx, voiceID, text)
if err != nil {
    if elevenlabs.IsRateLimitError(err) {
        log.Println("Rate limited, waiting...")
        time.Sleep(time.Minute)
    } else if elevenlabs.IsUnauthorizedError(err) {
        log.Fatal("Invalid API key")
    } else if elevenlabs.IsNotFoundError(err) {
        log.Fatal("Voice not found")
    } else {
        log.Fatalf("Error: %v", err)
    }
}

Environment Variables

ELEVENLABS_API_KEY: Your ElevenLabs API key (used automatically if not provided via WithAPIKey)

Name		Name	Last commit message	Last commit date
Latest commit History 94 Commits
.github		.github
cmd		cmd
docs		docs
examples		examples
internal/api		internal/api
omnivoice		omnivoice
openapi		openapi
ttsconfig		ttsconfig
ttsscript		ttsscript
voices		voices
.gitignore		.gitignore
.golangci.yaml		.golangci.yaml
CHANGELOG.json		CHANGELOG.json
CHANGELOG.md		CHANGELOG.md
LICENSE		LICENSE
PRESENTATION.md		PRESENTATION.md
README.md		README.md
README_AGENT.md		README_AGENT.md
README_AGENT_ROADMAP.md		README_AGENT_ROADMAP.md
TRD_STT.md		TRD_STT.md
audio.go		audio.go
audio_test.go		audio_test.go
audioisolation.go		audioisolation.go
audioisolation_test.go		audioisolation_test.go
client.go		client.go
client_test.go		client_test.go
dubbing.go		dubbing.go
errors.go		errors.go
errors_test.go		errors_test.go
forcedalignment.go		forcedalignment.go
forcedalignment_test.go		forcedalignment_test.go
generate.sh		generate.sh
go.mod		go.mod
go.sum		go.sum
history.go		history.go
history_test.go		history_test.go
integration_test.go		integration_test.go
mkdocs.yml		mkdocs.yml
models.go		models.go
models_test.go		models_test.go
music.go		music.go
music_test.go		music_test.go
ogen-fixnull		ogen-fixnull
ogen.yml		ogen.yml
projects.go		projects.go
projects_test.go		projects_test.go
pronunciation.go		pronunciation.go
pronunciation_rules.go		pronunciation_rules.go
pronunciation_rules_test.go		pronunciation_rules_test.go
pronunciation_test.go		pronunciation_test.go
soundeffects.go		soundeffects.go
soundeffects_test.go		soundeffects_test.go
speechtospeech.go		speechtospeech.go
speechtotext.go		speechtotext.go
speechtotext_test.go		speechtotext_test.go
texttodialogue.go		texttodialogue.go
texttodialogue_test.go		texttodialogue_test.go
texttospeech.go		texttospeech.go
texttospeech_test.go		texttospeech_test.go
twilio.go		twilio.go
user.go		user.go
user_test.go		user_test.go
voicedesign.go		voicedesign.go
voicedesign_test.go		voicedesign_test.go
voices.go		voices.go
voices_test.go		voices_test.go
voicesettings.go		voicesettings.go
websocketstt.go		websocketstt.go
websockettts.go		websockettts.go

Folders and files

Latest commit

History

Repository files navigation

ElevenLabs Go SDK

Features

Real-Time Services

Command Line Interface

OmniVoice Integration

Installation

CLI Installation

Quick Start

Basic Text-to-Speech

With Custom Options

Services

Text-to-Speech

Speech-to-Text

Sound Effects

Music Composition

Audio Isolation

Forced Alignment

Text-to-Dialogue

Voice Design

Pronunciation Dictionaries

Dubbing

Projects (Studio)

Speech-to-Speech (Voice Conversion)

WebSocket TTS (Real-Time Streaming)

WebSocket STT (Real-Time Transcription)

Twilio Integration (Phone Calls)

Examples

Command Line Interface

Basic Usage

Configuration Files

Presets

Input Format

Error Handling

Environment Variables

Documentation

Contributing

License

About

Resources

License

Uh oh!

Stars

Watchers

Forks

Releases 12

Contributors

Uh oh!

Languages