Skip to content

Mp3 container support#15

Open
vchulski wants to merge 2 commits intocartesia-ai:mainfrom
vchulski:mp3-container-support
Open

Mp3 container support#15
vchulski wants to merge 2 commits intocartesia-ai:mainfrom
vchulski:mp3-container-support

Conversation

@vchulski
Copy link

@vchulski vchulski commented Dec 10, 2024

Mp3 Container implementation

Added mp3 container support, following the Cartesia API documentation.

Changes

Added a new container to the dict inside OutputFormatMapping, modified get_output_format function inside TTS class and updated test accordingly.

@chongzluong
Copy link
Contributor

@vchulski hey thanks for chatting this morning - if you could give it a go on the latest version of the SDK, we believe this should be supported? Here's an example file that creates an mp3:

#!/usr/bin/env python3
"""
Standalone test for MP3 format with Cartesia TTS bytes endpoint.

This tests if client.tts.bytes() properly supports MP3 output format.

Usage:
    export CARTESIA_API_KEY=your_api_key_here
    python test_tts_mp3_standalone.py
"""

import os
import sys
import logging

import cartesia
from cartesia import Cartesia

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

def validate_mp3_response(data: bytes) -> bool:
    """Validate that the response is a valid MP3 file."""
    if len(data) < 128:
        logger.error("MP3 data too short")
        return False
    
    search_window = min(len(data), 1024)
    for i in range(search_window - 1):
        if data[i] == 0xFF and (data[i + 1] & 0xE0) == 0xE0:
            logger.info(f"Found valid MP3 frame sync at byte {i}")
            return True
    
    logger.error("No valid MP3 frame sync found")
    return False

def test_tts_bytes_mp3():
    """Test TTS bytes endpoint with MP3 format."""
    try:
        api_key = os.getenv('CARTESIA_API_KEY')
        if not api_key:
            logger.error("CARTESIA_API_KEY environment variable not set!")
            logger.info("Please set your API key: export CARTESIA_API_KEY=your_api_key_here")
            return False
        
        logger.info("Creating Cartesia client...")
        client = Cartesia(api_key=api_key)
        
        model_id = "sonic-2"
        transcript = "Hello, world! This is a test of MP3 output from TTS bytes endpoint."
        voice_id = "f9836c6e-a0bd-460e-9d3c-f7299fa60f94"
        
        logger.info("Testing TTS bytes with MP3 format...")
        logger.info(f"Parameters: {model_id}, '{transcript}', MP3 44100Hz 128kbps")
        
        # Test using the structured output format (like TTS normally uses)
        chunks = []
        chunk_count = 0
        
        for chunk in client.tts.bytes(
            model_id=model_id,
            transcript=transcript,
            voice={"mode": "id", "id": voice_id},
            output_format={
                "container": "mp3",
                "sample_rate": 44100,
                "bit_rate": 128000,
            },
        ):
            chunks.append(chunk)
            chunk_count += 1
            logger.info(f"Received chunk {chunk_count}: {len(chunk)} bytes")
        
        audio_data = b"".join(chunks)
        logger.info(f"Total audio data received: {len(audio_data)} bytes")
        
        if validate_mp3_response(audio_data):
            logger.info("✅ TTS bytes MP3 test PASSED - Valid MP3 audio generated!")
            
            output_file = "test_tts_mp3_output.mp3"
            with open(output_file, "wb") as f:
                f.write(audio_data)
            logger.info(f"💾 Saved output to: {output_file}")
            
            return True
        else:
            logger.error("❌ TTS bytes MP3 test FAILED - Invalid MP3 output")
            return False
            
    except Exception as e:
        logger.error(f"TTS bytes MP3 test failed with error: {e}")
        logger.exception("Full traceback:")
        return False

def test_tts_bytes_wav_comparison():
    """Test TTS bytes with WAV format for comparison."""
    try:
        api_key = os.getenv('CARTESIA_API_KEY')
        if not api_key:
            return False
        
        client = Cartesia(api_key=api_key)
        
        logger.info("Testing TTS bytes with WAV format (for comparison)...")
        
        chunks = []
        for chunk in client.tts.bytes(
            model_id="sonic-2",
            transcript="This is a WAV test.",
            voice={"mode": "id", "id": "f9836c6e-a0bd-460e-9d3c-f7299fa60f94"},
            output_format={
                "container": "wav",
                "encoding": "pcm_s16le",
                "sample_rate": 44100,
            },
        ):
            chunks.append(chunk)
        
        audio_data = b"".join(chunks)
        
        # Validate WAV
        if audio_data.startswith(b"RIFF") and audio_data[8:12] == b"WAVE":
            logger.info("✅ TTS bytes WAV test PASSED - Valid WAV audio generated!")
            return True
        else:
            logger.error("❌ TTS bytes WAV test FAILED - Invalid WAV output")
            return False
            
    except Exception as e:
        logger.error(f"TTS bytes WAV test failed: {e}")
        return False

def main():
    """Run the TTS MP3 tests."""
    logger.info("=" * 60)
    logger.info("🎵 Cartesia TTS MP3 Test Suite")
    logger.info("=" * 60)
    
    test1_passed = test_tts_bytes_wav_comparison()  # Control test
    test2_passed = test_tts_bytes_mp3()            # Main test
    
    logger.info("=" * 60)
    logger.info("📊 Test Results Summary:")
    logger.info(f"  TTS bytes WAV (control): {'✅ PASSED' if test1_passed else '❌ FAILED'}")
    logger.info(f"  TTS bytes MP3 (main): {'✅ PASSED' if test2_passed else '❌ FAILED'}")
    
    if test1_passed and test2_passed:
        logger.info("🎉 All tests PASSED! TTS bytes MP3 is working correctly.")
        sys.exit(0)
    else:
        logger.error("💥 Some tests FAILED! Check the logs above.")
        if not test2_passed:
            logger.error("❗ MP3 format issue confirmed with TTS bytes endpoint!")
        sys.exit(1)

if __name__ == "__main__":
    main() 

@aleixlahozt
Copy link

@chongzluong

Hi! Thanks for the quick follow-up and the example code.

Together with @vchulski we tested your script and confirmed that native MP3 generation works well with the latest SDK. However, we discovered a few important limitations during our testing:

1. SSE endpoint doesn't support MP3 format

When we try to use MP3 with the sse() endpoint, we get this error:

async for chunk in self.client.tts.sse(
    model_id=model,
    transcript=text,
    voice={"mode": "id", "id": voice_id},
    output_format={
        "container": "mp3",
        "sample_rate": 44100,
        "bit_rate": 192000,
    },
):

Error: status_code: 400, body: only 'raw' container is supported for this endpoint

2. Audio quality difference

We compared the audio quality between:

  • SSE + raw + our lameenc conversion
  • bytes() + native MP3

The SSE approach produces noticeably better audio quality in terms of voice clarity and background noise reduction.

3. Latency advantage of SSE

Our latency testing shows that SSE provides significantly better first-chunk latency (~200ms faster) compared to the bytes() endpoint, which is critical for our telephony applications.

Feature Request:

Would it be possible to add MP3 container support to the SSE endpoint? This would give us the best of both cases: the better latency of SSE with the convenience of native MP3 encoding. This would be valuable for our use case.

Thanks again for your help.

@vchulski
Copy link
Author

Hi, @chongzluong,

Any news on the mp3 support for SSE?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

None yet

Projects

None yet

Development

Successfully merging this pull request may close these issues.

3 participants