add sesame-csm-1b example

veerbia · veerbia · commit d33f22d635d5 · 2025-03-14T12:51:59.000-07:00
diff --git a/sesame-csm-1b/config.yaml b/sesame-csm-1b/config.yaml
@@ -0,0 +1,28 @@
+model_name: sesame-csm-1b
+python_version: py310
+model_metadata:
+  example_model_input:
+    text: "Hello from Sesame."
+    speaker: 0
+requirements:
+  - torch==2.4.0
+  - torchaudio==2.4.0
+  - tokenizers==0.21.0
+  - transformers==4.49.0
+  - huggingface_hub==0.28.1
+  - moshi==0.2.2
+  - torchtune==0.4.0
+  - torchao==0.9.0
+  - silentcipher @ git+https://github.com/SesameAILabs/silentcipher@master
+  - ffmpeg
+  - git+https://github.com/veerbia/csm.git
+resources:
+  accelerator: T4
+  cpu: '1'
+  memory: 10Gi
+  use_gpu: true
+secrets:
+  hf_access_token: null
+system_packages: []
+environment_variables: {}
+external_package_dirs: []
diff --git a/sesame-csm-1b/model/__init__.py b/sesame-csm-1b/model/__init__.py
diff --git a/sesame-csm-1b/model/model.py b/sesame-csm-1b/model/model.py
@@ -0,0 +1,32 @@
+import base64
+from io import BytesIO
+from huggingface_hub import hf_hub_download
+from generator import load_csm_1b
+import torchaudio
+import torch
+
+class Model:
+    def __init__(self, **kwargs):
+        self.generator = None
+        self._secrets = kwargs["secrets"]
+
+    def load(self):
+        model_path = hf_hub_download(repo_id="sesame/csm-1b", filename="ckpt.pt", token=self._secrets["hf_access_token"])
+        self.generator = load_csm_1b(model_path, "cuda", self._secrets["hf_access_token"])
+
+    def wav_to_base64(self, wav_tensor):
+        buffer = BytesIO()
+        torchaudio.save(buffer, wav_tensor.unsqueeze(0).cpu(), self.generator.sample_rate, format="wav")
+        buffer.seek(0)
+        return base64.b64encode(buffer.read()).decode("utf-8")
+
+    def predict(self, model_input):
+        text = model_input.get("text", "Hello from Sesame.")
+        speaker = model_input.get("speaker", 0)
+        audio = self.generator.generate(
+            text=text,
+            speaker=speaker,
+            context=[],
+            max_audio_length_ms=10_000,
+        )
+        return {"output": self.wav_to_base64(audio)}
diff --git a/xtts-streaming/model/model.py b/xtts-streaming/model/model.py
@@ -4,13 +4,15 @@
 import os
 import time
 import wave
+import json
 
 import numpy as np
 import torch
 from TTS.tts.configs.xtts_config import XttsConfig
 from TTS.tts.models.xtts import Xtts
 from TTS.utils.generic_utils import get_user_data_dir
 from TTS.utils.manage import ModelManager
+import fastapi
 
 # This is one of the speaker voices that comes with xtts
 SPEAKER_NAME = "Claribel Dervla"
@@ -33,12 +35,10 @@ def load(self):
         config = XttsConfig()
         config.load_json(os.path.join(model_path, "config.json"))
         self.model = Xtts.init_from_config(config)
-        # self.model.load_checkpoint(config, checkpoint_dir=model_path, eval=True)
         self.model.load_checkpoint(
             config, checkpoint_dir=model_path, eval=True, use_deepspeed=True
         )
         self.model.to(device)
-        # self.compiled_model = torch.compile(self.model.inference_stream)
 
         self.speaker = {
             "speaker_embedding": self.model.speaker_manager.speakers[SPEAKER_NAME][
@@ -78,25 +78,58 @@ def wav_postprocess(self, wav):
         wav = (wav * 32767).astype(np.int16)
         return wav
 
-    def predict(self, model_input):
-        text = model_input.get("text")
-        language = model_input.get("language", "en")
-        chunk_size = int(
-            model_input.get("chunk_size", 20)
-        )  # Ensure chunk_size is an integer
-        add_wav_header = False
 
-        streamer = self.model.inference_stream(
-            text,
-            language,
-            self.gpt_cond_latent,
-            self.speaker_embedding,
-            stream_chunk_size=chunk_size,
-            enable_text_splitting=True,
-            temperature=0.2,
-        )
+    async def websocket(self, websocket: fastapi.WebSocket):
+        """Handle WebSocket connections for text-to-speech requests"""
+        print("WebSocket connected")
+        try:
+            while True:
+                data = await websocket.receive_text()
+                
+                try:
+                    # Parse JSON input if provided
+                    input_data = json.loads(data)
+                except json.JSONDecodeError:
+                    # If not JSON, assume it's just text
+                    input_data = {"text": data, "language": "en", "chunk_size": 20}
+                
+                text = input_data.get("text")
+                language = input_data.get("language", "en")
+                chunk_size = int(input_data.get("chunk_size", 20))
+                
+                # Process the text to speech using the logic from the original predict method
+                streamer = self.model.inference_stream(
+                    text,
+                    language,
+                    self.gpt_cond_latent,
+                    self.speaker_embedding,
+                    stream_chunk_size=chunk_size,
+                    enable_text_splitting=True,
+                    temperature=0.2,
+                )
 
-        for chunk in streamer:
-            processed_chunk = self.wav_postprocess(chunk)
-            processed_bytes = processed_chunk.tobytes()
-            yield processed_bytes
+                for chunk in streamer:
+                    processed_chunk = self.wav_postprocess(chunk)
+                    processed_bytes = processed_chunk.tobytes()
+                    encoded_chunk = base64.b64encode(processed_bytes).decode('utf-8')
+                    await websocket.send_json({
+                        "type": "chunk",
+                        "data": encoded_chunk
+                    })
+                
+                await websocket.send_json({
+                    "type": "complete",
+                    "message": f"Processed '{text}'"
+                })
+                
+        except fastapi.WebSocketDisconnect:
+            print("WebSocket disconnected")
+        except Exception as e:
+            print(f"WebSocket error: {str(e)}")
+            try:
+                await websocket.send_json({
+                    "type": "error",
+                    "message": str(e)
+                })
+            except:
+                pass
diff --git a/xtts-streaming/test.py b/xtts-streaming/test.py
@@ -0,0 +1,168 @@
+import asyncio
+import websockets
+import json
+import base64
+import wave
+import os
+
+def wav_to_base64(wav_path):
+    """Convert a WAV file to base64 encoded string"""
+    with open(wav_path, "rb") as wav_file:
+        return base64.b64encode(wav_file.read()).decode('utf-8')
+
+async def send_websocket_data():
+    # Connection details
+    uri = "wss://model-rwn1jgd3.api.baseten.co/v1/websocket"
+    headers = {"Authorization": "Api-Key vVolDAU0.Mbynm8M7VGnaGqLbW9pwfWxFePNrGw8G"}
+    
+    async with websockets.connect(uri, extra_headers=headers) as websocket:
+        # For the TTS model, we send text instead of audio
+        text_data = {
+            "text": "Hello, this is a test of the text to speech websocket API.",
+            "language": "en",
+            "chunk_size": 20
+        }
+        
+        # Send the text data as JSON
+        await websocket.send(json.dumps(text_data))
+        print(f"Sent text: {text_data['text']}")
+        
+        # Collect audio chunks
+        audio_chunks = []
+        
+        # Process responses
+        while True:
+            try:
+                response = await websocket.recv()
+                
+                # Try to parse as JSON
+                try:
+                    data = json.loads(response)
+                    print(f"Received response: {data.get('type', 'unknown')}")
+                    
+                    if data.get("type") == "chunk":
+                        # Decode and save the audio chunk
+                        audio_chunk = base64.b64decode(data["data"])
+                        audio_chunks.append(audio_chunk)
+                        print("Saved audio chunk")
+                    
+                    elif data.get("type") == "complete":
+                        print(f"Processing complete: {data.get('message')}")
+                        break
+                    
+                    elif data.get("type") == "error":
+                        print(f"Error: {data.get('message')}")
+                        break
+                
+                except json.JSONDecodeError:
+                    # Not JSON, print the first part
+                    print(f"Received non-JSON response: {response[:50]}...")
+                    break
+                
+            except Exception as e:
+                print(f"Error receiving data: {str(e)}")
+                break
+        
+        # Save the audio to a WAV file if we received chunks
+        if audio_chunks:
+            output_file = "tts_output.wav"
+            with wave.open(output_file, 'wb') as wf:
+                wf.setnchannels(1)  # Mono
+                wf.setsampwidth(2)  # 16-bit
+                wf.setframerate(24000)  # XTTS default sample rate
+                wf.writeframes(b''.join(audio_chunks))
+            
+            print(f"Audio saved to {output_file}")
+            print(f"Full path: {os.path.abspath(output_file)}")
+        else:
+            print("No audio data received")
+
+async def test_multiple_concurrent_requests():
+    """Test sending multiple concurrent requests to the TTS websocket API"""
+    
+    async def single_request(idx):
+        """Handle a single request with unique text and output file"""
+        output_file = f"tts_output_{idx}.wav"
+        text = f"This is concurrent test number {idx}."
+        
+        try:
+            # Connection details
+            uri = "wss://model-rwn1jgd3.api.baseten.co/v1/websocket"
+            headers = {"Authorization": "Api-Key vVolDAU0.Mbynm8M7VGnaGqLbW9pwfWxFePNrGw8G"}
+            
+            async with websockets.connect(uri, extra_headers=headers) as websocket:
+                # Send text data as JSON
+                text_data = {
+                    "text": text,
+                    "language": "en",
+                    "chunk_size": 20
+                }
+                
+                await websocket.send(json.dumps(text_data))
+                print(f"Request {idx}: Sent text: {text}")
+                
+                # Collect audio chunks
+                audio_chunks = []
+                
+                # Process responses
+                while True:
+                    try:
+                        response = await websocket.recv()
+                        
+                        # Try to parse as JSON
+                        try:
+                            data = json.loads(response)
+                            
+                            if data.get("type") == "chunk":
+                                # Decode and save the audio chunk
+                                audio_chunk = base64.b64decode(data["data"])
+                                audio_chunks.append(audio_chunk)
+                            
+                            elif data.get("type") == "complete":
+                                print(f"Request {idx}: Processing complete")
+                                break
+                            
+                            elif data.get("type") == "error":
+                                print(f"Request {idx}: Error: {data.get('message')}")
+                                return False
+                        
+                        except json.JSONDecodeError:
+                            print(f"Request {idx}: Received non-JSON response")
+                            return False
+                        
+                    except Exception as e:
+                        print(f"Request {idx}: Error receiving data: {str(e)}")
+                        return False
+                
+                # Save the audio to a WAV file if we received chunks
+                if audio_chunks:
+                    with wave.open(output_file, 'wb') as wf:
+                        wf.setnchannels(1)  # Mono
+                        wf.setsampwidth(2)  # 16-bit
+                        wf.setframerate(24000)  # XTTS default sample rate
+                        wf.writeframes(b''.join(audio_chunks))
+                    
+                    print(f"Request {idx}: Audio saved to {output_file}")
+                    return True
+                else:
+                    print(f"Request {idx}: No audio data received")
+                    return False
+                
+        except Exception as e:
+            print(f"Request {idx}: Failed with exception: {str(e)}")
+            return False
+    
+    num_requests = 4
+    
+    print(f"Starting {num_requests} concurrent requests...")
+    results = await asyncio.gather(*[single_request(i+1) for i in range(num_requests)])
+    
+    successful = results.count(True)
+    print(f"Completed {successful} out of {num_requests} requests successfully")
+    return successful == num_requests
+
+# Run the tests
+if __name__ == "__main__":
+    asyncio.run(send_websocket_data())
+    print("\n--- Testing multiple concurrent requests ---\n")
+    asyncio.run(test_multiple_concurrent_requests())