pronunciation dict updates (#57)

noahlt · bpanahij · web-flow · commit 9f3fac378bff · 2025-11-12T17:58:19.000-08:00
Takes changes from #56 and adds support in bytes method and ws send wrapper methods. --------- Co-authored-by: Brian Johnson <brian@pjohnson.info>
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,5 +1,6 @@
 [project]
 name = "cartesia"
+version = "2.0.15"
 
 [tool.poetry]
 name = "cartesia"
diff --git a/src/cartesia/tts/_async_websocket.py b/src/cartesia/tts/_async_websocket.py
@@ -72,6 +72,7 @@ async def send(
         use_original_timestamps: bool = False,
         continue_: bool = False,
         max_buffer_delay_ms: Optional[int] = None,
+        pronunciation_dict_id: Optional[str] = None,
         flush: bool = False,
     ) -> None:
         """Send audio generation requests to the WebSocket. The response can be received using the `receive` method.
@@ -116,6 +117,8 @@ async def send(
             request_body["max_buffer_delay_ms"] = max_buffer_delay_ms
         if flush:
             request_body["flush"] = flush
+        if pronunciation_dict_id:
+            request_body["pronunciation_dict_id"] = pronunciation_dict_id
 
         if generation_config is not None:
             if isinstance(generation_config, dict):
@@ -383,6 +386,7 @@ async def send(
         add_timestamps: bool = False,
         add_phoneme_timestamps: bool = False,
         use_original_timestamps: bool = False,
+        pronunciation_dict_id: Optional[str] = None,
     ):
         """See :meth:`_WebSocket.send` for details."""
         if context_id is None:
@@ -403,6 +407,7 @@ async def send(
             add_timestamps=add_timestamps,
             add_phoneme_timestamps=add_phoneme_timestamps,
             use_original_timestamps=use_original_timestamps,
+            pronunciation_dict_id=pronunciation_dict_id,
         )
 
         generator = ctx.receive()
diff --git a/src/cartesia/tts/_websocket.py b/src/cartesia/tts/_websocket.py
@@ -70,6 +70,7 @@ def send(
         add_timestamps: bool = False,
         add_phoneme_timestamps: bool = False,
         use_original_timestamps: bool = False,
+        pronunciation_dict_id: Optional[str] = None
     ) -> Generator[bytes, None, None]:
         """Send audio generation requests to the WebSocket and yield responses.
 
@@ -111,6 +112,8 @@ def send(
             request_body["use_original_timestamps"] = use_original_timestamps
         if max_buffer_delay_ms:
             request_body["max_buffer_delay_ms"] = max_buffer_delay_ms
+        if pronunciation_dict_id:
+            request_body["pronunciation_dict_id"] = pronunciation_dict_id
 
         if generation_config is not None:
             if isinstance(generation_config, dict):
@@ -370,6 +373,7 @@ def send(
         add_timestamps: bool = False,
         add_phoneme_timestamps: bool = False,
         use_original_timestamps: bool = False,
+        pronunciation_dict_id: Optional[str] = None,
     ):
         """Send a request to the WebSocket to generate audio.
 
@@ -402,6 +406,7 @@ def send(
             "add_timestamps": add_timestamps,
             "add_phoneme_timestamps": add_phoneme_timestamps,
             "use_original_timestamps": use_original_timestamps,
+            "pronunciation_dict_id": pronunciation_dict_id,
         }
         generator = self._websocket_generator(request_body)
 
diff --git a/tests/custom/test_client.py b/tests/custom/test_client.py
@@ -431,6 +431,29 @@ def test_sse_err():
         pass
 
 
+def test_sse_pronunciation_dict(resources: _Resources):
+    logger.info("Testing SSE with pronunciation_dict_id parameter")
+    client = resources.client
+    transcript = SAMPLE_TRANSCRIPT
+
+    output_generate = client.tts.sse(
+        transcript=transcript,
+        voice={"mode": "id", "id": SAMPLE_VOICE_ID},
+        output_format=DEFAULT_OUTPUT_FORMAT_PARAMS,
+        model_id=DEFAULT_MODEL_ID,
+        pronunciation_dict_id=None,  # Test with None to verify parameter acceptance
+    )
+
+    chunks = []
+    for response in output_generate:
+        assert isinstance(response, WebSocketResponse_Chunk)
+        audio_bytes = base64.b64decode(response.data)
+        chunks.append(audio_bytes)
+
+    data = b"".join(chunks)
+    _validate_audio_response(data, DEFAULT_OUTPUT_FORMAT_PARAMS)
+
+
 @pytest.mark.parametrize("output_format", TEST_RAW_OUTPUT_FORMATS)
 @pytest.mark.parametrize("stream", [True, False])
 def test_ws_sync(resources: _Resources, output_format: OutputFormatParams, stream: bool):
@@ -584,6 +607,40 @@ async def test_ws_timestamps(use_original_timestamps: bool):
     await async_client.close()
 
 
+@pytest.mark.asyncio
+async def test_ws_pronunciation_dict():
+    logger.info("Testing WebSocket with pronunciation_dict_id parameter")
+    transcript = SAMPLE_TRANSCRIPT
+
+    async_client = create_async_client()
+    ws = await async_client.tts.websocket()
+
+    # Test that pronunciation_dict_id parameter can be passed
+    # Using None as we don't have a real pronunciation dict ID for testing
+    output_generate = await ws.send(
+        transcript=transcript,
+        voice={"mode": "id", "id": SAMPLE_VOICE_ID},
+        output_format=DEFAULT_OUTPUT_FORMAT_PARAMS,
+        model_id=DEFAULT_MODEL_ID,
+        pronunciation_dict_id=None,  # Test with None to verify parameter acceptance
+        stream=True,
+    )
+
+    chunks = []
+    async for out in output_generate:
+        _validate_schema(out)
+        if out.audio is not None:
+            chunks.append(out.audio)
+
+    # Verify audio
+    audio = b"".join(chunks)
+    _validate_audio_response(audio, DEFAULT_OUTPUT_FORMAT_PARAMS)
+
+    # Close the websocket
+    await ws.close()
+    await async_client.close()
+
+
 def chunk_generator(transcripts):
     for transcript in transcripts:
         if transcript.endswith(" "):
@@ -1364,6 +1421,7 @@ def test_ws_phoneme_timestamps():
         output_format=DEFAULT_OUTPUT_FORMAT_PARAMS,
         model_id=DEFAULT_MODEL_ID,
         add_phoneme_timestamps=True,
+        add_timestamps=True, # workaround, currently you need both add_timestamps and add_phoneme_timestamps to get phoneme timestamps
         stream=True,
     )
     has_phoneme_timestamps = False
@@ -1407,6 +1465,7 @@ def test_continuation_phoneme_timestamps():
         voice={"mode": "id", "id": SAMPLE_VOICE_ID},
         output_format=DEFAULT_OUTPUT_FORMAT_PARAMS,
         add_phoneme_timestamps=True,
+        add_timestamps=True, # workaround, currently you need both add_timestamps and add_phoneme_timestamps to get phoneme timestamps
     )
 
     has_phoneme_timestamps = False
@@ -1445,6 +1504,7 @@ async def test_ws_phoneme_timestamps_async():
         output_format=DEFAULT_OUTPUT_FORMAT_PARAMS,
         model_id=DEFAULT_MODEL_ID,
         add_phoneme_timestamps=True,
+        add_timestamps=True, # workaround, currently you need both add_timestamps and add_phoneme_timestamps to get phoneme timestamps
         stream=True,
     )
     has_phoneme_timestamps = False
@@ -1491,6 +1551,7 @@ async def test_continuation_phoneme_timestamps_async():
             voice={"mode": "id", "id": SAMPLE_VOICE_ID},
             output_format=DEFAULT_OUTPUT_FORMAT_PARAMS,
             add_phoneme_timestamps=True,
+            add_timestamps=True, # workaround, currently you need both add_timestamps and add_phoneme_timestamps to get phoneme timestamps
             continue_=True,
         )