Skip to content

Commit 1821b49

Browse files
committed
Add py.typed for type hints and realtime TTS implementation
1 parent fefa4c9 commit 1821b49

File tree

2 files changed

+143
-0
lines changed

2 files changed

+143
-0
lines changed

src/neuralaudio/py.typed

Whitespace-only changes.

src/neuralaudio/realtime_tts.py

Lines changed: 143 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,143 @@
1+
# This file was auto-generated by Fern from our API Definition.
2+
3+
import typing
4+
import urllib.parse
5+
import json
6+
import base64
7+
import websockets
8+
9+
from websockets.sync.client import connect
10+
11+
from .core.api_error import ApiError
12+
from .core.client_wrapper import SyncClientWrapper
13+
from .core.jsonable_encoder import jsonable_encoder
14+
from .core.remove_none_from_dict import remove_none_from_dict
15+
from .core.request_options import RequestOptions
16+
from .types.voice_settings import VoiceSettings
17+
from .text_to_speech.client import TextToSpeechClient
18+
from .types import OutputFormat
19+
20+
# this is used as the default value for optional parameters
21+
OMIT = typing.cast(typing.Any, ...)
22+
23+
24+
def text_chunker(chunks: typing.Iterator[str]) -> typing.Iterator[str]:
25+
"""Used during input streaming to chunk text blocks and set last char to space"""
26+
splitters = (".", ",", "?", "!", ";", ":", "—", "-", "(", ")", "[", "]", "}", " ")
27+
buffer = ""
28+
for text in chunks:
29+
if buffer.endswith(splitters):
30+
yield buffer if buffer.endswith(" ") else buffer + " "
31+
buffer = text
32+
elif text.startswith(splitters):
33+
output = buffer + text[0]
34+
yield output if output.endswith(" ") else output + " "
35+
buffer = text[1:]
36+
else:
37+
buffer += text
38+
if buffer != "":
39+
yield buffer + " "
40+
41+
42+
class RealtimeTextToSpeechClient(TextToSpeechClient):
43+
def __init__(self, *, client_wrapper: SyncClientWrapper):
44+
super().__init__(client_wrapper=client_wrapper)
45+
self._ws_base_url = urllib.parse.urlparse(self._client_wrapper.get_base_url())._replace(scheme="wss").geturl()
46+
47+
def convert_realtime(
48+
self,
49+
voice_id: str,
50+
*,
51+
text: typing.Iterator[str],
52+
model_id: typing.Optional[str] = OMIT,
53+
output_format: typing.Optional[OutputFormat] = "mp3_44100_128",
54+
voice_settings: typing.Optional[VoiceSettings] = OMIT,
55+
request_options: typing.Optional[RequestOptions] = None,
56+
) -> typing.Iterator[bytes]:
57+
"""
58+
Converts text into speech using a voice of your choice and returns audio.
59+
60+
Parameters:
61+
- voice_id: str. Voice ID to be used, you can use https://api.neuralaudio.solutions/v1/voices to list all the available voices.
62+
63+
- text: typing.Iterator[str]. The text that will get converted into speech.
64+
65+
- model_id: typing.Optional[str]. Identifier of the model that will be used, you can query them using GET /v1/models. The model needs to have support for text to speech, you can check this using the can_do_text_to_speech property.
66+
67+
- voice_settings: typing.Optional[VoiceSettings]. Voice settings overriding stored setttings for the given voice. They are applied only on the given request.
68+
69+
- request_options: typing.Optional[RequestOptions]. Request-specific configuration.
70+
---
71+
from neuralaudio import PronunciationDictionaryVersionLocator, VoiceSettings
72+
from neuralaudio.client import NeuralAudio
73+
74+
def get_text() -> typing.Iterator[str]:
75+
yield "Hello, how are you?"
76+
yield "I am fine, thank you."
77+
78+
client = NeuralAudio(
79+
api_key="YOUR_API_KEY",
80+
)
81+
client.text_to_speech.convert_realtime(
82+
voice_id="string",
83+
text=get_text(),
84+
model_id="string",
85+
voice_settings=VoiceSettings(
86+
stability=1.1,
87+
similarity_boost=1.1,
88+
style=1.1,
89+
use_speaker_boost=True,
90+
),
91+
)
92+
"""
93+
with connect(
94+
urllib.parse.urljoin(
95+
self._ws_base_url,
96+
f"v1/text-to-speech/{jsonable_encoder(voice_id)}/stream-input?model_id={model_id}&output_format={output_format}"
97+
),
98+
additional_headers=jsonable_encoder(
99+
remove_none_from_dict(
100+
{
101+
**self._client_wrapper.get_headers(),
102+
**(request_options.get("additional_headers", {}) if request_options is not None else {}),
103+
}
104+
)
105+
)
106+
) as socket:
107+
try:
108+
socket.send(json.dumps(
109+
dict(
110+
text=" ",
111+
try_trigger_generation=True,
112+
voice_settings=voice_settings.dict() if voice_settings else None,
113+
generation_config=dict(
114+
chunk_length_schedule=[50],
115+
),
116+
)
117+
))
118+
except websockets.exceptions.ConnectionClosedError as ce:
119+
raise ApiError(body=ce.reason, status_code=ce.code)
120+
121+
try:
122+
for text_chunk in text_chunker(text):
123+
data = dict(text=text_chunk, try_trigger_generation=True)
124+
socket.send(json.dumps(data))
125+
try:
126+
data = json.loads(socket.recv(1e-2))
127+
if "audio" in data and data["audio"]:
128+
yield base64.b64decode(data["audio"]) # type: ignore
129+
except TimeoutError:
130+
pass
131+
132+
socket.send(json.dumps(dict(text="")))
133+
134+
while True:
135+
136+
data = json.loads(socket.recv())
137+
if "audio" in data and data["audio"]:
138+
yield base64.b64decode(data["audio"]) # type: ignore
139+
except websockets.exceptions.ConnectionClosed as ce:
140+
if "message" in data:
141+
raise ApiError(body=data, status_code=ce.code)
142+
elif ce.code != 1000:
143+
raise ApiError(body=ce.reason, status_code=ce.code)

0 commit comments

Comments
 (0)