|
| 1 | +# This file was auto-generated by Fern from our API Definition. |
| 2 | + |
| 3 | +import typing |
| 4 | +import urllib.parse |
| 5 | +import json |
| 6 | +import base64 |
| 7 | +import websockets |
| 8 | + |
| 9 | +from websockets.sync.client import connect |
| 10 | + |
| 11 | +from .core.api_error import ApiError |
| 12 | +from .core.client_wrapper import SyncClientWrapper |
| 13 | +from .core.jsonable_encoder import jsonable_encoder |
| 14 | +from .core.remove_none_from_dict import remove_none_from_dict |
| 15 | +from .core.request_options import RequestOptions |
| 16 | +from .types.voice_settings import VoiceSettings |
| 17 | +from .text_to_speech.client import TextToSpeechClient |
| 18 | +from .types import OutputFormat |
| 19 | + |
| 20 | +# this is used as the default value for optional parameters |
| 21 | +OMIT = typing.cast(typing.Any, ...) |
| 22 | + |
| 23 | + |
| 24 | +def text_chunker(chunks: typing.Iterator[str]) -> typing.Iterator[str]: |
| 25 | + """Used during input streaming to chunk text blocks and set last char to space""" |
| 26 | + splitters = (".", ",", "?", "!", ";", ":", "—", "-", "(", ")", "[", "]", "}", " ") |
| 27 | + buffer = "" |
| 28 | + for text in chunks: |
| 29 | + if buffer.endswith(splitters): |
| 30 | + yield buffer if buffer.endswith(" ") else buffer + " " |
| 31 | + buffer = text |
| 32 | + elif text.startswith(splitters): |
| 33 | + output = buffer + text[0] |
| 34 | + yield output if output.endswith(" ") else output + " " |
| 35 | + buffer = text[1:] |
| 36 | + else: |
| 37 | + buffer += text |
| 38 | + if buffer != "": |
| 39 | + yield buffer + " " |
| 40 | + |
| 41 | + |
| 42 | +class RealtimeTextToSpeechClient(TextToSpeechClient): |
| 43 | + def __init__(self, *, client_wrapper: SyncClientWrapper): |
| 44 | + super().__init__(client_wrapper=client_wrapper) |
| 45 | + self._ws_base_url = urllib.parse.urlparse(self._client_wrapper.get_base_url())._replace(scheme="wss").geturl() |
| 46 | + |
| 47 | + def convert_realtime( |
| 48 | + self, |
| 49 | + voice_id: str, |
| 50 | + *, |
| 51 | + text: typing.Iterator[str], |
| 52 | + model_id: typing.Optional[str] = OMIT, |
| 53 | + output_format: typing.Optional[OutputFormat] = "mp3_44100_128", |
| 54 | + voice_settings: typing.Optional[VoiceSettings] = OMIT, |
| 55 | + request_options: typing.Optional[RequestOptions] = None, |
| 56 | + ) -> typing.Iterator[bytes]: |
| 57 | + """ |
| 58 | + Converts text into speech using a voice of your choice and returns audio. |
| 59 | +
|
| 60 | + Parameters: |
| 61 | + - voice_id: str. Voice ID to be used, you can use https://api.neuralaudio.solutions/v1/voices to list all the available voices. |
| 62 | + |
| 63 | + - text: typing.Iterator[str]. The text that will get converted into speech. |
| 64 | +
|
| 65 | + - model_id: typing.Optional[str]. Identifier of the model that will be used, you can query them using GET /v1/models. The model needs to have support for text to speech, you can check this using the can_do_text_to_speech property. |
| 66 | +
|
| 67 | + - voice_settings: typing.Optional[VoiceSettings]. Voice settings overriding stored setttings for the given voice. They are applied only on the given request. |
| 68 | +
|
| 69 | + - request_options: typing.Optional[RequestOptions]. Request-specific configuration. |
| 70 | + --- |
| 71 | + from neuralaudio import PronunciationDictionaryVersionLocator, VoiceSettings |
| 72 | + from neuralaudio.client import NeuralAudio |
| 73 | +
|
| 74 | + def get_text() -> typing.Iterator[str]: |
| 75 | + yield "Hello, how are you?" |
| 76 | + yield "I am fine, thank you." |
| 77 | +
|
| 78 | + client = NeuralAudio( |
| 79 | + api_key="YOUR_API_KEY", |
| 80 | + ) |
| 81 | + client.text_to_speech.convert_realtime( |
| 82 | + voice_id="string", |
| 83 | + text=get_text(), |
| 84 | + model_id="string", |
| 85 | + voice_settings=VoiceSettings( |
| 86 | + stability=1.1, |
| 87 | + similarity_boost=1.1, |
| 88 | + style=1.1, |
| 89 | + use_speaker_boost=True, |
| 90 | + ), |
| 91 | + ) |
| 92 | + """ |
| 93 | + with connect( |
| 94 | + urllib.parse.urljoin( |
| 95 | + self._ws_base_url, |
| 96 | + f"v1/text-to-speech/{jsonable_encoder(voice_id)}/stream-input?model_id={model_id}&output_format={output_format}" |
| 97 | + ), |
| 98 | + additional_headers=jsonable_encoder( |
| 99 | + remove_none_from_dict( |
| 100 | + { |
| 101 | + **self._client_wrapper.get_headers(), |
| 102 | + **(request_options.get("additional_headers", {}) if request_options is not None else {}), |
| 103 | + } |
| 104 | + ) |
| 105 | + ) |
| 106 | + ) as socket: |
| 107 | + try: |
| 108 | + socket.send(json.dumps( |
| 109 | + dict( |
| 110 | + text=" ", |
| 111 | + try_trigger_generation=True, |
| 112 | + voice_settings=voice_settings.dict() if voice_settings else None, |
| 113 | + generation_config=dict( |
| 114 | + chunk_length_schedule=[50], |
| 115 | + ), |
| 116 | + ) |
| 117 | + )) |
| 118 | + except websockets.exceptions.ConnectionClosedError as ce: |
| 119 | + raise ApiError(body=ce.reason, status_code=ce.code) |
| 120 | + |
| 121 | + try: |
| 122 | + for text_chunk in text_chunker(text): |
| 123 | + data = dict(text=text_chunk, try_trigger_generation=True) |
| 124 | + socket.send(json.dumps(data)) |
| 125 | + try: |
| 126 | + data = json.loads(socket.recv(1e-2)) |
| 127 | + if "audio" in data and data["audio"]: |
| 128 | + yield base64.b64decode(data["audio"]) # type: ignore |
| 129 | + except TimeoutError: |
| 130 | + pass |
| 131 | + |
| 132 | + socket.send(json.dumps(dict(text=""))) |
| 133 | + |
| 134 | + while True: |
| 135 | + |
| 136 | + data = json.loads(socket.recv()) |
| 137 | + if "audio" in data and data["audio"]: |
| 138 | + yield base64.b64decode(data["audio"]) # type: ignore |
| 139 | + except websockets.exceptions.ConnectionClosed as ce: |
| 140 | + if "message" in data: |
| 141 | + raise ApiError(body=data, status_code=ce.code) |
| 142 | + elif ce.code != 1000: |
| 143 | + raise ApiError(body=ce.reason, status_code=ce.code) |
0 commit comments