diff --git a/examples/audio.py b/examples/audio.py index 85f47bfb06..af41fe601b 100755 --- a/examples/audio.py +++ b/examples/audio.py @@ -1,6 +1,5 @@ #!/usr/bin/env rye run python -import time from pathlib import Path from openai import OpenAI @@ -12,8 +11,6 @@ def main() -> None: - stream_to_speakers() - # Create text-to-speech audio file with openai.audio.speech.with_streaming_response.create( model="tts-1", @@ -37,28 +34,5 @@ def main() -> None: print(translation.text) -def stream_to_speakers() -> None: - import pyaudio - - player_stream = pyaudio.PyAudio().open(format=pyaudio.paInt16, channels=1, rate=24000, output=True) - - start_time = time.time() - - with openai.audio.speech.with_streaming_response.create( - model="tts-1", - voice="alloy", - response_format="pcm", # similar to WAV, but without a header chunk at the start. - input="""I see skies of blue and clouds of white - The bright blessed days, the dark sacred nights - And I think to myself - What a wonderful world""", - ) as response: - print(f"Time to first byte: {int((time.time() - start_time) * 1000)}ms") - for chunk in response.iter_bytes(chunk_size=1024): - player_stream.write(chunk) - - print(f"Done in {int((time.time() - start_time) * 1000)}ms.") - - if __name__ == "__main__": main() diff --git a/examples/speech_to_text.py b/examples/speech_to_text.py new file mode 100644 index 0000000000..cc3f56b424 --- /dev/null +++ b/examples/speech_to_text.py @@ -0,0 +1,25 @@ +#!/usr/bin/env rye run python + +import asyncio + +from openai import AsyncOpenAI +from openai.helpers import Microphone + +# gets OPENAI_API_KEY from your environment variables +openai = AsyncOpenAI() + + +async def main() -> None: + print("Recording for the next 10 seconds...") + recording = await Microphone(timeout=10).record() + print("Recording complete") + transcription = await openai.audio.transcriptions.create( + model="whisper-1", + file=recording, + ) + + print(transcription.text) + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/examples/text_to_speech.py b/examples/text_to_speech.py new file mode 100644 index 0000000000..ac8b12b0ab --- /dev/null +++ b/examples/text_to_speech.py @@ -0,0 +1,31 @@ +#!/usr/bin/env rye run python + +import time +import asyncio + +from openai import AsyncOpenAI +from openai.helpers import LocalAudioPlayer + +# gets OPENAI_API_KEY from your environment variables +openai = AsyncOpenAI() + + +async def main() -> None: + start_time = time.time() + + async with openai.audio.speech.with_streaming_response.create( + model="tts-1", + voice="alloy", + response_format="pcm", # similar to WAV, but without a header chunk at the start. + input="""I see skies of blue and clouds of white + The bright blessed days, the dark sacred nights + And I think to myself + What a wonderful world""", + ) as response: + print(f"Time to first byte: {int((time.time() - start_time) * 1000)}ms") + await LocalAudioPlayer().play(response) + print(f"Time to play: {int((time.time() - start_time) * 1000)}ms") + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/pyproject.toml b/pyproject.toml index 3088eb2fb2..fcd893d4a9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -16,6 +16,8 @@ dependencies = [ "sniffio", "tqdm > 4", "jiter>=0.4.0, <1", + "sounddevice>=0.5.1", + "numpy>=2.0.2", ] requires-python = ">= 3.8" classifiers = [ diff --git a/requirements-dev.lock b/requirements-dev.lock index 5599057b66..0755ddb3c5 100644 --- a/requirements-dev.lock +++ b/requirements-dev.lock @@ -7,6 +7,7 @@ # all-features: true # with-sources: false # generate-hashes: false +# universal: false -e file:. annotated-types==0.6.0 @@ -32,6 +33,7 @@ certifi==2023.7.22 # via requests cffi==1.16.0 # via cryptography + # via sounddevice charset-normalizer==3.3.2 # via requests click==8.1.7 @@ -91,7 +93,7 @@ nest-asyncio==1.6.0 nodeenv==1.8.0 # via pyright nox==2023.4.22 -numpy==1.26.3 +numpy==2.0.2 # via openai # via pandas # via pandas-stubs @@ -101,7 +103,7 @@ packaging==23.2 # via black # via nox # via pytest -pandas==2.1.4 +pandas==2.2.3 # via openai pandas-stubs==2.1.4.231227 # via openai @@ -153,6 +155,8 @@ sniffio==1.3.0 # via trio sortedcontainers==2.4.0 # via trio +sounddevice==0.5.1 + # via openai time-machine==2.9.0 toml==0.10.2 # via inline-snapshot diff --git a/requirements.lock b/requirements.lock index cbdff94fa3..fa88e26c0f 100644 --- a/requirements.lock +++ b/requirements.lock @@ -7,6 +7,7 @@ # all-features: true # with-sources: false # generate-hashes: false +# universal: false -e file:. annotated-types==0.6.0 @@ -17,6 +18,8 @@ anyio==4.1.0 certifi==2023.7.22 # via httpcore # via httpx +cffi==1.17.1 + # via sounddevice distro==1.8.0 # via openai exceptiongroup==1.2.2 @@ -40,6 +43,8 @@ pandas==2.2.3 # via openai pandas-stubs==2.2.2.240807 # via openai +pycparser==2.22 + # via cffi pydantic==2.10.3 # via openai pydantic-core==2.27.1 @@ -53,6 +58,8 @@ six==1.16.0 sniffio==1.3.0 # via anyio # via openai +sounddevice==0.5.1 + # via openai tqdm==4.66.5 # via openai types-pytz==2024.2.0.20241003 diff --git a/src/openai/helpers.py b/src/openai/helpers.py new file mode 100644 index 0000000000..7cfc708c79 --- /dev/null +++ b/src/openai/helpers.py @@ -0,0 +1,3 @@ +from .lib.helpers import Microphone, LocalAudioPlayer + +__all__ = ["LocalAudioPlayer", "Microphone"] diff --git a/src/openai/lib/helpers/__init__.py b/src/openai/lib/helpers/__init__.py new file mode 100644 index 0000000000..ab3044da59 --- /dev/null +++ b/src/openai/lib/helpers/__init__.py @@ -0,0 +1,4 @@ +from .microphone import Microphone +from .local_audio_player import LocalAudioPlayer + +__all__ = ["Microphone", "LocalAudioPlayer"] diff --git a/src/openai/lib/helpers/local_audio_player.py b/src/openai/lib/helpers/local_audio_player.py new file mode 100644 index 0000000000..8d891c42eb --- /dev/null +++ b/src/openai/lib/helpers/local_audio_player.py @@ -0,0 +1,161 @@ +import queue +import asyncio +from typing import Any, Union, Callable, AsyncGenerator, cast + +import numpy as np +import sounddevice as sd +import numpy.typing as npt + +from ... import _legacy_response +from ..._response import StreamedBinaryAPIResponse, AsyncStreamedBinaryAPIResponse + +SAMPLE_RATE = 24000 + + +class LocalAudioPlayer: + def __init__( + self, + should_stop: Union[Callable[[], bool], None] = None, + ): + self.channels = 1 + self.dtype = np.float32 + self.should_stop = should_stop + + async def _tts_response_to_buffer( + self, + response: Union[ + _legacy_response.HttpxBinaryResponseContent, + AsyncStreamedBinaryAPIResponse, + StreamedBinaryAPIResponse, + ], + ) -> npt.NDArray[np.float32]: + chunks: list[bytes] = [] + if isinstance(response, _legacy_response.HttpxBinaryResponseContent) or isinstance( + response, StreamedBinaryAPIResponse + ): + for chunk in response.iter_bytes(chunk_size=1024): + if chunk: + chunks.append(chunk) + else: + async for chunk in response.iter_bytes(chunk_size=1024): + if chunk: + chunks.append(chunk) + + audio_bytes = b"".join(chunks) + audio_np = np.frombuffer(audio_bytes, dtype=np.int16).astype(np.float32) / 32767.0 + audio_np = audio_np.reshape(-1, 1) + return audio_np + + async def play( + self, + input: Union[ + npt.NDArray[np.int16], + npt.NDArray[np.float32], + _legacy_response.HttpxBinaryResponseContent, + AsyncStreamedBinaryAPIResponse, + StreamedBinaryAPIResponse, + ], + ) -> None: + audio_content: npt.NDArray[np.float32] + if isinstance(input, np.ndarray): + if input.dtype == np.int16 and self.dtype == np.float32: + audio_content = (input.astype(np.float32) / 32767.0).reshape(-1, self.channels) + elif input.dtype == np.float32: + audio_content = cast(npt.NDArray[np.float32], input) + else: + raise ValueError(f"Unsupported dtype: {input.dtype}") + else: + audio_content = await self._tts_response_to_buffer(input) + + loop = asyncio.get_event_loop() + event = asyncio.Event() + idx = 0 + + def callback( + outdata: npt.NDArray[np.float32], + frame_count: int, + _time_info: Any, + _status: Any, + ): + nonlocal idx + + remainder = len(audio_content) - idx + if remainder == 0 or (callable(self.should_stop) and self.should_stop()): + loop.call_soon_threadsafe(event.set) + raise sd.CallbackStop + valid_frames = frame_count if remainder >= frame_count else remainder + outdata[:valid_frames] = audio_content[idx : idx + valid_frames] + outdata[valid_frames:] = 0 + idx += valid_frames + + stream = sd.OutputStream( + samplerate=SAMPLE_RATE, + callback=callback, + dtype=audio_content.dtype, + channels=audio_content.shape[1], + ) + with stream: + await event.wait() + + async def play_stream( + self, + buffer_stream: AsyncGenerator[Union[npt.NDArray[np.float32], npt.NDArray[np.int16], None], None], + ) -> None: + loop = asyncio.get_event_loop() + event = asyncio.Event() + buffer_queue: queue.Queue[Union[npt.NDArray[np.float32], npt.NDArray[np.int16], None]] = queue.Queue(maxsize=50) + + async def buffer_producer(): + async for buffer in buffer_stream: + if buffer is None: + break + await loop.run_in_executor(None, buffer_queue.put, buffer) + await loop.run_in_executor(None, buffer_queue.put, None) # Signal completion + + def callback( + outdata: npt.NDArray[np.float32], + frame_count: int, + _time_info: Any, + _status: Any, + ): + nonlocal current_buffer, buffer_pos + + frames_written = 0 + while frames_written < frame_count: + if current_buffer is None or buffer_pos >= len(current_buffer): + try: + current_buffer = buffer_queue.get(timeout=0.1) + if current_buffer is None: + loop.call_soon_threadsafe(event.set) + raise sd.CallbackStop + buffer_pos = 0 + + if current_buffer.dtype == np.int16 and self.dtype == np.float32: + current_buffer = (current_buffer.astype(np.float32) / 32767.0).reshape(-1, self.channels) + + except queue.Empty: + outdata[frames_written:] = 0 + return + + remaining_frames = len(current_buffer) - buffer_pos + frames_to_write = min(frame_count - frames_written, remaining_frames) + outdata[frames_written : frames_written + frames_to_write] = current_buffer[ + buffer_pos : buffer_pos + frames_to_write + ] + buffer_pos += frames_to_write + frames_written += frames_to_write + + current_buffer = None + buffer_pos = 0 + + producer_task = asyncio.create_task(buffer_producer()) + + with sd.OutputStream( + samplerate=SAMPLE_RATE, + channels=self.channels, + dtype=self.dtype, + callback=callback, + ): + await event.wait() + + await producer_task diff --git a/src/openai/lib/helpers/microphone.py b/src/openai/lib/helpers/microphone.py new file mode 100644 index 0000000000..bbe30735ec --- /dev/null +++ b/src/openai/lib/helpers/microphone.py @@ -0,0 +1,97 @@ +import io +import time +import wave +import asyncio +from typing import Any, Type, Union, Generic, TypeVar, Callable, overload +from typing_extensions import Literal + +import numpy as np +import sounddevice as sd +import numpy.typing as npt + +from openai._types import FileTypes, FileContent + +SAMPLE_RATE = 24000 + +DType = TypeVar("DType", bound=np.generic) + + +class Microphone(Generic[DType]): + def __init__( + self, + channels: int = 1, + dtype: Type[DType] = np.int16, + should_record: Union[Callable[[], bool], None] = None, + timeout: Union[float, None] = None, + ): + self.channels = channels + self.dtype = dtype + self.should_record = should_record + self.buffer_chunks = [] + self.timeout = timeout + self.has_record_function = callable(should_record) + + def _ndarray_to_wav(self, audio_data: npt.NDArray[DType]) -> FileTypes: + buffer: FileContent = io.BytesIO() + with wave.open(buffer, "w") as wav_file: + wav_file.setnchannels(self.channels) + wav_file.setsampwidth(np.dtype(self.dtype).itemsize) + wav_file.setframerate(SAMPLE_RATE) + wav_file.writeframes(audio_data.tobytes()) + buffer.seek(0) + return ("audio.wav", buffer, "audio/wav") + + @overload + async def record(self, return_ndarray: Literal[True]) -> npt.NDArray[DType]: ... + + @overload + async def record(self, return_ndarray: Literal[False]) -> FileTypes: ... + + @overload + async def record(self, return_ndarray: None = ...) -> FileTypes: ... + + async def record(self, return_ndarray: Union[bool, None] = False) -> Union[npt.NDArray[DType], FileTypes]: + loop = asyncio.get_event_loop() + event = asyncio.Event() + self.buffer_chunks: list[npt.NDArray[DType]] = [] + start_time = time.perf_counter() + + def callback( + indata: npt.NDArray[DType], + _frame_count: int, + _time_info: Any, + _status: Any, + ): + execution_time = time.perf_counter() - start_time + reached_recording_timeout = execution_time > self.timeout if self.timeout is not None else False + if reached_recording_timeout: + loop.call_soon_threadsafe(event.set) + raise sd.CallbackStop + + should_be_recording = self.should_record() if callable(self.should_record) else True + if not should_be_recording: + loop.call_soon_threadsafe(event.set) + raise sd.CallbackStop + + self.buffer_chunks.append(indata.copy()) + + stream = sd.InputStream( + callback=callback, + dtype=self.dtype, + samplerate=SAMPLE_RATE, + channels=self.channels, + ) + with stream: + await event.wait() + + # Concatenate all chunks into a single buffer, handle empty case + concatenated_chunks: npt.NDArray[DType] = ( + np.concatenate(self.buffer_chunks, axis=0) + if len(self.buffer_chunks) > 0 + else np.array([], dtype=self.dtype) + ) + + if return_ndarray: + return concatenated_chunks + else: + return self._ndarray_to_wav(concatenated_chunks)