From 5cbdcd7ed5c10896a32c77efc2df20feb6bf4eb2 Mon Sep 17 00:00:00 2001 From: Rishabh Bhargava Date: Thu, 10 Jul 2025 01:35:07 -0700 Subject: [PATCH 1/8] Adding support for Speech to text --- src/together/client.py | 2 + src/together/resources/audio/__init__.py | 18 ++ .../resources/audio/transcriptions.py | 234 ++++++++++++++++++ src/together/resources/audio/translations.py | 234 ++++++++++++++++++ src/together/types/__init__.py | 16 ++ src/together/types/audio_speech.py | 81 +++++- 6 files changed, 584 insertions(+), 1 deletion(-) create mode 100644 src/together/resources/audio/transcriptions.py create mode 100644 src/together/resources/audio/translations.py diff --git a/src/together/client.py b/src/together/client.py index a1784529..47a43c24 100644 --- a/src/together/client.py +++ b/src/together/client.py @@ -103,6 +103,7 @@ class AsyncTogether: models: resources.AsyncModels fine_tuning: resources.AsyncFineTuning rerank: resources.AsyncRerank + audio: resources.AsyncAudio code_interpreter: CodeInterpreter batches: resources.AsyncBatches # client options @@ -167,6 +168,7 @@ def __init__( self.models = resources.AsyncModels(self.client) self.fine_tuning = resources.AsyncFineTuning(self.client) self.rerank = resources.AsyncRerank(self.client) + self.audio = resources.AsyncAudio(self.client) self.code_interpreter = CodeInterpreter(self.client) self.batches = resources.AsyncBatches(self.client) diff --git a/src/together/resources/audio/__init__.py b/src/together/resources/audio/__init__.py index 5703d348..5c8abc43 100644 --- a/src/together/resources/audio/__init__.py +++ b/src/together/resources/audio/__init__.py @@ -1,6 +1,8 @@ from functools import cached_property from together.resources.audio.speech import AsyncSpeech, Speech +from together.resources.audio.transcriptions import AsyncTranscriptions, Transcriptions +from together.resources.audio.translations import AsyncTranslations, Translations from together.types import ( TogetherClient, ) @@ -14,6 +16,14 @@ def __init__(self, client: TogetherClient) -> None: def speech(self) -> Speech: return Speech(self._client) + @cached_property + def transcriptions(self) -> Transcriptions: + return Transcriptions(self._client) + + @cached_property + def translations(self) -> Translations: + return Translations(self._client) + class AsyncAudio: def __init__(self, client: TogetherClient) -> None: @@ -22,3 +32,11 @@ def __init__(self, client: TogetherClient) -> None: @cached_property def speech(self) -> AsyncSpeech: return AsyncSpeech(self._client) + + @cached_property + def transcriptions(self) -> AsyncTranscriptions: + return AsyncTranscriptions(self._client) + + @cached_property + def translations(self) -> AsyncTranslations: + return AsyncTranslations(self._client) diff --git a/src/together/resources/audio/transcriptions.py b/src/together/resources/audio/transcriptions.py new file mode 100644 index 00000000..a574a81d --- /dev/null +++ b/src/together/resources/audio/transcriptions.py @@ -0,0 +1,234 @@ +from __future__ import annotations + +from typing import Any, Union, BinaryIO, Optional +from pathlib import Path + +from together.abstract import api_requestor +from together.together_response import TogetherResponse +from together.types import ( + AudioTranscriptionRequest, + AudioTranscriptionResponse, + AudioTranscriptionVerboseResponse, + AudioTranscriptionResponseFormat, + AudioTimestampGranularities, + TogetherClient, + TogetherRequest, +) + + +class Transcriptions: + def __init__(self, client: TogetherClient) -> None: + self._client = client + + def create( + self, + *, + file: Union[str, BinaryIO, Path], + model: str = "openai/whisper-large-v3", + language: Optional[str] = None, + prompt: Optional[str] = None, + response_format: Union[str, AudioTranscriptionResponseFormat] = "json", + temperature: float = 0.0, + timestamp_granularities: Optional[Union[str, AudioTimestampGranularities]] = None, + **kwargs: Any, + ) -> Union[AudioTranscriptionResponse, AudioTranscriptionVerboseResponse]: + """ + Transcribes audio into the input language. + + Args: + file: The audio file object (not file name) to transcribe, in one of these formats: + flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm. + Can be a file path (str/Path), file object (BinaryIO), or URL (str). + model: ID of the model to use. Defaults to "openai/whisper-large-v3". + language: The language of the input audio. Supplying the input language in + ISO-639-1 format will improve accuracy and latency. + prompt: An optional text to guide the model's style or continue a previous + audio segment. The prompt should match the audio language. + response_format: The format of the transcript output, in one of these options: + json, verbose_json. + temperature: The sampling temperature, between 0 and 1. Higher values like 0.8 + will make the output more random, while lower values like 0.2 will make it + more focused and deterministic. + timestamp_granularities: The timestamp granularities to populate for this + transcription. response_format must be set verbose_json to use timestamp + granularities. Either or both of these options are supported: word, or segment. + + Returns: + The transcribed text in the requested format. + """ + + requestor = api_requestor.APIRequestor( + client=self._client, + ) + + # Handle file input - could be a path, URL, or file object + files_data = {} + params_data = {} + + if isinstance(file, (str, Path)): + if isinstance(file, str) and file.startswith(('http://', 'https://')): + # URL string - send as multipart field + files_data["file"] = (None, file) + else: + # Local file path + file_path = Path(file) + files_data["file"] = open(file_path, "rb") + else: + # File object + files_data["file"] = file + + # Build request parameters + params_data.update({ + "model": model, + "response_format": response_format if isinstance(response_format, str) else response_format.value, + "temperature": temperature, + }) + + if language is not None: + params_data["language"] = language + + if prompt is not None: + params_data["prompt"] = prompt + + if timestamp_granularities is not None: + params_data["timestamp_granularities"] = ( + timestamp_granularities if isinstance(timestamp_granularities, str) + else timestamp_granularities.value + ) + + # Add any additional kwargs + params_data.update(kwargs) + + try: + response, _, _ = requestor.request( + options=TogetherRequest( + method="POST", + url="audio/transcriptions", + params=params_data, + files=files_data, + ), + ) + finally: + # Close file if we opened it + if files_data and "file" in files_data: + try: + # Only close if it's a file object (not a tuple for URL) + if hasattr(files_data["file"], 'close'): + files_data["file"].close() + except: + pass + + # Parse response based on format + if response_format == "verbose_json" or response_format == AudioTranscriptionResponseFormat.VERBOSE_JSON: + return AudioTranscriptionVerboseResponse(**response.data) + else: + return AudioTranscriptionResponse(**response.data) + + +class AsyncTranscriptions: + def __init__(self, client: TogetherClient) -> None: + self._client = client + + async def create( + self, + *, + file: Union[str, BinaryIO, Path], + model: str = "openai/whisper-large-v3", + language: Optional[str] = None, + prompt: Optional[str] = None, + response_format: Union[str, AudioTranscriptionResponseFormat] = "json", + temperature: float = 0.0, + timestamp_granularities: Optional[Union[str, AudioTimestampGranularities]] = None, + **kwargs: Any, + ) -> Union[AudioTranscriptionResponse, AudioTranscriptionVerboseResponse]: + """ + Async version of transcribe audio into the input language. + + Args: + file: The audio file object (not file name) to transcribe, in one of these formats: + flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm. + Can be a file path (str/Path), file object (BinaryIO), or URL (str). + model: ID of the model to use. Defaults to "openai/whisper-large-v3". + language: The language of the input audio. Supplying the input language in + ISO-639-1 format will improve accuracy and latency. + prompt: An optional text to guide the model's style or continue a previous + audio segment. The prompt should match the audio language. + response_format: The format of the transcript output, in one of these options: + json, verbose_json. + temperature: The sampling temperature, between 0 and 1. Higher values like 0.8 + will make the output more random, while lower values like 0.2 will make it + more focused and deterministic. + timestamp_granularities: The timestamp granularities to populate for this + transcription. response_format must be set verbose_json to use timestamp + granularities. Either or both of these options are supported: word, or segment. + + Returns: + The transcribed text in the requested format. + """ + + requestor = api_requestor.APIRequestor( + client=self._client, + ) + + # Handle file input - could be a path, URL, or file object + files_data = {} + params_data = {} + + if isinstance(file, (str, Path)): + if isinstance(file, str) and file.startswith(('http://', 'https://')): + # URL string - send as multipart field + files_data["file"] = (None, file) + else: + # Local file path + file_path = Path(file) + files_data["file"] = open(file_path, "rb") + else: + # File object + files_data["file"] = file + + # Build request parameters + params_data.update({ + "model": model, + "response_format": response_format if isinstance(response_format, str) else response_format.value, + "temperature": temperature, + }) + + if language is not None: + params_data["language"] = language + + if prompt is not None: + params_data["prompt"] = prompt + + if timestamp_granularities is not None: + params_data["timestamp_granularities"] = ( + timestamp_granularities if isinstance(timestamp_granularities, str) + else timestamp_granularities.value + ) + + # Add any additional kwargs + params_data.update(kwargs) + + try: + response, _, _ = await requestor.arequest( + options=TogetherRequest( + method="POST", + url="audio/transcriptions", + params=params_data, + files=files_data, + ), + ) + finally: + # Close file if we opened it + if files_data and "file" in files_data: + try: + # Only close if it's a file object (not a tuple for URL) + if hasattr(files_data["file"], 'close'): + files_data["file"].close() + except: + pass + + # Parse response based on format + if response_format == "verbose_json" or response_format == AudioTranscriptionResponseFormat.VERBOSE_JSON: + return AudioTranscriptionVerboseResponse(**response.data) + else: + return AudioTranscriptionResponse(**response.data) \ No newline at end of file diff --git a/src/together/resources/audio/translations.py b/src/together/resources/audio/translations.py new file mode 100644 index 00000000..7b31f4ae --- /dev/null +++ b/src/together/resources/audio/translations.py @@ -0,0 +1,234 @@ +from __future__ import annotations + +from typing import Any, Union, BinaryIO, Optional +from pathlib import Path + +from together.abstract import api_requestor +from together.together_response import TogetherResponse +from together.types import ( + AudioTranslationRequest, + AudioTranslationResponse, + AudioTranslationVerboseResponse, + AudioTranscriptionResponseFormat, + AudioTimestampGranularities, + TogetherClient, + TogetherRequest, +) + + +class Translations: + def __init__(self, client: TogetherClient) -> None: + self._client = client + + def create( + self, + *, + file: Union[str, BinaryIO, Path], + model: str = "openai/whisper-large-v3", + language: Optional[str] = None, + prompt: Optional[str] = None, + response_format: Union[str, AudioTranscriptionResponseFormat] = "json", + temperature: float = 0.0, + timestamp_granularities: Optional[Union[str, AudioTimestampGranularities]] = None, + **kwargs: Any, + ) -> Union[AudioTranslationResponse, AudioTranslationVerboseResponse]: + """ + Translates audio into English. + + Args: + file: The audio file object (not file name) to translate, in one of these formats: + flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm. + Can be a file path (str/Path), file object (BinaryIO), or URL (str). + model: ID of the model to use. Defaults to "openai/whisper-large-v3". + language: The language of the input audio. Optional ISO-639-1 language code. + If omitted, language is set to English. + prompt: An optional text to guide the model's style or continue a previous + audio segment. The prompt should be in English. + response_format: The format of the transcript output, in one of these options: + json, verbose_json. + temperature: The sampling temperature, between 0 and 1. Higher values like 0.8 + will make the output more random, while lower values like 0.2 will make it + more focused and deterministic. + timestamp_granularities: The timestamp granularities to populate for this + translation. response_format must be set verbose_json to use timestamp + granularities. Either or both of these options are supported: word, or segment. + + Returns: + The translated text in the requested format. + """ + + requestor = api_requestor.APIRequestor( + client=self._client, + ) + + # Handle file input - could be a path, URL, or file object + files_data = {} + params_data = {} + + if isinstance(file, (str, Path)): + if isinstance(file, str) and file.startswith(('http://', 'https://')): + # URL string - send as multipart field + files_data["file"] = (None, file) + else: + # Local file path + file_path = Path(file) + files_data["file"] = open(file_path, "rb") + else: + # File object + files_data["file"] = file + + # Build request parameters + params_data.update({ + "model": model, + "response_format": response_format if isinstance(response_format, str) else response_format.value, + "temperature": temperature, + }) + + if language is not None: + params_data["language"] = language + + if prompt is not None: + params_data["prompt"] = prompt + + if timestamp_granularities is not None: + params_data["timestamp_granularities"] = ( + timestamp_granularities if isinstance(timestamp_granularities, str) + else timestamp_granularities.value + ) + + # Add any additional kwargs + params_data.update(kwargs) + + try: + response, _, _ = requestor.request( + options=TogetherRequest( + method="POST", + url="audio/translations", + params=params_data, + files=files_data, + ), + ) + finally: + # Close file if we opened it + if files_data and "file" in files_data: + try: + # Only close if it's a file object (not a tuple for URL) + if hasattr(files_data["file"], 'close'): + files_data["file"].close() + except: + pass + + # Parse response based on format + if response_format == "verbose_json" or response_format == AudioTranscriptionResponseFormat.VERBOSE_JSON: + return AudioTranslationVerboseResponse(**response.data) + else: + return AudioTranslationResponse(**response.data) + + +class AsyncTranslations: + def __init__(self, client: TogetherClient) -> None: + self._client = client + + async def create( + self, + *, + file: Union[str, BinaryIO, Path], + model: str = "openai/whisper-large-v3", + language: Optional[str] = None, + prompt: Optional[str] = None, + response_format: Union[str, AudioTranscriptionResponseFormat] = "json", + temperature: float = 0.0, + timestamp_granularities: Optional[Union[str, AudioTimestampGranularities]] = None, + **kwargs: Any, + ) -> Union[AudioTranslationResponse, AudioTranslationVerboseResponse]: + """ + Async version of translate audio into English. + + Args: + file: The audio file object (not file name) to translate, in one of these formats: + flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm. + Can be a file path (str/Path), file object (BinaryIO), or URL (str). + model: ID of the model to use. Defaults to "openai/whisper-large-v3". + language: The language of the input audio. Optional ISO-639-1 language code. + If omitted, language is set to English. + prompt: An optional text to guide the model's style or continue a previous + audio segment. The prompt should be in English. + response_format: The format of the transcript output, in one of these options: + json, verbose_json. + temperature: The sampling temperature, between 0 and 1. Higher values like 0.8 + will make the output more random, while lower values like 0.2 will make it + more focused and deterministic. + timestamp_granularities: The timestamp granularities to populate for this + translation. response_format must be set verbose_json to use timestamp + granularities. Either or both of these options are supported: word, or segment. + + Returns: + The translated text in the requested format. + """ + + requestor = api_requestor.APIRequestor( + client=self._client, + ) + + # Handle file input - could be a path, URL, or file object + files_data = {} + params_data = {} + + if isinstance(file, (str, Path)): + if isinstance(file, str) and file.startswith(('http://', 'https://')): + # URL string - send as multipart field + files_data["file"] = (None, file) + else: + # Local file path + file_path = Path(file) + files_data["file"] = open(file_path, "rb") + else: + # File object + files_data["file"] = file + + # Build request parameters + params_data.update({ + "model": model, + "response_format": response_format if isinstance(response_format, str) else response_format.value, + "temperature": temperature, + }) + + if language is not None: + params_data["language"] = language + + if prompt is not None: + params_data["prompt"] = prompt + + if timestamp_granularities is not None: + params_data["timestamp_granularities"] = ( + timestamp_granularities if isinstance(timestamp_granularities, str) + else timestamp_granularities.value + ) + + # Add any additional kwargs + params_data.update(kwargs) + + try: + response, _, _ = await requestor.arequest( + options=TogetherRequest( + method="POST", + url="audio/translations", + params=params_data, + files=files_data, + ), + ) + finally: + # Close file if we opened it + if files_data and "file" in files_data: + try: + # Only close if it's a file object (not a tuple for URL) + if hasattr(files_data["file"], 'close'): + files_data["file"].close() + except: + pass + + # Parse response based on format + if response_format == "verbose_json" or response_format == AudioTranscriptionResponseFormat.VERBOSE_JSON: + return AudioTranslationVerboseResponse(**response.data) + else: + return AudioTranslationResponse(**response.data) \ No newline at end of file diff --git a/src/together/types/__init__.py b/src/together/types/__init__.py index e218a02e..6e89baff 100644 --- a/src/together/types/__init__.py +++ b/src/together/types/__init__.py @@ -7,6 +7,14 @@ AudioSpeechStreamChunk, AudioSpeechStreamEvent, AudioSpeechStreamResponse, + AudioTranscriptionRequest, + AudioTranslationRequest, + AudioTranscriptionResponse, + AudioTranscriptionVerboseResponse, + AudioTranslationResponse, + AudioTranslationVerboseResponse, + AudioTranscriptionResponseFormat, + AudioTimestampGranularities, ) from together.types.chat_completions import ( ChatCompletionChunk, @@ -102,6 +110,14 @@ "AudioSpeechStreamChunk", "AudioSpeechStreamEvent", "AudioSpeechStreamResponse", + "AudioTranscriptionRequest", + "AudioTranslationRequest", + "AudioTranscriptionResponse", + "AudioTranscriptionVerboseResponse", + "AudioTranslationResponse", + "AudioTranslationVerboseResponse", + "AudioTranscriptionResponseFormat", + "AudioTimestampGranularities", "DedicatedEndpoint", "ListEndpoint", "Autoscaling", diff --git a/src/together/types/audio_speech.py b/src/together/types/audio_speech.py index fb0cf786..a00b8fbd 100644 --- a/src/together/types/audio_speech.py +++ b/src/together/types/audio_speech.py @@ -1,7 +1,7 @@ from __future__ import annotations from enum import Enum -from typing import Iterator +from typing import Iterator, Union, BinaryIO, Optional, List import threading from pydantic import BaseModel, ConfigDict @@ -108,3 +108,82 @@ def stream_to_file(self, file_path: str) -> None: audio = base64.b64decode(stream_event_response.response.data.b64) f.write(audio) + + +class AudioTranscriptionResponseFormat(str, Enum): + JSON = "json" + VERBOSE_JSON = "verbose_json" + + +class AudioTimestampGranularities(str, Enum): + SEGMENT = "segment" + WORD = "word" + + +class AudioTranscriptionRequest(BaseModel): + model_config = ConfigDict(arbitrary_types_allowed=True) + + file: Union[str, BinaryIO] + model: str = "openai/whisper-large-v3" + language: Optional[str] = None + prompt: Optional[str] = None + response_format: AudioTranscriptionResponseFormat = AudioTranscriptionResponseFormat.JSON + temperature: float = 0.0 + timestamp_granularities: Optional[AudioTimestampGranularities] = AudioTimestampGranularities.SEGMENT + + +class AudioTranslationRequest(BaseModel): + model_config = ConfigDict(arbitrary_types_allowed=True) + + file: Union[str, BinaryIO] + model: str = "openai/whisper-large-v3" + language: Optional[str] = None + prompt: Optional[str] = None + response_format: AudioTranscriptionResponseFormat = AudioTranscriptionResponseFormat.JSON + temperature: float = 0.0 + timestamp_granularities: Optional[AudioTimestampGranularities] = AudioTimestampGranularities.SEGMENT + + +class AudioTranscriptionSegment(BaseModel): + id: int + seek: Optional[int] = None + start: float + end: float + text: str + tokens: Optional[List[int]] = None + temperature: Optional[float] = None + avg_logprob: Optional[float] = None + compression_ratio: Optional[float] = None + no_speech_prob: Optional[float] = None + + +class AudioTranscriptionWord(BaseModel): + word: str + start: float + end: float + + +class AudioTranscriptionResponse(BaseModel): + text: str + + +class AudioTranscriptionVerboseResponse(BaseModel): + task: Optional[str] = None + language: Optional[str] = None + duration: Optional[float] = None + text: str + segments: Optional[List[AudioTranscriptionSegment]] = None + words: Optional[List[AudioTranscriptionWord]] = None + + +class AudioTranslationResponse(BaseModel): + text: str + + +class AudioTranslationVerboseResponse(BaseModel): + task: Optional[str] = None + language: Optional[str] = None + duration: Optional[float] = None + text: str + segments: Optional[List[AudioTranscriptionSegment]] = None + words: Optional[List[AudioTranscriptionWord]] = None From 0e4d164d880c5204195a0db019a82ef830431b01 Mon Sep 17 00:00:00 2001 From: Sahil Yadav Date: Thu, 10 Jul 2025 10:39:51 -0700 Subject: [PATCH 2/8] lint fixes --- .../resources/audio/transcriptions.py | 118 ++++++++++------- src/together/resources/audio/translations.py | 120 ++++++++++++------ src/together/types/audio_speech.py | 20 ++- 3 files changed, 170 insertions(+), 88 deletions(-) diff --git a/src/together/resources/audio/transcriptions.py b/src/together/resources/audio/transcriptions.py index a574a81d..766d4175 100644 --- a/src/together/resources/audio/transcriptions.py +++ b/src/together/resources/audio/transcriptions.py @@ -1,16 +1,14 @@ from __future__ import annotations -from typing import Any, Union, BinaryIO, Optional from pathlib import Path +from typing import Any, BinaryIO, Dict, Optional, Tuple, Union from together.abstract import api_requestor -from together.together_response import TogetherResponse from together.types import ( - AudioTranscriptionRequest, + AudioTimestampGranularities, AudioTranscriptionResponse, - AudioTranscriptionVerboseResponse, AudioTranscriptionResponseFormat, - AudioTimestampGranularities, + AudioTranscriptionVerboseResponse, TogetherClient, TogetherRequest, ) @@ -29,7 +27,9 @@ def create( prompt: Optional[str] = None, response_format: Union[str, AudioTranscriptionResponseFormat] = "json", temperature: float = 0.0, - timestamp_granularities: Optional[Union[str, AudioTimestampGranularities]] = None, + timestamp_granularities: Optional[ + Union[str, AudioTimestampGranularities] + ] = None, **kwargs: Any, ) -> Union[AudioTranscriptionResponse, AudioTranscriptionVerboseResponse]: """ @@ -56,17 +56,17 @@ def create( Returns: The transcribed text in the requested format. """ - + requestor = api_requestor.APIRequestor( client=self._client, ) # Handle file input - could be a path, URL, or file object - files_data = {} + files_data: Dict[str, Union[Tuple[None, str], BinaryIO]] = {} params_data = {} - + if isinstance(file, (str, Path)): - if isinstance(file, str) and file.startswith(('http://', 'https://')): + if isinstance(file, str) and file.startswith(("http://", "https://")): # URL string - send as multipart field files_data["file"] = (None, file) else: @@ -78,22 +78,29 @@ def create( files_data["file"] = file # Build request parameters - params_data.update({ - "model": model, - "response_format": response_format if isinstance(response_format, str) else response_format.value, - "temperature": temperature, - }) - + params_data.update( + { + "model": model, + "response_format": ( + response_format.value + if hasattr(response_format, "value") + else response_format + ), + "temperature": temperature, + } + ) + if language is not None: params_data["language"] = language - + if prompt is not None: params_data["prompt"] = prompt - + if timestamp_granularities is not None: params_data["timestamp_granularities"] = ( - timestamp_granularities if isinstance(timestamp_granularities, str) - else timestamp_granularities.value + timestamp_granularities.value + if hasattr(timestamp_granularities, "value") + else timestamp_granularities ) # Add any additional kwargs @@ -113,13 +120,17 @@ def create( if files_data and "file" in files_data: try: # Only close if it's a file object (not a tuple for URL) - if hasattr(files_data["file"], 'close'): - files_data["file"].close() + file_obj = files_data["file"] + if hasattr(file_obj, "close") and not isinstance(file_obj, tuple): + file_obj.close() except: pass # Parse response based on format - if response_format == "verbose_json" or response_format == AudioTranscriptionResponseFormat.VERBOSE_JSON: + if ( + response_format == "verbose_json" + or response_format == AudioTranscriptionResponseFormat.VERBOSE_JSON + ): return AudioTranscriptionVerboseResponse(**response.data) else: return AudioTranscriptionResponse(**response.data) @@ -133,12 +144,14 @@ async def create( self, *, file: Union[str, BinaryIO, Path], - model: str = "openai/whisper-large-v3", + model: str = "openai/whisper-large-v3", language: Optional[str] = None, prompt: Optional[str] = None, response_format: Union[str, AudioTranscriptionResponseFormat] = "json", temperature: float = 0.0, - timestamp_granularities: Optional[Union[str, AudioTimestampGranularities]] = None, + timestamp_granularities: Optional[ + Union[str, AudioTimestampGranularities] + ] = None, **kwargs: Any, ) -> Union[AudioTranscriptionResponse, AudioTranscriptionVerboseResponse]: """ @@ -165,17 +178,17 @@ async def create( Returns: The transcribed text in the requested format. """ - + requestor = api_requestor.APIRequestor( client=self._client, ) # Handle file input - could be a path, URL, or file object - files_data = {} + files_data: Dict[str, Union[Tuple[None, str], BinaryIO]] = {} params_data = {} - + if isinstance(file, (str, Path)): - if isinstance(file, str) and file.startswith(('http://', 'https://')): + if isinstance(file, str) and file.startswith(("http://", "https://")): # URL string - send as multipart field files_data["file"] = (None, file) else: @@ -187,22 +200,37 @@ async def create( files_data["file"] = file # Build request parameters - params_data.update({ - "model": model, - "response_format": response_format if isinstance(response_format, str) else response_format.value, - "temperature": temperature, - }) - + params_data.update( + { + "model": model, + "response_format": ( + response_format + if isinstance(response_format, str) + else ( + response_format.value + if hasattr(response_format, "value") + else response_format + ) + ), + "temperature": temperature, + } + ) + if language is not None: params_data["language"] = language - + if prompt is not None: params_data["prompt"] = prompt - + if timestamp_granularities is not None: params_data["timestamp_granularities"] = ( - timestamp_granularities if isinstance(timestamp_granularities, str) - else timestamp_granularities.value + timestamp_granularities + if isinstance(timestamp_granularities, str) + else ( + timestamp_granularities.value + if hasattr(timestamp_granularities, "value") + else timestamp_granularities + ) ) # Add any additional kwargs @@ -222,13 +250,17 @@ async def create( if files_data and "file" in files_data: try: # Only close if it's a file object (not a tuple for URL) - if hasattr(files_data["file"], 'close'): - files_data["file"].close() + file_obj = files_data["file"] + if hasattr(file_obj, "close") and not isinstance(file_obj, tuple): + file_obj.close() except: pass # Parse response based on format - if response_format == "verbose_json" or response_format == AudioTranscriptionResponseFormat.VERBOSE_JSON: + if ( + response_format == "verbose_json" + or response_format == AudioTranscriptionResponseFormat.VERBOSE_JSON + ): return AudioTranscriptionVerboseResponse(**response.data) else: - return AudioTranscriptionResponse(**response.data) \ No newline at end of file + return AudioTranscriptionResponse(**response.data) diff --git a/src/together/resources/audio/translations.py b/src/together/resources/audio/translations.py index 7b31f4ae..4b5cc59b 100644 --- a/src/together/resources/audio/translations.py +++ b/src/together/resources/audio/translations.py @@ -1,6 +1,6 @@ from __future__ import annotations -from typing import Any, Union, BinaryIO, Optional +from typing import Any, Union, BinaryIO, Optional, Dict, Tuple from pathlib import Path from together.abstract import api_requestor @@ -29,7 +29,9 @@ def create( prompt: Optional[str] = None, response_format: Union[str, AudioTranscriptionResponseFormat] = "json", temperature: float = 0.0, - timestamp_granularities: Optional[Union[str, AudioTimestampGranularities]] = None, + timestamp_granularities: Optional[ + Union[str, AudioTimestampGranularities] + ] = None, **kwargs: Any, ) -> Union[AudioTranslationResponse, AudioTranslationVerboseResponse]: """ @@ -56,17 +58,17 @@ def create( Returns: The translated text in the requested format. """ - + requestor = api_requestor.APIRequestor( client=self._client, ) # Handle file input - could be a path, URL, or file object - files_data = {} + files_data: Dict[str, Union[Tuple[None, str], BinaryIO]] = {} params_data = {} - + if isinstance(file, (str, Path)): - if isinstance(file, str) and file.startswith(('http://', 'https://')): + if isinstance(file, str) and file.startswith(("http://", "https://")): # URL string - send as multipart field files_data["file"] = (None, file) else: @@ -78,22 +80,37 @@ def create( files_data["file"] = file # Build request parameters - params_data.update({ - "model": model, - "response_format": response_format if isinstance(response_format, str) else response_format.value, - "temperature": temperature, - }) - + params_data.update( + { + "model": model, + "response_format": ( + response_format + if isinstance(response_format, str) + else ( + response_format.value + if hasattr(response_format, "value") + else response_format + ) + ), + "temperature": temperature, + } + ) + if language is not None: params_data["language"] = language - + if prompt is not None: params_data["prompt"] = prompt - + if timestamp_granularities is not None: params_data["timestamp_granularities"] = ( - timestamp_granularities if isinstance(timestamp_granularities, str) - else timestamp_granularities.value + timestamp_granularities + if isinstance(timestamp_granularities, str) + else ( + timestamp_granularities.value + if hasattr(timestamp_granularities, "value") + else timestamp_granularities + ) ) # Add any additional kwargs @@ -113,13 +130,17 @@ def create( if files_data and "file" in files_data: try: # Only close if it's a file object (not a tuple for URL) - if hasattr(files_data["file"], 'close'): - files_data["file"].close() + file_obj = files_data["file"] + if hasattr(file_obj, "close") and not isinstance(file_obj, tuple): + file_obj.close() except: pass # Parse response based on format - if response_format == "verbose_json" or response_format == AudioTranscriptionResponseFormat.VERBOSE_JSON: + if ( + response_format == "verbose_json" + or response_format == AudioTranscriptionResponseFormat.VERBOSE_JSON + ): return AudioTranslationVerboseResponse(**response.data) else: return AudioTranslationResponse(**response.data) @@ -133,12 +154,14 @@ async def create( self, *, file: Union[str, BinaryIO, Path], - model: str = "openai/whisper-large-v3", + model: str = "openai/whisper-large-v3", language: Optional[str] = None, prompt: Optional[str] = None, response_format: Union[str, AudioTranscriptionResponseFormat] = "json", temperature: float = 0.0, - timestamp_granularities: Optional[Union[str, AudioTimestampGranularities]] = None, + timestamp_granularities: Optional[ + Union[str, AudioTimestampGranularities] + ] = None, **kwargs: Any, ) -> Union[AudioTranslationResponse, AudioTranslationVerboseResponse]: """ @@ -165,17 +188,17 @@ async def create( Returns: The translated text in the requested format. """ - + requestor = api_requestor.APIRequestor( client=self._client, ) # Handle file input - could be a path, URL, or file object - files_data = {} + files_data: Dict[str, Union[Tuple[None, str], BinaryIO]] = {} params_data = {} - + if isinstance(file, (str, Path)): - if isinstance(file, str) and file.startswith(('http://', 'https://')): + if isinstance(file, str) and file.startswith(("http://", "https://")): # URL string - send as multipart field files_data["file"] = (None, file) else: @@ -187,22 +210,37 @@ async def create( files_data["file"] = file # Build request parameters - params_data.update({ - "model": model, - "response_format": response_format if isinstance(response_format, str) else response_format.value, - "temperature": temperature, - }) - + params_data.update( + { + "model": model, + "response_format": ( + response_format + if isinstance(response_format, str) + else ( + response_format.value + if hasattr(response_format, "value") + else response_format + ) + ), + "temperature": temperature, + } + ) + if language is not None: params_data["language"] = language - + if prompt is not None: params_data["prompt"] = prompt - + if timestamp_granularities is not None: params_data["timestamp_granularities"] = ( - timestamp_granularities if isinstance(timestamp_granularities, str) - else timestamp_granularities.value + timestamp_granularities + if isinstance(timestamp_granularities, str) + else ( + timestamp_granularities.value + if hasattr(timestamp_granularities, "value") + else timestamp_granularities + ) ) # Add any additional kwargs @@ -222,13 +260,17 @@ async def create( if files_data and "file" in files_data: try: # Only close if it's a file object (not a tuple for URL) - if hasattr(files_data["file"], 'close'): - files_data["file"].close() + file_obj = files_data["file"] + if hasattr(file_obj, "close") and not isinstance(file_obj, tuple): + file_obj.close() except: pass # Parse response based on format - if response_format == "verbose_json" or response_format == AudioTranscriptionResponseFormat.VERBOSE_JSON: + if ( + response_format == "verbose_json" + or response_format == AudioTranscriptionResponseFormat.VERBOSE_JSON + ): return AudioTranslationVerboseResponse(**response.data) else: - return AudioTranslationResponse(**response.data) \ No newline at end of file + return AudioTranslationResponse(**response.data) diff --git a/src/together/types/audio_speech.py b/src/together/types/audio_speech.py index a00b8fbd..0f7de7dd 100644 --- a/src/together/types/audio_speech.py +++ b/src/together/types/audio_speech.py @@ -122,26 +122,34 @@ class AudioTimestampGranularities(str, Enum): class AudioTranscriptionRequest(BaseModel): model_config = ConfigDict(arbitrary_types_allowed=True) - + file: Union[str, BinaryIO] model: str = "openai/whisper-large-v3" language: Optional[str] = None prompt: Optional[str] = None - response_format: AudioTranscriptionResponseFormat = AudioTranscriptionResponseFormat.JSON + response_format: AudioTranscriptionResponseFormat = ( + AudioTranscriptionResponseFormat.JSON + ) temperature: float = 0.0 - timestamp_granularities: Optional[AudioTimestampGranularities] = AudioTimestampGranularities.SEGMENT + timestamp_granularities: Optional[AudioTimestampGranularities] = ( + AudioTimestampGranularities.SEGMENT + ) class AudioTranslationRequest(BaseModel): model_config = ConfigDict(arbitrary_types_allowed=True) - + file: Union[str, BinaryIO] model: str = "openai/whisper-large-v3" language: Optional[str] = None prompt: Optional[str] = None - response_format: AudioTranscriptionResponseFormat = AudioTranscriptionResponseFormat.JSON + response_format: AudioTranscriptionResponseFormat = ( + AudioTranscriptionResponseFormat.JSON + ) temperature: float = 0.0 - timestamp_granularities: Optional[AudioTimestampGranularities] = AudioTimestampGranularities.SEGMENT + timestamp_granularities: Optional[AudioTimestampGranularities] = ( + AudioTimestampGranularities.SEGMENT + ) class AudioTranscriptionSegment(BaseModel): From a5fd61be8abb2b23c82bdb59d7808bfeee78f62a Mon Sep 17 00:00:00 2001 From: Sahil Yadav Date: Thu, 10 Jul 2025 11:26:20 -0700 Subject: [PATCH 3/8] remove unsupported fields from response --- src/together/types/audio_speech.py | 15 ++------------- 1 file changed, 2 insertions(+), 13 deletions(-) diff --git a/src/together/types/audio_speech.py b/src/together/types/audio_speech.py index 0f7de7dd..f078dd5d 100644 --- a/src/together/types/audio_speech.py +++ b/src/together/types/audio_speech.py @@ -1,13 +1,12 @@ from __future__ import annotations +import base64 from enum import Enum -from typing import Iterator, Union, BinaryIO, Optional, List -import threading +from typing import BinaryIO, Iterator, List, Optional, Union from pydantic import BaseModel, ConfigDict from together.together_response import TogetherResponse -import base64 class AudioResponseFormat(str, Enum): @@ -79,23 +78,19 @@ class AudioSpeechStreamEventResponse(BaseModel): class AudioSpeechStreamResponse(BaseModel): - response: TogetherResponse | Iterator[TogetherResponse] model_config = ConfigDict(arbitrary_types_allowed=True) def stream_to_file(self, file_path: str) -> None: - if isinstance(self.response, TogetherResponse): # save response to file with open(file_path, "wb") as f: f.write(self.response.data) elif isinstance(self.response, Iterator): - with open(file_path, "wb") as f: for chunk in self.response: - # Try to parse as stream chunk stream_event_response = AudioSpeechStreamEventResponse( response={"data": chunk.data} @@ -154,15 +149,9 @@ class AudioTranslationRequest(BaseModel): class AudioTranscriptionSegment(BaseModel): id: int - seek: Optional[int] = None start: float end: float text: str - tokens: Optional[List[int]] = None - temperature: Optional[float] = None - avg_logprob: Optional[float] = None - compression_ratio: Optional[float] = None - no_speech_prob: Optional[float] = None class AudioTranscriptionWord(BaseModel): From 0b9e19685e670d8046e6650e926d23a102c40638 Mon Sep 17 00:00:00 2001 From: Sahil Yadav Date: Thu, 10 Jul 2025 12:11:01 -0700 Subject: [PATCH 4/8] add test for transcriptions --- .../resources/test_transcriptions.py | 100 ++++++++++++++++++ 1 file changed, 100 insertions(+) create mode 100644 tests/integration/resources/test_transcriptions.py diff --git a/tests/integration/resources/test_transcriptions.py b/tests/integration/resources/test_transcriptions.py new file mode 100644 index 00000000..0e4e40d2 --- /dev/null +++ b/tests/integration/resources/test_transcriptions.py @@ -0,0 +1,100 @@ +import os + +import pytest + +from together.client import Together +from together.types.audio_speech import ( + AudioTranscriptionResponse, + AudioTranscriptionVerboseResponse, +) + + +class TestTogetherTranscriptions: + @pytest.fixture + def sync_together_client(self) -> Together: + """ + Initialize object with API key from environment + """ + TOGETHER_API_KEY = os.getenv("TOGETHER_API_KEY") + return Together(api_key=TOGETHER_API_KEY) + + def test_basic_transcription_url(self, sync_together_client): + """ + Test basic transcription with URL audio file + """ + audio_url = "https://raw.githubusercontent.com/yuekaizhang/Triton-ASR-Client/main/datasets/mini_en/wav/1221-135766-0002.wav" + + response = sync_together_client.audio.transcriptions.create( + file=audio_url, model="openai/whisper-large-v3" + ) + + assert isinstance(response, AudioTranscriptionResponse) + assert isinstance(response.text, str) + assert len(response.text) > 0 + + def test_transcription_with_language(self, sync_together_client): + """ + Test transcription with language parameter + """ + audio_url = "https://raw.githubusercontent.com/yuekaizhang/Triton-ASR-Client/main/datasets/mini_en/wav/1221-135766-0002.wav" + + response = sync_together_client.audio.transcriptions.create( + file=audio_url, model="openai/whisper-large-v3", language="en" + ) + + assert isinstance(response, AudioTranscriptionResponse) + assert isinstance(response.text, str) + assert len(response.text) > 0 + + def test_transcription_verbose_json(self, sync_together_client): + """ + Test transcription with verbose JSON format and timestamps + """ + audio_url = "https://raw.githubusercontent.com/yuekaizhang/Triton-ASR-Client/main/datasets/mini_en/wav/1221-135766-0002.wav" + + response = sync_together_client.audio.transcriptions.create( + file=audio_url, + model="openai/whisper-large-v3", + response_format="verbose_json", + timestamp_granularities="segment", + ) + + assert isinstance(response, AudioTranscriptionVerboseResponse) + assert isinstance(response.text, str) + assert len(response.text) > 0 + assert hasattr(response, "segments") + + def test_transcription_with_temperature(self, sync_together_client): + """ + Test transcription with temperature parameter + """ + audio_url = "https://raw.githubusercontent.com/yuekaizhang/Triton-ASR-Client/main/datasets/mini_en/wav/1221-135766-0002.wav" + + response = sync_together_client.audio.transcriptions.create( + file=audio_url, model="openai/whisper-large-v3", temperature=0.2 + ) + + assert isinstance(response, AudioTranscriptionResponse) + assert isinstance(response.text, str) + assert len(response.text) > 0 + + def test_transcription_missing_file(self, sync_together_client): + """ + Test transcription with missing file parameter + """ + with pytest.raises(TypeError): + sync_together_client.audio.transcriptions.create( + model="openai/whisper-large-v3" + ) + + def test_transcription_missing_model(self, sync_together_client): + """ + Test transcription with missing model parameter - should use default model + """ + audio_url = "https://raw.githubusercontent.com/yuekaizhang/Triton-ASR-Client/main/datasets/mini_en/wav/1221-135766-0002.wav" + + response = sync_together_client.audio.transcriptions.create(file=audio_url) + + assert isinstance(response, AudioTranscriptionResponse) + assert isinstance(response.text, str) + assert len(response.text) > 0 From e61e077504a4040e071f3396abd55f09accb8fea Mon Sep 17 00:00:00 2001 From: Sahil Yadav Date: Thu, 10 Jul 2025 12:14:02 -0700 Subject: [PATCH 5/8] Bump package version --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 35bc164a..55a69e7a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -12,7 +12,7 @@ build-backend = "poetry.masonry.api" [tool.poetry] name = "together" -version = "1.5.20" +version = "1.5.21" authors = ["Together AI "] description = "Python client for Together's Cloud Platform!" readme = "README.md" From e4473c58d79c5bcfc1acc79c88f03db18f6fddba Mon Sep 17 00:00:00 2001 From: Sahil Yadav Date: Thu, 10 Jul 2025 12:48:12 -0700 Subject: [PATCH 6/8] remove task from response --- src/together/types/audio_speech.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/together/types/audio_speech.py b/src/together/types/audio_speech.py index f078dd5d..b3c110f0 100644 --- a/src/together/types/audio_speech.py +++ b/src/together/types/audio_speech.py @@ -165,7 +165,6 @@ class AudioTranscriptionResponse(BaseModel): class AudioTranscriptionVerboseResponse(BaseModel): - task: Optional[str] = None language: Optional[str] = None duration: Optional[float] = None text: str From 25d031a899a698a5a042aad3322bc1088718bff9 Mon Sep 17 00:00:00 2001 From: Sahil Yadav Date: Thu, 10 Jul 2025 13:07:27 -0700 Subject: [PATCH 7/8] replace audio url --- tests/integration/resources/test_transcriptions.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/integration/resources/test_transcriptions.py b/tests/integration/resources/test_transcriptions.py index 0e4e40d2..4aaa150c 100644 --- a/tests/integration/resources/test_transcriptions.py +++ b/tests/integration/resources/test_transcriptions.py @@ -22,7 +22,7 @@ def test_basic_transcription_url(self, sync_together_client): """ Test basic transcription with URL audio file """ - audio_url = "https://raw.githubusercontent.com/yuekaizhang/Triton-ASR-Client/main/datasets/mini_en/wav/1221-135766-0002.wav" + audio_url = "https://www.soundjay.com/misc/sounds/bell-ringing-05.wav" response = sync_together_client.audio.transcriptions.create( file=audio_url, model="openai/whisper-large-v3" @@ -36,7 +36,7 @@ def test_transcription_with_language(self, sync_together_client): """ Test transcription with language parameter """ - audio_url = "https://raw.githubusercontent.com/yuekaizhang/Triton-ASR-Client/main/datasets/mini_en/wav/1221-135766-0002.wav" + audio_url = "https://www.soundjay.com/misc/sounds/bell-ringing-05.wav" response = sync_together_client.audio.transcriptions.create( file=audio_url, model="openai/whisper-large-v3", language="en" @@ -50,7 +50,7 @@ def test_transcription_verbose_json(self, sync_together_client): """ Test transcription with verbose JSON format and timestamps """ - audio_url = "https://raw.githubusercontent.com/yuekaizhang/Triton-ASR-Client/main/datasets/mini_en/wav/1221-135766-0002.wav" + audio_url = "https://www.soundjay.com/misc/sounds/bell-ringing-05.wav" response = sync_together_client.audio.transcriptions.create( file=audio_url, @@ -68,7 +68,7 @@ def test_transcription_with_temperature(self, sync_together_client): """ Test transcription with temperature parameter """ - audio_url = "https://raw.githubusercontent.com/yuekaizhang/Triton-ASR-Client/main/datasets/mini_en/wav/1221-135766-0002.wav" + audio_url = "https://www.soundjay.com/misc/sounds/bell-ringing-05.wav" response = sync_together_client.audio.transcriptions.create( file=audio_url, model="openai/whisper-large-v3", temperature=0.2 @@ -91,7 +91,7 @@ def test_transcription_missing_model(self, sync_together_client): """ Test transcription with missing model parameter - should use default model """ - audio_url = "https://raw.githubusercontent.com/yuekaizhang/Triton-ASR-Client/main/datasets/mini_en/wav/1221-135766-0002.wav" + audio_url = "https://www.soundjay.com/misc/sounds/bell-ringing-05.wav" response = sync_together_client.audio.transcriptions.create(file=audio_url) From c77f643ebd42f6441d4678a1d3dbf191b3b19e56 Mon Sep 17 00:00:00 2001 From: Sahil Yadav Date: Thu, 10 Jul 2025 13:58:38 -0700 Subject: [PATCH 8/8] change file --- tests/integration/resources/test_transcriptions.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/integration/resources/test_transcriptions.py b/tests/integration/resources/test_transcriptions.py index 4aaa150c..f36f7c4d 100644 --- a/tests/integration/resources/test_transcriptions.py +++ b/tests/integration/resources/test_transcriptions.py @@ -22,7 +22,7 @@ def test_basic_transcription_url(self, sync_together_client): """ Test basic transcription with URL audio file """ - audio_url = "https://www.soundjay.com/misc/sounds/bell-ringing-05.wav" + audio_url = "https://ia801605.us.archive.org/28/items/jfks19630626/jfk_1963_0626_berliner.wav" response = sync_together_client.audio.transcriptions.create( file=audio_url, model="openai/whisper-large-v3" @@ -36,7 +36,7 @@ def test_transcription_with_language(self, sync_together_client): """ Test transcription with language parameter """ - audio_url = "https://www.soundjay.com/misc/sounds/bell-ringing-05.wav" + audio_url = "https://ia801605.us.archive.org/28/items/jfks19630626/jfk_1963_0626_berliner.wav" response = sync_together_client.audio.transcriptions.create( file=audio_url, model="openai/whisper-large-v3", language="en" @@ -50,7 +50,7 @@ def test_transcription_verbose_json(self, sync_together_client): """ Test transcription with verbose JSON format and timestamps """ - audio_url = "https://www.soundjay.com/misc/sounds/bell-ringing-05.wav" + audio_url = "https://ia801605.us.archive.org/28/items/jfks19630626/jfk_1963_0626_berliner.wav" response = sync_together_client.audio.transcriptions.create( file=audio_url, @@ -68,7 +68,7 @@ def test_transcription_with_temperature(self, sync_together_client): """ Test transcription with temperature parameter """ - audio_url = "https://www.soundjay.com/misc/sounds/bell-ringing-05.wav" + audio_url = "https://ia801605.us.archive.org/28/items/jfks19630626/jfk_1963_0626_berliner.wav" response = sync_together_client.audio.transcriptions.create( file=audio_url, model="openai/whisper-large-v3", temperature=0.2 @@ -91,7 +91,7 @@ def test_transcription_missing_model(self, sync_together_client): """ Test transcription with missing model parameter - should use default model """ - audio_url = "https://www.soundjay.com/misc/sounds/bell-ringing-05.wav" + audio_url = "https://ia801605.us.archive.org/28/items/jfks19630626/jfk_1963_0626_berliner.wav" response = sync_together_client.audio.transcriptions.create(file=audio_url)