From 5cbdcd7ed5c10896a32c77efc2df20feb6bf4eb2 Mon Sep 17 00:00:00 2001
From: Rishabh Bhargava <rishabh.bhargava93@gmail.com>
Date: Thu, 10 Jul 2025 01:35:07 -0700
Subject: [PATCH 1/8] Adding support for Speech to text

---
 src/together/client.py                        |   2 +
 src/together/resources/audio/__init__.py      |  18 ++
 .../resources/audio/transcriptions.py         | 234 ++++++++++++++++++
 src/together/resources/audio/translations.py  | 234 ++++++++++++++++++
 src/together/types/__init__.py                |  16 ++
 src/together/types/audio_speech.py            |  81 +++++-
 6 files changed, 584 insertions(+), 1 deletion(-)
 create mode 100644 src/together/resources/audio/transcriptions.py
 create mode 100644 src/together/resources/audio/translations.py

diff --git a/src/together/client.py b/src/together/client.py
index a1784529..47a43c24 100644
--- a/src/together/client.py
+++ b/src/together/client.py
@@ -103,6 +103,7 @@ class AsyncTogether:
     models: resources.AsyncModels
     fine_tuning: resources.AsyncFineTuning
     rerank: resources.AsyncRerank
+    audio: resources.AsyncAudio
     code_interpreter: CodeInterpreter
     batches: resources.AsyncBatches
     # client options
@@ -167,6 +168,7 @@ def __init__(
         self.models = resources.AsyncModels(self.client)
         self.fine_tuning = resources.AsyncFineTuning(self.client)
         self.rerank = resources.AsyncRerank(self.client)
+        self.audio = resources.AsyncAudio(self.client)
         self.code_interpreter = CodeInterpreter(self.client)
         self.batches = resources.AsyncBatches(self.client)
 
diff --git a/src/together/resources/audio/__init__.py b/src/together/resources/audio/__init__.py
index 5703d348..5c8abc43 100644
--- a/src/together/resources/audio/__init__.py
+++ b/src/together/resources/audio/__init__.py
@@ -1,6 +1,8 @@
 from functools import cached_property
 
 from together.resources.audio.speech import AsyncSpeech, Speech
+from together.resources.audio.transcriptions import AsyncTranscriptions, Transcriptions
+from together.resources.audio.translations import AsyncTranslations, Translations
 from together.types import (
     TogetherClient,
 )
@@ -14,6 +16,14 @@ def __init__(self, client: TogetherClient) -> None:
     def speech(self) -> Speech:
         return Speech(self._client)
 
+    @cached_property
+    def transcriptions(self) -> Transcriptions:
+        return Transcriptions(self._client)
+
+    @cached_property
+    def translations(self) -> Translations:
+        return Translations(self._client)
+
 
 class AsyncAudio:
     def __init__(self, client: TogetherClient) -> None:
@@ -22,3 +32,11 @@ def __init__(self, client: TogetherClient) -> None:
     @cached_property
     def speech(self) -> AsyncSpeech:
         return AsyncSpeech(self._client)
+
+    @cached_property
+    def transcriptions(self) -> AsyncTranscriptions:
+        return AsyncTranscriptions(self._client)
+
+    @cached_property
+    def translations(self) -> AsyncTranslations:
+        return AsyncTranslations(self._client)
diff --git a/src/together/resources/audio/transcriptions.py b/src/together/resources/audio/transcriptions.py
new file mode 100644
index 00000000..a574a81d
--- /dev/null
+++ b/src/together/resources/audio/transcriptions.py
@@ -0,0 +1,234 @@
+from __future__ import annotations
+
+from typing import Any, Union, BinaryIO, Optional
+from pathlib import Path
+
+from together.abstract import api_requestor
+from together.together_response import TogetherResponse
+from together.types import (
+    AudioTranscriptionRequest,
+    AudioTranscriptionResponse,
+    AudioTranscriptionVerboseResponse,
+    AudioTranscriptionResponseFormat,
+    AudioTimestampGranularities,
+    TogetherClient,
+    TogetherRequest,
+)
+
+
+class Transcriptions:
+    def __init__(self, client: TogetherClient) -> None:
+        self._client = client
+
+    def create(
+        self,
+        *,
+        file: Union[str, BinaryIO, Path],
+        model: str = "openai/whisper-large-v3",
+        language: Optional[str] = None,
+        prompt: Optional[str] = None,
+        response_format: Union[str, AudioTranscriptionResponseFormat] = "json",
+        temperature: float = 0.0,
+        timestamp_granularities: Optional[Union[str, AudioTimestampGranularities]] = None,
+        **kwargs: Any,
+    ) -> Union[AudioTranscriptionResponse, AudioTranscriptionVerboseResponse]:
+        """
+        Transcribes audio into the input language.
+
+        Args:
+            file: The audio file object (not file name) to transcribe, in one of these formats:
+                flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm.
+                Can be a file path (str/Path), file object (BinaryIO), or URL (str).
+            model: ID of the model to use. Defaults to "openai/whisper-large-v3".
+            language: The language of the input audio. Supplying the input language in
+                ISO-639-1 format will improve accuracy and latency.
+            prompt: An optional text to guide the model's style or continue a previous
+                audio segment. The prompt should match the audio language.
+            response_format: The format of the transcript output, in one of these options:
+                json, verbose_json.
+            temperature: The sampling temperature, between 0 and 1. Higher values like 0.8
+                will make the output more random, while lower values like 0.2 will make it
+                more focused and deterministic.
+            timestamp_granularities: The timestamp granularities to populate for this
+                transcription. response_format must be set verbose_json to use timestamp
+                granularities. Either or both of these options are supported: word, or segment.
+
+        Returns:
+            The transcribed text in the requested format.
+        """
+        
+        requestor = api_requestor.APIRequestor(
+            client=self._client,
+        )
+
+        # Handle file input - could be a path, URL, or file object
+        files_data = {}
+        params_data = {}
+        
+        if isinstance(file, (str, Path)):
+            if isinstance(file, str) and file.startswith(('http://', 'https://')):
+                # URL string - send as multipart field
+                files_data["file"] = (None, file)
+            else:
+                # Local file path
+                file_path = Path(file)
+                files_data["file"] = open(file_path, "rb")
+        else:
+            # File object
+            files_data["file"] = file
+
+        # Build request parameters
+        params_data.update({
+            "model": model,
+            "response_format": response_format if isinstance(response_format, str) else response_format.value,
+            "temperature": temperature,
+        })
+        
+        if language is not None:
+            params_data["language"] = language
+            
+        if prompt is not None:
+            params_data["prompt"] = prompt
+            
+        if timestamp_granularities is not None:
+            params_data["timestamp_granularities"] = (
+                timestamp_granularities if isinstance(timestamp_granularities, str) 
+                else timestamp_granularities.value
+            )
+
+        # Add any additional kwargs
+        params_data.update(kwargs)
+
+        try:
+            response, _, _ = requestor.request(
+                options=TogetherRequest(
+                    method="POST",
+                    url="audio/transcriptions",
+                    params=params_data,
+                    files=files_data,
+                ),
+            )
+        finally:
+            # Close file if we opened it
+            if files_data and "file" in files_data:
+                try:
+                    # Only close if it's a file object (not a tuple for URL)
+                    if hasattr(files_data["file"], 'close'):
+                        files_data["file"].close()
+                except:
+                    pass
+
+        # Parse response based on format
+        if response_format == "verbose_json" or response_format == AudioTranscriptionResponseFormat.VERBOSE_JSON:
+            return AudioTranscriptionVerboseResponse(**response.data)
+        else:
+            return AudioTranscriptionResponse(**response.data)
+
+
+class AsyncTranscriptions:
+    def __init__(self, client: TogetherClient) -> None:
+        self._client = client
+
+    async def create(
+        self,
+        *,
+        file: Union[str, BinaryIO, Path],
+        model: str = "openai/whisper-large-v3", 
+        language: Optional[str] = None,
+        prompt: Optional[str] = None,
+        response_format: Union[str, AudioTranscriptionResponseFormat] = "json",
+        temperature: float = 0.0,
+        timestamp_granularities: Optional[Union[str, AudioTimestampGranularities]] = None,
+        **kwargs: Any,
+    ) -> Union[AudioTranscriptionResponse, AudioTranscriptionVerboseResponse]:
+        """
+        Async version of transcribe audio into the input language.
+
+        Args:
+            file: The audio file object (not file name) to transcribe, in one of these formats:
+                flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm.
+                Can be a file path (str/Path), file object (BinaryIO), or URL (str).
+            model: ID of the model to use. Defaults to "openai/whisper-large-v3".
+            language: The language of the input audio. Supplying the input language in
+                ISO-639-1 format will improve accuracy and latency.
+            prompt: An optional text to guide the model's style or continue a previous
+                audio segment. The prompt should match the audio language.
+            response_format: The format of the transcript output, in one of these options:
+                json, verbose_json.
+            temperature: The sampling temperature, between 0 and 1. Higher values like 0.8
+                will make the output more random, while lower values like 0.2 will make it
+                more focused and deterministic.
+            timestamp_granularities: The timestamp granularities to populate for this
+                transcription. response_format must be set verbose_json to use timestamp
+                granularities. Either or both of these options are supported: word, or segment.
+
+        Returns:
+            The transcribed text in the requested format.
+        """
+        
+        requestor = api_requestor.APIRequestor(
+            client=self._client,
+        )
+
+        # Handle file input - could be a path, URL, or file object
+        files_data = {}
+        params_data = {}
+        
+        if isinstance(file, (str, Path)):
+            if isinstance(file, str) and file.startswith(('http://', 'https://')):
+                # URL string - send as multipart field
+                files_data["file"] = (None, file)
+            else:
+                # Local file path
+                file_path = Path(file)
+                files_data["file"] = open(file_path, "rb")
+        else:
+            # File object
+            files_data["file"] = file
+
+        # Build request parameters
+        params_data.update({
+            "model": model,
+            "response_format": response_format if isinstance(response_format, str) else response_format.value,
+            "temperature": temperature,
+        })
+        
+        if language is not None:
+            params_data["language"] = language
+            
+        if prompt is not None:
+            params_data["prompt"] = prompt
+            
+        if timestamp_granularities is not None:
+            params_data["timestamp_granularities"] = (
+                timestamp_granularities if isinstance(timestamp_granularities, str) 
+                else timestamp_granularities.value
+            )
+
+        # Add any additional kwargs
+        params_data.update(kwargs)
+
+        try:
+            response, _, _ = await requestor.arequest(
+                options=TogetherRequest(
+                    method="POST",
+                    url="audio/transcriptions",
+                    params=params_data,
+                    files=files_data,
+                ),
+            )
+        finally:
+            # Close file if we opened it
+            if files_data and "file" in files_data:
+                try:
+                    # Only close if it's a file object (not a tuple for URL)
+                    if hasattr(files_data["file"], 'close'):
+                        files_data["file"].close()
+                except:
+                    pass
+
+        # Parse response based on format
+        if response_format == "verbose_json" or response_format == AudioTranscriptionResponseFormat.VERBOSE_JSON:
+            return AudioTranscriptionVerboseResponse(**response.data)
+        else:
+            return AudioTranscriptionResponse(**response.data)
\ No newline at end of file
diff --git a/src/together/resources/audio/translations.py b/src/together/resources/audio/translations.py
new file mode 100644
index 00000000..7b31f4ae
--- /dev/null
+++ b/src/together/resources/audio/translations.py
@@ -0,0 +1,234 @@
+from __future__ import annotations
+
+from typing import Any, Union, BinaryIO, Optional
+from pathlib import Path
+
+from together.abstract import api_requestor
+from together.together_response import TogetherResponse
+from together.types import (
+    AudioTranslationRequest,
+    AudioTranslationResponse,
+    AudioTranslationVerboseResponse,
+    AudioTranscriptionResponseFormat,
+    AudioTimestampGranularities,
+    TogetherClient,
+    TogetherRequest,
+)
+
+
+class Translations:
+    def __init__(self, client: TogetherClient) -> None:
+        self._client = client
+
+    def create(
+        self,
+        *,
+        file: Union[str, BinaryIO, Path],
+        model: str = "openai/whisper-large-v3",
+        language: Optional[str] = None,
+        prompt: Optional[str] = None,
+        response_format: Union[str, AudioTranscriptionResponseFormat] = "json",
+        temperature: float = 0.0,
+        timestamp_granularities: Optional[Union[str, AudioTimestampGranularities]] = None,
+        **kwargs: Any,
+    ) -> Union[AudioTranslationResponse, AudioTranslationVerboseResponse]:
+        """
+        Translates audio into English.
+
+        Args:
+            file: The audio file object (not file name) to translate, in one of these formats:
+                flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm.
+                Can be a file path (str/Path), file object (BinaryIO), or URL (str).
+            model: ID of the model to use. Defaults to "openai/whisper-large-v3".
+            language: The language of the input audio. Optional ISO-639-1 language code.
+                If omitted, language is set to English.
+            prompt: An optional text to guide the model's style or continue a previous
+                audio segment. The prompt should be in English.
+            response_format: The format of the transcript output, in one of these options:
+                json, verbose_json.
+            temperature: The sampling temperature, between 0 and 1. Higher values like 0.8
+                will make the output more random, while lower values like 0.2 will make it
+                more focused and deterministic.
+            timestamp_granularities: The timestamp granularities to populate for this
+                translation. response_format must be set verbose_json to use timestamp
+                granularities. Either or both of these options are supported: word, or segment.
+
+        Returns:
+            The translated text in the requested format.
+        """
+        
+        requestor = api_requestor.APIRequestor(
+            client=self._client,
+        )
+
+        # Handle file input - could be a path, URL, or file object
+        files_data = {}
+        params_data = {}
+        
+        if isinstance(file, (str, Path)):
+            if isinstance(file, str) and file.startswith(('http://', 'https://')):
+                # URL string - send as multipart field
+                files_data["file"] = (None, file)
+            else:
+                # Local file path
+                file_path = Path(file)
+                files_data["file"] = open(file_path, "rb")
+        else:
+            # File object
+            files_data["file"] = file
+
+        # Build request parameters
+        params_data.update({
+            "model": model,
+            "response_format": response_format if isinstance(response_format, str) else response_format.value,
+            "temperature": temperature,
+        })
+        
+        if language is not None:
+            params_data["language"] = language
+            
+        if prompt is not None:
+            params_data["prompt"] = prompt
+            
+        if timestamp_granularities is not None:
+            params_data["timestamp_granularities"] = (
+                timestamp_granularities if isinstance(timestamp_granularities, str) 
+                else timestamp_granularities.value
+            )
+
+        # Add any additional kwargs
+        params_data.update(kwargs)
+
+        try:
+            response, _, _ = requestor.request(
+                options=TogetherRequest(
+                    method="POST",
+                    url="audio/translations",
+                    params=params_data,
+                    files=files_data,
+                ),
+            )
+        finally:
+            # Close file if we opened it
+            if files_data and "file" in files_data:
+                try:
+                    # Only close if it's a file object (not a tuple for URL)
+                    if hasattr(files_data["file"], 'close'):
+                        files_data["file"].close()
+                except:
+                    pass
+
+        # Parse response based on format
+        if response_format == "verbose_json" or response_format == AudioTranscriptionResponseFormat.VERBOSE_JSON:
+            return AudioTranslationVerboseResponse(**response.data)
+        else:
+            return AudioTranslationResponse(**response.data)
+
+
+class AsyncTranslations:
+    def __init__(self, client: TogetherClient) -> None:
+        self._client = client
+
+    async def create(
+        self,
+        *,
+        file: Union[str, BinaryIO, Path],
+        model: str = "openai/whisper-large-v3", 
+        language: Optional[str] = None,
+        prompt: Optional[str] = None,
+        response_format: Union[str, AudioTranscriptionResponseFormat] = "json",
+        temperature: float = 0.0,
+        timestamp_granularities: Optional[Union[str, AudioTimestampGranularities]] = None,
+        **kwargs: Any,
+    ) -> Union[AudioTranslationResponse, AudioTranslationVerboseResponse]:
+        """
+        Async version of translate audio into English.
+
+        Args:
+            file: The audio file object (not file name) to translate, in one of these formats:
+                flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm.
+                Can be a file path (str/Path), file object (BinaryIO), or URL (str).
+            model: ID of the model to use. Defaults to "openai/whisper-large-v3".
+            language: The language of the input audio. Optional ISO-639-1 language code.
+                If omitted, language is set to English.
+            prompt: An optional text to guide the model's style or continue a previous
+                audio segment. The prompt should be in English.
+            response_format: The format of the transcript output, in one of these options:
+                json, verbose_json.
+            temperature: The sampling temperature, between 0 and 1. Higher values like 0.8
+                will make the output more random, while lower values like 0.2 will make it
+                more focused and deterministic.
+            timestamp_granularities: The timestamp granularities to populate for this
+                translation. response_format must be set verbose_json to use timestamp
+                granularities. Either or both of these options are supported: word, or segment.
+
+        Returns:
+            The translated text in the requested format.
+        """
+        
+        requestor = api_requestor.APIRequestor(
+            client=self._client,
+        )
+
+        # Handle file input - could be a path, URL, or file object
+        files_data = {}
+        params_data = {}
+        
+        if isinstance(file, (str, Path)):
+            if isinstance(file, str) and file.startswith(('http://', 'https://')):
+                # URL string - send as multipart field
+                files_data["file"] = (None, file)
+            else:
+                # Local file path
+                file_path = Path(file)
+                files_data["file"] = open(file_path, "rb")
+        else:
+            # File object
+            files_data["file"] = file
+
+        # Build request parameters
+        params_data.update({
+            "model": model,
+            "response_format": response_format if isinstance(response_format, str) else response_format.value,
+            "temperature": temperature,
+        })
+        
+        if language is not None:
+            params_data["language"] = language
+            
+        if prompt is not None:
+            params_data["prompt"] = prompt
+            
+        if timestamp_granularities is not None:
+            params_data["timestamp_granularities"] = (
+                timestamp_granularities if isinstance(timestamp_granularities, str) 
+                else timestamp_granularities.value
+            )
+
+        # Add any additional kwargs
+        params_data.update(kwargs)
+
+        try:
+            response, _, _ = await requestor.arequest(
+                options=TogetherRequest(
+                    method="POST",
+                    url="audio/translations",
+                    params=params_data,
+                    files=files_data,
+                ),
+            )
+        finally:
+            # Close file if we opened it
+            if files_data and "file" in files_data:
+                try:
+                    # Only close if it's a file object (not a tuple for URL)
+                    if hasattr(files_data["file"], 'close'):
+                        files_data["file"].close()
+                except:
+                    pass
+
+        # Parse response based on format
+        if response_format == "verbose_json" or response_format == AudioTranscriptionResponseFormat.VERBOSE_JSON:
+            return AudioTranslationVerboseResponse(**response.data)
+        else:
+            return AudioTranslationResponse(**response.data)
\ No newline at end of file
diff --git a/src/together/types/__init__.py b/src/together/types/__init__.py
index e218a02e..6e89baff 100644
--- a/src/together/types/__init__.py
+++ b/src/together/types/__init__.py
@@ -7,6 +7,14 @@
     AudioSpeechStreamChunk,
     AudioSpeechStreamEvent,
     AudioSpeechStreamResponse,
+    AudioTranscriptionRequest,
+    AudioTranslationRequest,
+    AudioTranscriptionResponse,
+    AudioTranscriptionVerboseResponse,
+    AudioTranslationResponse,
+    AudioTranslationVerboseResponse,
+    AudioTranscriptionResponseFormat,
+    AudioTimestampGranularities,
 )
 from together.types.chat_completions import (
     ChatCompletionChunk,
@@ -102,6 +110,14 @@
     "AudioSpeechStreamChunk",
     "AudioSpeechStreamEvent",
     "AudioSpeechStreamResponse",
+    "AudioTranscriptionRequest",
+    "AudioTranslationRequest",
+    "AudioTranscriptionResponse",
+    "AudioTranscriptionVerboseResponse",
+    "AudioTranslationResponse",
+    "AudioTranslationVerboseResponse",
+    "AudioTranscriptionResponseFormat",
+    "AudioTimestampGranularities",
     "DedicatedEndpoint",
     "ListEndpoint",
     "Autoscaling",
diff --git a/src/together/types/audio_speech.py b/src/together/types/audio_speech.py
index fb0cf786..a00b8fbd 100644
--- a/src/together/types/audio_speech.py
+++ b/src/together/types/audio_speech.py
@@ -1,7 +1,7 @@
 from __future__ import annotations
 
 from enum import Enum
-from typing import Iterator
+from typing import Iterator, Union, BinaryIO, Optional, List
 import threading
 
 from pydantic import BaseModel, ConfigDict
@@ -108,3 +108,82 @@ def stream_to_file(self, file_path: str) -> None:
                     audio = base64.b64decode(stream_event_response.response.data.b64)
 
                     f.write(audio)
+
+
+class AudioTranscriptionResponseFormat(str, Enum):
+    JSON = "json"
+    VERBOSE_JSON = "verbose_json"
+
+
+class AudioTimestampGranularities(str, Enum):
+    SEGMENT = "segment"
+    WORD = "word"
+
+
+class AudioTranscriptionRequest(BaseModel):
+    model_config = ConfigDict(arbitrary_types_allowed=True)
+    
+    file: Union[str, BinaryIO]
+    model: str = "openai/whisper-large-v3"
+    language: Optional[str] = None
+    prompt: Optional[str] = None
+    response_format: AudioTranscriptionResponseFormat = AudioTranscriptionResponseFormat.JSON
+    temperature: float = 0.0
+    timestamp_granularities: Optional[AudioTimestampGranularities] = AudioTimestampGranularities.SEGMENT
+
+
+class AudioTranslationRequest(BaseModel):
+    model_config = ConfigDict(arbitrary_types_allowed=True)
+    
+    file: Union[str, BinaryIO]
+    model: str = "openai/whisper-large-v3"
+    language: Optional[str] = None
+    prompt: Optional[str] = None
+    response_format: AudioTranscriptionResponseFormat = AudioTranscriptionResponseFormat.JSON
+    temperature: float = 0.0
+    timestamp_granularities: Optional[AudioTimestampGranularities] = AudioTimestampGranularities.SEGMENT
+
+
+class AudioTranscriptionSegment(BaseModel):
+    id: int
+    seek: Optional[int] = None
+    start: float
+    end: float
+    text: str
+    tokens: Optional[List[int]] = None
+    temperature: Optional[float] = None
+    avg_logprob: Optional[float] = None
+    compression_ratio: Optional[float] = None
+    no_speech_prob: Optional[float] = None
+
+
+class AudioTranscriptionWord(BaseModel):
+    word: str
+    start: float
+    end: float
+
+
+class AudioTranscriptionResponse(BaseModel):
+    text: str
+
+
+class AudioTranscriptionVerboseResponse(BaseModel):
+    task: Optional[str] = None
+    language: Optional[str] = None
+    duration: Optional[float] = None
+    text: str
+    segments: Optional[List[AudioTranscriptionSegment]] = None
+    words: Optional[List[AudioTranscriptionWord]] = None
+
+
+class AudioTranslationResponse(BaseModel):
+    text: str
+
+
+class AudioTranslationVerboseResponse(BaseModel):
+    task: Optional[str] = None
+    language: Optional[str] = None
+    duration: Optional[float] = None
+    text: str
+    segments: Optional[List[AudioTranscriptionSegment]] = None
+    words: Optional[List[AudioTranscriptionWord]] = None

From 0e4d164d880c5204195a0db019a82ef830431b01 Mon Sep 17 00:00:00 2001
From: Sahil Yadav <yadavsahil197@gmail.com>
Date: Thu, 10 Jul 2025 10:39:51 -0700
Subject: [PATCH 2/8] lint fixes

---
 .../resources/audio/transcriptions.py         | 118 ++++++++++-------
 src/together/resources/audio/translations.py  | 120 ++++++++++++------
 src/together/types/audio_speech.py            |  20 ++-
 3 files changed, 170 insertions(+), 88 deletions(-)

diff --git a/src/together/resources/audio/transcriptions.py b/src/together/resources/audio/transcriptions.py
index a574a81d..766d4175 100644
--- a/src/together/resources/audio/transcriptions.py
+++ b/src/together/resources/audio/transcriptions.py
@@ -1,16 +1,14 @@
 from __future__ import annotations
 
-from typing import Any, Union, BinaryIO, Optional
 from pathlib import Path
+from typing import Any, BinaryIO, Dict, Optional, Tuple, Union
 
 from together.abstract import api_requestor
-from together.together_response import TogetherResponse
 from together.types import (
-    AudioTranscriptionRequest,
+    AudioTimestampGranularities,
     AudioTranscriptionResponse,
-    AudioTranscriptionVerboseResponse,
     AudioTranscriptionResponseFormat,
-    AudioTimestampGranularities,
+    AudioTranscriptionVerboseResponse,
     TogetherClient,
     TogetherRequest,
 )
@@ -29,7 +27,9 @@ def create(
         prompt: Optional[str] = None,
         response_format: Union[str, AudioTranscriptionResponseFormat] = "json",
         temperature: float = 0.0,
-        timestamp_granularities: Optional[Union[str, AudioTimestampGranularities]] = None,
+        timestamp_granularities: Optional[
+            Union[str, AudioTimestampGranularities]
+        ] = None,
         **kwargs: Any,
     ) -> Union[AudioTranscriptionResponse, AudioTranscriptionVerboseResponse]:
         """
@@ -56,17 +56,17 @@ def create(
         Returns:
             The transcribed text in the requested format.
         """
-        
+
         requestor = api_requestor.APIRequestor(
             client=self._client,
         )
 
         # Handle file input - could be a path, URL, or file object
-        files_data = {}
+        files_data: Dict[str, Union[Tuple[None, str], BinaryIO]] = {}
         params_data = {}
-        
+
         if isinstance(file, (str, Path)):
-            if isinstance(file, str) and file.startswith(('http://', 'https://')):
+            if isinstance(file, str) and file.startswith(("http://", "https://")):
                 # URL string - send as multipart field
                 files_data["file"] = (None, file)
             else:
@@ -78,22 +78,29 @@ def create(
             files_data["file"] = file
 
         # Build request parameters
-        params_data.update({
-            "model": model,
-            "response_format": response_format if isinstance(response_format, str) else response_format.value,
-            "temperature": temperature,
-        })
-        
+        params_data.update(
+            {
+                "model": model,
+                "response_format": (
+                    response_format.value
+                    if hasattr(response_format, "value")
+                    else response_format
+                ),
+                "temperature": temperature,
+            }
+        )
+
         if language is not None:
             params_data["language"] = language
-            
+
         if prompt is not None:
             params_data["prompt"] = prompt
-            
+
         if timestamp_granularities is not None:
             params_data["timestamp_granularities"] = (
-                timestamp_granularities if isinstance(timestamp_granularities, str) 
-                else timestamp_granularities.value
+                timestamp_granularities.value
+                if hasattr(timestamp_granularities, "value")
+                else timestamp_granularities
             )
 
         # Add any additional kwargs
@@ -113,13 +120,17 @@ def create(
             if files_data and "file" in files_data:
                 try:
                     # Only close if it's a file object (not a tuple for URL)
-                    if hasattr(files_data["file"], 'close'):
-                        files_data["file"].close()
+                    file_obj = files_data["file"]
+                    if hasattr(file_obj, "close") and not isinstance(file_obj, tuple):
+                        file_obj.close()
                 except:
                     pass
 
         # Parse response based on format
-        if response_format == "verbose_json" or response_format == AudioTranscriptionResponseFormat.VERBOSE_JSON:
+        if (
+            response_format == "verbose_json"
+            or response_format == AudioTranscriptionResponseFormat.VERBOSE_JSON
+        ):
             return AudioTranscriptionVerboseResponse(**response.data)
         else:
             return AudioTranscriptionResponse(**response.data)
@@ -133,12 +144,14 @@ async def create(
         self,
         *,
         file: Union[str, BinaryIO, Path],
-        model: str = "openai/whisper-large-v3", 
+        model: str = "openai/whisper-large-v3",
         language: Optional[str] = None,
         prompt: Optional[str] = None,
         response_format: Union[str, AudioTranscriptionResponseFormat] = "json",
         temperature: float = 0.0,
-        timestamp_granularities: Optional[Union[str, AudioTimestampGranularities]] = None,
+        timestamp_granularities: Optional[
+            Union[str, AudioTimestampGranularities]
+        ] = None,
         **kwargs: Any,
     ) -> Union[AudioTranscriptionResponse, AudioTranscriptionVerboseResponse]:
         """
@@ -165,17 +178,17 @@ async def create(
         Returns:
             The transcribed text in the requested format.
         """
-        
+
         requestor = api_requestor.APIRequestor(
             client=self._client,
         )
 
         # Handle file input - could be a path, URL, or file object
-        files_data = {}
+        files_data: Dict[str, Union[Tuple[None, str], BinaryIO]] = {}
         params_data = {}
-        
+
         if isinstance(file, (str, Path)):
-            if isinstance(file, str) and file.startswith(('http://', 'https://')):
+            if isinstance(file, str) and file.startswith(("http://", "https://")):
                 # URL string - send as multipart field
                 files_data["file"] = (None, file)
             else:
@@ -187,22 +200,37 @@ async def create(
             files_data["file"] = file
 
         # Build request parameters
-        params_data.update({
-            "model": model,
-            "response_format": response_format if isinstance(response_format, str) else response_format.value,
-            "temperature": temperature,
-        })
-        
+        params_data.update(
+            {
+                "model": model,
+                "response_format": (
+                    response_format
+                    if isinstance(response_format, str)
+                    else (
+                        response_format.value
+                        if hasattr(response_format, "value")
+                        else response_format
+                    )
+                ),
+                "temperature": temperature,
+            }
+        )
+
         if language is not None:
             params_data["language"] = language
-            
+
         if prompt is not None:
             params_data["prompt"] = prompt
-            
+
         if timestamp_granularities is not None:
             params_data["timestamp_granularities"] = (
-                timestamp_granularities if isinstance(timestamp_granularities, str) 
-                else timestamp_granularities.value
+                timestamp_granularities
+                if isinstance(timestamp_granularities, str)
+                else (
+                    timestamp_granularities.value
+                    if hasattr(timestamp_granularities, "value")
+                    else timestamp_granularities
+                )
             )
 
         # Add any additional kwargs
@@ -222,13 +250,17 @@ async def create(
             if files_data and "file" in files_data:
                 try:
                     # Only close if it's a file object (not a tuple for URL)
-                    if hasattr(files_data["file"], 'close'):
-                        files_data["file"].close()
+                    file_obj = files_data["file"]
+                    if hasattr(file_obj, "close") and not isinstance(file_obj, tuple):
+                        file_obj.close()
                 except:
                     pass
 
         # Parse response based on format
-        if response_format == "verbose_json" or response_format == AudioTranscriptionResponseFormat.VERBOSE_JSON:
+        if (
+            response_format == "verbose_json"
+            or response_format == AudioTranscriptionResponseFormat.VERBOSE_JSON
+        ):
             return AudioTranscriptionVerboseResponse(**response.data)
         else:
-            return AudioTranscriptionResponse(**response.data)
\ No newline at end of file
+            return AudioTranscriptionResponse(**response.data)
diff --git a/src/together/resources/audio/translations.py b/src/together/resources/audio/translations.py
index 7b31f4ae..4b5cc59b 100644
--- a/src/together/resources/audio/translations.py
+++ b/src/together/resources/audio/translations.py
@@ -1,6 +1,6 @@
 from __future__ import annotations
 
-from typing import Any, Union, BinaryIO, Optional
+from typing import Any, Union, BinaryIO, Optional, Dict, Tuple
 from pathlib import Path
 
 from together.abstract import api_requestor
@@ -29,7 +29,9 @@ def create(
         prompt: Optional[str] = None,
         response_format: Union[str, AudioTranscriptionResponseFormat] = "json",
         temperature: float = 0.0,
-        timestamp_granularities: Optional[Union[str, AudioTimestampGranularities]] = None,
+        timestamp_granularities: Optional[
+            Union[str, AudioTimestampGranularities]
+        ] = None,
         **kwargs: Any,
     ) -> Union[AudioTranslationResponse, AudioTranslationVerboseResponse]:
         """
@@ -56,17 +58,17 @@ def create(
         Returns:
             The translated text in the requested format.
         """
-        
+
         requestor = api_requestor.APIRequestor(
             client=self._client,
         )
 
         # Handle file input - could be a path, URL, or file object
-        files_data = {}
+        files_data: Dict[str, Union[Tuple[None, str], BinaryIO]] = {}
         params_data = {}
-        
+
         if isinstance(file, (str, Path)):
-            if isinstance(file, str) and file.startswith(('http://', 'https://')):
+            if isinstance(file, str) and file.startswith(("http://", "https://")):
                 # URL string - send as multipart field
                 files_data["file"] = (None, file)
             else:
@@ -78,22 +80,37 @@ def create(
             files_data["file"] = file
 
         # Build request parameters
-        params_data.update({
-            "model": model,
-            "response_format": response_format if isinstance(response_format, str) else response_format.value,
-            "temperature": temperature,
-        })
-        
+        params_data.update(
+            {
+                "model": model,
+                "response_format": (
+                    response_format
+                    if isinstance(response_format, str)
+                    else (
+                        response_format.value
+                        if hasattr(response_format, "value")
+                        else response_format
+                    )
+                ),
+                "temperature": temperature,
+            }
+        )
+
         if language is not None:
             params_data["language"] = language
-            
+
         if prompt is not None:
             params_data["prompt"] = prompt
-            
+
         if timestamp_granularities is not None:
             params_data["timestamp_granularities"] = (
-                timestamp_granularities if isinstance(timestamp_granularities, str) 
-                else timestamp_granularities.value
+                timestamp_granularities
+                if isinstance(timestamp_granularities, str)
+                else (
+                    timestamp_granularities.value
+                    if hasattr(timestamp_granularities, "value")
+                    else timestamp_granularities
+                )
             )
 
         # Add any additional kwargs
@@ -113,13 +130,17 @@ def create(
             if files_data and "file" in files_data:
                 try:
                     # Only close if it's a file object (not a tuple for URL)
-                    if hasattr(files_data["file"], 'close'):
-                        files_data["file"].close()
+                    file_obj = files_data["file"]
+                    if hasattr(file_obj, "close") and not isinstance(file_obj, tuple):
+                        file_obj.close()
                 except:
                     pass
 
         # Parse response based on format
-        if response_format == "verbose_json" or response_format == AudioTranscriptionResponseFormat.VERBOSE_JSON:
+        if (
+            response_format == "verbose_json"
+            or response_format == AudioTranscriptionResponseFormat.VERBOSE_JSON
+        ):
             return AudioTranslationVerboseResponse(**response.data)
         else:
             return AudioTranslationResponse(**response.data)
@@ -133,12 +154,14 @@ async def create(
         self,
         *,
         file: Union[str, BinaryIO, Path],
-        model: str = "openai/whisper-large-v3", 
+        model: str = "openai/whisper-large-v3",
         language: Optional[str] = None,
         prompt: Optional[str] = None,
         response_format: Union[str, AudioTranscriptionResponseFormat] = "json",
         temperature: float = 0.0,
-        timestamp_granularities: Optional[Union[str, AudioTimestampGranularities]] = None,
+        timestamp_granularities: Optional[
+            Union[str, AudioTimestampGranularities]
+        ] = None,
         **kwargs: Any,
     ) -> Union[AudioTranslationResponse, AudioTranslationVerboseResponse]:
         """
@@ -165,17 +188,17 @@ async def create(
         Returns:
             The translated text in the requested format.
         """
-        
+
         requestor = api_requestor.APIRequestor(
             client=self._client,
         )
 
         # Handle file input - could be a path, URL, or file object
-        files_data = {}
+        files_data: Dict[str, Union[Tuple[None, str], BinaryIO]] = {}
         params_data = {}
-        
+
         if isinstance(file, (str, Path)):
-            if isinstance(file, str) and file.startswith(('http://', 'https://')):
+            if isinstance(file, str) and file.startswith(("http://", "https://")):
                 # URL string - send as multipart field
                 files_data["file"] = (None, file)
             else:
@@ -187,22 +210,37 @@ async def create(
             files_data["file"] = file
 
         # Build request parameters
-        params_data.update({
-            "model": model,
-            "response_format": response_format if isinstance(response_format, str) else response_format.value,
-            "temperature": temperature,
-        })
-        
+        params_data.update(
+            {
+                "model": model,
+                "response_format": (
+                    response_format
+                    if isinstance(response_format, str)
+                    else (
+                        response_format.value
+                        if hasattr(response_format, "value")
+                        else response_format
+                    )
+                ),
+                "temperature": temperature,
+            }
+        )
+
         if language is not None:
             params_data["language"] = language
-            
+
         if prompt is not None:
             params_data["prompt"] = prompt
-            
+
         if timestamp_granularities is not None:
             params_data["timestamp_granularities"] = (
-                timestamp_granularities if isinstance(timestamp_granularities, str) 
-                else timestamp_granularities.value
+                timestamp_granularities
+                if isinstance(timestamp_granularities, str)
+                else (
+                    timestamp_granularities.value
+                    if hasattr(timestamp_granularities, "value")
+                    else timestamp_granularities
+                )
             )
 
         # Add any additional kwargs
@@ -222,13 +260,17 @@ async def create(
             if files_data and "file" in files_data:
                 try:
                     # Only close if it's a file object (not a tuple for URL)
-                    if hasattr(files_data["file"], 'close'):
-                        files_data["file"].close()
+                    file_obj = files_data["file"]
+                    if hasattr(file_obj, "close") and not isinstance(file_obj, tuple):
+                        file_obj.close()
                 except:
                     pass
 
         # Parse response based on format
-        if response_format == "verbose_json" or response_format == AudioTranscriptionResponseFormat.VERBOSE_JSON:
+        if (
+            response_format == "verbose_json"
+            or response_format == AudioTranscriptionResponseFormat.VERBOSE_JSON
+        ):
             return AudioTranslationVerboseResponse(**response.data)
         else:
-            return AudioTranslationResponse(**response.data)
\ No newline at end of file
+            return AudioTranslationResponse(**response.data)
diff --git a/src/together/types/audio_speech.py b/src/together/types/audio_speech.py
index a00b8fbd..0f7de7dd 100644
--- a/src/together/types/audio_speech.py
+++ b/src/together/types/audio_speech.py
@@ -122,26 +122,34 @@ class AudioTimestampGranularities(str, Enum):
 
 class AudioTranscriptionRequest(BaseModel):
     model_config = ConfigDict(arbitrary_types_allowed=True)
-    
+
     file: Union[str, BinaryIO]
     model: str = "openai/whisper-large-v3"
     language: Optional[str] = None
     prompt: Optional[str] = None
-    response_format: AudioTranscriptionResponseFormat = AudioTranscriptionResponseFormat.JSON
+    response_format: AudioTranscriptionResponseFormat = (
+        AudioTranscriptionResponseFormat.JSON
+    )
     temperature: float = 0.0
-    timestamp_granularities: Optional[AudioTimestampGranularities] = AudioTimestampGranularities.SEGMENT
+    timestamp_granularities: Optional[AudioTimestampGranularities] = (
+        AudioTimestampGranularities.SEGMENT
+    )
 
 
 class AudioTranslationRequest(BaseModel):
     model_config = ConfigDict(arbitrary_types_allowed=True)
-    
+
     file: Union[str, BinaryIO]
     model: str = "openai/whisper-large-v3"
     language: Optional[str] = None
     prompt: Optional[str] = None
-    response_format: AudioTranscriptionResponseFormat = AudioTranscriptionResponseFormat.JSON
+    response_format: AudioTranscriptionResponseFormat = (
+        AudioTranscriptionResponseFormat.JSON
+    )
     temperature: float = 0.0
-    timestamp_granularities: Optional[AudioTimestampGranularities] = AudioTimestampGranularities.SEGMENT
+    timestamp_granularities: Optional[AudioTimestampGranularities] = (
+        AudioTimestampGranularities.SEGMENT
+    )
 
 
 class AudioTranscriptionSegment(BaseModel):

From a5fd61be8abb2b23c82bdb59d7808bfeee78f62a Mon Sep 17 00:00:00 2001
From: Sahil Yadav <yadavsahil197@gmail.com>
Date: Thu, 10 Jul 2025 11:26:20 -0700
Subject: [PATCH 3/8] remove unsupported fields from response

---
 src/together/types/audio_speech.py | 15 ++-------------
 1 file changed, 2 insertions(+), 13 deletions(-)

diff --git a/src/together/types/audio_speech.py b/src/together/types/audio_speech.py
index 0f7de7dd..f078dd5d 100644
--- a/src/together/types/audio_speech.py
+++ b/src/together/types/audio_speech.py
@@ -1,13 +1,12 @@
 from __future__ import annotations
 
+import base64
 from enum import Enum
-from typing import Iterator, Union, BinaryIO, Optional, List
-import threading
+from typing import BinaryIO, Iterator, List, Optional, Union
 
 from pydantic import BaseModel, ConfigDict
 
 from together.together_response import TogetherResponse
-import base64
 
 
 class AudioResponseFormat(str, Enum):
@@ -79,23 +78,19 @@ class AudioSpeechStreamEventResponse(BaseModel):
 
 
 class AudioSpeechStreamResponse(BaseModel):
-
     response: TogetherResponse | Iterator[TogetherResponse]
 
     model_config = ConfigDict(arbitrary_types_allowed=True)
 
     def stream_to_file(self, file_path: str) -> None:
-
         if isinstance(self.response, TogetherResponse):
             # save response to file
             with open(file_path, "wb") as f:
                 f.write(self.response.data)
 
         elif isinstance(self.response, Iterator):
-
             with open(file_path, "wb") as f:
                 for chunk in self.response:
-
                     # Try to parse as stream chunk
                     stream_event_response = AudioSpeechStreamEventResponse(
                         response={"data": chunk.data}
@@ -154,15 +149,9 @@ class AudioTranslationRequest(BaseModel):
 
 class AudioTranscriptionSegment(BaseModel):
     id: int
-    seek: Optional[int] = None
     start: float
     end: float
     text: str
-    tokens: Optional[List[int]] = None
-    temperature: Optional[float] = None
-    avg_logprob: Optional[float] = None
-    compression_ratio: Optional[float] = None
-    no_speech_prob: Optional[float] = None
 
 
 class AudioTranscriptionWord(BaseModel):

From 0b9e19685e670d8046e6650e926d23a102c40638 Mon Sep 17 00:00:00 2001
From: Sahil Yadav <yadavsahil197@gmail.com>
Date: Thu, 10 Jul 2025 12:11:01 -0700
Subject: [PATCH 4/8] add test for transcriptions

---
 .../resources/test_transcriptions.py          | 100 ++++++++++++++++++
 1 file changed, 100 insertions(+)
 create mode 100644 tests/integration/resources/test_transcriptions.py

diff --git a/tests/integration/resources/test_transcriptions.py b/tests/integration/resources/test_transcriptions.py
new file mode 100644
index 00000000..0e4e40d2
--- /dev/null
+++ b/tests/integration/resources/test_transcriptions.py
@@ -0,0 +1,100 @@
+import os
+
+import pytest
+
+from together.client import Together
+from together.types.audio_speech import (
+    AudioTranscriptionResponse,
+    AudioTranscriptionVerboseResponse,
+)
+
+
+class TestTogetherTranscriptions:
+    @pytest.fixture
+    def sync_together_client(self) -> Together:
+        """
+        Initialize object with API key from environment
+        """
+        TOGETHER_API_KEY = os.getenv("TOGETHER_API_KEY")
+        return Together(api_key=TOGETHER_API_KEY)
+
+    def test_basic_transcription_url(self, sync_together_client):
+        """
+        Test basic transcription with URL audio file
+        """
+        audio_url = "https://raw.githubusercontent.com/yuekaizhang/Triton-ASR-Client/main/datasets/mini_en/wav/1221-135766-0002.wav"
+
+        response = sync_together_client.audio.transcriptions.create(
+            file=audio_url, model="openai/whisper-large-v3"
+        )
+
+        assert isinstance(response, AudioTranscriptionResponse)
+        assert isinstance(response.text, str)
+        assert len(response.text) > 0
+
+    def test_transcription_with_language(self, sync_together_client):
+        """
+        Test transcription with language parameter
+        """
+        audio_url = "https://raw.githubusercontent.com/yuekaizhang/Triton-ASR-Client/main/datasets/mini_en/wav/1221-135766-0002.wav"
+
+        response = sync_together_client.audio.transcriptions.create(
+            file=audio_url, model="openai/whisper-large-v3", language="en"
+        )
+
+        assert isinstance(response, AudioTranscriptionResponse)
+        assert isinstance(response.text, str)
+        assert len(response.text) > 0
+
+    def test_transcription_verbose_json(self, sync_together_client):
+        """
+        Test transcription with verbose JSON format and timestamps
+        """
+        audio_url = "https://raw.githubusercontent.com/yuekaizhang/Triton-ASR-Client/main/datasets/mini_en/wav/1221-135766-0002.wav"
+
+        response = sync_together_client.audio.transcriptions.create(
+            file=audio_url,
+            model="openai/whisper-large-v3",
+            response_format="verbose_json",
+            timestamp_granularities="segment",
+        )
+
+        assert isinstance(response, AudioTranscriptionVerboseResponse)
+        assert isinstance(response.text, str)
+        assert len(response.text) > 0
+        assert hasattr(response, "segments")
+
+    def test_transcription_with_temperature(self, sync_together_client):
+        """
+        Test transcription with temperature parameter
+        """
+        audio_url = "https://raw.githubusercontent.com/yuekaizhang/Triton-ASR-Client/main/datasets/mini_en/wav/1221-135766-0002.wav"
+
+        response = sync_together_client.audio.transcriptions.create(
+            file=audio_url, model="openai/whisper-large-v3", temperature=0.2
+        )
+
+        assert isinstance(response, AudioTranscriptionResponse)
+        assert isinstance(response.text, str)
+        assert len(response.text) > 0
+
+    def test_transcription_missing_file(self, sync_together_client):
+        """
+        Test transcription with missing file parameter
+        """
+        with pytest.raises(TypeError):
+            sync_together_client.audio.transcriptions.create(
+                model="openai/whisper-large-v3"
+            )
+
+    def test_transcription_missing_model(self, sync_together_client):
+        """
+        Test transcription with missing model parameter - should use default model
+        """
+        audio_url = "https://raw.githubusercontent.com/yuekaizhang/Triton-ASR-Client/main/datasets/mini_en/wav/1221-135766-0002.wav"
+
+        response = sync_together_client.audio.transcriptions.create(file=audio_url)
+
+        assert isinstance(response, AudioTranscriptionResponse)
+        assert isinstance(response.text, str)
+        assert len(response.text) > 0

From e61e077504a4040e071f3396abd55f09accb8fea Mon Sep 17 00:00:00 2001
From: Sahil Yadav <yadavsahil197@gmail.com>
Date: Thu, 10 Jul 2025 12:14:02 -0700
Subject: [PATCH 5/8] Bump package version

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 35bc164a..55a69e7a 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -12,7 +12,7 @@ build-backend = "poetry.masonry.api"
 
 [tool.poetry]
 name = "together"
-version = "1.5.20"
+version = "1.5.21"
 authors = ["Together AI <support@together.ai>"]
 description = "Python client for Together's Cloud Platform!"
 readme = "README.md"

From e4473c58d79c5bcfc1acc79c88f03db18f6fddba Mon Sep 17 00:00:00 2001
From: Sahil Yadav <yadavsahil197@gmail.com>
Date: Thu, 10 Jul 2025 12:48:12 -0700
Subject: [PATCH 6/8] remove task from response

---
 src/together/types/audio_speech.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/together/types/audio_speech.py b/src/together/types/audio_speech.py
index f078dd5d..b3c110f0 100644
--- a/src/together/types/audio_speech.py
+++ b/src/together/types/audio_speech.py
@@ -165,7 +165,6 @@ class AudioTranscriptionResponse(BaseModel):
 
 
 class AudioTranscriptionVerboseResponse(BaseModel):
-    task: Optional[str] = None
     language: Optional[str] = None
     duration: Optional[float] = None
     text: str

From 25d031a899a698a5a042aad3322bc1088718bff9 Mon Sep 17 00:00:00 2001
From: Sahil Yadav <yadavsahil197@gmail.com>
Date: Thu, 10 Jul 2025 13:07:27 -0700
Subject: [PATCH 7/8] replace audio url

---
 tests/integration/resources/test_transcriptions.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/tests/integration/resources/test_transcriptions.py b/tests/integration/resources/test_transcriptions.py
index 0e4e40d2..4aaa150c 100644
--- a/tests/integration/resources/test_transcriptions.py
+++ b/tests/integration/resources/test_transcriptions.py
@@ -22,7 +22,7 @@ def test_basic_transcription_url(self, sync_together_client):
         """
         Test basic transcription with URL audio file
         """
-        audio_url = "https://raw.githubusercontent.com/yuekaizhang/Triton-ASR-Client/main/datasets/mini_en/wav/1221-135766-0002.wav"
+        audio_url = "https://www.soundjay.com/misc/sounds/bell-ringing-05.wav"
 
         response = sync_together_client.audio.transcriptions.create(
             file=audio_url, model="openai/whisper-large-v3"
@@ -36,7 +36,7 @@ def test_transcription_with_language(self, sync_together_client):
         """
         Test transcription with language parameter
         """
-        audio_url = "https://raw.githubusercontent.com/yuekaizhang/Triton-ASR-Client/main/datasets/mini_en/wav/1221-135766-0002.wav"
+        audio_url = "https://www.soundjay.com/misc/sounds/bell-ringing-05.wav"
 
         response = sync_together_client.audio.transcriptions.create(
             file=audio_url, model="openai/whisper-large-v3", language="en"
@@ -50,7 +50,7 @@ def test_transcription_verbose_json(self, sync_together_client):
         """
         Test transcription with verbose JSON format and timestamps
         """
-        audio_url = "https://raw.githubusercontent.com/yuekaizhang/Triton-ASR-Client/main/datasets/mini_en/wav/1221-135766-0002.wav"
+        audio_url = "https://www.soundjay.com/misc/sounds/bell-ringing-05.wav"
 
         response = sync_together_client.audio.transcriptions.create(
             file=audio_url,
@@ -68,7 +68,7 @@ def test_transcription_with_temperature(self, sync_together_client):
         """
         Test transcription with temperature parameter
         """
-        audio_url = "https://raw.githubusercontent.com/yuekaizhang/Triton-ASR-Client/main/datasets/mini_en/wav/1221-135766-0002.wav"
+        audio_url = "https://www.soundjay.com/misc/sounds/bell-ringing-05.wav"
 
         response = sync_together_client.audio.transcriptions.create(
             file=audio_url, model="openai/whisper-large-v3", temperature=0.2
@@ -91,7 +91,7 @@ def test_transcription_missing_model(self, sync_together_client):
         """
         Test transcription with missing model parameter - should use default model
         """
-        audio_url = "https://raw.githubusercontent.com/yuekaizhang/Triton-ASR-Client/main/datasets/mini_en/wav/1221-135766-0002.wav"
+        audio_url = "https://www.soundjay.com/misc/sounds/bell-ringing-05.wav"
 
         response = sync_together_client.audio.transcriptions.create(file=audio_url)
 

From c77f643ebd42f6441d4678a1d3dbf191b3b19e56 Mon Sep 17 00:00:00 2001
From: Sahil Yadav <yadavsahil197@gmail.com>
Date: Thu, 10 Jul 2025 13:58:38 -0700
Subject: [PATCH 8/8] change file

---
 tests/integration/resources/test_transcriptions.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/tests/integration/resources/test_transcriptions.py b/tests/integration/resources/test_transcriptions.py
index 4aaa150c..f36f7c4d 100644
--- a/tests/integration/resources/test_transcriptions.py
+++ b/tests/integration/resources/test_transcriptions.py
@@ -22,7 +22,7 @@ def test_basic_transcription_url(self, sync_together_client):
         """
         Test basic transcription with URL audio file
         """
-        audio_url = "https://www.soundjay.com/misc/sounds/bell-ringing-05.wav"
+        audio_url = "https://ia801605.us.archive.org/28/items/jfks19630626/jfk_1963_0626_berliner.wav"
 
         response = sync_together_client.audio.transcriptions.create(
             file=audio_url, model="openai/whisper-large-v3"
@@ -36,7 +36,7 @@ def test_transcription_with_language(self, sync_together_client):
         """
         Test transcription with language parameter
         """
-        audio_url = "https://www.soundjay.com/misc/sounds/bell-ringing-05.wav"
+        audio_url = "https://ia801605.us.archive.org/28/items/jfks19630626/jfk_1963_0626_berliner.wav"
 
         response = sync_together_client.audio.transcriptions.create(
             file=audio_url, model="openai/whisper-large-v3", language="en"
@@ -50,7 +50,7 @@ def test_transcription_verbose_json(self, sync_together_client):
         """
         Test transcription with verbose JSON format and timestamps
         """
-        audio_url = "https://www.soundjay.com/misc/sounds/bell-ringing-05.wav"
+        audio_url = "https://ia801605.us.archive.org/28/items/jfks19630626/jfk_1963_0626_berliner.wav"
 
         response = sync_together_client.audio.transcriptions.create(
             file=audio_url,
@@ -68,7 +68,7 @@ def test_transcription_with_temperature(self, sync_together_client):
         """
         Test transcription with temperature parameter
         """
-        audio_url = "https://www.soundjay.com/misc/sounds/bell-ringing-05.wav"
+        audio_url = "https://ia801605.us.archive.org/28/items/jfks19630626/jfk_1963_0626_berliner.wav"
 
         response = sync_together_client.audio.transcriptions.create(
             file=audio_url, model="openai/whisper-large-v3", temperature=0.2
@@ -91,7 +91,7 @@ def test_transcription_missing_model(self, sync_together_client):
         """
         Test transcription with missing model parameter - should use default model
         """
-        audio_url = "https://www.soundjay.com/misc/sounds/bell-ringing-05.wav"
+        audio_url = "https://ia801605.us.archive.org/28/items/jfks19630626/jfk_1963_0626_berliner.wav"
 
         response = sync_together_client.audio.transcriptions.create(file=audio_url)