Skip to content

Commit 5cbdcd7

Browse files
rishabh-bhargavayadavsahil197
authored andcommitted
Adding support for Speech to text
1 parent 724a192 commit 5cbdcd7

File tree

6 files changed

+584
-1
lines changed

6 files changed

+584
-1
lines changed

src/together/client.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,7 @@ class AsyncTogether:
103103
models: resources.AsyncModels
104104
fine_tuning: resources.AsyncFineTuning
105105
rerank: resources.AsyncRerank
106+
audio: resources.AsyncAudio
106107
code_interpreter: CodeInterpreter
107108
batches: resources.AsyncBatches
108109
# client options
@@ -167,6 +168,7 @@ def __init__(
167168
self.models = resources.AsyncModels(self.client)
168169
self.fine_tuning = resources.AsyncFineTuning(self.client)
169170
self.rerank = resources.AsyncRerank(self.client)
171+
self.audio = resources.AsyncAudio(self.client)
170172
self.code_interpreter = CodeInterpreter(self.client)
171173
self.batches = resources.AsyncBatches(self.client)
172174

src/together/resources/audio/__init__.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
from functools import cached_property
22

33
from together.resources.audio.speech import AsyncSpeech, Speech
4+
from together.resources.audio.transcriptions import AsyncTranscriptions, Transcriptions
5+
from together.resources.audio.translations import AsyncTranslations, Translations
46
from together.types import (
57
TogetherClient,
68
)
@@ -14,6 +16,14 @@ def __init__(self, client: TogetherClient) -> None:
1416
def speech(self) -> Speech:
1517
return Speech(self._client)
1618

19+
@cached_property
20+
def transcriptions(self) -> Transcriptions:
21+
return Transcriptions(self._client)
22+
23+
@cached_property
24+
def translations(self) -> Translations:
25+
return Translations(self._client)
26+
1727

1828
class AsyncAudio:
1929
def __init__(self, client: TogetherClient) -> None:
@@ -22,3 +32,11 @@ def __init__(self, client: TogetherClient) -> None:
2232
@cached_property
2333
def speech(self) -> AsyncSpeech:
2434
return AsyncSpeech(self._client)
35+
36+
@cached_property
37+
def transcriptions(self) -> AsyncTranscriptions:
38+
return AsyncTranscriptions(self._client)
39+
40+
@cached_property
41+
def translations(self) -> AsyncTranslations:
42+
return AsyncTranslations(self._client)
Lines changed: 234 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,234 @@
1+
from __future__ import annotations
2+
3+
from typing import Any, Union, BinaryIO, Optional
4+
from pathlib import Path
5+
6+
from together.abstract import api_requestor
7+
from together.together_response import TogetherResponse
8+
from together.types import (
9+
AudioTranscriptionRequest,
10+
AudioTranscriptionResponse,
11+
AudioTranscriptionVerboseResponse,
12+
AudioTranscriptionResponseFormat,
13+
AudioTimestampGranularities,
14+
TogetherClient,
15+
TogetherRequest,
16+
)
17+
18+
19+
class Transcriptions:
20+
def __init__(self, client: TogetherClient) -> None:
21+
self._client = client
22+
23+
def create(
24+
self,
25+
*,
26+
file: Union[str, BinaryIO, Path],
27+
model: str = "openai/whisper-large-v3",
28+
language: Optional[str] = None,
29+
prompt: Optional[str] = None,
30+
response_format: Union[str, AudioTranscriptionResponseFormat] = "json",
31+
temperature: float = 0.0,
32+
timestamp_granularities: Optional[Union[str, AudioTimestampGranularities]] = None,
33+
**kwargs: Any,
34+
) -> Union[AudioTranscriptionResponse, AudioTranscriptionVerboseResponse]:
35+
"""
36+
Transcribes audio into the input language.
37+
38+
Args:
39+
file: The audio file object (not file name) to transcribe, in one of these formats:
40+
flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm.
41+
Can be a file path (str/Path), file object (BinaryIO), or URL (str).
42+
model: ID of the model to use. Defaults to "openai/whisper-large-v3".
43+
language: The language of the input audio. Supplying the input language in
44+
ISO-639-1 format will improve accuracy and latency.
45+
prompt: An optional text to guide the model's style or continue a previous
46+
audio segment. The prompt should match the audio language.
47+
response_format: The format of the transcript output, in one of these options:
48+
json, verbose_json.
49+
temperature: The sampling temperature, between 0 and 1. Higher values like 0.8
50+
will make the output more random, while lower values like 0.2 will make it
51+
more focused and deterministic.
52+
timestamp_granularities: The timestamp granularities to populate for this
53+
transcription. response_format must be set verbose_json to use timestamp
54+
granularities. Either or both of these options are supported: word, or segment.
55+
56+
Returns:
57+
The transcribed text in the requested format.
58+
"""
59+
60+
requestor = api_requestor.APIRequestor(
61+
client=self._client,
62+
)
63+
64+
# Handle file input - could be a path, URL, or file object
65+
files_data = {}
66+
params_data = {}
67+
68+
if isinstance(file, (str, Path)):
69+
if isinstance(file, str) and file.startswith(('http://', 'https://')):
70+
# URL string - send as multipart field
71+
files_data["file"] = (None, file)
72+
else:
73+
# Local file path
74+
file_path = Path(file)
75+
files_data["file"] = open(file_path, "rb")
76+
else:
77+
# File object
78+
files_data["file"] = file
79+
80+
# Build request parameters
81+
params_data.update({
82+
"model": model,
83+
"response_format": response_format if isinstance(response_format, str) else response_format.value,
84+
"temperature": temperature,
85+
})
86+
87+
if language is not None:
88+
params_data["language"] = language
89+
90+
if prompt is not None:
91+
params_data["prompt"] = prompt
92+
93+
if timestamp_granularities is not None:
94+
params_data["timestamp_granularities"] = (
95+
timestamp_granularities if isinstance(timestamp_granularities, str)
96+
else timestamp_granularities.value
97+
)
98+
99+
# Add any additional kwargs
100+
params_data.update(kwargs)
101+
102+
try:
103+
response, _, _ = requestor.request(
104+
options=TogetherRequest(
105+
method="POST",
106+
url="audio/transcriptions",
107+
params=params_data,
108+
files=files_data,
109+
),
110+
)
111+
finally:
112+
# Close file if we opened it
113+
if files_data and "file" in files_data:
114+
try:
115+
# Only close if it's a file object (not a tuple for URL)
116+
if hasattr(files_data["file"], 'close'):
117+
files_data["file"].close()
118+
except:
119+
pass
120+
121+
# Parse response based on format
122+
if response_format == "verbose_json" or response_format == AudioTranscriptionResponseFormat.VERBOSE_JSON:
123+
return AudioTranscriptionVerboseResponse(**response.data)
124+
else:
125+
return AudioTranscriptionResponse(**response.data)
126+
127+
128+
class AsyncTranscriptions:
129+
def __init__(self, client: TogetherClient) -> None:
130+
self._client = client
131+
132+
async def create(
133+
self,
134+
*,
135+
file: Union[str, BinaryIO, Path],
136+
model: str = "openai/whisper-large-v3",
137+
language: Optional[str] = None,
138+
prompt: Optional[str] = None,
139+
response_format: Union[str, AudioTranscriptionResponseFormat] = "json",
140+
temperature: float = 0.0,
141+
timestamp_granularities: Optional[Union[str, AudioTimestampGranularities]] = None,
142+
**kwargs: Any,
143+
) -> Union[AudioTranscriptionResponse, AudioTranscriptionVerboseResponse]:
144+
"""
145+
Async version of transcribe audio into the input language.
146+
147+
Args:
148+
file: The audio file object (not file name) to transcribe, in one of these formats:
149+
flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm.
150+
Can be a file path (str/Path), file object (BinaryIO), or URL (str).
151+
model: ID of the model to use. Defaults to "openai/whisper-large-v3".
152+
language: The language of the input audio. Supplying the input language in
153+
ISO-639-1 format will improve accuracy and latency.
154+
prompt: An optional text to guide the model's style or continue a previous
155+
audio segment. The prompt should match the audio language.
156+
response_format: The format of the transcript output, in one of these options:
157+
json, verbose_json.
158+
temperature: The sampling temperature, between 0 and 1. Higher values like 0.8
159+
will make the output more random, while lower values like 0.2 will make it
160+
more focused and deterministic.
161+
timestamp_granularities: The timestamp granularities to populate for this
162+
transcription. response_format must be set verbose_json to use timestamp
163+
granularities. Either or both of these options are supported: word, or segment.
164+
165+
Returns:
166+
The transcribed text in the requested format.
167+
"""
168+
169+
requestor = api_requestor.APIRequestor(
170+
client=self._client,
171+
)
172+
173+
# Handle file input - could be a path, URL, or file object
174+
files_data = {}
175+
params_data = {}
176+
177+
if isinstance(file, (str, Path)):
178+
if isinstance(file, str) and file.startswith(('http://', 'https://')):
179+
# URL string - send as multipart field
180+
files_data["file"] = (None, file)
181+
else:
182+
# Local file path
183+
file_path = Path(file)
184+
files_data["file"] = open(file_path, "rb")
185+
else:
186+
# File object
187+
files_data["file"] = file
188+
189+
# Build request parameters
190+
params_data.update({
191+
"model": model,
192+
"response_format": response_format if isinstance(response_format, str) else response_format.value,
193+
"temperature": temperature,
194+
})
195+
196+
if language is not None:
197+
params_data["language"] = language
198+
199+
if prompt is not None:
200+
params_data["prompt"] = prompt
201+
202+
if timestamp_granularities is not None:
203+
params_data["timestamp_granularities"] = (
204+
timestamp_granularities if isinstance(timestamp_granularities, str)
205+
else timestamp_granularities.value
206+
)
207+
208+
# Add any additional kwargs
209+
params_data.update(kwargs)
210+
211+
try:
212+
response, _, _ = await requestor.arequest(
213+
options=TogetherRequest(
214+
method="POST",
215+
url="audio/transcriptions",
216+
params=params_data,
217+
files=files_data,
218+
),
219+
)
220+
finally:
221+
# Close file if we opened it
222+
if files_data and "file" in files_data:
223+
try:
224+
# Only close if it's a file object (not a tuple for URL)
225+
if hasattr(files_data["file"], 'close'):
226+
files_data["file"].close()
227+
except:
228+
pass
229+
230+
# Parse response based on format
231+
if response_format == "verbose_json" or response_format == AudioTranscriptionResponseFormat.VERBOSE_JSON:
232+
return AudioTranscriptionVerboseResponse(**response.data)
233+
else:
234+
return AudioTranscriptionResponse(**response.data)

0 commit comments

Comments
 (0)