1+ from __future__ import annotations
2+
3+ from typing import Any , Union , BinaryIO , Optional
4+ from pathlib import Path
5+
6+ from together .abstract import api_requestor
7+ from together .together_response import TogetherResponse
8+ from together .types import (
9+ AudioTranscriptionRequest ,
10+ AudioTranscriptionResponse ,
11+ AudioTranscriptionVerboseResponse ,
12+ AudioTranscriptionResponseFormat ,
13+ AudioTimestampGranularities ,
14+ TogetherClient ,
15+ TogetherRequest ,
16+ )
17+
18+
19+ class Transcriptions :
20+ def __init__ (self , client : TogetherClient ) -> None :
21+ self ._client = client
22+
23+ def create (
24+ self ,
25+ * ,
26+ file : Union [str , BinaryIO , Path ],
27+ model : str = "openai/whisper-large-v3" ,
28+ language : Optional [str ] = None ,
29+ prompt : Optional [str ] = None ,
30+ response_format : Union [str , AudioTranscriptionResponseFormat ] = "json" ,
31+ temperature : float = 0.0 ,
32+ timestamp_granularities : Optional [Union [str , AudioTimestampGranularities ]] = None ,
33+ ** kwargs : Any ,
34+ ) -> Union [AudioTranscriptionResponse , AudioTranscriptionVerboseResponse ]:
35+ """
36+ Transcribes audio into the input language.
37+
38+ Args:
39+ file: The audio file object (not file name) to transcribe, in one of these formats:
40+ flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm.
41+ Can be a file path (str/Path), file object (BinaryIO), or URL (str).
42+ model: ID of the model to use. Defaults to "openai/whisper-large-v3".
43+ language: The language of the input audio. Supplying the input language in
44+ ISO-639-1 format will improve accuracy and latency.
45+ prompt: An optional text to guide the model's style or continue a previous
46+ audio segment. The prompt should match the audio language.
47+ response_format: The format of the transcript output, in one of these options:
48+ json, verbose_json.
49+ temperature: The sampling temperature, between 0 and 1. Higher values like 0.8
50+ will make the output more random, while lower values like 0.2 will make it
51+ more focused and deterministic.
52+ timestamp_granularities: The timestamp granularities to populate for this
53+ transcription. response_format must be set verbose_json to use timestamp
54+ granularities. Either or both of these options are supported: word, or segment.
55+
56+ Returns:
57+ The transcribed text in the requested format.
58+ """
59+
60+ requestor = api_requestor .APIRequestor (
61+ client = self ._client ,
62+ )
63+
64+ # Handle file input - could be a path, URL, or file object
65+ files_data = {}
66+ params_data = {}
67+
68+ if isinstance (file , (str , Path )):
69+ if isinstance (file , str ) and file .startswith (('http://' , 'https://' )):
70+ # URL string - send as multipart field
71+ files_data ["file" ] = (None , file )
72+ else :
73+ # Local file path
74+ file_path = Path (file )
75+ files_data ["file" ] = open (file_path , "rb" )
76+ else :
77+ # File object
78+ files_data ["file" ] = file
79+
80+ # Build request parameters
81+ params_data .update ({
82+ "model" : model ,
83+ "response_format" : response_format if isinstance (response_format , str ) else response_format .value ,
84+ "temperature" : temperature ,
85+ })
86+
87+ if language is not None :
88+ params_data ["language" ] = language
89+
90+ if prompt is not None :
91+ params_data ["prompt" ] = prompt
92+
93+ if timestamp_granularities is not None :
94+ params_data ["timestamp_granularities" ] = (
95+ timestamp_granularities if isinstance (timestamp_granularities , str )
96+ else timestamp_granularities .value
97+ )
98+
99+ # Add any additional kwargs
100+ params_data .update (kwargs )
101+
102+ try :
103+ response , _ , _ = requestor .request (
104+ options = TogetherRequest (
105+ method = "POST" ,
106+ url = "audio/transcriptions" ,
107+ params = params_data ,
108+ files = files_data ,
109+ ),
110+ )
111+ finally :
112+ # Close file if we opened it
113+ if files_data and "file" in files_data :
114+ try :
115+ # Only close if it's a file object (not a tuple for URL)
116+ if hasattr (files_data ["file" ], 'close' ):
117+ files_data ["file" ].close ()
118+ except :
119+ pass
120+
121+ # Parse response based on format
122+ if response_format == "verbose_json" or response_format == AudioTranscriptionResponseFormat .VERBOSE_JSON :
123+ return AudioTranscriptionVerboseResponse (** response .data )
124+ else :
125+ return AudioTranscriptionResponse (** response .data )
126+
127+
128+ class AsyncTranscriptions :
129+ def __init__ (self , client : TogetherClient ) -> None :
130+ self ._client = client
131+
132+ async def create (
133+ self ,
134+ * ,
135+ file : Union [str , BinaryIO , Path ],
136+ model : str = "openai/whisper-large-v3" ,
137+ language : Optional [str ] = None ,
138+ prompt : Optional [str ] = None ,
139+ response_format : Union [str , AudioTranscriptionResponseFormat ] = "json" ,
140+ temperature : float = 0.0 ,
141+ timestamp_granularities : Optional [Union [str , AudioTimestampGranularities ]] = None ,
142+ ** kwargs : Any ,
143+ ) -> Union [AudioTranscriptionResponse , AudioTranscriptionVerboseResponse ]:
144+ """
145+ Async version of transcribe audio into the input language.
146+
147+ Args:
148+ file: The audio file object (not file name) to transcribe, in one of these formats:
149+ flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm.
150+ Can be a file path (str/Path), file object (BinaryIO), or URL (str).
151+ model: ID of the model to use. Defaults to "openai/whisper-large-v3".
152+ language: The language of the input audio. Supplying the input language in
153+ ISO-639-1 format will improve accuracy and latency.
154+ prompt: An optional text to guide the model's style or continue a previous
155+ audio segment. The prompt should match the audio language.
156+ response_format: The format of the transcript output, in one of these options:
157+ json, verbose_json.
158+ temperature: The sampling temperature, between 0 and 1. Higher values like 0.8
159+ will make the output more random, while lower values like 0.2 will make it
160+ more focused and deterministic.
161+ timestamp_granularities: The timestamp granularities to populate for this
162+ transcription. response_format must be set verbose_json to use timestamp
163+ granularities. Either or both of these options are supported: word, or segment.
164+
165+ Returns:
166+ The transcribed text in the requested format.
167+ """
168+
169+ requestor = api_requestor .APIRequestor (
170+ client = self ._client ,
171+ )
172+
173+ # Handle file input - could be a path, URL, or file object
174+ files_data = {}
175+ params_data = {}
176+
177+ if isinstance (file , (str , Path )):
178+ if isinstance (file , str ) and file .startswith (('http://' , 'https://' )):
179+ # URL string - send as multipart field
180+ files_data ["file" ] = (None , file )
181+ else :
182+ # Local file path
183+ file_path = Path (file )
184+ files_data ["file" ] = open (file_path , "rb" )
185+ else :
186+ # File object
187+ files_data ["file" ] = file
188+
189+ # Build request parameters
190+ params_data .update ({
191+ "model" : model ,
192+ "response_format" : response_format if isinstance (response_format , str ) else response_format .value ,
193+ "temperature" : temperature ,
194+ })
195+
196+ if language is not None :
197+ params_data ["language" ] = language
198+
199+ if prompt is not None :
200+ params_data ["prompt" ] = prompt
201+
202+ if timestamp_granularities is not None :
203+ params_data ["timestamp_granularities" ] = (
204+ timestamp_granularities if isinstance (timestamp_granularities , str )
205+ else timestamp_granularities .value
206+ )
207+
208+ # Add any additional kwargs
209+ params_data .update (kwargs )
210+
211+ try :
212+ response , _ , _ = await requestor .arequest (
213+ options = TogetherRequest (
214+ method = "POST" ,
215+ url = "audio/transcriptions" ,
216+ params = params_data ,
217+ files = files_data ,
218+ ),
219+ )
220+ finally :
221+ # Close file if we opened it
222+ if files_data and "file" in files_data :
223+ try :
224+ # Only close if it's a file object (not a tuple for URL)
225+ if hasattr (files_data ["file" ], 'close' ):
226+ files_data ["file" ].close ()
227+ except :
228+ pass
229+
230+ # Parse response based on format
231+ if response_format == "verbose_json" or response_format == AudioTranscriptionResponseFormat .VERBOSE_JSON :
232+ return AudioTranscriptionVerboseResponse (** response .data )
233+ else :
234+ return AudioTranscriptionResponse (** response .data )
0 commit comments