Skip to content

Commit 7fb2ef0

Browse files
authored
Adds Whisper model (#18819)
1 parent 196b23d commit 7fb2ef0

File tree

1 file changed

+150
-0
lines changed

1 file changed

+150
-0
lines changed
Lines changed: 150 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,150 @@
1+
{
2+
"id": "200f0812-148c-48c1-915d-fb3277a94a08",
3+
"source": 1,
4+
"name": "@cf/openai/whisper-large-v3-turbo",
5+
"description": "Whisper is a pre-trained model for automatic speech recognition (ASR) and speech translation. ",
6+
"task": {
7+
"id": "dfce1c48-2a81-462e-a7fd-de97ce985207",
8+
"name": "Automatic Speech Recognition",
9+
"description": "Automatic speech recognition (ASR) models convert a speech signal, typically an audio input, to text."
10+
},
11+
"tags": [],
12+
"properties": [
13+
{
14+
"property_id": "beta",
15+
"value": "true"
16+
}
17+
],
18+
"schema": {
19+
"input": {
20+
"type": "object",
21+
"properties": {
22+
"audio": {
23+
"type": "string",
24+
"description": "Base64 encoded value of the audio data."
25+
},
26+
"task": {
27+
"type": "string",
28+
"default": "transcribe",
29+
"description": "Supported tasks are 'translate' or 'transcribe'."
30+
},
31+
"language": {
32+
"type": "string",
33+
"default": "en",
34+
"description": "The language of the audio being transcribed or translated."
35+
},
36+
"vad_filter": {
37+
"type": "string",
38+
"default": "false",
39+
"description": "Preprocess the audio with a voice activity detection model."
40+
},
41+
"initial_prompt": {
42+
"type": "string",
43+
"description": "A text prompt to help provide context to the model on the contents of the audio."
44+
},
45+
"prefix": {
46+
"type": "string",
47+
"description": "The prefix it appended the the beginning of the output of the transcription and can guide the transcription result."
48+
}
49+
},
50+
"required": [
51+
"audio"
52+
]
53+
},
54+
"output": {
55+
"type": "object",
56+
"contentType": "application/json",
57+
"properties": {
58+
"transcription_info": {
59+
"type": "object",
60+
"properties": {
61+
"language": {
62+
"type": "string",
63+
"description": "The language of the audio being transcribed or translated."
64+
},
65+
"language_probability": {
66+
"type": "number",
67+
"description": "The confidence level or probability of the detected language being accurate, represented as a decimal between 0 and 1."
68+
},
69+
"duration": {
70+
"type": "number",
71+
"description": "The total duration of the original audio file, in seconds."
72+
},
73+
"duration_after_vad": {
74+
"type": "number",
75+
"description": "The duration of the audio after applying Voice Activity Detection (VAD) to remove silent or irrelevant sections, in seconds."
76+
}
77+
}
78+
},
79+
"text": {
80+
"type": "string",
81+
"description": "The complete transcription of the audio."
82+
},
83+
"word_count": {
84+
"type": "number",
85+
"description": "The total number of words in the transcription."
86+
},
87+
"segments": {
88+
"type": "object",
89+
"properties": {
90+
"start": {
91+
"type": "number",
92+
"description": "The starting time of the segment within the audio, in seconds."
93+
},
94+
"end": {
95+
"type": "number",
96+
"description": "The ending time of the segment within the audio, in seconds."
97+
},
98+
"text": {
99+
"type": "string",
100+
"description": "The transcription of the segment."
101+
},
102+
"temperature": {
103+
"type": "number",
104+
"description": "The temperature used in the decoding process, controlling randomness in predictions. Lower values result in more deterministic outputs."
105+
},
106+
"avg_logprob": {
107+
"type": "number",
108+
"description": "The average log probability of the predictions for the words in this segment, indicating overall confidence."
109+
},
110+
"compression_ratio": {
111+
"type": "number",
112+
"description": "The compression ratio of the input to the output, measuring how much the text was compressed during the transcription process."
113+
},
114+
"no_speech_prob": {
115+
"type": "number",
116+
"description": "The probability that the segment contains no speech, represented as a decimal between 0 and 1."
117+
},
118+
"words": {
119+
"type": "array",
120+
"items": {
121+
"type": "object",
122+
"properties": {
123+
"word": {
124+
"type": "string",
125+
"description": "The individual word transcribed from the audio."
126+
},
127+
"start": {
128+
"type": "number",
129+
"description": "The starting time of the word within the audio, in seconds."
130+
},
131+
"end": {
132+
"type": "number",
133+
"description": "The ending time of the word within the audio, in seconds."
134+
}
135+
}
136+
}
137+
}
138+
}
139+
},
140+
"vtt": {
141+
"type": "string",
142+
"description": "The transcription in WebVTT format, which includes timing and text information for use in subtitles."
143+
}
144+
},
145+
"required": [
146+
"text"
147+
]
148+
}
149+
}
150+
}

0 commit comments

Comments
 (0)