You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
Copy file name to clipboardExpand all lines: articles/cognitive-services/Speech-Service/batch-transcription-get.md
+52-29Lines changed: 52 additions & 29 deletions
Display the source diff
Display the rich diff
Original file line number
Diff line number
Diff line change
@@ -43,8 +43,8 @@ You should receive a response body in the following format:
43
43
},
44
44
"properties": {
45
45
"diarizationEnabled": false,
46
-
"wordLevelTimestampsEnabled": true,
47
-
"displayFormWordLevelTimestampsEnabled": false,
46
+
"wordLevelTimestampsEnabled": false,
47
+
"displayFormWordLevelTimestampsEnabled": true,
48
48
"channels": [
49
49
0,
50
50
1
@@ -98,8 +98,8 @@ You should receive a response body in the following format:
98
98
},
99
99
"properties": {
100
100
"diarizationEnabled": false,
101
-
"wordLevelTimestampsEnabled": true,
102
-
"displayFormWordLevelTimestampsEnabled": false,
101
+
"wordLevelTimestampsEnabled": false,
102
+
"displayFormWordLevelTimestampsEnabled": true,
103
103
"channels": [
104
104
0,
105
105
1
@@ -285,9 +285,9 @@ The contents of each transcription result file are formatted as JSON, as shown i
285
285
```json
286
286
{
287
287
"source": "...",
288
-
"timestamp": "2022-09-16T09:30:21Z",
289
-
"durationInTicks": 41200000,
290
-
"duration": "PT4.12S",
288
+
"timestamp": "2023-07-10T14:28:16Z",
289
+
"durationInTicks": 25800000,
290
+
"duration": "PT2.58S",
291
291
"combinedRecognizedPhrases": [
292
292
{
293
293
"channel": 0,
@@ -300,39 +300,62 @@ The contents of each transcription result file are formatted as JSON, as shown i
300
300
"recognizedPhrases": [
301
301
{
302
302
"recognitionStatus": "Success",
303
-
"speaker": 1,
304
303
"channel": 0,
305
-
"offset": "PT0.07S",
306
-
"duration": "PT1.59S",
307
-
"offsetInTicks": 700000.0,
308
-
"durationInTicks": 15900000.0,
309
-
304
+
"offset": "PT0.76S",
305
+
"duration": "PT1.32S",
306
+
"offsetInTicks": 7600000.0,
307
+
"durationInTicks": 13200000.0,
310
308
"nBest": [
311
309
{
312
-
"confidence": 0.898652852,
310
+
"confidence": 0.5643338,
313
311
"lexical": "hello world",
314
312
"itn": "hello world",
315
313
"maskedITN": "hello world",
316
314
"display": "Hello world.",
317
-
318
-
"words": [
315
+
"displayWords": [
319
316
{
320
-
"word": "hello",
321
-
"offset": "PT0.09S",
322
-
"duration": "PT0.48S",
323
-
"offsetInTicks": 900000.0,
324
-
"durationInTicks": 4800000.0,
325
-
"confidence": 0.987572
317
+
"displayText": "Hello",
318
+
"offset": "PT0.76S",
319
+
"duration": "PT0.76S",
320
+
"offsetInTicks": 7600000.0,
321
+
"durationInTicks": 7600000.0
326
322
},
327
323
{
328
-
"word": "world",
329
-
"offset": "PT0.59S",
330
-
"duration": "PT0.16S",
331
-
"offsetInTicks": 5900000.0,
332
-
"durationInTicks": 1600000.0,
333
-
"confidence": 0.906032
324
+
"displayText": "world.",
325
+
"offset": "PT1.52S",
326
+
"duration": "PT0.56S",
327
+
"offsetInTicks": 15200000.0,
328
+
"durationInTicks": 5600000.0
334
329
}
335
330
]
331
+
},
332
+
{
333
+
"confidence": 0.1769063,
334
+
"lexical": "helloworld",
335
+
"itn": "helloworld",
336
+
"maskedITN": "helloworld",
337
+
"display": "helloworld"
338
+
},
339
+
{
340
+
"confidence": 0.49964225,
341
+
"lexical": "hello worlds",
342
+
"itn": "hello worlds",
343
+
"maskedITN": "hello worlds",
344
+
"display": "hello worlds"
345
+
},
346
+
{
347
+
"confidence": 0.4995761,
348
+
"lexical": "hello worm",
349
+
"itn": "hello worm",
350
+
"maskedITN": "hello worm",
351
+
"display": "hello worm"
352
+
},
353
+
{
354
+
"confidence": 0.49418187,
355
+
"lexical": "hello word",
356
+
"itn": "hello word",
357
+
"maskedITN": "hello word",
358
+
"display": "hello word"
336
359
}
337
360
]
338
361
}
@@ -348,7 +371,7 @@ Depending in part on the request parameters set when you created the transcripti
348
371
|`combinedRecognizedPhrases`|The concatenated results of all phrases for the channel.|
349
372
|`confidence`|The confidence value for the recognition.|
350
373
|`display`|The display form of the recognized text. Added punctuation and capitalization are included.|
351
-
|`displayPhraseElements`|A list of results with display text for each word of the phrase. The `displayFormWordLevelTimestampsEnabled` request property must be set to `true`, otherwise this property is not present.<br/><br/>**Note**: This property is only available with Speech to text REST API version 3.1.|
374
+
|`displayWords`|The timestamps for each word of the transcription. The `displayFormWordLevelTimestampsEnabled` request property must be set to `true`, otherwise this property is not present.<br/><br/>**Note**: This property is only available with Speech to text REST API version 3.1.|
352
375
|`duration`|The audio duration. The value is an ISO 8601 encoded duration.|
353
376
|`durationInTicks`|The audio duration in ticks (1 tick is 100 nanoseconds).|
354
377
|`itn`|The inverse text normalized (ITN) form of the recognized text. Abbreviations such as "Doctor Smith" to "Dr Smith", phone numbers, and other transformations are applied.|
0 commit comments