Skip to content

Commit a15c512

Browse files
Merge pull request #244434 from alexeyo26/alexeyo/displayWords-fix
[CogSvc] Speech. Fix for displayPhraseElements / displayWords issue
2 parents dac2cca + e6e8556 commit a15c512

File tree

1 file changed

+52
-29
lines changed

1 file changed

+52
-29
lines changed

articles/cognitive-services/Speech-Service/batch-transcription-get.md

Lines changed: 52 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -43,8 +43,8 @@ You should receive a response body in the following format:
4343
},
4444
"properties": {
4545
"diarizationEnabled": false,
46-
"wordLevelTimestampsEnabled": true,
47-
"displayFormWordLevelTimestampsEnabled": false,
46+
"wordLevelTimestampsEnabled": false,
47+
"displayFormWordLevelTimestampsEnabled": true,
4848
"channels": [
4949
0,
5050
1
@@ -98,8 +98,8 @@ You should receive a response body in the following format:
9898
},
9999
"properties": {
100100
"diarizationEnabled": false,
101-
"wordLevelTimestampsEnabled": true,
102-
"displayFormWordLevelTimestampsEnabled": false,
101+
"wordLevelTimestampsEnabled": false,
102+
"displayFormWordLevelTimestampsEnabled": true,
103103
"channels": [
104104
0,
105105
1
@@ -285,9 +285,9 @@ The contents of each transcription result file are formatted as JSON, as shown i
285285
```json
286286
{
287287
"source": "...",
288-
"timestamp": "2022-09-16T09:30:21Z",
289-
"durationInTicks": 41200000,
290-
"duration": "PT4.12S",
288+
"timestamp": "2023-07-10T14:28:16Z",
289+
"durationInTicks": 25800000,
290+
"duration": "PT2.58S",
291291
"combinedRecognizedPhrases": [
292292
{
293293
"channel": 0,
@@ -300,39 +300,62 @@ The contents of each transcription result file are formatted as JSON, as shown i
300300
"recognizedPhrases": [
301301
{
302302
"recognitionStatus": "Success",
303-
"speaker": 1,
304303
"channel": 0,
305-
"offset": "PT0.07S",
306-
"duration": "PT1.59S",
307-
"offsetInTicks": 700000.0,
308-
"durationInTicks": 15900000.0,
309-
304+
"offset": "PT0.76S",
305+
"duration": "PT1.32S",
306+
"offsetInTicks": 7600000.0,
307+
"durationInTicks": 13200000.0,
310308
"nBest": [
311309
{
312-
"confidence": 0.898652852,
310+
"confidence": 0.5643338,
313311
"lexical": "hello world",
314312
"itn": "hello world",
315313
"maskedITN": "hello world",
316314
"display": "Hello world.",
317-
318-
"words": [
315+
"displayWords": [
319316
{
320-
"word": "hello",
321-
"offset": "PT0.09S",
322-
"duration": "PT0.48S",
323-
"offsetInTicks": 900000.0,
324-
"durationInTicks": 4800000.0,
325-
"confidence": 0.987572
317+
"displayText": "Hello",
318+
"offset": "PT0.76S",
319+
"duration": "PT0.76S",
320+
"offsetInTicks": 7600000.0,
321+
"durationInTicks": 7600000.0
326322
},
327323
{
328-
"word": "world",
329-
"offset": "PT0.59S",
330-
"duration": "PT0.16S",
331-
"offsetInTicks": 5900000.0,
332-
"durationInTicks": 1600000.0,
333-
"confidence": 0.906032
324+
"displayText": "world.",
325+
"offset": "PT1.52S",
326+
"duration": "PT0.56S",
327+
"offsetInTicks": 15200000.0,
328+
"durationInTicks": 5600000.0
334329
}
335330
]
331+
},
332+
{
333+
"confidence": 0.1769063,
334+
"lexical": "helloworld",
335+
"itn": "helloworld",
336+
"maskedITN": "helloworld",
337+
"display": "helloworld"
338+
},
339+
{
340+
"confidence": 0.49964225,
341+
"lexical": "hello worlds",
342+
"itn": "hello worlds",
343+
"maskedITN": "hello worlds",
344+
"display": "hello worlds"
345+
},
346+
{
347+
"confidence": 0.4995761,
348+
"lexical": "hello worm",
349+
"itn": "hello worm",
350+
"maskedITN": "hello worm",
351+
"display": "hello worm"
352+
},
353+
{
354+
"confidence": 0.49418187,
355+
"lexical": "hello word",
356+
"itn": "hello word",
357+
"maskedITN": "hello word",
358+
"display": "hello word"
336359
}
337360
]
338361
}
@@ -348,7 +371,7 @@ Depending in part on the request parameters set when you created the transcripti
348371
|`combinedRecognizedPhrases`|The concatenated results of all phrases for the channel.|
349372
|`confidence`|The confidence value for the recognition.|
350373
|`display`|The display form of the recognized text. Added punctuation and capitalization are included.|
351-
|`displayPhraseElements`|A list of results with display text for each word of the phrase. The `displayFormWordLevelTimestampsEnabled` request property must be set to `true`, otherwise this property is not present.<br/><br/>**Note**: This property is only available with Speech to text REST API version 3.1.|
374+
|`displayWords`|The timestamps for each word of the transcription. The `displayFormWordLevelTimestampsEnabled` request property must be set to `true`, otherwise this property is not present.<br/><br/>**Note**: This property is only available with Speech to text REST API version 3.1.|
352375
|`duration`|The audio duration. The value is an ISO 8601 encoded duration.|
353376
|`durationInTicks`|The audio duration in ticks (1 tick is 100 nanoseconds).|
354377
|`itn`|The inverse text normalized (ITN) form of the recognized text. Abbreviations such as "Doctor Smith" to "Dr Smith", phone numbers, and other transformations are applied.|

0 commit comments

Comments
 (0)