Skip to content

Commit 0652a04

Browse files
authored
Add SpeechToTextResponse.Usage (#6546)
1 parent e69d3e9 commit 0652a04

File tree

4 files changed

+103
-66
lines changed

4 files changed

+103
-66
lines changed

src/Libraries/Microsoft.Extensions.AI.Abstractions/SpeechToText/SpeechToTextResponse.cs

Lines changed: 14 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -47,10 +47,10 @@ public SpeechToTextResponse(string? content)
4747
/// <summary>Gets or sets the ID of the speech to text response.</summary>
4848
public string? ResponseId { get; set; }
4949

50-
/// <summary>Gets or sets the model ID used in the creation of the speech to text completion.</summary>
50+
/// <summary>Gets or sets the model ID used in the creation of the speech to text response.</summary>
5151
public string? ModelId { get; set; }
5252

53-
/// <summary>Gets or sets the raw representation of the speech to text completion from an underlying implementation.</summary>
53+
/// <summary>Gets or sets the raw representation of the speech to text response from an underlying implementation.</summary>
5454
/// <remarks>
5555
/// If a <see cref="SpeechToTextResponse"/> is created to represent some underlying object from another object
5656
/// model, this property can be used to store that original object. This can be useful for debugging or
@@ -59,7 +59,7 @@ public SpeechToTextResponse(string? content)
5959
[JsonIgnore]
6060
public object? RawRepresentation { get; set; }
6161

62-
/// <summary>Gets or sets any additional properties associated with the speech to text completion.</summary>
62+
/// <summary>Gets or sets any additional properties associated with the speech to text response.</summary>
6363
public AdditionalPropertiesDictionary? AdditionalProperties { get; set; }
6464

6565
/// <summary>Gets the text of this speech to text response.</summary>
@@ -76,9 +76,15 @@ public SpeechToTextResponse(string? content)
7676
/// <returns>An array of <see cref="SpeechToTextResponseUpdate" /> instances that may be used to represent this <see cref="SpeechToTextResponse" />.</returns>
7777
public SpeechToTextResponseUpdate[] ToSpeechToTextResponseUpdates()
7878
{
79-
SpeechToTextResponseUpdate update = new SpeechToTextResponseUpdate
79+
IList<AIContent> contents = Contents;
80+
if (Usage is { } usage)
8081
{
81-
Contents = Contents,
82+
contents = [.. contents, new UsageContent(usage)];
83+
}
84+
85+
SpeechToTextResponseUpdate update = new()
86+
{
87+
Contents = contents,
8288
AdditionalProperties = AdditionalProperties,
8389
RawRepresentation = RawRepresentation,
8490
StartTime = StartTime,
@@ -98,4 +104,7 @@ public IList<AIContent> Contents
98104
get => _contents ??= [];
99105
set => _contents = value;
100106
}
107+
108+
/// <summary>Gets or sets usage details for the speech to text response.</summary>
109+
public UsageDetails? Usage { get; set; }
101110
}

src/Libraries/Microsoft.Extensions.AI.Abstractions/SpeechToText/SpeechToTextResponseUpdateExtensions.cs

Lines changed: 39 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,14 @@
11
// Licensed to the .NET Foundation under one or more agreements.
22
// The .NET Foundation licenses this file to you under the MIT license.
33

4-
using System;
54
using System.Collections.Generic;
65
using System.Diagnostics.CodeAnalysis;
76
using System.Threading;
87
using System.Threading.Tasks;
98
using Microsoft.Shared.Diagnostics;
109

10+
#pragma warning disable S1121 // Assignments should not be made from within sub-expressions
11+
1112
namespace Microsoft.Extensions.AI;
1213

1314
/// <summary>
@@ -25,32 +26,13 @@ public static SpeechToTextResponse ToSpeechToTextResponse(
2526
_ = Throw.IfNull(updates);
2627

2728
SpeechToTextResponse response = new();
28-
List<AIContent> contents = [];
29-
string? responseId = null;
30-
string? modelId = null;
31-
AdditionalPropertiesDictionary? additionalProperties = null;
3229

33-
TimeSpan? endTime = null;
3430
foreach (var update in updates)
3531
{
36-
// Track the first start time provided by the updates
37-
response.StartTime ??= update.StartTime;
38-
39-
// Track the last end time provided by the updates
40-
if (update.EndTime is not null)
41-
{
42-
endTime = update.EndTime;
43-
}
44-
45-
ProcessUpdate(update, contents, ref responseId, ref modelId, ref additionalProperties);
32+
ProcessUpdate(update, response);
4633
}
4734

48-
ChatResponseExtensions.CoalesceTextContent(contents);
49-
response.EndTime = endTime;
50-
response.Contents = contents;
51-
response.ResponseId = responseId;
52-
response.ModelId = modelId;
53-
response.AdditionalProperties = additionalProperties;
35+
ChatResponseExtensions.CoalesceTextContent((List<AIContent>)response.Contents);
5436

5537
return response;
5638
}
@@ -70,74 +52,73 @@ static async Task<SpeechToTextResponse> ToResponseAsync(
7052
IAsyncEnumerable<SpeechToTextResponseUpdate> updates, CancellationToken cancellationToken)
7153
{
7254
SpeechToTextResponse response = new();
73-
List<AIContent> contents = [];
74-
string? responseId = null;
75-
string? modelId = null;
76-
AdditionalPropertiesDictionary? additionalProperties = null;
7755

78-
TimeSpan? endTime = null;
7956
await foreach (var update in updates.WithCancellation(cancellationToken).ConfigureAwait(false))
8057
{
81-
// Track the first start time provided by the updates
82-
response.StartTime ??= update.StartTime;
83-
84-
// Track the last end time provided by the updates
85-
if (update.EndTime is not null)
86-
{
87-
endTime = update.EndTime;
88-
}
89-
90-
ProcessUpdate(update, contents, ref responseId, ref modelId, ref additionalProperties);
58+
ProcessUpdate(update, response);
9159
}
9260

93-
ChatResponseExtensions.CoalesceTextContent(contents);
94-
95-
response.EndTime = endTime;
96-
response.Contents = contents;
97-
response.ResponseId = responseId;
98-
response.ModelId = modelId;
99-
response.AdditionalProperties = additionalProperties;
61+
ChatResponseExtensions.CoalesceTextContent((List<AIContent>)response.Contents);
10062

10163
return response;
10264
}
10365
}
10466

10567
/// <summary>Processes the <see cref="SpeechToTextResponseUpdate"/>, incorporating its contents and properties.</summary>
10668
/// <param name="update">The update to process.</param>
107-
/// <param name="contents">The list of content items being accumulated.</param>
108-
/// <param name="responseId">The response ID to update if the update has one.</param>
109-
/// <param name="modelId">The model ID to update if the update has one.</param>
110-
/// <param name="additionalProperties">The additional properties to update if the update has any.</param>
69+
/// <param name="response">The <see cref="SpeechToTextResponse"/> object that should be updated based on <paramref name="update"/>.</param>
11170
private static void ProcessUpdate(
11271
SpeechToTextResponseUpdate update,
113-
List<AIContent> contents,
114-
ref string? responseId,
115-
ref string? modelId,
116-
ref AdditionalPropertiesDictionary? additionalProperties)
72+
SpeechToTextResponse response)
11773
{
11874
if (update.ResponseId is not null)
11975
{
120-
responseId = update.ResponseId;
76+
response.ResponseId = update.ResponseId;
12177
}
12278

12379
if (update.ModelId is not null)
12480
{
125-
modelId = update.ModelId;
81+
response.ModelId = update.ModelId;
12682
}
12783

128-
contents.AddRange(update.Contents);
84+
if (response.StartTime is null || (update.StartTime is not null && update.StartTime < response.StartTime))
85+
{
86+
// Track the first start time provided by the updates
87+
response.StartTime = update.StartTime;
88+
}
89+
90+
if (response.EndTime is null || (update.EndTime is not null && update.EndTime > response.EndTime))
91+
{
92+
// Track the last end time provided by the updates
93+
response.EndTime = update.EndTime;
94+
}
95+
96+
foreach (var content in update.Contents)
97+
{
98+
switch (content)
99+
{
100+
// Usage content is treated specially and propagated to the response's Usage.
101+
case UsageContent usage:
102+
(response.Usage ??= new()).Add(usage.Details);
103+
break;
104+
105+
default:
106+
response.Contents.Add(content);
107+
break;
108+
}
109+
}
129110

130111
if (update.AdditionalProperties is not null)
131112
{
132-
if (additionalProperties is null)
113+
if (response.AdditionalProperties is null)
133114
{
134-
additionalProperties = new(update.AdditionalProperties);
115+
response.AdditionalProperties = new(update.AdditionalProperties);
135116
}
136117
else
137118
{
138119
foreach (var entry in update.AdditionalProperties)
139120
{
140-
additionalProperties[entry.Key] = entry.Value;
121+
response.AdditionalProperties[entry.Key] = entry.Value;
141122
}
142123
}
143124
}

test/Libraries/Microsoft.Extensions.AI.Abstractions.Tests/SpeechToText/SpeechToTextResponseTests.cs

Lines changed: 26 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ public void Constructor_Parameterless_PropsDefaulted()
3131
Assert.Null(response.StartTime);
3232
Assert.Null(response.EndTime);
3333
Assert.Equal(string.Empty, response.ToString());
34+
Assert.Null(response.Usage);
3435
}
3536

3637
[Theory]
@@ -132,6 +133,11 @@ public void Properties_Roundtrip()
132133
List<AIContent> newContents = [new TextContent("text1"), new TextContent("text2")];
133134
response.Contents = newContents;
134135
Assert.Same(newContents, response.Contents);
136+
137+
Assert.Null(response.Usage);
138+
UsageDetails usageDetails = new();
139+
response.Usage = usageDetails;
140+
Assert.Same(usageDetails, response.Usage);
135141
}
136142

137143
[Fact]
@@ -152,6 +158,7 @@ public void JsonSerialization_Roundtrips()
152158
EndTime = TimeSpan.FromSeconds(2),
153159
RawRepresentation = new(),
154160
AdditionalProperties = new() { ["key"] = "value" },
161+
Usage = new() { InputTokenCount = 42, OutputTokenCount = 84, TotalTokenCount = 126 },
155162
};
156163

157164
string json = JsonSerializer.Serialize(original, TestJsonSerializerContext.Default.SpeechToTextResponse);
@@ -176,6 +183,11 @@ public void JsonSerialization_Roundtrips()
176183
Assert.True(result.AdditionalProperties.TryGetValue("key", out object? value));
177184
Assert.IsType<JsonElement>(value);
178185
Assert.Equal("value", ((JsonElement)value!).GetString());
186+
187+
Assert.NotNull(result.Usage);
188+
Assert.Equal(42, result.Usage.InputTokenCount);
189+
Assert.Equal(84, result.Usage.OutputTokenCount);
190+
Assert.Equal(126, result.Usage.TotalTokenCount);
179191
}
180192

181193
[Fact]
@@ -185,8 +197,10 @@ public void ToString_OutputsText()
185197
Assert.Equal("This is a test." + Environment.NewLine + "It's multiple lines.", response.ToString());
186198
}
187199

188-
[Fact]
189-
public void ToSpeechToTextResponseUpdates_ReturnsExpectedUpdate()
200+
[Theory]
201+
[InlineData(false)]
202+
[InlineData(true)]
203+
public void ToSpeechToTextResponseUpdates_ReturnsExpectedUpdate(bool withUsage)
190204
{
191205
// Arrange: create a response with contents
192206
SpeechToTextResponse response = new()
@@ -202,6 +216,7 @@ public void ToSpeechToTextResponseUpdates_ReturnsExpectedUpdate()
202216
ResponseId = "12345",
203217
ModelId = "someModel",
204218
AdditionalProperties = new() { ["key1"] = "value1", ["key2"] = 42 },
219+
Usage = withUsage ? new UsageDetails { InputTokenCount = 100, OutputTokenCount = 200, TotalTokenCount = 300 } : null
205220
};
206221

207222
// Act: convert to streaming updates
@@ -217,13 +232,21 @@ public void ToSpeechToTextResponseUpdates_ReturnsExpectedUpdate()
217232
Assert.Equal(TimeSpan.FromSeconds(1), update.StartTime);
218233
Assert.Equal(TimeSpan.FromSeconds(2), update.EndTime);
219234

220-
Assert.Equal(3, update.Contents.Count);
235+
Assert.Equal(withUsage ? 4 : 3, update.Contents.Count);
221236
Assert.Equal("Hello, ", Assert.IsType<TextContent>(update.Contents[0]).Text);
222237
Assert.Equal("image/png", Assert.IsType<DataContent>(update.Contents[1]).MediaType);
223238
Assert.Equal("world!", Assert.IsType<TextContent>(update.Contents[2]).Text);
224239

225240
Assert.NotNull(update.AdditionalProperties);
226241
Assert.Equal("value1", update.AdditionalProperties["key1"]);
227242
Assert.Equal(42, update.AdditionalProperties["key2"]);
243+
244+
if (withUsage)
245+
{
246+
var usage = Assert.IsType<UsageContent>(update.Contents[3]);
247+
Assert.Equal(100, usage.Details.InputTokenCount);
248+
Assert.Equal(200, usage.Details.OutputTokenCount);
249+
Assert.Equal(300, usage.Details.TotalTokenCount);
250+
}
228251
}
229252
}

test/Libraries/Microsoft.Extensions.AI.Abstractions.Tests/SpeechToText/SpeechToTextResponseUpdateExtensionsTests.cs

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,8 @@ public async Task ToSpeechToTextResponse_SuccessfullyCreatesResponse(bool useAsy
7070
Assert.Equal("d", response.AdditionalProperties["c"]);
7171

7272
Assert.Equal("Hello human, How are You?", response.Text);
73+
74+
Assert.Null(response.Usage);
7375
}
7476

7577
[Theory]
@@ -129,6 +131,28 @@ void AddGap()
129131
}
130132
}
131133

134+
[Fact]
135+
public async Task ToSpeechToTextResponse_UsageContentExtractedFromContents()
136+
{
137+
SpeechToTextResponseUpdate[] updates =
138+
{
139+
new() { Contents = [new TextContent("Hello, ")] },
140+
new() { Contents = [new UsageContent(new() { TotalTokenCount = 42 })] },
141+
new() { Contents = [new TextContent("world!")] },
142+
new() { Contents = [new UsageContent(new() { InputTokenCount = 12, TotalTokenCount = 24 })] },
143+
};
144+
145+
SpeechToTextResponse response = await YieldAsync(updates).ToSpeechToTextResponseAsync();
146+
147+
Assert.NotNull(response);
148+
149+
Assert.NotNull(response.Usage);
150+
Assert.Equal(12, response.Usage.InputTokenCount);
151+
Assert.Equal(66, response.Usage.TotalTokenCount);
152+
153+
Assert.Equal("Hello, world!", Assert.IsType<TextContent>(Assert.Single(response.Contents)).Text);
154+
}
155+
132156
private static async IAsyncEnumerable<SpeechToTextResponseUpdate> YieldAsync(IEnumerable<SpeechToTextResponseUpdate> updates)
133157
{
134158
foreach (SpeechToTextResponseUpdate update in updates)

0 commit comments

Comments
 (0)