Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ await model.DownloadAsync(progress =>

var audioClient = await model.GetAudioClientAsync();
var session = audioClient.CreateLiveTranscriptionSession();
session.Settings.SampleRate = 16000;
session.Settings.SampleRate = 16000; // Default is 16000; shown here to match the NAudio WaveFormat below
session.Settings.Channels = 1;
session.Settings.Language = "en";

Expand All @@ -54,16 +54,17 @@ await model.DownloadAsync(progress =>
{
await foreach (var result in session.GetTranscriptionStream())
{
var text = result.Content?[0]?.Text;
if (result.IsFinal)
{
Console.WriteLine();
Console.WriteLine($" [FINAL] {result.Text}");
Console.WriteLine($" [FINAL] {text}");
Console.Out.Flush();
}
else if (!string.IsNullOrEmpty(result.Text))
else if (!string.IsNullOrEmpty(text))
{
Console.ForegroundColor = ConsoleColor.Cyan;
Console.Write(result.Text);
Console.Write(text);
Console.ResetColor();
Console.Out.Flush();
}
Expand Down
23 changes: 11 additions & 12 deletions sdk/cs/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -259,12 +259,12 @@ waveIn.DataAvailable += (sender, e) =>
// Read transcription results as they arrive
await foreach (var result in session.GetTranscriptionStream())
{
// result inherits from AudioCreateTranscriptionResponse
// - result.Text — incremental transcribed text (per chunk, not accumulated)
// - result.IsFinal — true for final results, false for interim hypotheses
// - result.Segments — segment-level timing data (Start/End in seconds)
// - result.Language language code
Console.Write(result.Text);
// result follows the OpenAI Realtime ConversationItem pattern:
// - result.Content[0].Text — incremental transcribed text (per chunk, not accumulated)
// - result.Content[0].Transcript — alias for Text (OpenAI Realtime compatibility)
// - result.IsFinal — true for final results, false for interim hypotheses
// - result.StartTime / EndTimesegment timing in seconds
Console.Write(result.Content?[0]?.Text);
}

await session.StopAsync();
Expand All @@ -274,12 +274,11 @@ await session.StopAsync();

| Field | Type | Description |
|-------|------|-------------|
| `Text` | `string` | Transcribed text from this audio chunk (inherited from `AudioCreateTranscriptionResponse`) |
| `Content` | `List<TranscriptionContentPart>` | Content parts. Access text via `Content[0].Text` or `Content[0].Transcript`. |
| `IsFinal` | `bool` | Whether this is a final or interim result. Nemotron always returns `true`. |
| `Language` | `string` | Language code (inherited) |
| `Duration` | `float` | Audio duration in seconds (inherited) |
| `Segments` | `List<Segment>` | Segment timing with `Start`/`End` offsets (inherited) |
| `Words` | `List<WordSegment>` | Word-level timing (inherited, when available) |
| `StartTime` | `double?` | Start time offset in the audio stream (seconds). |
| `EndTime` | `double?` | End time offset in the audio stream (seconds). |
| `Id` | `string?` | Unique identifier for this result (if available). |

#### Session Lifecycle

Expand Down Expand Up @@ -356,7 +355,7 @@ Key types:
| [`OpenAIChatClient`](./docs/api/microsoft.ai.foundry.local.openaichatclient.md) | Chat completions (sync + streaming) |
| [`OpenAIAudioClient`](./docs/api/microsoft.ai.foundry.local.openaiaudioclient.md) | Audio transcription (sync + streaming) |
| [`LiveAudioTranscriptionSession`](./docs/api/microsoft.ai.foundry.local.openai.liveaudiotranscriptionsession.md) | Real-time audio streaming session |
| [`LiveAudioTranscriptionResponse`](./docs/api/microsoft.ai.foundry.local.openai.liveaudiotranscriptionresponse.md) | Streaming transcription result (extends `AudioCreateTranscriptionResponse`) |
| [`LiveAudioTranscriptionResponse`](./docs/api/microsoft.ai.foundry.local.openai.liveaudiotranscriptionresponse.md) | Streaming transcription result (ConversationItem-shaped) |
| [`ModelInfo`](./docs/api/microsoft.ai.foundry.local.modelinfo.md) | Full model metadata record |

## Tests
Expand Down
5 changes: 2 additions & 3 deletions sdk/cs/src/Detail/JsonSerializationContext.cs
Original file line number Diff line number Diff line change
Expand Up @@ -33,11 +33,10 @@ namespace Microsoft.AI.Foundry.Local.Detail;
[JsonSerializable(typeof(IList<FunctionDefinition>))]
[JsonSerializable(typeof(PropertyDefinition))]
[JsonSerializable(typeof(IList<PropertyDefinition>))]
// --- Audio streaming types ---
[JsonSerializable(typeof(LiveAudioTranscriptionResponse))]
// --- Audio streaming types (LiveAudioTranscriptionResponse inherits ConversationItem
// which has AOT-incompatible JsonConverters, so we only register the raw deserialization type) ---
[JsonSerializable(typeof(LiveAudioTranscriptionRaw))]
[JsonSerializable(typeof(CoreErrorResponse))]
[JsonSerializable(typeof(AudioCreateTranscriptionResponse.Segment))]
[JsonSourceGenerationOptions(DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull,
WriteIndented = false)]
internal partial class JsonSerializationContext : JsonSerializerContext
Expand Down
4 changes: 2 additions & 2 deletions sdk/cs/src/OpenAI/LiveAudioTranscriptionClient.cs
Original file line number Diff line number Diff line change
Expand Up @@ -218,7 +218,7 @@ private async Task PushLoopAsync(CancellationToken ct)
try
{
var transcription = LiveAudioTranscriptionResponse.FromJson(response.Data);
if (!string.IsNullOrEmpty(transcription.Text))
if (!string.IsNullOrEmpty(transcription.Content?[0]?.Text))
{
_outputChannel?.Writer.TryWrite(transcription);
}
Expand Down Expand Up @@ -331,7 +331,7 @@ public async Task StopAsync(CancellationToken ct = default)
try
{
var finalResult = LiveAudioTranscriptionResponse.FromJson(response.Data);
if (!string.IsNullOrEmpty(finalResult.Text))
if (!string.IsNullOrEmpty(finalResult.Content?[0]?.Text))
{
_outputChannel?.Writer.TryWrite(finalResult);
}
Expand Down
45 changes: 23 additions & 22 deletions sdk/cs/src/OpenAI/LiveAudioTranscriptionTypes.cs
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,18 @@ namespace Microsoft.AI.Foundry.Local.OpenAI;

using System.Text.Json;
using System.Text.Json.Serialization;
using Betalgo.Ranul.OpenAI.ObjectModels.ResponseModels;
using Betalgo.Ranul.OpenAI.ObjectModels.RealtimeModels;
using Microsoft.AI.Foundry.Local;
using Microsoft.AI.Foundry.Local.Detail;

/// <summary>
/// Transcription result for real-time audio streaming sessions.
/// Extends <see cref="AudioCreateTranscriptionResponse"/> to provide a consistent
/// output format with file-based transcription, while adding streaming-specific fields.
/// Extends the OpenAI Realtime API's <see cref="ConversationItem"/> so that
/// customers access text via <c>result.Content[0].Text</c> or
/// <c>result.Content[0].Transcript</c>, ensuring forward compatibility
/// when the transport layer moves to WebSocket.
/// </summary>
public record LiveAudioTranscriptionResponse : AudioCreateTranscriptionResponse
public class LiveAudioTranscriptionResponse : ConversationItem
{
/// <summary>
/// Whether this is a final or partial (interim) result.
Expand All @@ -22,35 +24,34 @@ public record LiveAudioTranscriptionResponse : AudioCreateTranscriptionResponse
[JsonPropertyName("is_final")]
public bool IsFinal { get; init; }

/// <summary>Start time offset of this segment in the audio stream (seconds).</summary>
[JsonPropertyName("start_time")]
public double? StartTime { get; init; }
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We're tracking this?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes,

/// <summary>
/// Transcription result sent back to SDK via callback during streaming.
/// Must match the SDK's AudioStreamingTranscriptionResult type.
/// </summary>
public record AudioStreamingTranscriptionResult
{
    [JsonPropertyName("is_final")]
    public bool IsFinal { get; init; }

    [JsonPropertyName("text")]
    public string Text { get; init; } = string.Empty;

    [JsonPropertyName("start_time")]
    public double? StartTime { get; init; }

    [JsonPropertyName("end_time")]
    public double? EndTime { get; init; }
}

This is inside our Core code, as you can see, we have the start_time in JSON response.

It is useful for the caller to have timestamp display, subtitle generation etc.


/// <summary>End time offset of this segment in the audio stream (seconds).</summary>
[JsonPropertyName("end_time")]
public double? EndTime { get; init; }

internal static LiveAudioTranscriptionResponse FromJson(string json)
{
// Deserialize the core's JSON (which has is_final, text, start_time, end_time)
// into an intermediate record, then map to the response type.
var raw = JsonSerializer.Deserialize(json,
JsonSerializationContext.Default.LiveAudioTranscriptionRaw)
?? throw new FoundryLocalException("Failed to deserialize live audio transcription result");

var response = new LiveAudioTranscriptionResponse
return new LiveAudioTranscriptionResponse
{
Text = raw.Text,
IsFinal = raw.IsFinal,
};

// Map start_time/end_time into a Segment for OpenAI-compatible output
if (raw.StartTime.HasValue || raw.EndTime.HasValue)
{
response.Segments =
StartTime = raw.StartTime,
EndTime = raw.EndTime,
Content =
[
new Segment
new ContentPart
{
Start = (float)(raw.StartTime ?? 0),
End = (float)(raw.EndTime ?? 0),
Text = raw.Text
Text = raw.Text,
Transcript = raw.Text
}
];
}

return response;
]
};
}
}

Expand Down
36 changes: 17 additions & 19 deletions sdk/cs/test/FoundryLocal.Tests/LiveAudioTranscriptionTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -21,25 +21,24 @@ public async Task FromJson_ParsesTextAndIsFinal()

var result = LiveAudioTranscriptionResponse.FromJson(json);

await Assert.That(result.Text).IsEqualTo("hello world");
await Assert.That(result.Content).IsNotNull();
await Assert.That(result.Content!.Count).IsEqualTo(1);
await Assert.That(result.Content[0].Text).IsEqualTo("hello world");
await Assert.That(result.Content[0].Transcript).IsEqualTo("hello world");
await Assert.That(result.IsFinal).IsTrue();
await Assert.That(result.Segments).IsNull();
}

[Test]
public async Task FromJson_MapsTimingToSegments()
public async Task FromJson_MapsTimingFields()
{
var json = """{"is_final":false,"text":"partial","start_time":1.5,"end_time":3.0}""";

var result = LiveAudioTranscriptionResponse.FromJson(json);

await Assert.That(result.Text).IsEqualTo("partial");
await Assert.That(result.Content?[0]?.Text).IsEqualTo("partial");
await Assert.That(result.IsFinal).IsFalse();
await Assert.That(result.Segments).IsNotNull();
await Assert.That(result.Segments!.Count).IsEqualTo(1);
await Assert.That(result.Segments[0].Start).IsEqualTo(1.5f);
await Assert.That(result.Segments[0].End).IsEqualTo(3.0f);
await Assert.That(result.Segments[0].Text).IsEqualTo("partial");
await Assert.That(result.StartTime).IsEqualTo(1.5);
await Assert.That(result.EndTime).IsEqualTo(3.0);
}

[Test]
Expand All @@ -49,21 +48,20 @@ public async Task FromJson_EmptyText_ParsesSuccessfully()

var result = LiveAudioTranscriptionResponse.FromJson(json);

await Assert.That(result.Text).IsEqualTo("");
await Assert.That(result.Content?[0]?.Text).IsEqualTo("");
await Assert.That(result.IsFinal).IsTrue();
}

[Test]
public async Task FromJson_OnlyStartTime_CreatesSegment()
public async Task FromJson_OnlyStartTime_SetsStartTime()
{
var json = """{"is_final":true,"text":"word","start_time":2.0,"end_time":null}""";

var result = LiveAudioTranscriptionResponse.FromJson(json);

await Assert.That(result.Segments).IsNotNull();
await Assert.That(result.Segments!.Count).IsEqualTo(1);
await Assert.That(result.Segments[0].Start).IsEqualTo(2.0f);
await Assert.That(result.Segments[0].End).IsEqualTo(0f);
await Assert.That(result.StartTime).IsEqualTo(2.0);
await Assert.That(result.EndTime).IsNull();
await Assert.That(result.Content?[0]?.Text).IsEqualTo("word");
}

[Test]
Expand All @@ -75,15 +73,15 @@ public async Task FromJson_InvalidJson_Throws()
}

[Test]
public async Task FromJson_InheritsFromAudioCreateTranscriptionResponse()
public async Task FromJson_ContentHasTextAndTranscript()
{
var json = """{"is_final":true,"text":"test","start_time":null,"end_time":null}""";

var result = LiveAudioTranscriptionResponse.FromJson(json);

// Verify it's assignable to the base type
Betalgo.Ranul.OpenAI.ObjectModels.ResponseModels.AudioCreateTranscriptionResponse baseRef = result;
await Assert.That(baseRef.Text).IsEqualTo("test");
// Both Text and Transcript should have the same value
await Assert.That(result.Content?[0]?.Text).IsEqualTo("test");
await Assert.That(result.Content?[0]?.Transcript).IsEqualTo("test");
}

// --- LiveAudioTranscriptionOptions tests ---
Expand Down
Loading