microsoft · kunal-vaishnavi · Mar 28, 2026 · Mar 27, 2026 · Mar 27, 2026 · Mar 28, 2026
diff --git a/samples/cs/GettingStarted/src/LiveAudioTranscriptionExample/Program.cs b/samples/cs/GettingStarted/src/LiveAudioTranscriptionExample/Program.cs
@@ -41,7 +41,7 @@ await model.DownloadAsync(progress =>
 
 var audioClient = await model.GetAudioClientAsync();
 var session = audioClient.CreateLiveTranscriptionSession();
-session.Settings.SampleRate = 16000;
+session.Settings.SampleRate = 16000;  // Default is 16000; shown here to match the NAudio WaveFormat below
 session.Settings.Channels = 1;
 session.Settings.Language = "en";
 
@@ -54,16 +54,17 @@ await model.DownloadAsync(progress =>
     {
         await foreach (var result in session.GetTranscriptionStream())
         {
+            var text = result.Content?[0]?.Text;
             if (result.IsFinal)
             {
                 Console.WriteLine();
-                Console.WriteLine($"  [FINAL] {result.Text}");
+                Console.WriteLine($"  [FINAL] {text}");
                 Console.Out.Flush();
             }
-            else if (!string.IsNullOrEmpty(result.Text))
+            else if (!string.IsNullOrEmpty(text))
             {
                 Console.ForegroundColor = ConsoleColor.Cyan;
-                Console.Write(result.Text);
+                Console.Write(text);
                 Console.ResetColor();
                 Console.Out.Flush();
             }

diff --git a/sdk/cs/README.md b/sdk/cs/README.md
@@ -259,12 +259,12 @@ waveIn.DataAvailable += (sender, e) =>
 // Read transcription results as they arrive
 await foreach (var result in session.GetTranscriptionStream())
 {
-    // result inherits from AudioCreateTranscriptionResponse
-    // - result.Text         — incremental transcribed text (per chunk, not accumulated)
-    // - result.IsFinal      — true for final results, false for interim hypotheses
-    // - result.Segments     — segment-level timing data (Start/End in seconds)
-    // - result.Language     — language code
-    Console.Write(result.Text);
+    // result follows the OpenAI Realtime ConversationItem pattern:
+    // - result.Content[0].Text       — incremental transcribed text (per chunk, not accumulated)
+    // - result.Content[0].Transcript — alias for Text (OpenAI Realtime compatibility)
+    // - result.IsFinal               — true for final results, false for interim hypotheses
+    // - result.StartTime / EndTime   — segment timing in seconds
+    Console.Write(result.Content?[0]?.Text);
 }
 
 await session.StopAsync();
@@ -274,12 +274,11 @@ await session.StopAsync();
 
 | Field | Type | Description |
 |-------|------|-------------|
-| `Text` | `string` | Transcribed text from this audio chunk (inherited from `AudioCreateTranscriptionResponse`) |
+| `Content` | `List<TranscriptionContentPart>` | Content parts. Access text via `Content[0].Text` or `Content[0].Transcript`. |
 | `IsFinal` | `bool` | Whether this is a final or interim result. Nemotron always returns `true`. |
-| `Language` | `string` | Language code (inherited) |
-| `Duration` | `float` | Audio duration in seconds (inherited) |
-| `Segments` | `List<Segment>` | Segment timing with `Start`/`End` offsets (inherited) |
-| `Words` | `List<WordSegment>` | Word-level timing (inherited, when available) |
+| `StartTime` | `double?` | Start time offset in the audio stream (seconds). |
+| `EndTime` | `double?` | End time offset in the audio stream (seconds). |
+| `Id` | `string?` | Unique identifier for this result (if available). |
 
 #### Session Lifecycle
 
@@ -356,7 +355,7 @@ Key types:
 | [`OpenAIChatClient`](./docs/api/microsoft.ai.foundry.local.openaichatclient.md) | Chat completions (sync + streaming) |
 | [`OpenAIAudioClient`](./docs/api/microsoft.ai.foundry.local.openaiaudioclient.md) | Audio transcription (sync + streaming) |
 | [`LiveAudioTranscriptionSession`](./docs/api/microsoft.ai.foundry.local.openai.liveaudiotranscriptionsession.md) | Real-time audio streaming session |
-| [`LiveAudioTranscriptionResponse`](./docs/api/microsoft.ai.foundry.local.openai.liveaudiotranscriptionresponse.md) | Streaming transcription result (extends `AudioCreateTranscriptionResponse`) |
+| [`LiveAudioTranscriptionResponse`](./docs/api/microsoft.ai.foundry.local.openai.liveaudiotranscriptionresponse.md) | Streaming transcription result (ConversationItem-shaped) |
 | [`ModelInfo`](./docs/api/microsoft.ai.foundry.local.modelinfo.md) | Full model metadata record |
 
 ## Tests

diff --git a/sdk/cs/src/Detail/JsonSerializationContext.cs b/sdk/cs/src/Detail/JsonSerializationContext.cs
@@ -33,11 +33,10 @@ namespace Microsoft.AI.Foundry.Local.Detail;
 [JsonSerializable(typeof(IList<FunctionDefinition>))]
 [JsonSerializable(typeof(PropertyDefinition))]
 [JsonSerializable(typeof(IList<PropertyDefinition>))]
-// --- Audio streaming types ---
-[JsonSerializable(typeof(LiveAudioTranscriptionResponse))]
+// --- Audio streaming types (LiveAudioTranscriptionResponse inherits ConversationItem
+//     which has AOT-incompatible JsonConverters, so we only register the raw deserialization type) ---
 [JsonSerializable(typeof(LiveAudioTranscriptionRaw))]
 [JsonSerializable(typeof(CoreErrorResponse))]
-[JsonSerializable(typeof(AudioCreateTranscriptionResponse.Segment))]
 [JsonSourceGenerationOptions(DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull,
                              WriteIndented = false)]
 internal partial class JsonSerializationContext : JsonSerializerContext

diff --git a/sdk/cs/src/OpenAI/LiveAudioTranscriptionClient.cs b/sdk/cs/src/OpenAI/LiveAudioTranscriptionClient.cs
@@ -218,7 +218,7 @@ private async Task PushLoopAsync(CancellationToken ct)
                     try
                     {
                         var transcription = LiveAudioTranscriptionResponse.FromJson(response.Data);
-                        if (!string.IsNullOrEmpty(transcription.Text))
+                        if (!string.IsNullOrEmpty(transcription.Content?[0]?.Text))
                         {
                             _outputChannel?.Writer.TryWrite(transcription);
                         }
@@ -331,7 +331,7 @@ public async Task StopAsync(CancellationToken ct = default)
                 try
                 {
                     var finalResult = LiveAudioTranscriptionResponse.FromJson(response.Data);
-                    if (!string.IsNullOrEmpty(finalResult.Text))
+                    if (!string.IsNullOrEmpty(finalResult.Content?[0]?.Text))
                     {
                         _outputChannel?.Writer.TryWrite(finalResult);
                     }

diff --git a/sdk/cs/src/OpenAI/LiveAudioTranscriptionTypes.cs b/sdk/cs/src/OpenAI/LiveAudioTranscriptionTypes.cs
@@ -2,16 +2,18 @@ namespace Microsoft.AI.Foundry.Local.OpenAI;
 
 using System.Text.Json;
 using System.Text.Json.Serialization;
-using Betalgo.Ranul.OpenAI.ObjectModels.ResponseModels;
+using Betalgo.Ranul.OpenAI.ObjectModels.RealtimeModels;
 using Microsoft.AI.Foundry.Local;
 using Microsoft.AI.Foundry.Local.Detail;
 
 /// <summary>
 /// Transcription result for real-time audio streaming sessions.
-/// Extends <see cref="AudioCreateTranscriptionResponse"/> to provide a consistent
-/// output format with file-based transcription, while adding streaming-specific fields.
+/// Extends the OpenAI Realtime API's <see cref="ConversationItem"/> so that
+/// customers access text via <c>result.Content[0].Text</c> or
+/// <c>result.Content[0].Transcript</c>, ensuring forward compatibility
+/// when the transport layer moves to WebSocket.
 /// </summary>
-public record LiveAudioTranscriptionResponse : AudioCreateTranscriptionResponse
+public class LiveAudioTranscriptionResponse : ConversationItem
 {
     /// <summary>
     /// Whether this is a final or partial (interim) result.
@@ -22,35 +24,34 @@ public record LiveAudioTranscriptionResponse : AudioCreateTranscriptionResponse
     [JsonPropertyName("is_final")]
     public bool IsFinal { get; init; }
 
+    /// <summary>Start time offset of this segment in the audio stream (seconds).</summary>
+    [JsonPropertyName("start_time")]
+    public double? StartTime { get; init; }
+
+    /// <summary>End time offset of this segment in the audio stream (seconds).</summary>
+    [JsonPropertyName("end_time")]
+    public double? EndTime { get; init; }
+
     internal static LiveAudioTranscriptionResponse FromJson(string json)
     {
-        // Deserialize the core's JSON (which has is_final, text, start_time, end_time)
-        // into an intermediate record, then map to the response type.
         var raw = JsonSerializer.Deserialize(json,
             JsonSerializationContext.Default.LiveAudioTranscriptionRaw)
             ?? throw new FoundryLocalException("Failed to deserialize live audio transcription result");
 
-        var response = new LiveAudioTranscriptionResponse
+        return new LiveAudioTranscriptionResponse
         {
-            Text = raw.Text,
             IsFinal = raw.IsFinal,
-        };
-
-        // Map start_time/end_time into a Segment for OpenAI-compatible output
-        if (raw.StartTime.HasValue || raw.EndTime.HasValue)
-        {
-            response.Segments =
+            StartTime = raw.StartTime,
+            EndTime = raw.EndTime,
+            Content =
             [
-                new Segment
+                new ContentPart
                 {
-                    Start = (float)(raw.StartTime ?? 0),
-                    End = (float)(raw.EndTime ?? 0),
-                    Text = raw.Text
+                    Text = raw.Text,
+                    Transcript = raw.Text
                 }
-            ];
-        }
-
-        return response;
+            ]
+        };
     }
 }
 

diff --git a/sdk/cs/test/FoundryLocal.Tests/LiveAudioTranscriptionTests.cs b/sdk/cs/test/FoundryLocal.Tests/LiveAudioTranscriptionTests.cs
@@ -21,25 +21,24 @@ public async Task FromJson_ParsesTextAndIsFinal()
 
         var result = LiveAudioTranscriptionResponse.FromJson(json);
 
-        await Assert.That(result.Text).IsEqualTo("hello world");
+        await Assert.That(result.Content).IsNotNull();
+        await Assert.That(result.Content!.Count).IsEqualTo(1);
+        await Assert.That(result.Content[0].Text).IsEqualTo("hello world");
+        await Assert.That(result.Content[0].Transcript).IsEqualTo("hello world");
         await Assert.That(result.IsFinal).IsTrue();
-        await Assert.That(result.Segments).IsNull();
     }
 
     [Test]
-    public async Task FromJson_MapsTimingToSegments()
+    public async Task FromJson_MapsTimingFields()
     {
         var json = """{"is_final":false,"text":"partial","start_time":1.5,"end_time":3.0}""";
 
         var result = LiveAudioTranscriptionResponse.FromJson(json);
 
-        await Assert.That(result.Text).IsEqualTo("partial");
+        await Assert.That(result.Content?[0]?.Text).IsEqualTo("partial");
         await Assert.That(result.IsFinal).IsFalse();
-        await Assert.That(result.Segments).IsNotNull();
-        await Assert.That(result.Segments!.Count).IsEqualTo(1);
-        await Assert.That(result.Segments[0].Start).IsEqualTo(1.5f);
-        await Assert.That(result.Segments[0].End).IsEqualTo(3.0f);
-        await Assert.That(result.Segments[0].Text).IsEqualTo("partial");
+        await Assert.That(result.StartTime).IsEqualTo(1.5);
+        await Assert.That(result.EndTime).IsEqualTo(3.0);
     }
 
     [Test]
@@ -49,21 +48,20 @@ public async Task FromJson_EmptyText_ParsesSuccessfully()
 
         var result = LiveAudioTranscriptionResponse.FromJson(json);
 
-        await Assert.That(result.Text).IsEqualTo("");
+        await Assert.That(result.Content?[0]?.Text).IsEqualTo("");
         await Assert.That(result.IsFinal).IsTrue();
     }
 
     [Test]
-    public async Task FromJson_OnlyStartTime_CreatesSegment()
+    public async Task FromJson_OnlyStartTime_SetsStartTime()
     {
         var json = """{"is_final":true,"text":"word","start_time":2.0,"end_time":null}""";
 
         var result = LiveAudioTranscriptionResponse.FromJson(json);
 
-        await Assert.That(result.Segments).IsNotNull();
-        await Assert.That(result.Segments!.Count).IsEqualTo(1);
-        await Assert.That(result.Segments[0].Start).IsEqualTo(2.0f);
-        await Assert.That(result.Segments[0].End).IsEqualTo(0f);
+        await Assert.That(result.StartTime).IsEqualTo(2.0);
+        await Assert.That(result.EndTime).IsNull();
+        await Assert.That(result.Content?[0]?.Text).IsEqualTo("word");
     }
 
     [Test]
@@ -75,15 +73,15 @@ public async Task FromJson_InvalidJson_Throws()
     }
 
     [Test]
-    public async Task FromJson_InheritsFromAudioCreateTranscriptionResponse()
+    public async Task FromJson_ContentHasTextAndTranscript()
     {
         var json = """{"is_final":true,"text":"test","start_time":null,"end_time":null}""";
 
         var result = LiveAudioTranscriptionResponse.FromJson(json);
 
-        // Verify it's assignable to the base type
-        Betalgo.Ranul.OpenAI.ObjectModels.ResponseModels.AudioCreateTranscriptionResponse baseRef = result;
-        await Assert.That(baseRef.Text).IsEqualTo("test");
+        // Both Text and Transcript should have the same value
+        await Assert.That(result.Content?[0]?.Text).IsEqualTo("test");
+        await Assert.That(result.Content?[0]?.Transcript).IsEqualTo("test");
     }
 
     // --- LiveAudioTranscriptionOptions tests ---