Skip to content

Commit 926bc13

Browse files
authored
fix: #494 Voice input transcription failing in realtime-demo (#530)
1 parent becabb9 commit 926bc13

File tree

3 files changed

+37
-3
lines changed

3 files changed

+37
-3
lines changed

.changeset/wet-snakes-stare.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
---
2+
'@openai/agents-realtime': patch
3+
---
4+
5+
fix: #494 Voice input transcription failing in realtime-demo

packages/agents-realtime/src/realtimeSession.ts

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,10 @@ import {
2929
RealtimeOutputGuardrailSettings,
3030
} from './guardrail';
3131
import { RealtimeItem } from './items';
32-
import { OpenAIRealtimeModels } from './openaiRealtimeBase';
32+
import {
33+
DEFAULT_OPENAI_REALTIME_SESSION_CONFIG,
34+
OpenAIRealtimeModels,
35+
} from './openaiRealtimeBase';
3336
import { OpenAIRealtimeWebRTC } from './openaiRealtimeWebRtc';
3437
import { OpenAIRealtimeWebSocket } from './openaiRealtimeWebsocket';
3538
import { RealtimeAgent } from './realtimeAgent';
@@ -147,6 +150,12 @@ export type RealtimeSessionConnectOptions = {
147150
url?: string;
148151
};
149152

153+
function cloneDefaultSessionConfig(): Partial<RealtimeSessionConfig> {
154+
return JSON.parse(
155+
JSON.stringify(DEFAULT_OPENAI_REALTIME_SESSION_CONFIG),
156+
) as Partial<RealtimeSessionConfig>;
157+
}
158+
150159
/**
151160
* A `RealtimeSession` is the cornerstone of building Voice Agents. It's the equivalent of a
152161
* Runner in text-based agents except that it automatically handles multiple turns by maintaining a
@@ -206,7 +215,8 @@ export class RealtimeSession<
206215
// modalities, speed, toolChoice, turnDetection, etc.). Without this, updating
207216
// the agent would drop audio format overrides (e.g. g711_ulaw) and revert to
208217
// transport defaults causing issues for integrations like Twilio.
209-
#lastSessionConfig: Partial<RealtimeSessionConfig> | null = null;
218+
#lastSessionConfig: Partial<RealtimeSessionConfig> | null =
219+
cloneDefaultSessionConfig();
210220
#automaticallyTriggerResponseForMcpToolCalls: boolean = true;
211221

212222
constructor(

packages/agents-realtime/test/realtimeSession.test.ts

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,11 @@ import {
1111
} from '@openai/agents-core';
1212
import * as utils from '../src/utils';
1313
import type { TransportToolCallEvent } from '../src/transportLayerEvents';
14-
import { OpenAIRealtimeBase } from '../src/openaiRealtimeBase';
14+
import {
15+
DEFAULT_OPENAI_REALTIME_SESSION_CONFIG,
16+
OpenAIRealtimeBase,
17+
} from '../src/openaiRealtimeBase';
18+
import { toNewSessionConfig } from '../src/clientMessages';
1519

1620
function createMessage(id: string, text: string): RealtimeItem {
1721
return {
@@ -122,6 +126,21 @@ describe('RealtimeSession', () => {
122126
expect(t.connectCalls[0]?.url).toBe('ws://example');
123127
});
124128

129+
it('includes default transcription config when connecting', async () => {
130+
const t = new FakeTransport();
131+
const agent = new RealtimeAgent({ name: 'A', handoffs: [] });
132+
const s = new RealtimeSession(agent, { transport: t });
133+
await s.connect({ apiKey: 'test' });
134+
135+
const normalizedConfig = toNewSessionConfig(
136+
t.connectCalls[0]?.initialSessionConfig ?? {},
137+
);
138+
139+
expect(normalizedConfig.audio?.input?.transcription).toEqual(
140+
DEFAULT_OPENAI_REALTIME_SESSION_CONFIG.audio?.input?.transcription,
141+
);
142+
});
143+
125144
it('updateHistory accepts callback', () => {
126145
const item = createMessage('1', 'hi');
127146
session.updateHistory([item]);

0 commit comments

Comments
 (0)