From 858b568a131c3f97497de3b7147db6c622b6325d Mon Sep 17 00:00:00 2001 From: Kavi Mathur Date: Fri, 13 Feb 2026 12:10:38 -0500 Subject: [PATCH 1/6] WIP: Route playback through VPIO for echo cancellation on iOS Put AVAudioEngine in manual rendering mode on iOS (no headphones) so that the VPIO output render callback can pull rendered playback audio and feed it to the speaker through Bus 0. This gives the VPIO full visibility into both input and output for acoustic echo cancellation (AEC), preventing the AI from hearing its own voice through the speaker and self-interrupting. Co-Authored-By: Claude Opus 4.6 --- Sources/AIProxy/AudioController.swift | 40 +++++-- .../AIProxy/MicrophonePCMSampleVendorAT.swift | 108 +++++++++++++++++- 2 files changed, 132 insertions(+), 16 deletions(-) diff --git a/Sources/AIProxy/AudioController.swift b/Sources/AIProxy/AudioController.swift index dfddaf7..9af9aa6 100644 --- a/Sources/AIProxy/AudioController.swift +++ b/Sources/AIProxy/AudioController.swift @@ -18,16 +18,16 @@ import AVFoundation /// We use either AVAudioEngine or AudioToolbox for mic data, depending on the platform and whether headphones are attached. /// The following arrangement provides for the best user experience: /// -/// +----------+---------------+------------------+ -/// | Platform | Headphones | Audio API | -/// +----------+---------------+------------------+ -/// | macOS | Yes | AudioEngine | -/// | macOS | No | AudioToolbox | -/// | iOS | Yes | AudioEngine | -/// | iOS | No | AudioToolbox | -/// | watchOS | Yes | AudioEngine | -/// | watchOS | No | AudioEngine | -/// +----------+---------------+------------------+ +/// +----------+---------------+--------------------------------------+ +/// | Platform | Headphones | Audio API | +/// +----------+---------------+--------------------------------------+ +/// | macOS | Yes | AudioEngine | +/// | macOS | No | AudioToolbox | +/// | iOS | Yes | AudioEngine | +/// | iOS | No | AudioToolbox + manual rendering AEC | +/// | watchOS | Yes | AudioEngine | +/// | watchOS | No | AudioEngine | +/// +----------+---------------+--------------------------------------+ /// @AIProxyActor public final class AudioController { public enum Mode { @@ -61,11 +61,29 @@ import AVFoundation self.audioEngine = AVAudioEngine() + #if os(iOS) + let needsManualRendering = modes.contains(.record) && modes.contains(.playback) + && !AIProxyUtils.headphonesConnected + if needsManualRendering { + let renderFormat = AVAudioFormat( + commonFormat: .pcmFormatFloat32, + sampleRate: 44100, + channels: 1, + interleaved: true + )! + try audioEngine.enableManualRenderingMode( + .realtime, + format: renderFormat, + maximumFrameCount: 4096 + ) + } + #endif + if modes.contains(.record) { #if os(macOS) || os(iOS) self.microphonePCMSampleVendor = AIProxyUtils.headphonesConnected ? try MicrophonePCMSampleVendorAE(audioEngine: self.audioEngine) - : MicrophonePCMSampleVendorAT() + : MicrophonePCMSampleVendorAT(audioEngine: self.audioEngine) #else self.microphonePCMSampleVendor = try MicrophonePCMSampleVendorAE(audioEngine: self.audioEngine) #endif diff --git a/Sources/AIProxy/MicrophonePCMSampleVendorAT.swift b/Sources/AIProxy/MicrophonePCMSampleVendorAT.swift index c131b34..db3dfbf 100644 --- a/Sources/AIProxy/MicrophonePCMSampleVendorAT.swift +++ b/Sources/AIProxy/MicrophonePCMSampleVendorAT.swift @@ -58,8 +58,11 @@ nonisolated private let kVoiceProcessingInputSampleRate: Double = 44100 private var audioUnit: AudioUnit? private let microphonePCMSampleVendorCommon = MicrophonePCMSampleVendorCommon() private var continuation: AsyncStream.Continuation? + private var audioEngine: AVAudioEngine? - public init() {} + public init(audioEngine: AVAudioEngine? = nil) { + self.audioEngine = audioEngine + } deinit { logIf(.debug)?.debug("MicrophonePCMSampleVendor is being freed") @@ -101,17 +104,17 @@ nonisolated private let kVoiceProcessingInputSampleRate: Double = 44100 ) } - var zero: UInt32 = 0 + var one_output: UInt32 = 1 err = AudioUnitSetProperty(audioUnit, kAudioOutputUnitProperty_EnableIO, kAudioUnitScope_Output, 0, - &zero, // <-- This is not a mistake! If you leave this on, iOS spams the logs with: "from AU (address): auou/vpio/appl, render err: -1" - UInt32(MemoryLayout.size(ofValue: one))) + &one_output, + UInt32(MemoryLayout.size(ofValue: one_output))) guard err == noErr else { throw MicrophonePCMSampleVendorError.couldNotConfigureAudioUnit( - "Could not disable the output scope of the speaker bus" + "Could not enable the output scope of the speaker bus" ) } @@ -251,6 +254,49 @@ nonisolated private let kVoiceProcessingInputSampleRate: Double = 44100 ) } + // If we have an AVAudioEngine in manual rendering mode, set up the VPIO output bus + // to pull rendered audio. This gives the VPIO visibility into playback for AEC. + if audioEngine != nil { + var outputFormat = AudioStreamBasicDescription( + mSampleRate: kVoiceProcessingInputSampleRate, // 44100 + mFormatID: kAudioFormatLinearPCM, + mFormatFlags: kAudioFormatFlagIsFloat | kAudioFormatFlagIsPacked, + mBytesPerPacket: 4, + mFramesPerPacket: 1, + mBytesPerFrame: 4, + mChannelsPerFrame: 1, + mBitsPerChannel: 32, + mReserved: 0 + ) + err = AudioUnitSetProperty(audioUnit, + kAudioUnitProperty_StreamFormat, + kAudioUnitScope_Input, + 0, // Bus 0 (speaker) + &outputFormat, + UInt32(MemoryLayout.size)) + guard err == noErr else { + throw MicrophonePCMSampleVendorError.couldNotConfigureAudioUnit( + "Could not set stream format on the input scope of the speaker bus" + ) + } + + var outputCallbackStruct = AURenderCallbackStruct( + inputProc: audioOutputRenderCallback, + inputProcRefCon: Unmanaged.passUnretained(self).toOpaque() + ) + err = AudioUnitSetProperty(audioUnit, + kAudioUnitProperty_SetRenderCallback, + kAudioUnitScope_Input, + 0, // Bus 0 + &outputCallbackStruct, + UInt32(MemoryLayout.size)) + guard err == noErr else { + throw MicrophonePCMSampleVendorError.couldNotConfigureAudioUnit( + "Could not set the output render callback on the voice processing audio unit" + ) + } + } + err = AudioUnitInitialize(audioUnit) guard err == noErr else { throw MicrophonePCMSampleVendorError.couldNotConfigureAudioUnit( @@ -336,6 +382,58 @@ nonisolated private let kVoiceProcessingInputSampleRate: Double = 44100 } } } + + /// Called from the VPIO output render callback on the real-time audio thread. + /// Pulls rendered audio from the AVAudioEngine's manual rendering block and writes it + /// into the VPIO's output buffer so the VPIO can use it as the AEC reference signal. + fileprivate func didReceiveOutputRenderCallback( + _ ioActionFlags: UnsafeMutablePointer, + _ inTimeStamp: UnsafePointer, + _ inBusNumber: UInt32, + _ inNumberFrames: UInt32, + _ ioData: UnsafeMutablePointer? + ) { + guard let ioData = ioData, let audioEngine = audioEngine else { + // No engine — render silence + if let ioData = ioData { + let buf = UnsafeMutableAudioBufferListPointer(ioData) + for i in 0.. + .fromOpaque(inRefCon) + .takeUnretainedValue() + vendor.didReceiveOutputRenderCallback( + ioActionFlags, + inTimeStamp, + inBusNumber, + inNumberFrames, + ioData + ) + return noErr } // This @AIProxyActor annotation is a lie. From e5aa09a4507a0661f5422575194498aa5fb95ba5 Mon Sep 17 00:00:00 2001 From: Kavi Mathur Date: Fri, 13 Feb 2026 12:22:05 -0500 Subject: [PATCH 2/6] Add design doc for VPIO echo cancellation approach Co-Authored-By: Claude Opus 4.6 --- docs/vpio-echo-cancellation.md | 84 ++++++++++++++++++++++++++++++++++ 1 file changed, 84 insertions(+) create mode 100644 docs/vpio-echo-cancellation.md diff --git a/docs/vpio-echo-cancellation.md b/docs/vpio-echo-cancellation.md new file mode 100644 index 0000000..f69f5ea --- /dev/null +++ b/docs/vpio-echo-cancellation.md @@ -0,0 +1,84 @@ +# VPIO Echo Cancellation Fix for iOS + +## Problem + +When using Joice on iOS with speaker output (no headphones), the AI's voice plays through the speaker, gets picked up by the microphone, and OpenAI's server-side VAD interprets it as user speech — causing the AI to interrupt itself. + +## Root Cause + +The AIProxy SDK uses `kAudioUnitSubType_VoiceProcessingIO` (VPIO) for mic capture on iOS without headphones. VPIO is designed to provide Acoustic Echo Cancellation (AEC), but **AEC was non-functional** because: + +1. The VPIO output bus (Bus 0) was **explicitly disabled** in `MicrophonePCMSampleVendorAT.swift` (the comment explained that enabling it caused `render err: -1` without a data source) +2. Playback went through a completely separate `AVAudioEngine` that the VPIO had no visibility into + +**For VPIO AEC to work, playback audio must flow through the VPIO's output bus as a reference signal.** The VPIO cannot cancel echo from audio it doesn't know about. + +## Solution: AVAudioEngine Manual Rendering + VPIO I/O + +Route playback through the VPIO by putting `AVAudioEngine` in **manual rendering mode**. The existing `AudioPCMPlayer` continues to schedule buffers on the playerNode as before, but the engine no longer renders to hardware. Instead, the VPIO output render callback **pulls** rendered audio from the engine and feeds it to the speaker through the VPIO. This gives the VPIO full visibility into both input and output for AEC. + +This is the same pattern used by Twilio's Voice SDK for iOS echo cancellation. + +### Signal Path + +``` +OpenAI 24kHz PCM16 --> AudioPCMPlayer --> playerNode --> AVAudioEngine (manual rendering, 44100Hz) + | + v + VPIO output render callback + calls engine.manualRenderingBlock + | + v + VPIO Bus 0 output + (plays to speaker + AEC ref) + +Hardware mic --> VPIO Bus 1 input (echo-cancelled) --> input callback --> resample --> OpenAI +``` + +## Changes + +### `MicrophonePCMSampleVendorAT.swift` + +1. **Added `audioEngine` property + updated `init`** — accepts an optional `AVAudioEngine` in manual rendering mode +2. **Enabled output bus 0** — changed the `zero` → `one_output` so the VPIO speaker bus is active (previously disabled to avoid `render err: -1`) +3. **Set stream format on output bus 0 (Input scope)** — Float32 at 44100Hz mono, matching the manual rendering engine format +4. **Registered a render callback on bus 0** — only when `audioEngine` is provided +5. **Implemented `didReceiveOutputRenderCallback`** — pulls audio from `audioEngine.manualRenderingBlock` into the VPIO's output buffer; fills silence on error or when no engine is present +6. **Added C-level `audioOutputRenderCallback`** — bridges to the instance method (same pattern as the existing input callback) + +### `AudioController.swift` + +1. **Enable manual rendering** — on iOS without headphones, puts `AVAudioEngine` into `.realtime` manual rendering mode at Float32/44100Hz/mono before any nodes are attached +2. **Pass `audioEngine` to VPIO vendor** — `MicrophonePCMSampleVendorAT(audioEngine: self.audioEngine)` so the render callback can pull from it +3. **Updated doc comment table** — iOS without headphones now notes "AudioToolbox + manual rendering AEC" + +### `AudioPCMPlayer.swift` + +No changes needed. The playerNode scheduling API works identically in manual rendering mode. The engine buffers audio internally and renders it when `manualRenderingBlock` is called from the VPIO output callback. + +## Why This Works + +1. `AudioPCMPlayer` schedules playback buffers on the playerNode at 24kHz +2. `AVAudioEngine` (in manual rendering mode at 44100Hz) internally upsamples and mixes +3. The VPIO output render callback pulls mixed audio via `manualRenderingBlock` +4. VPIO sends this audio to the hardware speaker **and** uses it as the AEC reference +5. VPIO subtracts the reference from the mic input on Bus 1, producing echo-cancelled audio +6. The echo-cancelled mic audio flows through the existing input callback unchanged + +## Risks and Mitigations + +| Risk | Mitigation | +|------|------------| +| Volume bug with VPIO (documented in AudioPCMPlayer) | AudioPCMPlayer is initialized before VPIO (existing order in AudioController). In manual rendering mode, the engine doesn't drive hardware directly, so the bug may not apply. | +| `manualRenderingBlock` called on real-time thread | Apple docs confirm this is the intended usage — the block is designed for real-time contexts. | +| Format mismatch between engine output and VPIO bus | Both configured to Float32/44100Hz/mono. The engine handles 24kHz to 44100Hz upsampling internally. | +| macOS not addressed | macOS AT path unchanged (separate concern, less acute due to speaker/mic distance). Only iOS gets the manual rendering + VPIO AEC fix. | +| Headphones path unchanged | When headphones are connected, the `MicrophonePCMSampleVendorAE` (AVAudioEngine-based) path is used instead — no regression risk. | + +## Testing + +1. Build and run on a **physical iOS device** with speaker (no headphones) +2. Start a voice session — AI should speak full responses without self-interrupting +3. Speak over the AI to verify user interruption still works +4. Test with headphones to confirm no regression (headphones path is unchanged) +5. Test that playback volume is acceptable From c6fc665cb10c3f8628b0255d1ff24d4183de49cc Mon Sep 17 00:00:00 2001 From: Kavi Mathur Date: Sat, 14 Feb 2026 14:56:46 -0500 Subject: [PATCH 3/6] Fix iOS realtime self-interruption with VPIO echo guard --- Sources/AIProxy/AudioController.swift | 17 +- .../AIProxy/MicrophonePCMSampleVendorAT.swift | 183 ++++++++++++++++-- docs/realtime-self-interruption-fix.md | 83 ++++++++ 3 files changed, 265 insertions(+), 18 deletions(-) create mode 100644 docs/realtime-self-interruption-fix.md diff --git a/Sources/AIProxy/AudioController.swift b/Sources/AIProxy/AudioController.swift index 9af9aa6..ec8c0a6 100644 --- a/Sources/AIProxy/AudioController.swift +++ b/Sources/AIProxy/AudioController.swift @@ -42,6 +42,13 @@ import AVFoundation public init(modes: [Mode]) async throws { self.modes = modes + #if os(iOS) + let needsManualRendering = modes.contains(.record) && modes.contains(.playback) + && !AIProxyUtils.headphonesConnected + #else + let needsManualRendering = false + #endif + #if os(iOS) // This is not respected if `setVoiceProcessingEnabled(true)` is used :/ // Instead, I've added my own accumulator. @@ -62,8 +69,6 @@ import AVFoundation self.audioEngine = AVAudioEngine() #if os(iOS) - let needsManualRendering = modes.contains(.record) && modes.contains(.playback) - && !AIProxyUtils.headphonesConnected if needsManualRendering { let renderFormat = AVAudioFormat( commonFormat: .pcmFormatFloat32, @@ -99,9 +104,17 @@ import AVFoundation // Nesting `start` in a Task is necessary on watchOS. // There is some sort of race, and letting the runloop tick seems to "fix" it. // If I call `prepare` and `start` in serial succession, then there is no playback on watchOS (sometimes). + #if os(iOS) + try self.audioEngine.start() + #elseif os(watchOS) + Task { + try self.audioEngine.start() + } + #else Task { try self.audioEngine.start() } + #endif } deinit { diff --git a/Sources/AIProxy/MicrophonePCMSampleVendorAT.swift b/Sources/AIProxy/MicrophonePCMSampleVendorAT.swift index db3dfbf..a5ebe34 100644 --- a/Sources/AIProxy/MicrophonePCMSampleVendorAT.swift +++ b/Sources/AIProxy/MicrophonePCMSampleVendorAT.swift @@ -12,6 +12,13 @@ import AudioToolbox import Foundation nonisolated private let kVoiceProcessingInputSampleRate: Double = 44100 +nonisolated private let kEchoGuardOutputRMSFloor: Float = 0.0015 +nonisolated private let kEchoGuardOutputTailSeconds: TimeInterval = 0.12 +nonisolated private let kEchoGuardRMSSmoothingFactor: Float = 0.2 +nonisolated private let kEchoGuardBargeInThresholdFloor: Float = 0.018 +nonisolated private let kEchoGuardBargeInRelativeMultiplier: Float = 2.3 +nonisolated private let kEchoGuardFramesForBargeIn = 2 +nonisolated private let kEchoGuardBargeInHoldSeconds: TimeInterval = 1.0 /// This is an AudioToolbox-based implementation that vends PCM16 microphone samples at a /// sample rate that OpenAI's realtime models expect. @@ -59,6 +66,10 @@ nonisolated private let kVoiceProcessingInputSampleRate: Double = 44100 private let microphonePCMSampleVendorCommon = MicrophonePCMSampleVendorCommon() private var continuation: AsyncStream.Continuation? private var audioEngine: AVAudioEngine? + private var outputLikelyActiveUntilUptime: TimeInterval = 0 + private var outputSmoothedRMS: Float = 0 + private var micLoudFrameStreak = 0 + private var bargeInOpenUntilUptime: TimeInterval = 0 public init(audioEngine: AVAudioEngine? = nil) { self.audioEngine = audioEngine @@ -104,7 +115,12 @@ nonisolated private let kVoiceProcessingInputSampleRate: Double = 44100 ) } - var one_output: UInt32 = 1 + #if os(iOS) + let shouldEnableSpeakerBusForAEC = audioEngine?.isInManualRenderingMode ?? false + #else + let shouldEnableSpeakerBusForAEC = true + #endif + var one_output: UInt32 = shouldEnableSpeakerBusForAEC ? 1 : 0 err = AudioUnitSetProperty(audioUnit, kAudioOutputUnitProperty_EnableIO, kAudioUnitScope_Output, @@ -114,7 +130,7 @@ nonisolated private let kVoiceProcessingInputSampleRate: Double = 44100 guard err == noErr else { throw MicrophonePCMSampleVendorError.couldNotConfigureAudioUnit( - "Could not enable the output scope of the speaker bus" + "Could not configure the output scope of the speaker bus" ) } @@ -239,24 +255,39 @@ nonisolated private let kVoiceProcessingInputSampleRate: Double = 44100 ) } - // Do not use auto gain control. Remove in a future commit. - // var enable: UInt32 = 1 - // err = AudioUnitSetProperty(audioUnit, - // kAUVoiceIOProperty_VoiceProcessingEnableAGC, - // kAudioUnitScope_Output, - // 1, - // &enable, - // UInt32(MemoryLayout.size(ofValue: enable))) - // - guard err == noErr else { - throw MicrophonePCMSampleVendorError.couldNotConfigureAudioUnit( - "Could not configure auto gain control" - ) + #if os(iOS) + // Make voice processing explicit so route changes do not accidentally bypass AEC. + var disableBypass: UInt32 = 0 + err = AudioUnitSetProperty( + audioUnit, + kAUVoiceIOProperty_BypassVoiceProcessing, + kAudioUnitScope_Global, + 0, + &disableBypass, + UInt32(MemoryLayout.size) + ) + if err != noErr { + logIf(.warning)?.warning("Could not force-enable VPIO voice processing: \(err)") } + // Disable AGC to avoid amplifying far-end leakage into the uplink. + var disableAGC: UInt32 = 0 + err = AudioUnitSetProperty( + audioUnit, + kAUVoiceIOProperty_VoiceProcessingEnableAGC, + kAudioUnitScope_Global, + 1, + &disableAGC, + UInt32(MemoryLayout.size) + ) + if err != noErr { + logIf(.warning)?.warning("Could not disable VPIO AGC: \(err)") + } + #endif + // If we have an AVAudioEngine in manual rendering mode, set up the VPIO output bus // to pull rendered audio. This gives the VPIO visibility into playback for AEC. - if audioEngine != nil { + if shouldEnableSpeakerBusForAEC, audioEngine != nil { var outputFormat = AudioStreamBasicDescription( mSampleRate: kVoiceProcessingInputSampleRate, // 44100 mFormatID: kAudioFormatLinearPCM, @@ -364,6 +395,12 @@ nonisolated private let kVoiceProcessingInputSampleRate: Double = 44100 return } + #if os(iOS) + if self.shouldSuppressLikelyEchoInput(bufferList: bufferList, frameCount: inNumberFrames) { + return + } + #endif + guard let audioFormat = AVAudioFormat( commonFormat: .pcmFormatInt16, sampleRate: kVoiceProcessingInputSampleRate, @@ -411,8 +448,122 @@ nonisolated private let kVoiceProcessingInputSampleRate: Double = 44100 for i in 0.., + frameCount: UInt32 + ) { + let outputRMS = self.rms(ofFloat32BufferList: ioData) + let now = ProcessInfo.processInfo.systemUptime + + if outputRMS > kEchoGuardOutputRMSFloor { + let bufferDuration = Double(frameCount) / kVoiceProcessingInputSampleRate + self.outputLikelyActiveUntilUptime = now + bufferDuration + kEchoGuardOutputTailSeconds + if self.outputSmoothedRMS == 0 { + self.outputSmoothedRMS = outputRMS + } else { + self.outputSmoothedRMS = (self.outputSmoothedRMS * (1 - kEchoGuardRMSSmoothingFactor)) + + (outputRMS * kEchoGuardRMSSmoothingFactor) + } + return + } + + if now > self.outputLikelyActiveUntilUptime { + self.outputSmoothedRMS = 0 + self.micLoudFrameStreak = 0 + } + } + + private func shouldSuppressLikelyEchoInput( + bufferList: AudioBufferList, + frameCount: UInt32 + ) -> Bool { + let now = ProcessInfo.processInfo.systemUptime + if now <= self.bargeInOpenUntilUptime { + return false + } + + guard now <= self.outputLikelyActiveUntilUptime else { + self.micLoudFrameStreak = 0 + return false + } + + let micRMS = self.rms(ofPCM16BufferList: bufferList, frameCount: frameCount) + let bargeInThreshold = max( + kEchoGuardBargeInThresholdFloor, + self.outputSmoothedRMS * kEchoGuardBargeInRelativeMultiplier + ) + + if micRMS >= bargeInThreshold { + self.micLoudFrameStreak += 1 + if self.micLoudFrameStreak >= kEchoGuardFramesForBargeIn { + self.micLoudFrameStreak = 0 + self.bargeInOpenUntilUptime = now + kEchoGuardBargeInHoldSeconds + return false + } + } else { + self.micLoudFrameStreak = 0 + } + + return true + } + + private func rms( + ofPCM16BufferList bufferList: AudioBufferList, + frameCount: UInt32 + ) -> Float { + guard frameCount > 0, + let data = bufferList.mBuffers.mData else { + return 0 + } + + let sampleCount = Int(frameCount) + let samples = data.bindMemory(to: Int16.self, capacity: sampleCount) + let scale = Float(Int16.max) + var sumSquares: Float = 0 + for i in 0..) -> Float { + let buffers = UnsafeMutableAudioBufferListPointer(ioData) + guard !buffers.isEmpty else { + return 0 + } + + var sampleCount = 0 + var sumSquares: Float = 0 + for buffer in buffers { + guard let mData = buffer.mData else { continue } + let count = Int(buffer.mDataByteSize) / MemoryLayout.size + if count == 0 { continue } + let samples = mData.bindMemory(to: Float.self, capacity: count) + sampleCount += count + for i in 0.. 0 else { + return 0 } + return sqrt(sumSquares / Float(sampleCount)) } + #endif } // This @AIProxyActor annotation is a lie. diff --git a/docs/realtime-self-interruption-fix.md b/docs/realtime-self-interruption-fix.md new file mode 100644 index 0000000..e8a7888 --- /dev/null +++ b/docs/realtime-self-interruption-fix.md @@ -0,0 +1,83 @@ +# Realtime Self-Interruption Fix Plan + +## Goal + +Prevent the OpenAI Realtime model from hearing its own speaker playback on iOS and incorrectly interrupting itself, while preserving intentional user barge-in behavior. + +## Scope + +- Primary implementation target: `AIProxySwift` (`wip/vpio-echo-cancellation`) +- Integration context reviewed: `/Users/kavimathur/workspace/joice` (CallKit + Realtime) +- APIs involved: iOS audio stack (VPIO + AVAudioEngine), OpenAI Realtime turn detection + +## Mechanism Review + +### AIProxySwift audio path (current branch) + +1. `AudioController` chooses AudioToolbox VPIO on iOS speaker path (no headphones). +2. `MicrophonePCMSampleVendorAT` captures microphone via VPIO bus 1. +3. Playback is scheduled with `AudioPCMPlayer` on `AVAudioEngine`. +4. VPIO bus 0 render callback pulls from `audioEngine.manualRenderingBlock` as AEC reference. + +This closes the biggest architectural gap from issue #240 and PR #264. + +### Why interruption can still happen + +Even with VPIO AEC working, residual echo can remain due to acoustics, device variance, AGC interactions, and startup/adaptation windows. OpenAI turn detection may treat that residual as user speech and emit `input_audio_buffer.speech_started`, which typical app code maps to playback interruption. + +## Plan + +1. Harden VPIO voice-processing configuration. +1. Add a client-side mic uplink echo guard in the VPIO callback path for iOS speaker mode: + - Track assistant playback activity/level. + - Suppress likely echo frames while assistant audio is active. + - Re-open mic quickly on strong near-end speech (barge-in). +1. Keep behavior isolated to iOS speaker full-duplex mode to avoid regressions. +1. Validate by build/tests and document expected on-device QA scenarios. + +## Risk Notes + +- Over-aggressive suppression can make barge-in harder. +- Under-aggressive suppression can still allow self-interruption. +- iOS real-time audio timing varies by route/device, so thresholds must be conservative and tunable in code. + +## Learning Log + +- Initial finding: current branch correctly routes far-end audio through VPIO output callback, but does not guard against residual leakage in uplink. +- Initial finding: `joice` uses `.semanticVAD(eagerness: .medium)` and interrupts playback on every `input_audio_buffer.speech_started`, so any leakage becomes user-visible immediately. +- Implementation direction: add defense-in-depth in SDK rather than relying only on VPIO AEC. +- Attempted approach: add mic suppression wrapper in `AudioController.micStream()`. +- Course correction: Swift 6 sendability checks rejected forwarding `AVAudioPCMBuffer` across actor boundaries in that wrapper, so the echo guard moved into `MicrophonePCMSampleVendorAT` callback code (no cross-actor buffer forwarding). +- Implemented: explicitly force VPIO voice processing on (`kAUVoiceIOProperty_BypassVoiceProcessing = 0`) and disable AGC (`kAUVoiceIOProperty_VoiceProcessingEnableAGC = 0`) to reduce amplified leakage. +- Implemented: output callback now measures rendered output RMS and tracks an "assistant audio active" window. +- Implemented: input callback computes mic RMS and suppresses frames likely to be echo while assistant output is active, but allows barge-in after consecutive loud mic frames and keeps a short barge-in-open window. +- Implemented: speaker output bus + output callback setup are now limited to manual-rendering mode, preserving pre-existing macOS/non-manual behavior. +- Platform-safety update: iOS-only behavior changes are enforced for AGC/bypass settings and echo suppression; macOS/watchOS paths retain prior behavior unless unchanged baseline setup is required. +- Validation: `swift build` succeeds and `swift test` passes (172 tests, 0 failures). + +## Execution Summary + +- Changed `/Users/kavimathur/workspace/AIProxySwift/Sources/AIProxy/MicrophonePCMSampleVendorAT.swift`: + - Added VPIO property hardening for bypass/AGC. + - Added residual echo suppression logic in the real-time callback path (iOS only). + - Added real-time RMS tracking helpers for output and microphone buffers. +- Changed `/Users/kavimathur/workspace/AIProxySwift/Sources/AIProxy/AudioController.swift`: + - Start `AVAudioEngine` synchronously on non-watchOS to reduce startup race risk in iOS/macOS call paths. +- Added plan + log file: + - `/Users/kavimathur/workspace/AIProxySwift/docs/realtime-self-interruption-fix.md` + +## On-Device QA Checklist + +1. iPhone speaker mode, no headphones: AI should complete full responses without self-interrupting. +1. While AI is speaking, interrupt loudly and verify barge-in still works within ~200-300ms. +1. Quiet room and noisy room checks: verify no constant false interruptions. +1. Headphones/Bluetooth route: verify no regression (existing non-speaker behavior unchanged). +1. CallKit route changes (speaker toggle, lock screen controls): verify conversation remains stable. + +## References + +- Issue context: +- Prior attempt (PR): +- OpenAI Realtime turn detection fields (`interrupt_response`, `create_response`): +- Apple Audio Unit bus model (I/O unit fundamentals): +- Apple WWDC discussion of full-duplex voice processing echo cancellation: From a6c680195ec52af4e36fd82e82b35d8949dc6941 Mon Sep 17 00:00:00 2001 From: Kavi Mathur Date: Sun, 15 Feb 2026 01:43:35 -0500 Subject: [PATCH 4/6] Clarify actor annotation caveat on Core Audio callbacks --- Sources/AIProxy/MicrophonePCMSampleVendorAT.swift | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/Sources/AIProxy/MicrophonePCMSampleVendorAT.swift b/Sources/AIProxy/MicrophonePCMSampleVendorAT.swift index a5ebe34..c0f9771 100644 --- a/Sources/AIProxy/MicrophonePCMSampleVendorAT.swift +++ b/Sources/AIProxy/MicrophonePCMSampleVendorAT.swift @@ -566,7 +566,11 @@ nonisolated private let kEchoGuardBargeInHoldSeconds: TimeInterval = 1.0 #endif } -// This @AIProxyActor annotation is a lie. +// NOTE: +// This callback is invoked by Core Audio on a real-time I/O thread via C APIs. +// It is not scheduled onto AIProxyActor at runtime, even though the symbol is +// annotated with @AIProxyActor for Swift type-checking ergonomics. +// Do not assume actor isolation/synchronization inside this callback. @AIProxyActor private let audioOutputRenderCallback: AURenderCallback = { inRefCon, ioActionFlags, @@ -587,7 +591,11 @@ nonisolated private let kEchoGuardBargeInHoldSeconds: TimeInterval = 1.0 return noErr } -// This @AIProxyActor annotation is a lie. +// NOTE: +// This callback is invoked by Core Audio on a real-time I/O thread via C APIs. +// It is not scheduled onto AIProxyActor at runtime, even though the symbol is +// annotated with @AIProxyActor for Swift type-checking ergonomics. +// Do not assume actor isolation/synchronization inside this callback. @AIProxyActor private let audioRenderCallback: AURenderCallback = { inRefCon, ioActionFlags, From 8ea809830c61e506ed9e51f8b437e08a1714ab8d Mon Sep 17 00:00:00 2001 From: Kavi Mathur Date: Sun, 15 Feb 2026 02:05:53 -0500 Subject: [PATCH 5/6] Annotate iOS-only guards as iOS-first pending cross-platform validation --- Sources/AIProxy/AudioController.swift | 8 ++++---- Sources/AIProxy/MicrophonePCMSampleVendorAT.swift | 12 ++++++------ 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/Sources/AIProxy/AudioController.swift b/Sources/AIProxy/AudioController.swift index ec8c0a6..82381a9 100644 --- a/Sources/AIProxy/AudioController.swift +++ b/Sources/AIProxy/AudioController.swift @@ -42,14 +42,14 @@ import AVFoundation public init(modes: [Mode]) async throws { self.modes = modes - #if os(iOS) + #if os(iOS) // iOS-first guard: non-iOS behavior has not been validated for this path yet. let needsManualRendering = modes.contains(.record) && modes.contains(.playback) && !AIProxyUtils.headphonesConnected #else let needsManualRendering = false #endif - #if os(iOS) + #if os(iOS) // iOS-first guard: non-iOS behavior has not been validated for this path yet. // This is not respected if `setVoiceProcessingEnabled(true)` is used :/ // Instead, I've added my own accumulator. // try? AVAudioSession.sharedInstance().setPreferredIOBufferDuration(0.1) @@ -68,7 +68,7 @@ import AVFoundation self.audioEngine = AVAudioEngine() - #if os(iOS) + #if os(iOS) // iOS-first guard: non-iOS behavior has not been validated for this path yet. if needsManualRendering { let renderFormat = AVAudioFormat( commonFormat: .pcmFormatFloat32, @@ -104,7 +104,7 @@ import AVFoundation // Nesting `start` in a Task is necessary on watchOS. // There is some sort of race, and letting the runloop tick seems to "fix" it. // If I call `prepare` and `start` in serial succession, then there is no playback on watchOS (sometimes). - #if os(iOS) + #if os(iOS) // iOS-first guard: non-iOS behavior has not been validated for this path yet. try self.audioEngine.start() #elseif os(watchOS) Task { diff --git a/Sources/AIProxy/MicrophonePCMSampleVendorAT.swift b/Sources/AIProxy/MicrophonePCMSampleVendorAT.swift index c0f9771..a93a6c9 100644 --- a/Sources/AIProxy/MicrophonePCMSampleVendorAT.swift +++ b/Sources/AIProxy/MicrophonePCMSampleVendorAT.swift @@ -115,7 +115,7 @@ nonisolated private let kEchoGuardBargeInHoldSeconds: TimeInterval = 1.0 ) } - #if os(iOS) + #if os(iOS) // iOS-first guard: non-iOS behavior has not been validated for this path yet. let shouldEnableSpeakerBusForAEC = audioEngine?.isInManualRenderingMode ?? false #else let shouldEnableSpeakerBusForAEC = true @@ -255,7 +255,7 @@ nonisolated private let kEchoGuardBargeInHoldSeconds: TimeInterval = 1.0 ) } - #if os(iOS) + #if os(iOS) // iOS-first guard: non-iOS behavior has not been validated for this path yet. // Make voice processing explicit so route changes do not accidentally bypass AEC. var disableBypass: UInt32 = 0 err = AudioUnitSetProperty( @@ -395,7 +395,7 @@ nonisolated private let kEchoGuardBargeInHoldSeconds: TimeInterval = 1.0 return } - #if os(iOS) + #if os(iOS) // iOS-first guard: non-iOS behavior has not been validated for this path yet. if self.shouldSuppressLikelyEchoInput(bufferList: bufferList, frameCount: inNumberFrames) { return } @@ -448,17 +448,17 @@ nonisolated private let kEchoGuardBargeInHoldSeconds: TimeInterval = 1.0 for i in 0.., frameCount: UInt32 From fb2480c48cae3dce5cc44a2b7323cfa2efea8b3d Mon Sep 17 00:00:00 2001 From: Kavi Mathur Date: Sun, 15 Feb 2026 19:31:13 -0500 Subject: [PATCH 6/6] Expose Realtime input audio noise reduction config --- .../OpenAIRealtimeSessionConfiguration.swift | 22 +++++++++ ...nAIRealtimeSessionConfigurationTests.swift | 46 +++++++++++++++++++ 2 files changed, 68 insertions(+) create mode 100644 Tests/AIProxyTests/OpenAIRealtimeSessionConfigurationTests.swift diff --git a/Sources/AIProxy/OpenAI/OpenAIRealtimeSessionConfiguration.swift b/Sources/AIProxy/OpenAI/OpenAIRealtimeSessionConfiguration.swift index 2178df0..c6c69c8 100644 --- a/Sources/AIProxy/OpenAI/OpenAIRealtimeSessionConfiguration.swift +++ b/Sources/AIProxy/OpenAI/OpenAIRealtimeSessionConfiguration.swift @@ -61,6 +61,9 @@ nonisolated public struct OpenAIRealtimeSessionConfiguration: Encodable, Sendabl /// The format of input audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`. public let inputAudioFormat: AudioFormat? + /// Configuration for input audio noise reduction. Set to nil to turn off. + public let inputAudioNoiseReduction: InputAudioNoiseReduction? + /// Configuration for input audio transcription. Set to nil to turn off. public let inputAudioTranscription: InputAudioTranscription? @@ -112,6 +115,7 @@ nonisolated public struct OpenAIRealtimeSessionConfiguration: Encodable, Sendabl private enum CodingKeys: String, CodingKey { case inputAudioFormat = "input_audio_format" + case inputAudioNoiseReduction = "input_audio_noise_reduction" case inputAudioTranscription = "input_audio_transcription" case instructions case maxResponseOutputTokens = "max_response_output_tokens" @@ -127,6 +131,7 @@ nonisolated public struct OpenAIRealtimeSessionConfiguration: Encodable, Sendabl public init( inputAudioFormat: OpenAIRealtimeSessionConfiguration.AudioFormat? = nil, + inputAudioNoiseReduction: OpenAIRealtimeSessionConfiguration.InputAudioNoiseReduction? = nil, inputAudioTranscription: OpenAIRealtimeSessionConfiguration.InputAudioTranscription? = nil, instructions: String? = nil, maxResponseOutputTokens: OpenAIRealtimeSessionConfiguration.MaxResponseOutputTokens? = nil, @@ -140,6 +145,7 @@ nonisolated public struct OpenAIRealtimeSessionConfiguration: Encodable, Sendabl voice: String? = nil ) { self.inputAudioFormat = inputAudioFormat + self.inputAudioNoiseReduction = inputAudioNoiseReduction self.inputAudioTranscription = inputAudioTranscription self.instructions = instructions self.maxResponseOutputTokens = maxResponseOutputTokens @@ -154,6 +160,22 @@ nonisolated public struct OpenAIRealtimeSessionConfiguration: Encodable, Sendabl } } +// MARK: - +extension OpenAIRealtimeSessionConfiguration { + nonisolated public struct InputAudioNoiseReduction: Encodable, Sendable { + nonisolated public enum NoiseReductionType: String, Encodable, Sendable { + case nearField = "near_field" + case farField = "far_field" + } + + public let type: NoiseReductionType + + public init(type: NoiseReductionType) { + self.type = type + } + } +} + // MARK: - extension OpenAIRealtimeSessionConfiguration { nonisolated public struct InputAudioTranscription: Encodable, Sendable { diff --git a/Tests/AIProxyTests/OpenAIRealtimeSessionConfigurationTests.swift b/Tests/AIProxyTests/OpenAIRealtimeSessionConfigurationTests.swift new file mode 100644 index 0000000..8825579 --- /dev/null +++ b/Tests/AIProxyTests/OpenAIRealtimeSessionConfigurationTests.swift @@ -0,0 +1,46 @@ +// +// OpenAIRealtimeSessionConfigurationTests.swift +// AIProxy +// +// Created by Codex on 2/15/26. +// + +import XCTest +@testable import AIProxy + +final class OpenAIRealtimeSessionConfigurationTests: XCTestCase { + + func testInputAudioNoiseReductionNearFieldIsEncodable() throws { + let config = OpenAIRealtimeSessionConfiguration( + inputAudioNoiseReduction: .init(type: .nearField), + speed: nil + ) + + XCTAssertEqual( + """ + { + "input_audio_noise_reduction" : { + "type" : "near_field" + } + } + """, + try config.serialize(pretty: true) + ) + } + + func testInputAudioNoiseReductionIsOptional() throws { + let config = OpenAIRealtimeSessionConfiguration( + inputAudioFormat: .pcm16, + speed: nil + ) + + XCTAssertEqual( + """ + { + "input_audio_format" : "pcm16" + } + """, + try config.serialize(pretty: true) + ) + } +}