diff --git a/Sources/AIProxy/AudioController.swift b/Sources/AIProxy/AudioController.swift index dfddaf7..82381a9 100644 --- a/Sources/AIProxy/AudioController.swift +++ b/Sources/AIProxy/AudioController.swift @@ -18,16 +18,16 @@ import AVFoundation /// We use either AVAudioEngine or AudioToolbox for mic data, depending on the platform and whether headphones are attached. /// The following arrangement provides for the best user experience: /// -/// +----------+---------------+------------------+ -/// | Platform | Headphones | Audio API | -/// +----------+---------------+------------------+ -/// | macOS | Yes | AudioEngine | -/// | macOS | No | AudioToolbox | -/// | iOS | Yes | AudioEngine | -/// | iOS | No | AudioToolbox | -/// | watchOS | Yes | AudioEngine | -/// | watchOS | No | AudioEngine | -/// +----------+---------------+------------------+ +/// +----------+---------------+--------------------------------------+ +/// | Platform | Headphones | Audio API | +/// +----------+---------------+--------------------------------------+ +/// | macOS | Yes | AudioEngine | +/// | macOS | No | AudioToolbox | +/// | iOS | Yes | AudioEngine | +/// | iOS | No | AudioToolbox + manual rendering AEC | +/// | watchOS | Yes | AudioEngine | +/// | watchOS | No | AudioEngine | +/// +----------+---------------+--------------------------------------+ /// @AIProxyActor public final class AudioController { public enum Mode { @@ -42,7 +42,14 @@ import AVFoundation public init(modes: [Mode]) async throws { self.modes = modes - #if os(iOS) + #if os(iOS) // iOS-first guard: non-iOS behavior has not been validated for this path yet. + let needsManualRendering = modes.contains(.record) && modes.contains(.playback) + && !AIProxyUtils.headphonesConnected + #else + let needsManualRendering = false + #endif + + #if os(iOS) // iOS-first guard: non-iOS behavior has not been validated for this path yet. // This is not respected if `setVoiceProcessingEnabled(true)` is used :/ // Instead, I've added my own accumulator. // try? AVAudioSession.sharedInstance().setPreferredIOBufferDuration(0.1) @@ -61,11 +68,27 @@ import AVFoundation self.audioEngine = AVAudioEngine() + #if os(iOS) // iOS-first guard: non-iOS behavior has not been validated for this path yet. + if needsManualRendering { + let renderFormat = AVAudioFormat( + commonFormat: .pcmFormatFloat32, + sampleRate: 44100, + channels: 1, + interleaved: true + )! + try audioEngine.enableManualRenderingMode( + .realtime, + format: renderFormat, + maximumFrameCount: 4096 + ) + } + #endif + if modes.contains(.record) { #if os(macOS) || os(iOS) self.microphonePCMSampleVendor = AIProxyUtils.headphonesConnected ? try MicrophonePCMSampleVendorAE(audioEngine: self.audioEngine) - : MicrophonePCMSampleVendorAT() + : MicrophonePCMSampleVendorAT(audioEngine: self.audioEngine) #else self.microphonePCMSampleVendor = try MicrophonePCMSampleVendorAE(audioEngine: self.audioEngine) #endif @@ -81,9 +104,17 @@ import AVFoundation // Nesting `start` in a Task is necessary on watchOS. // There is some sort of race, and letting the runloop tick seems to "fix" it. // If I call `prepare` and `start` in serial succession, then there is no playback on watchOS (sometimes). + #if os(iOS) // iOS-first guard: non-iOS behavior has not been validated for this path yet. + try self.audioEngine.start() + #elseif os(watchOS) Task { try self.audioEngine.start() } + #else + Task { + try self.audioEngine.start() + } + #endif } deinit { diff --git a/Sources/AIProxy/MicrophonePCMSampleVendorAT.swift b/Sources/AIProxy/MicrophonePCMSampleVendorAT.swift index c131b34..a93a6c9 100644 --- a/Sources/AIProxy/MicrophonePCMSampleVendorAT.swift +++ b/Sources/AIProxy/MicrophonePCMSampleVendorAT.swift @@ -12,6 +12,13 @@ import AudioToolbox import Foundation nonisolated private let kVoiceProcessingInputSampleRate: Double = 44100 +nonisolated private let kEchoGuardOutputRMSFloor: Float = 0.0015 +nonisolated private let kEchoGuardOutputTailSeconds: TimeInterval = 0.12 +nonisolated private let kEchoGuardRMSSmoothingFactor: Float = 0.2 +nonisolated private let kEchoGuardBargeInThresholdFloor: Float = 0.018 +nonisolated private let kEchoGuardBargeInRelativeMultiplier: Float = 2.3 +nonisolated private let kEchoGuardFramesForBargeIn = 2 +nonisolated private let kEchoGuardBargeInHoldSeconds: TimeInterval = 1.0 /// This is an AudioToolbox-based implementation that vends PCM16 microphone samples at a /// sample rate that OpenAI's realtime models expect. @@ -58,8 +65,15 @@ nonisolated private let kVoiceProcessingInputSampleRate: Double = 44100 private var audioUnit: AudioUnit? private let microphonePCMSampleVendorCommon = MicrophonePCMSampleVendorCommon() private var continuation: AsyncStream.Continuation? - - public init() {} + private var audioEngine: AVAudioEngine? + private var outputLikelyActiveUntilUptime: TimeInterval = 0 + private var outputSmoothedRMS: Float = 0 + private var micLoudFrameStreak = 0 + private var bargeInOpenUntilUptime: TimeInterval = 0 + + public init(audioEngine: AVAudioEngine? = nil) { + self.audioEngine = audioEngine + } deinit { logIf(.debug)?.debug("MicrophonePCMSampleVendor is being freed") @@ -101,17 +115,22 @@ nonisolated private let kVoiceProcessingInputSampleRate: Double = 44100 ) } - var zero: UInt32 = 0 + #if os(iOS) // iOS-first guard: non-iOS behavior has not been validated for this path yet. + let shouldEnableSpeakerBusForAEC = audioEngine?.isInManualRenderingMode ?? false + #else + let shouldEnableSpeakerBusForAEC = true + #endif + var one_output: UInt32 = shouldEnableSpeakerBusForAEC ? 1 : 0 err = AudioUnitSetProperty(audioUnit, kAudioOutputUnitProperty_EnableIO, kAudioUnitScope_Output, 0, - &zero, // <-- This is not a mistake! If you leave this on, iOS spams the logs with: "from AU (address): auou/vpio/appl, render err: -1" - UInt32(MemoryLayout.size(ofValue: one))) + &one_output, + UInt32(MemoryLayout.size(ofValue: one_output))) guard err == noErr else { throw MicrophonePCMSampleVendorError.couldNotConfigureAudioUnit( - "Could not disable the output scope of the speaker bus" + "Could not configure the output scope of the speaker bus" ) } @@ -236,19 +255,77 @@ nonisolated private let kVoiceProcessingInputSampleRate: Double = 44100 ) } - // Do not use auto gain control. Remove in a future commit. - // var enable: UInt32 = 1 - // err = AudioUnitSetProperty(audioUnit, - // kAUVoiceIOProperty_VoiceProcessingEnableAGC, - // kAudioUnitScope_Output, - // 1, - // &enable, - // UInt32(MemoryLayout.size(ofValue: enable))) - // - guard err == noErr else { - throw MicrophonePCMSampleVendorError.couldNotConfigureAudioUnit( - "Could not configure auto gain control" + #if os(iOS) // iOS-first guard: non-iOS behavior has not been validated for this path yet. + // Make voice processing explicit so route changes do not accidentally bypass AEC. + var disableBypass: UInt32 = 0 + err = AudioUnitSetProperty( + audioUnit, + kAUVoiceIOProperty_BypassVoiceProcessing, + kAudioUnitScope_Global, + 0, + &disableBypass, + UInt32(MemoryLayout.size) + ) + if err != noErr { + logIf(.warning)?.warning("Could not force-enable VPIO voice processing: \(err)") + } + + // Disable AGC to avoid amplifying far-end leakage into the uplink. + var disableAGC: UInt32 = 0 + err = AudioUnitSetProperty( + audioUnit, + kAUVoiceIOProperty_VoiceProcessingEnableAGC, + kAudioUnitScope_Global, + 1, + &disableAGC, + UInt32(MemoryLayout.size) + ) + if err != noErr { + logIf(.warning)?.warning("Could not disable VPIO AGC: \(err)") + } + #endif + + // If we have an AVAudioEngine in manual rendering mode, set up the VPIO output bus + // to pull rendered audio. This gives the VPIO visibility into playback for AEC. + if shouldEnableSpeakerBusForAEC, audioEngine != nil { + var outputFormat = AudioStreamBasicDescription( + mSampleRate: kVoiceProcessingInputSampleRate, // 44100 + mFormatID: kAudioFormatLinearPCM, + mFormatFlags: kAudioFormatFlagIsFloat | kAudioFormatFlagIsPacked, + mBytesPerPacket: 4, + mFramesPerPacket: 1, + mBytesPerFrame: 4, + mChannelsPerFrame: 1, + mBitsPerChannel: 32, + mReserved: 0 ) + err = AudioUnitSetProperty(audioUnit, + kAudioUnitProperty_StreamFormat, + kAudioUnitScope_Input, + 0, // Bus 0 (speaker) + &outputFormat, + UInt32(MemoryLayout.size)) + guard err == noErr else { + throw MicrophonePCMSampleVendorError.couldNotConfigureAudioUnit( + "Could not set stream format on the input scope of the speaker bus" + ) + } + + var outputCallbackStruct = AURenderCallbackStruct( + inputProc: audioOutputRenderCallback, + inputProcRefCon: Unmanaged.passUnretained(self).toOpaque() + ) + err = AudioUnitSetProperty(audioUnit, + kAudioUnitProperty_SetRenderCallback, + kAudioUnitScope_Input, + 0, // Bus 0 + &outputCallbackStruct, + UInt32(MemoryLayout.size)) + guard err == noErr else { + throw MicrophonePCMSampleVendorError.couldNotConfigureAudioUnit( + "Could not set the output render callback on the voice processing audio unit" + ) + } } err = AudioUnitInitialize(audioUnit) @@ -318,6 +395,12 @@ nonisolated private let kVoiceProcessingInputSampleRate: Double = 44100 return } + #if os(iOS) // iOS-first guard: non-iOS behavior has not been validated for this path yet. + if self.shouldSuppressLikelyEchoInput(bufferList: bufferList, frameCount: inNumberFrames) { + return + } + #endif + guard let audioFormat = AVAudioFormat( commonFormat: .pcmFormatInt16, sampleRate: kVoiceProcessingInputSampleRate, @@ -336,9 +419,183 @@ nonisolated private let kVoiceProcessingInputSampleRate: Double = 44100 } } } + + /// Called from the VPIO output render callback on the real-time audio thread. + /// Pulls rendered audio from the AVAudioEngine's manual rendering block and writes it + /// into the VPIO's output buffer so the VPIO can use it as the AEC reference signal. + fileprivate func didReceiveOutputRenderCallback( + _ ioActionFlags: UnsafeMutablePointer, + _ inTimeStamp: UnsafePointer, + _ inBusNumber: UInt32, + _ inNumberFrames: UInt32, + _ ioData: UnsafeMutablePointer? + ) { + guard let ioData = ioData, let audioEngine = audioEngine else { + // No engine — render silence + if let ioData = ioData { + let buf = UnsafeMutableAudioBufferListPointer(ioData) + for i in 0.., + frameCount: UInt32 + ) { + let outputRMS = self.rms(ofFloat32BufferList: ioData) + let now = ProcessInfo.processInfo.systemUptime + + if outputRMS > kEchoGuardOutputRMSFloor { + let bufferDuration = Double(frameCount) / kVoiceProcessingInputSampleRate + self.outputLikelyActiveUntilUptime = now + bufferDuration + kEchoGuardOutputTailSeconds + if self.outputSmoothedRMS == 0 { + self.outputSmoothedRMS = outputRMS + } else { + self.outputSmoothedRMS = (self.outputSmoothedRMS * (1 - kEchoGuardRMSSmoothingFactor)) + + (outputRMS * kEchoGuardRMSSmoothingFactor) + } + return + } + + if now > self.outputLikelyActiveUntilUptime { + self.outputSmoothedRMS = 0 + self.micLoudFrameStreak = 0 + } + } + + private func shouldSuppressLikelyEchoInput( + bufferList: AudioBufferList, + frameCount: UInt32 + ) -> Bool { + let now = ProcessInfo.processInfo.systemUptime + if now <= self.bargeInOpenUntilUptime { + return false + } + + guard now <= self.outputLikelyActiveUntilUptime else { + self.micLoudFrameStreak = 0 + return false + } + + let micRMS = self.rms(ofPCM16BufferList: bufferList, frameCount: frameCount) + let bargeInThreshold = max( + kEchoGuardBargeInThresholdFloor, + self.outputSmoothedRMS * kEchoGuardBargeInRelativeMultiplier + ) + + if micRMS >= bargeInThreshold { + self.micLoudFrameStreak += 1 + if self.micLoudFrameStreak >= kEchoGuardFramesForBargeIn { + self.micLoudFrameStreak = 0 + self.bargeInOpenUntilUptime = now + kEchoGuardBargeInHoldSeconds + return false + } + } else { + self.micLoudFrameStreak = 0 + } + + return true + } + + private func rms( + ofPCM16BufferList bufferList: AudioBufferList, + frameCount: UInt32 + ) -> Float { + guard frameCount > 0, + let data = bufferList.mBuffers.mData else { + return 0 + } + + let sampleCount = Int(frameCount) + let samples = data.bindMemory(to: Int16.self, capacity: sampleCount) + let scale = Float(Int16.max) + var sumSquares: Float = 0 + for i in 0..) -> Float { + let buffers = UnsafeMutableAudioBufferListPointer(ioData) + guard !buffers.isEmpty else { + return 0 + } + + var sampleCount = 0 + var sumSquares: Float = 0 + for buffer in buffers { + guard let mData = buffer.mData else { continue } + let count = Int(buffer.mDataByteSize) / MemoryLayout.size + if count == 0 { continue } + let samples = mData.bindMemory(to: Float.self, capacity: count) + sampleCount += count + for i in 0.. 0 else { + return 0 + } + return sqrt(sumSquares / Float(sampleCount)) + } + #endif +} + +// NOTE: +// This callback is invoked by Core Audio on a real-time I/O thread via C APIs. +// It is not scheduled onto AIProxyActor at runtime, even though the symbol is +// annotated with @AIProxyActor for Swift type-checking ergonomics. +// Do not assume actor isolation/synchronization inside this callback. +@AIProxyActor private let audioOutputRenderCallback: AURenderCallback = { + inRefCon, + ioActionFlags, + inTimeStamp, + inBusNumber, + inNumberFrames, + ioData in + let vendor = Unmanaged + .fromOpaque(inRefCon) + .takeUnretainedValue() + vendor.didReceiveOutputRenderCallback( + ioActionFlags, + inTimeStamp, + inBusNumber, + inNumberFrames, + ioData + ) + return noErr } -// This @AIProxyActor annotation is a lie. +// NOTE: +// This callback is invoked by Core Audio on a real-time I/O thread via C APIs. +// It is not scheduled onto AIProxyActor at runtime, even though the symbol is +// annotated with @AIProxyActor for Swift type-checking ergonomics. +// Do not assume actor isolation/synchronization inside this callback. @AIProxyActor private let audioRenderCallback: AURenderCallback = { inRefCon, ioActionFlags, diff --git a/Sources/AIProxy/OpenAI/OpenAIRealtimeSessionConfiguration.swift b/Sources/AIProxy/OpenAI/OpenAIRealtimeSessionConfiguration.swift index 2178df0..c6c69c8 100644 --- a/Sources/AIProxy/OpenAI/OpenAIRealtimeSessionConfiguration.swift +++ b/Sources/AIProxy/OpenAI/OpenAIRealtimeSessionConfiguration.swift @@ -61,6 +61,9 @@ nonisolated public struct OpenAIRealtimeSessionConfiguration: Encodable, Sendabl /// The format of input audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`. public let inputAudioFormat: AudioFormat? + /// Configuration for input audio noise reduction. Set to nil to turn off. + public let inputAudioNoiseReduction: InputAudioNoiseReduction? + /// Configuration for input audio transcription. Set to nil to turn off. public let inputAudioTranscription: InputAudioTranscription? @@ -112,6 +115,7 @@ nonisolated public struct OpenAIRealtimeSessionConfiguration: Encodable, Sendabl private enum CodingKeys: String, CodingKey { case inputAudioFormat = "input_audio_format" + case inputAudioNoiseReduction = "input_audio_noise_reduction" case inputAudioTranscription = "input_audio_transcription" case instructions case maxResponseOutputTokens = "max_response_output_tokens" @@ -127,6 +131,7 @@ nonisolated public struct OpenAIRealtimeSessionConfiguration: Encodable, Sendabl public init( inputAudioFormat: OpenAIRealtimeSessionConfiguration.AudioFormat? = nil, + inputAudioNoiseReduction: OpenAIRealtimeSessionConfiguration.InputAudioNoiseReduction? = nil, inputAudioTranscription: OpenAIRealtimeSessionConfiguration.InputAudioTranscription? = nil, instructions: String? = nil, maxResponseOutputTokens: OpenAIRealtimeSessionConfiguration.MaxResponseOutputTokens? = nil, @@ -140,6 +145,7 @@ nonisolated public struct OpenAIRealtimeSessionConfiguration: Encodable, Sendabl voice: String? = nil ) { self.inputAudioFormat = inputAudioFormat + self.inputAudioNoiseReduction = inputAudioNoiseReduction self.inputAudioTranscription = inputAudioTranscription self.instructions = instructions self.maxResponseOutputTokens = maxResponseOutputTokens @@ -154,6 +160,22 @@ nonisolated public struct OpenAIRealtimeSessionConfiguration: Encodable, Sendabl } } +// MARK: - +extension OpenAIRealtimeSessionConfiguration { + nonisolated public struct InputAudioNoiseReduction: Encodable, Sendable { + nonisolated public enum NoiseReductionType: String, Encodable, Sendable { + case nearField = "near_field" + case farField = "far_field" + } + + public let type: NoiseReductionType + + public init(type: NoiseReductionType) { + self.type = type + } + } +} + // MARK: - extension OpenAIRealtimeSessionConfiguration { nonisolated public struct InputAudioTranscription: Encodable, Sendable { diff --git a/Tests/AIProxyTests/OpenAIRealtimeSessionConfigurationTests.swift b/Tests/AIProxyTests/OpenAIRealtimeSessionConfigurationTests.swift new file mode 100644 index 0000000..8825579 --- /dev/null +++ b/Tests/AIProxyTests/OpenAIRealtimeSessionConfigurationTests.swift @@ -0,0 +1,46 @@ +// +// OpenAIRealtimeSessionConfigurationTests.swift +// AIProxy +// +// Created by Codex on 2/15/26. +// + +import XCTest +@testable import AIProxy + +final class OpenAIRealtimeSessionConfigurationTests: XCTestCase { + + func testInputAudioNoiseReductionNearFieldIsEncodable() throws { + let config = OpenAIRealtimeSessionConfiguration( + inputAudioNoiseReduction: .init(type: .nearField), + speed: nil + ) + + XCTAssertEqual( + """ + { + "input_audio_noise_reduction" : { + "type" : "near_field" + } + } + """, + try config.serialize(pretty: true) + ) + } + + func testInputAudioNoiseReductionIsOptional() throws { + let config = OpenAIRealtimeSessionConfiguration( + inputAudioFormat: .pcm16, + speed: nil + ) + + XCTAssertEqual( + """ + { + "input_audio_format" : "pcm16" + } + """, + try config.serialize(pretty: true) + ) + } +} diff --git a/docs/realtime-self-interruption-fix.md b/docs/realtime-self-interruption-fix.md new file mode 100644 index 0000000..e8a7888 --- /dev/null +++ b/docs/realtime-self-interruption-fix.md @@ -0,0 +1,83 @@ +# Realtime Self-Interruption Fix Plan + +## Goal + +Prevent the OpenAI Realtime model from hearing its own speaker playback on iOS and incorrectly interrupting itself, while preserving intentional user barge-in behavior. + +## Scope + +- Primary implementation target: `AIProxySwift` (`wip/vpio-echo-cancellation`) +- Integration context reviewed: `/Users/kavimathur/workspace/joice` (CallKit + Realtime) +- APIs involved: iOS audio stack (VPIO + AVAudioEngine), OpenAI Realtime turn detection + +## Mechanism Review + +### AIProxySwift audio path (current branch) + +1. `AudioController` chooses AudioToolbox VPIO on iOS speaker path (no headphones). +2. `MicrophonePCMSampleVendorAT` captures microphone via VPIO bus 1. +3. Playback is scheduled with `AudioPCMPlayer` on `AVAudioEngine`. +4. VPIO bus 0 render callback pulls from `audioEngine.manualRenderingBlock` as AEC reference. + +This closes the biggest architectural gap from issue #240 and PR #264. + +### Why interruption can still happen + +Even with VPIO AEC working, residual echo can remain due to acoustics, device variance, AGC interactions, and startup/adaptation windows. OpenAI turn detection may treat that residual as user speech and emit `input_audio_buffer.speech_started`, which typical app code maps to playback interruption. + +## Plan + +1. Harden VPIO voice-processing configuration. +1. Add a client-side mic uplink echo guard in the VPIO callback path for iOS speaker mode: + - Track assistant playback activity/level. + - Suppress likely echo frames while assistant audio is active. + - Re-open mic quickly on strong near-end speech (barge-in). +1. Keep behavior isolated to iOS speaker full-duplex mode to avoid regressions. +1. Validate by build/tests and document expected on-device QA scenarios. + +## Risk Notes + +- Over-aggressive suppression can make barge-in harder. +- Under-aggressive suppression can still allow self-interruption. +- iOS real-time audio timing varies by route/device, so thresholds must be conservative and tunable in code. + +## Learning Log + +- Initial finding: current branch correctly routes far-end audio through VPIO output callback, but does not guard against residual leakage in uplink. +- Initial finding: `joice` uses `.semanticVAD(eagerness: .medium)` and interrupts playback on every `input_audio_buffer.speech_started`, so any leakage becomes user-visible immediately. +- Implementation direction: add defense-in-depth in SDK rather than relying only on VPIO AEC. +- Attempted approach: add mic suppression wrapper in `AudioController.micStream()`. +- Course correction: Swift 6 sendability checks rejected forwarding `AVAudioPCMBuffer` across actor boundaries in that wrapper, so the echo guard moved into `MicrophonePCMSampleVendorAT` callback code (no cross-actor buffer forwarding). +- Implemented: explicitly force VPIO voice processing on (`kAUVoiceIOProperty_BypassVoiceProcessing = 0`) and disable AGC (`kAUVoiceIOProperty_VoiceProcessingEnableAGC = 0`) to reduce amplified leakage. +- Implemented: output callback now measures rendered output RMS and tracks an "assistant audio active" window. +- Implemented: input callback computes mic RMS and suppresses frames likely to be echo while assistant output is active, but allows barge-in after consecutive loud mic frames and keeps a short barge-in-open window. +- Implemented: speaker output bus + output callback setup are now limited to manual-rendering mode, preserving pre-existing macOS/non-manual behavior. +- Platform-safety update: iOS-only behavior changes are enforced for AGC/bypass settings and echo suppression; macOS/watchOS paths retain prior behavior unless unchanged baseline setup is required. +- Validation: `swift build` succeeds and `swift test` passes (172 tests, 0 failures). + +## Execution Summary + +- Changed `/Users/kavimathur/workspace/AIProxySwift/Sources/AIProxy/MicrophonePCMSampleVendorAT.swift`: + - Added VPIO property hardening for bypass/AGC. + - Added residual echo suppression logic in the real-time callback path (iOS only). + - Added real-time RMS tracking helpers for output and microphone buffers. +- Changed `/Users/kavimathur/workspace/AIProxySwift/Sources/AIProxy/AudioController.swift`: + - Start `AVAudioEngine` synchronously on non-watchOS to reduce startup race risk in iOS/macOS call paths. +- Added plan + log file: + - `/Users/kavimathur/workspace/AIProxySwift/docs/realtime-self-interruption-fix.md` + +## On-Device QA Checklist + +1. iPhone speaker mode, no headphones: AI should complete full responses without self-interrupting. +1. While AI is speaking, interrupt loudly and verify barge-in still works within ~200-300ms. +1. Quiet room and noisy room checks: verify no constant false interruptions. +1. Headphones/Bluetooth route: verify no regression (existing non-speaker behavior unchanged). +1. CallKit route changes (speaker toggle, lock screen controls): verify conversation remains stable. + +## References + +- Issue context: +- Prior attempt (PR): +- OpenAI Realtime turn detection fields (`interrupt_response`, `create_response`): +- Apple Audio Unit bus model (I/O unit fundamentals): +- Apple WWDC discussion of full-duplex voice processing echo cancellation: diff --git a/docs/vpio-echo-cancellation.md b/docs/vpio-echo-cancellation.md new file mode 100644 index 0000000..f69f5ea --- /dev/null +++ b/docs/vpio-echo-cancellation.md @@ -0,0 +1,84 @@ +# VPIO Echo Cancellation Fix for iOS + +## Problem + +When using Joice on iOS with speaker output (no headphones), the AI's voice plays through the speaker, gets picked up by the microphone, and OpenAI's server-side VAD interprets it as user speech — causing the AI to interrupt itself. + +## Root Cause + +The AIProxy SDK uses `kAudioUnitSubType_VoiceProcessingIO` (VPIO) for mic capture on iOS without headphones. VPIO is designed to provide Acoustic Echo Cancellation (AEC), but **AEC was non-functional** because: + +1. The VPIO output bus (Bus 0) was **explicitly disabled** in `MicrophonePCMSampleVendorAT.swift` (the comment explained that enabling it caused `render err: -1` without a data source) +2. Playback went through a completely separate `AVAudioEngine` that the VPIO had no visibility into + +**For VPIO AEC to work, playback audio must flow through the VPIO's output bus as a reference signal.** The VPIO cannot cancel echo from audio it doesn't know about. + +## Solution: AVAudioEngine Manual Rendering + VPIO I/O + +Route playback through the VPIO by putting `AVAudioEngine` in **manual rendering mode**. The existing `AudioPCMPlayer` continues to schedule buffers on the playerNode as before, but the engine no longer renders to hardware. Instead, the VPIO output render callback **pulls** rendered audio from the engine and feeds it to the speaker through the VPIO. This gives the VPIO full visibility into both input and output for AEC. + +This is the same pattern used by Twilio's Voice SDK for iOS echo cancellation. + +### Signal Path + +``` +OpenAI 24kHz PCM16 --> AudioPCMPlayer --> playerNode --> AVAudioEngine (manual rendering, 44100Hz) + | + v + VPIO output render callback + calls engine.manualRenderingBlock + | + v + VPIO Bus 0 output + (plays to speaker + AEC ref) + +Hardware mic --> VPIO Bus 1 input (echo-cancelled) --> input callback --> resample --> OpenAI +``` + +## Changes + +### `MicrophonePCMSampleVendorAT.swift` + +1. **Added `audioEngine` property + updated `init`** — accepts an optional `AVAudioEngine` in manual rendering mode +2. **Enabled output bus 0** — changed the `zero` → `one_output` so the VPIO speaker bus is active (previously disabled to avoid `render err: -1`) +3. **Set stream format on output bus 0 (Input scope)** — Float32 at 44100Hz mono, matching the manual rendering engine format +4. **Registered a render callback on bus 0** — only when `audioEngine` is provided +5. **Implemented `didReceiveOutputRenderCallback`** — pulls audio from `audioEngine.manualRenderingBlock` into the VPIO's output buffer; fills silence on error or when no engine is present +6. **Added C-level `audioOutputRenderCallback`** — bridges to the instance method (same pattern as the existing input callback) + +### `AudioController.swift` + +1. **Enable manual rendering** — on iOS without headphones, puts `AVAudioEngine` into `.realtime` manual rendering mode at Float32/44100Hz/mono before any nodes are attached +2. **Pass `audioEngine` to VPIO vendor** — `MicrophonePCMSampleVendorAT(audioEngine: self.audioEngine)` so the render callback can pull from it +3. **Updated doc comment table** — iOS without headphones now notes "AudioToolbox + manual rendering AEC" + +### `AudioPCMPlayer.swift` + +No changes needed. The playerNode scheduling API works identically in manual rendering mode. The engine buffers audio internally and renders it when `manualRenderingBlock` is called from the VPIO output callback. + +## Why This Works + +1. `AudioPCMPlayer` schedules playback buffers on the playerNode at 24kHz +2. `AVAudioEngine` (in manual rendering mode at 44100Hz) internally upsamples and mixes +3. The VPIO output render callback pulls mixed audio via `manualRenderingBlock` +4. VPIO sends this audio to the hardware speaker **and** uses it as the AEC reference +5. VPIO subtracts the reference from the mic input on Bus 1, producing echo-cancelled audio +6. The echo-cancelled mic audio flows through the existing input callback unchanged + +## Risks and Mitigations + +| Risk | Mitigation | +|------|------------| +| Volume bug with VPIO (documented in AudioPCMPlayer) | AudioPCMPlayer is initialized before VPIO (existing order in AudioController). In manual rendering mode, the engine doesn't drive hardware directly, so the bug may not apply. | +| `manualRenderingBlock` called on real-time thread | Apple docs confirm this is the intended usage — the block is designed for real-time contexts. | +| Format mismatch between engine output and VPIO bus | Both configured to Float32/44100Hz/mono. The engine handles 24kHz to 44100Hz upsampling internally. | +| macOS not addressed | macOS AT path unchanged (separate concern, less acute due to speaker/mic distance). Only iOS gets the manual rendering + VPIO AEC fix. | +| Headphones path unchanged | When headphones are connected, the `MicrophonePCMSampleVendorAE` (AVAudioEngine-based) path is used instead — no regression risk. | + +## Testing + +1. Build and run on a **physical iOS device** with speaker (no headphones) +2. Start a voice session — AI should speak full responses without self-interrupting +3. Speak over the AI to verify user interruption still works +4. Test with headphones to confirm no regression (headphones path is unchanged) +5. Test that playback volume is acceptable