diff --git a/Sources/AIProxy/AudioController.swift b/Sources/AIProxy/AudioController.swift
index dfddaf7..82381a9 100644
--- a/Sources/AIProxy/AudioController.swift
+++ b/Sources/AIProxy/AudioController.swift
@@ -18,16 +18,16 @@ import AVFoundation
 /// We use either AVAudioEngine or AudioToolbox for mic data, depending on the platform and whether headphones are attached.
 /// The following arrangement provides for the best user experience:
 ///
-///     +----------+---------------+------------------+
-///     | Platform | Headphones    | Audio API        |
-///     +----------+---------------+------------------+
-///     | macOS    | Yes           | AudioEngine      |
-///     | macOS    | No            | AudioToolbox     |
-///     | iOS      | Yes           | AudioEngine      |
-///     | iOS      | No            | AudioToolbox     |
-///     | watchOS  | Yes           | AudioEngine      |
-///     | watchOS  | No            | AudioEngine      |
-///     +----------+---------------+------------------+
+///     +----------+---------------+--------------------------------------+
+///     | Platform | Headphones    | Audio API                            |
+///     +----------+---------------+--------------------------------------+
+///     | macOS    | Yes           | AudioEngine                          |
+///     | macOS    | No            | AudioToolbox                         |
+///     | iOS      | Yes           | AudioEngine                          |
+///     | iOS      | No            | AudioToolbox + manual rendering AEC  |
+///     | watchOS  | Yes           | AudioEngine                          |
+///     | watchOS  | No            | AudioEngine                          |
+///     +----------+---------------+--------------------------------------+
 ///
 @AIProxyActor public final class AudioController {
     public enum Mode {
@@ -42,7 +42,14 @@ import AVFoundation
 
     public init(modes: [Mode]) async throws {
         self.modes = modes
-        #if os(iOS)
+        #if os(iOS) // iOS-first guard: non-iOS behavior has not been validated for this path yet.
+        let needsManualRendering = modes.contains(.record) && modes.contains(.playback)
+                                   && !AIProxyUtils.headphonesConnected
+        #else
+        let needsManualRendering = false
+        #endif
+
+        #if os(iOS) // iOS-first guard: non-iOS behavior has not been validated for this path yet.
         // This is not respected if `setVoiceProcessingEnabled(true)` is used :/
         // Instead, I've added my own accumulator.
         // try? AVAudioSession.sharedInstance().setPreferredIOBufferDuration(0.1)
@@ -61,11 +68,27 @@ import AVFoundation
 
         self.audioEngine = AVAudioEngine()
 
+        #if os(iOS) // iOS-first guard: non-iOS behavior has not been validated for this path yet.
+        if needsManualRendering {
+            let renderFormat = AVAudioFormat(
+                commonFormat: .pcmFormatFloat32,
+                sampleRate: 44100,
+                channels: 1,
+                interleaved: true
+            )!
+            try audioEngine.enableManualRenderingMode(
+                .realtime,
+                format: renderFormat,
+                maximumFrameCount: 4096
+            )
+        }
+        #endif
+
         if modes.contains(.record) {
             #if os(macOS) || os(iOS)
             self.microphonePCMSampleVendor = AIProxyUtils.headphonesConnected
                                                ? try MicrophonePCMSampleVendorAE(audioEngine: self.audioEngine)
-                                               : MicrophonePCMSampleVendorAT()
+                                               : MicrophonePCMSampleVendorAT(audioEngine: self.audioEngine)
             #else
             self.microphonePCMSampleVendor = try MicrophonePCMSampleVendorAE(audioEngine: self.audioEngine)
             #endif
@@ -81,9 +104,17 @@ import AVFoundation
         // Nesting `start` in a Task is necessary on watchOS.
         // There is some sort of race, and letting the runloop tick seems to "fix" it.
         // If I call `prepare` and `start` in serial succession, then there is no playback on watchOS (sometimes).
+        #if os(iOS) // iOS-first guard: non-iOS behavior has not been validated for this path yet.
+        try self.audioEngine.start()
+        #elseif os(watchOS)
         Task {
             try self.audioEngine.start()
         }
+        #else
+        Task {
+            try self.audioEngine.start()
+        }
+        #endif
     }
 
     deinit {
diff --git a/Sources/AIProxy/MicrophonePCMSampleVendorAT.swift b/Sources/AIProxy/MicrophonePCMSampleVendorAT.swift
index c131b34..a93a6c9 100644
--- a/Sources/AIProxy/MicrophonePCMSampleVendorAT.swift
+++ b/Sources/AIProxy/MicrophonePCMSampleVendorAT.swift
@@ -12,6 +12,13 @@ import AudioToolbox
 import Foundation
 
 nonisolated private let kVoiceProcessingInputSampleRate: Double = 44100
+nonisolated private let kEchoGuardOutputRMSFloor: Float = 0.0015
+nonisolated private let kEchoGuardOutputTailSeconds: TimeInterval = 0.12
+nonisolated private let kEchoGuardRMSSmoothingFactor: Float = 0.2
+nonisolated private let kEchoGuardBargeInThresholdFloor: Float = 0.018
+nonisolated private let kEchoGuardBargeInRelativeMultiplier: Float = 2.3
+nonisolated private let kEchoGuardFramesForBargeIn = 2
+nonisolated private let kEchoGuardBargeInHoldSeconds: TimeInterval = 1.0
 
 /// This is an AudioToolbox-based implementation that vends PCM16 microphone samples at a
 /// sample rate that OpenAI's realtime models expect.
@@ -58,8 +65,15 @@ nonisolated private let kVoiceProcessingInputSampleRate: Double = 44100
     private var audioUnit: AudioUnit?
     private let microphonePCMSampleVendorCommon = MicrophonePCMSampleVendorCommon()
     private var continuation: AsyncStream<AVAudioPCMBuffer>.Continuation?
-
-    public init() {}
+    private var audioEngine: AVAudioEngine?
+    private var outputLikelyActiveUntilUptime: TimeInterval = 0
+    private var outputSmoothedRMS: Float = 0
+    private var micLoudFrameStreak = 0
+    private var bargeInOpenUntilUptime: TimeInterval = 0
+
+    public init(audioEngine: AVAudioEngine? = nil) {
+        self.audioEngine = audioEngine
+    }
 
     deinit {
         logIf(.debug)?.debug("MicrophonePCMSampleVendor is being freed")
@@ -101,17 +115,22 @@ nonisolated private let kVoiceProcessingInputSampleRate: Double = 44100
             )
         }
 
-        var zero: UInt32 = 0
+        #if os(iOS) // iOS-first guard: non-iOS behavior has not been validated for this path yet.
+        let shouldEnableSpeakerBusForAEC = audioEngine?.isInManualRenderingMode ?? false
+        #else
+        let shouldEnableSpeakerBusForAEC = true
+        #endif
+        var one_output: UInt32 = shouldEnableSpeakerBusForAEC ? 1 : 0
         err = AudioUnitSetProperty(audioUnit,
                                    kAudioOutputUnitProperty_EnableIO,
                                    kAudioUnitScope_Output,
                                    0,
-                                   &zero, // <-- This is not a mistake! If you leave this on, iOS spams the logs with: "from AU (address): auou/vpio/appl, render err: -1"
-                                   UInt32(MemoryLayout.size(ofValue: one)))
+                                   &one_output,
+                                   UInt32(MemoryLayout.size(ofValue: one_output)))
 
         guard err == noErr else {
             throw MicrophonePCMSampleVendorError.couldNotConfigureAudioUnit(
-                "Could not disable the output scope of the speaker bus"
+                "Could not configure the output scope of the speaker bus"
             )
         }
 
@@ -236,19 +255,77 @@ nonisolated private let kVoiceProcessingInputSampleRate: Double = 44100
             )
         }
 
-        // Do not use auto gain control. Remove in a future commit.
-        // var enable: UInt32 = 1
-        // err = AudioUnitSetProperty(audioUnit,
-        //                      kAUVoiceIOProperty_VoiceProcessingEnableAGC,
-        //                      kAudioUnitScope_Output,
-        //                      1,
-        //                      &enable,
-        //                      UInt32(MemoryLayout.size(ofValue: enable)))
-        //
-        guard err == noErr else {
-            throw MicrophonePCMSampleVendorError.couldNotConfigureAudioUnit(
-                "Could not configure auto gain control"
+        #if os(iOS) // iOS-first guard: non-iOS behavior has not been validated for this path yet.
+        // Make voice processing explicit so route changes do not accidentally bypass AEC.
+        var disableBypass: UInt32 = 0
+        err = AudioUnitSetProperty(
+            audioUnit,
+            kAUVoiceIOProperty_BypassVoiceProcessing,
+            kAudioUnitScope_Global,
+            0,
+            &disableBypass,
+            UInt32(MemoryLayout<UInt32>.size)
+        )
+        if err != noErr {
+            logIf(.warning)?.warning("Could not force-enable VPIO voice processing: \(err)")
+        }
+
+        // Disable AGC to avoid amplifying far-end leakage into the uplink.
+        var disableAGC: UInt32 = 0
+        err = AudioUnitSetProperty(
+            audioUnit,
+            kAUVoiceIOProperty_VoiceProcessingEnableAGC,
+            kAudioUnitScope_Global,
+            1,
+            &disableAGC,
+            UInt32(MemoryLayout<UInt32>.size)
+        )
+        if err != noErr {
+            logIf(.warning)?.warning("Could not disable VPIO AGC: \(err)")
+        }
+        #endif
+
+        // If we have an AVAudioEngine in manual rendering mode, set up the VPIO output bus
+        // to pull rendered audio. This gives the VPIO visibility into playback for AEC.
+        if shouldEnableSpeakerBusForAEC, audioEngine != nil {
+            var outputFormat = AudioStreamBasicDescription(
+                mSampleRate: kVoiceProcessingInputSampleRate,  // 44100
+                mFormatID: kAudioFormatLinearPCM,
+                mFormatFlags: kAudioFormatFlagIsFloat | kAudioFormatFlagIsPacked,
+                mBytesPerPacket: 4,
+                mFramesPerPacket: 1,
+                mBytesPerFrame: 4,
+                mChannelsPerFrame: 1,
+                mBitsPerChannel: 32,
+                mReserved: 0
             )
+            err = AudioUnitSetProperty(audioUnit,
+                                       kAudioUnitProperty_StreamFormat,
+                                       kAudioUnitScope_Input,
+                                       0,  // Bus 0 (speaker)
+                                       &outputFormat,
+                                       UInt32(MemoryLayout<AudioStreamBasicDescription>.size))
+            guard err == noErr else {
+                throw MicrophonePCMSampleVendorError.couldNotConfigureAudioUnit(
+                    "Could not set stream format on the input scope of the speaker bus"
+                )
+            }
+
+            var outputCallbackStruct = AURenderCallbackStruct(
+                inputProc: audioOutputRenderCallback,
+                inputProcRefCon: Unmanaged.passUnretained(self).toOpaque()
+            )
+            err = AudioUnitSetProperty(audioUnit,
+                                       kAudioUnitProperty_SetRenderCallback,
+                                       kAudioUnitScope_Input,
+                                       0,  // Bus 0
+                                       &outputCallbackStruct,
+                                       UInt32(MemoryLayout<AURenderCallbackStruct>.size))
+            guard err == noErr else {
+                throw MicrophonePCMSampleVendorError.couldNotConfigureAudioUnit(
+                    "Could not set the output render callback on the voice processing audio unit"
+                )
+            }
         }
 
         err = AudioUnitInitialize(audioUnit)
@@ -318,6 +395,12 @@ nonisolated private let kVoiceProcessingInputSampleRate: Double = 44100
             return
         }
 
+        #if os(iOS) // iOS-first guard: non-iOS behavior has not been validated for this path yet.
+        if self.shouldSuppressLikelyEchoInput(bufferList: bufferList, frameCount: inNumberFrames) {
+            return
+        }
+        #endif
+
         guard let audioFormat = AVAudioFormat(
             commonFormat: .pcmFormatInt16,
             sampleRate: kVoiceProcessingInputSampleRate,
@@ -336,9 +419,183 @@ nonisolated private let kVoiceProcessingInputSampleRate: Double = 44100
             }
         }
     }
+
+    /// Called from the VPIO output render callback on the real-time audio thread.
+    /// Pulls rendered audio from the AVAudioEngine's manual rendering block and writes it
+    /// into the VPIO's output buffer so the VPIO can use it as the AEC reference signal.
+    fileprivate func didReceiveOutputRenderCallback(
+        _ ioActionFlags: UnsafeMutablePointer<AudioUnitRenderActionFlags>,
+        _ inTimeStamp: UnsafePointer<AudioTimeStamp>,
+        _ inBusNumber: UInt32,
+        _ inNumberFrames: UInt32,
+        _ ioData: UnsafeMutablePointer<AudioBufferList>?
+    ) {
+        guard let ioData = ioData, let audioEngine = audioEngine else {
+            // No engine — render silence
+            if let ioData = ioData {
+                let buf = UnsafeMutableAudioBufferListPointer(ioData)
+                for i in 0..<buf.count {
+                    memset(buf[i].mData, 0, Int(buf[i].mDataByteSize))
+                }
+            }
+            return
+        }
+        var error: OSStatus = noErr
+        let status = audioEngine.manualRenderingBlock(inNumberFrames, ioData, &error)
+        if status != .success {
+            // On error or insufficient data, fill with silence
+            let buf = UnsafeMutableAudioBufferListPointer(ioData)
+            for i in 0..<buf.count {
+                memset(buf[i].mData, 0, Int(buf[i].mDataByteSize))
+            }
+            #if os(iOS) // iOS-first guard: non-iOS behavior has not been validated for this path yet.
+            self.noteRenderedOutput(ioData, frameCount: inNumberFrames)
+            #endif
+            return
+        }
+        #if os(iOS) // iOS-first guard: non-iOS behavior has not been validated for this path yet.
+        self.noteRenderedOutput(ioData, frameCount: inNumberFrames)
+        #endif
+    }
+
+    #if os(iOS) // iOS-first guard: non-iOS behavior has not been validated for this path yet.
+    private func noteRenderedOutput(
+        _ ioData: UnsafeMutablePointer<AudioBufferList>,
+        frameCount: UInt32
+    ) {
+        let outputRMS = self.rms(ofFloat32BufferList: ioData)
+        let now = ProcessInfo.processInfo.systemUptime
+
+        if outputRMS > kEchoGuardOutputRMSFloor {
+            let bufferDuration = Double(frameCount) / kVoiceProcessingInputSampleRate
+            self.outputLikelyActiveUntilUptime = now + bufferDuration + kEchoGuardOutputTailSeconds
+            if self.outputSmoothedRMS == 0 {
+                self.outputSmoothedRMS = outputRMS
+            } else {
+                self.outputSmoothedRMS = (self.outputSmoothedRMS * (1 - kEchoGuardRMSSmoothingFactor))
+                                         + (outputRMS * kEchoGuardRMSSmoothingFactor)
+            }
+            return
+        }
+
+        if now > self.outputLikelyActiveUntilUptime {
+            self.outputSmoothedRMS = 0
+            self.micLoudFrameStreak = 0
+        }
+    }
+
+    private func shouldSuppressLikelyEchoInput(
+        bufferList: AudioBufferList,
+        frameCount: UInt32
+    ) -> Bool {
+        let now = ProcessInfo.processInfo.systemUptime
+        if now <= self.bargeInOpenUntilUptime {
+            return false
+        }
+
+        guard now <= self.outputLikelyActiveUntilUptime else {
+            self.micLoudFrameStreak = 0
+            return false
+        }
+
+        let micRMS = self.rms(ofPCM16BufferList: bufferList, frameCount: frameCount)
+        let bargeInThreshold = max(
+            kEchoGuardBargeInThresholdFloor,
+            self.outputSmoothedRMS * kEchoGuardBargeInRelativeMultiplier
+        )
+
+        if micRMS >= bargeInThreshold {
+            self.micLoudFrameStreak += 1
+            if self.micLoudFrameStreak >= kEchoGuardFramesForBargeIn {
+                self.micLoudFrameStreak = 0
+                self.bargeInOpenUntilUptime = now + kEchoGuardBargeInHoldSeconds
+                return false
+            }
+        } else {
+            self.micLoudFrameStreak = 0
+        }
+
+        return true
+    }
+
+    private func rms(
+        ofPCM16BufferList bufferList: AudioBufferList,
+        frameCount: UInt32
+    ) -> Float {
+        guard frameCount > 0,
+              let data = bufferList.mBuffers.mData else {
+            return 0
+        }
+
+        let sampleCount = Int(frameCount)
+        let samples = data.bindMemory(to: Int16.self, capacity: sampleCount)
+        let scale = Float(Int16.max)
+        var sumSquares: Float = 0
+        for i in 0..<sampleCount {
+            let normalized = Float(samples[i]) / scale
+            sumSquares += normalized * normalized
+        }
+        return sqrt(sumSquares / Float(sampleCount))
+    }
+
+    private func rms(ofFloat32BufferList ioData: UnsafeMutablePointer<AudioBufferList>) -> Float {
+        let buffers = UnsafeMutableAudioBufferListPointer(ioData)
+        guard !buffers.isEmpty else {
+            return 0
+        }
+
+        var sampleCount = 0
+        var sumSquares: Float = 0
+        for buffer in buffers {
+            guard let mData = buffer.mData else { continue }
+            let count = Int(buffer.mDataByteSize) / MemoryLayout<Float>.size
+            if count == 0 { continue }
+            let samples = mData.bindMemory(to: Float.self, capacity: count)
+            sampleCount += count
+            for i in 0..<count {
+                let sample = samples[i]
+                sumSquares += sample * sample
+            }
+        }
+
+        guard sampleCount > 0 else {
+            return 0
+        }
+        return sqrt(sumSquares / Float(sampleCount))
+    }
+    #endif
+}
+
+// NOTE:
+// This callback is invoked by Core Audio on a real-time I/O thread via C APIs.
+// It is not scheduled onto AIProxyActor at runtime, even though the symbol is
+// annotated with @AIProxyActor for Swift type-checking ergonomics.
+// Do not assume actor isolation/synchronization inside this callback.
+@AIProxyActor private let audioOutputRenderCallback: AURenderCallback = {
+    inRefCon,
+    ioActionFlags,
+    inTimeStamp,
+    inBusNumber,
+    inNumberFrames,
+    ioData in
+    let vendor = Unmanaged<MicrophonePCMSampleVendorAT>
+        .fromOpaque(inRefCon)
+        .takeUnretainedValue()
+    vendor.didReceiveOutputRenderCallback(
+        ioActionFlags,
+        inTimeStamp,
+        inBusNumber,
+        inNumberFrames,
+        ioData
+    )
+    return noErr
 }
 
-// This @AIProxyActor annotation is a lie.
+// NOTE:
+// This callback is invoked by Core Audio on a real-time I/O thread via C APIs.
+// It is not scheduled onto AIProxyActor at runtime, even though the symbol is
+// annotated with @AIProxyActor for Swift type-checking ergonomics.
+// Do not assume actor isolation/synchronization inside this callback.
 @AIProxyActor private let audioRenderCallback: AURenderCallback = {
     inRefCon,
     ioActionFlags,
diff --git a/Sources/AIProxy/OpenAI/OpenAIRealtimeSessionConfiguration.swift b/Sources/AIProxy/OpenAI/OpenAIRealtimeSessionConfiguration.swift
index 2178df0..c6c69c8 100644
--- a/Sources/AIProxy/OpenAI/OpenAIRealtimeSessionConfiguration.swift
+++ b/Sources/AIProxy/OpenAI/OpenAIRealtimeSessionConfiguration.swift
@@ -61,6 +61,9 @@ nonisolated public struct OpenAIRealtimeSessionConfiguration: Encodable, Sendabl
     /// The format of input audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`.
     public let inputAudioFormat: AudioFormat?
 
+    /// Configuration for input audio noise reduction. Set to nil to turn off.
+    public let inputAudioNoiseReduction: InputAudioNoiseReduction?
+
     /// Configuration for input audio transcription. Set to nil to turn off.
     public let inputAudioTranscription: InputAudioTranscription?
 
@@ -112,6 +115,7 @@ nonisolated public struct OpenAIRealtimeSessionConfiguration: Encodable, Sendabl
 
     private enum CodingKeys: String, CodingKey {
         case inputAudioFormat = "input_audio_format"
+        case inputAudioNoiseReduction = "input_audio_noise_reduction"
         case inputAudioTranscription = "input_audio_transcription"
         case instructions
         case maxResponseOutputTokens = "max_response_output_tokens"
@@ -127,6 +131,7 @@ nonisolated public struct OpenAIRealtimeSessionConfiguration: Encodable, Sendabl
 
     public init(
         inputAudioFormat: OpenAIRealtimeSessionConfiguration.AudioFormat? = nil,
+        inputAudioNoiseReduction: OpenAIRealtimeSessionConfiguration.InputAudioNoiseReduction? = nil,
         inputAudioTranscription: OpenAIRealtimeSessionConfiguration.InputAudioTranscription? = nil,
         instructions: String? = nil,
         maxResponseOutputTokens: OpenAIRealtimeSessionConfiguration.MaxResponseOutputTokens? = nil,
@@ -140,6 +145,7 @@ nonisolated public struct OpenAIRealtimeSessionConfiguration: Encodable, Sendabl
         voice: String? = nil
     ) {
         self.inputAudioFormat = inputAudioFormat
+        self.inputAudioNoiseReduction = inputAudioNoiseReduction
         self.inputAudioTranscription = inputAudioTranscription
         self.instructions = instructions
         self.maxResponseOutputTokens = maxResponseOutputTokens
@@ -154,6 +160,22 @@ nonisolated public struct OpenAIRealtimeSessionConfiguration: Encodable, Sendabl
     }
 }
 
+// MARK: -
+extension OpenAIRealtimeSessionConfiguration {
+    nonisolated public struct InputAudioNoiseReduction: Encodable, Sendable {
+        nonisolated public enum NoiseReductionType: String, Encodable, Sendable {
+            case nearField = "near_field"
+            case farField = "far_field"
+        }
+
+        public let type: NoiseReductionType
+
+        public init(type: NoiseReductionType) {
+            self.type = type
+        }
+    }
+}
+
 // MARK: -
 extension OpenAIRealtimeSessionConfiguration {
     nonisolated public struct InputAudioTranscription: Encodable, Sendable {
diff --git a/Tests/AIProxyTests/OpenAIRealtimeSessionConfigurationTests.swift b/Tests/AIProxyTests/OpenAIRealtimeSessionConfigurationTests.swift
new file mode 100644
index 0000000..8825579
--- /dev/null
+++ b/Tests/AIProxyTests/OpenAIRealtimeSessionConfigurationTests.swift
@@ -0,0 +1,46 @@
+//
+//  OpenAIRealtimeSessionConfigurationTests.swift
+//  AIProxy
+//
+//  Created by Codex on 2/15/26.
+//
+
+import XCTest
+@testable import AIProxy
+
+final class OpenAIRealtimeSessionConfigurationTests: XCTestCase {
+
+    func testInputAudioNoiseReductionNearFieldIsEncodable() throws {
+        let config = OpenAIRealtimeSessionConfiguration(
+            inputAudioNoiseReduction: .init(type: .nearField),
+            speed: nil
+        )
+
+        XCTAssertEqual(
+            """
+            {
+              "input_audio_noise_reduction" : {
+                "type" : "near_field"
+              }
+            }
+            """,
+            try config.serialize(pretty: true)
+        )
+    }
+
+    func testInputAudioNoiseReductionIsOptional() throws {
+        let config = OpenAIRealtimeSessionConfiguration(
+            inputAudioFormat: .pcm16,
+            speed: nil
+        )
+
+        XCTAssertEqual(
+            """
+            {
+              "input_audio_format" : "pcm16"
+            }
+            """,
+            try config.serialize(pretty: true)
+        )
+    }
+}
diff --git a/docs/realtime-self-interruption-fix.md b/docs/realtime-self-interruption-fix.md
new file mode 100644
index 0000000..e8a7888
--- /dev/null
+++ b/docs/realtime-self-interruption-fix.md
@@ -0,0 +1,83 @@
+# Realtime Self-Interruption Fix Plan
+
+## Goal
+
+Prevent the OpenAI Realtime model from hearing its own speaker playback on iOS and incorrectly interrupting itself, while preserving intentional user barge-in behavior.
+
+## Scope
+
+- Primary implementation target: `AIProxySwift` (`wip/vpio-echo-cancellation`)
+- Integration context reviewed: `/Users/kavimathur/workspace/joice` (CallKit + Realtime)
+- APIs involved: iOS audio stack (VPIO + AVAudioEngine), OpenAI Realtime turn detection
+
+## Mechanism Review
+
+### AIProxySwift audio path (current branch)
+
+1. `AudioController` chooses AudioToolbox VPIO on iOS speaker path (no headphones).
+2. `MicrophonePCMSampleVendorAT` captures microphone via VPIO bus 1.
+3. Playback is scheduled with `AudioPCMPlayer` on `AVAudioEngine`.
+4. VPIO bus 0 render callback pulls from `audioEngine.manualRenderingBlock` as AEC reference.
+
+This closes the biggest architectural gap from issue #240 and PR #264.
+
+### Why interruption can still happen
+
+Even with VPIO AEC working, residual echo can remain due to acoustics, device variance, AGC interactions, and startup/adaptation windows. OpenAI turn detection may treat that residual as user speech and emit `input_audio_buffer.speech_started`, which typical app code maps to playback interruption.
+
+## Plan
+
+1. Harden VPIO voice-processing configuration.
+1. Add a client-side mic uplink echo guard in the VPIO callback path for iOS speaker mode:
+   - Track assistant playback activity/level.
+   - Suppress likely echo frames while assistant audio is active.
+   - Re-open mic quickly on strong near-end speech (barge-in).
+1. Keep behavior isolated to iOS speaker full-duplex mode to avoid regressions.
+1. Validate by build/tests and document expected on-device QA scenarios.
+
+## Risk Notes
+
+- Over-aggressive suppression can make barge-in harder.
+- Under-aggressive suppression can still allow self-interruption.
+- iOS real-time audio timing varies by route/device, so thresholds must be conservative and tunable in code.
+
+## Learning Log
+
+- Initial finding: current branch correctly routes far-end audio through VPIO output callback, but does not guard against residual leakage in uplink.
+- Initial finding: `joice` uses `.semanticVAD(eagerness: .medium)` and interrupts playback on every `input_audio_buffer.speech_started`, so any leakage becomes user-visible immediately.
+- Implementation direction: add defense-in-depth in SDK rather than relying only on VPIO AEC.
+- Attempted approach: add mic suppression wrapper in `AudioController.micStream()`.
+- Course correction: Swift 6 sendability checks rejected forwarding `AVAudioPCMBuffer` across actor boundaries in that wrapper, so the echo guard moved into `MicrophonePCMSampleVendorAT` callback code (no cross-actor buffer forwarding).
+- Implemented: explicitly force VPIO voice processing on (`kAUVoiceIOProperty_BypassVoiceProcessing = 0`) and disable AGC (`kAUVoiceIOProperty_VoiceProcessingEnableAGC = 0`) to reduce amplified leakage.
+- Implemented: output callback now measures rendered output RMS and tracks an "assistant audio active" window.
+- Implemented: input callback computes mic RMS and suppresses frames likely to be echo while assistant output is active, but allows barge-in after consecutive loud mic frames and keeps a short barge-in-open window.
+- Implemented: speaker output bus + output callback setup are now limited to manual-rendering mode, preserving pre-existing macOS/non-manual behavior.
+- Platform-safety update: iOS-only behavior changes are enforced for AGC/bypass settings and echo suppression; macOS/watchOS paths retain prior behavior unless unchanged baseline setup is required.
+- Validation: `swift build` succeeds and `swift test` passes (172 tests, 0 failures).
+
+## Execution Summary
+
+- Changed `/Users/kavimathur/workspace/AIProxySwift/Sources/AIProxy/MicrophonePCMSampleVendorAT.swift`:
+  - Added VPIO property hardening for bypass/AGC.
+  - Added residual echo suppression logic in the real-time callback path (iOS only).
+  - Added real-time RMS tracking helpers for output and microphone buffers.
+- Changed `/Users/kavimathur/workspace/AIProxySwift/Sources/AIProxy/AudioController.swift`:
+  - Start `AVAudioEngine` synchronously on non-watchOS to reduce startup race risk in iOS/macOS call paths.
+- Added plan + log file:
+  - `/Users/kavimathur/workspace/AIProxySwift/docs/realtime-self-interruption-fix.md`
+
+## On-Device QA Checklist
+
+1. iPhone speaker mode, no headphones: AI should complete full responses without self-interrupting.
+1. While AI is speaking, interrupt loudly and verify barge-in still works within ~200-300ms.
+1. Quiet room and noisy room checks: verify no constant false interruptions.
+1. Headphones/Bluetooth route: verify no regression (existing non-speaker behavior unchanged).
+1. CallKit route changes (speaker toggle, lock screen controls): verify conversation remains stable.
+
+## References
+
+- Issue context: <https://github.com/lzell/AIProxySwift/issues/240>
+- Prior attempt (PR): <https://github.com/lzell/AIProxySwift/pull/264>
+- OpenAI Realtime turn detection fields (`interrupt_response`, `create_response`): <https://github.com/openai/openai-realtime-api-beta/blob/main/README.md>
+- Apple Audio Unit bus model (I/O unit fundamentals): <https://developer.apple.com/library/archive/documentation/MusicAudio/Conceptual/AudioUnitHostingGuide_iOS/AudioUnitHostingFundamentals/AudioUnitHostingFundamentals.html>
+- Apple WWDC discussion of full-duplex voice processing echo cancellation: <https://developer.apple.com/videos/play/wwdc2011/413/>
diff --git a/docs/vpio-echo-cancellation.md b/docs/vpio-echo-cancellation.md
new file mode 100644
index 0000000..f69f5ea
--- /dev/null
+++ b/docs/vpio-echo-cancellation.md
@@ -0,0 +1,84 @@
+# VPIO Echo Cancellation Fix for iOS
+
+## Problem
+
+When using Joice on iOS with speaker output (no headphones), the AI's voice plays through the speaker, gets picked up by the microphone, and OpenAI's server-side VAD interprets it as user speech — causing the AI to interrupt itself.
+
+## Root Cause
+
+The AIProxy SDK uses `kAudioUnitSubType_VoiceProcessingIO` (VPIO) for mic capture on iOS without headphones. VPIO is designed to provide Acoustic Echo Cancellation (AEC), but **AEC was non-functional** because:
+
+1. The VPIO output bus (Bus 0) was **explicitly disabled** in `MicrophonePCMSampleVendorAT.swift` (the comment explained that enabling it caused `render err: -1` without a data source)
+2. Playback went through a completely separate `AVAudioEngine` that the VPIO had no visibility into
+
+**For VPIO AEC to work, playback audio must flow through the VPIO's output bus as a reference signal.** The VPIO cannot cancel echo from audio it doesn't know about.
+
+## Solution: AVAudioEngine Manual Rendering + VPIO I/O
+
+Route playback through the VPIO by putting `AVAudioEngine` in **manual rendering mode**. The existing `AudioPCMPlayer` continues to schedule buffers on the playerNode as before, but the engine no longer renders to hardware. Instead, the VPIO output render callback **pulls** rendered audio from the engine and feeds it to the speaker through the VPIO. This gives the VPIO full visibility into both input and output for AEC.
+
+This is the same pattern used by Twilio's Voice SDK for iOS echo cancellation.
+
+### Signal Path
+
+```
+OpenAI 24kHz PCM16 --> AudioPCMPlayer --> playerNode --> AVAudioEngine (manual rendering, 44100Hz)
+                                                              |
+                                                              v
+                                                VPIO output render callback
+                                                calls engine.manualRenderingBlock
+                                                              |
+                                                              v
+                                                      VPIO Bus 0 output
+                                                (plays to speaker + AEC ref)
+
+Hardware mic --> VPIO Bus 1 input (echo-cancelled) --> input callback --> resample --> OpenAI
+```
+
+## Changes
+
+### `MicrophonePCMSampleVendorAT.swift`
+
+1. **Added `audioEngine` property + updated `init`** — accepts an optional `AVAudioEngine` in manual rendering mode
+2. **Enabled output bus 0** — changed the `zero` → `one_output` so the VPIO speaker bus is active (previously disabled to avoid `render err: -1`)
+3. **Set stream format on output bus 0 (Input scope)** — Float32 at 44100Hz mono, matching the manual rendering engine format
+4. **Registered a render callback on bus 0** — only when `audioEngine` is provided
+5. **Implemented `didReceiveOutputRenderCallback`** — pulls audio from `audioEngine.manualRenderingBlock` into the VPIO's output buffer; fills silence on error or when no engine is present
+6. **Added C-level `audioOutputRenderCallback`** — bridges to the instance method (same pattern as the existing input callback)
+
+### `AudioController.swift`
+
+1. **Enable manual rendering** — on iOS without headphones, puts `AVAudioEngine` into `.realtime` manual rendering mode at Float32/44100Hz/mono before any nodes are attached
+2. **Pass `audioEngine` to VPIO vendor** — `MicrophonePCMSampleVendorAT(audioEngine: self.audioEngine)` so the render callback can pull from it
+3. **Updated doc comment table** — iOS without headphones now notes "AudioToolbox + manual rendering AEC"
+
+### `AudioPCMPlayer.swift`
+
+No changes needed. The playerNode scheduling API works identically in manual rendering mode. The engine buffers audio internally and renders it when `manualRenderingBlock` is called from the VPIO output callback.
+
+## Why This Works
+
+1. `AudioPCMPlayer` schedules playback buffers on the playerNode at 24kHz
+2. `AVAudioEngine` (in manual rendering mode at 44100Hz) internally upsamples and mixes
+3. The VPIO output render callback pulls mixed audio via `manualRenderingBlock`
+4. VPIO sends this audio to the hardware speaker **and** uses it as the AEC reference
+5. VPIO subtracts the reference from the mic input on Bus 1, producing echo-cancelled audio
+6. The echo-cancelled mic audio flows through the existing input callback unchanged
+
+## Risks and Mitigations
+
+| Risk | Mitigation |
+|------|------------|
+| Volume bug with VPIO (documented in AudioPCMPlayer) | AudioPCMPlayer is initialized before VPIO (existing order in AudioController). In manual rendering mode, the engine doesn't drive hardware directly, so the bug may not apply. |
+| `manualRenderingBlock` called on real-time thread | Apple docs confirm this is the intended usage — the block is designed for real-time contexts. |
+| Format mismatch between engine output and VPIO bus | Both configured to Float32/44100Hz/mono. The engine handles 24kHz to 44100Hz upsampling internally. |
+| macOS not addressed | macOS AT path unchanged (separate concern, less acute due to speaker/mic distance). Only iOS gets the manual rendering + VPIO AEC fix. |
+| Headphones path unchanged | When headphones are connected, the `MicrophonePCMSampleVendorAE` (AVAudioEngine-based) path is used instead — no regression risk. |
+
+## Testing
+
+1. Build and run on a **physical iOS device** with speaker (no headphones)
+2. Start a voice session — AI should speak full responses without self-interrupting
+3. Speak over the AI to verify user interruption still works
+4. Test with headphones to confirm no regression (headphones path is unchanged)
+5. Test that playback volume is acceptable