Add copy() to KVCache protocol and all implementations (#158)

alankessler · web-flow · commit 807885c1cef9 · 2026-03-27T14:47:06.000-07:00
* Add copy() to KVCache protocol and all implementations Add an independent deep-copy method to enable reusing a prefix cache across multiple ChatSession instances without reloading from disk. - Add copy() requirement to KVCache protocol - Implement on KVCacheSimple, RotatingKVCache, QuantizedKVCache, ChunkedKVCache, ArraysCache, MambaCache, and CacheList - Guard against empty state in all copy() methods to avoid fatalError from state setters that reject empty arrays - Preserve leftPadding in ArraysCache/MambaCache copies - Add CacheList array-based initializer to support copy() - Change ArraysCache.leftPadding from private to internal for subclass access in MambaCache.copy() Tests: - testCacheCopyIsIndependent: parameterized across 6 cache types, verifies copy has same state and mutation of copy leaves original unchanged - testCacheCopyOnEmptyCache: verifies copy of unpopulated cache does not crash - testCacheListCopyIsIndependent: verifies CacheList with heterogeneous sub-caches copies independently * Bump mlx-swift dependency to 0.31.1 Picks up the fix for array[.ellipsis] returning self instead of a copy (ml-explore/mlx-swift#367), plus mlx 0.31.1 C++ updates.
diff --git a/Libraries/MLXLMCommon/KVCache.swift b/Libraries/MLXLMCommon/KVCache.swift
@@ -71,6 +71,9 @@ public protocol KVCache: Evaluatable {
     func makeMask(
         n: Int, windowSize: Int?, returnArray: Bool
     ) -> MLXFast.ScaledDotProductAttentionMaskMode
+
+    /// Create an independent deep copy of this cache.
+    func copy() -> any KVCache
 }
 
 /// Protocol for caches that support efficient quantized operations
@@ -149,6 +152,10 @@ open class BaseKVCache: KVCache {
     @discardableResult
     open func trim(_ n: Int) -> Int { 0 }
 
+    open func copy() -> any KVCache {
+        fatalError("copy() must be implemented by subclass")
+    }
+
     /// Default implementation for caches without special mask requirements
     open func makeMask(
         n: Int, windowSize: Int?, returnArray: Bool
@@ -419,6 +426,16 @@ public class KVCacheSimple: BaseKVCache, CustomDebugStringConvertible {
         return quantizedCache
     }
 
+    public override func copy() -> any KVCache {
+        let new = KVCacheSimple()
+        new.step = self.step
+        let s = self.state
+        if !s.isEmpty {
+            new.state = s.map { $0[.ellipsis] }
+        }
+        return new
+    }
+
     public var debugDescription: String {
         "\(String(describing: Self.self)) \(Unmanaged.passUnretained(self).toOpaque()), offset: \(offset), step: \(step), keys: \(keys?.shape.description ?? "-"), values: \(values?.shape.description ?? "-")"
     }
@@ -680,6 +697,16 @@ public class RotatingKVCache: BaseKVCache, CustomDebugStringConvertible {
         "\(String(describing: Self.self)) offset: \(offset), maxSize: \(maxCacheSize.description), keep: \(keep), idx: \(idx)"
     }
 
+    public override func copy() -> any KVCache {
+        let new = RotatingKVCache(maxSize: maxCacheSize, keep: keep, step: step)
+        let s = self.state
+        if !s.isEmpty {
+            new.state = s.map { $0[.ellipsis] }
+        }
+        new.metaState = self.metaState
+        return new
+    }
+
     /// Convert to quantized cache
     /// Note: This is complex due to the rotating nature and temporal ordering
     public func toQuantized(groupSize: Int = 64, bits: Int = 4) -> QuantizedKVCache {
@@ -925,6 +952,16 @@ public class QuantizedKVCache: BaseKVCache, QuantizedKVCacheProtocol {
         return trimmed
     }
 
+    public override func copy() -> any KVCache {
+        let new = QuantizedKVCache(groupSize: groupSize, bits: bits, mode: mode)
+        let s = self.state
+        if !s.isEmpty {
+            new.state = s.map { $0[.ellipsis] }
+        }
+        new.metaState = self.metaState
+        return new
+    }
+
     /// Convert to unquantized cache
     public func toUnquantized() -> KVCacheSimple {
         let simpleCache = KVCacheSimple()
@@ -1014,6 +1051,17 @@ public class ChunkedKVCache: KVCacheSimple {
         return trimmed
     }
 
+    public override func copy() -> any KVCache {
+        let new = ChunkedKVCache(chunkSize: chunkSize)
+        new.step = self.step
+        let s = self.state
+        if !s.isEmpty {
+            new.state = s.map { $0[.ellipsis] }
+        }
+        new.metaState = self.metaState
+        return new
+    }
+
     public override var metaState: [String] {
         get {
             let chunkSizeStr = chunkSize?.description ?? "None"
@@ -1036,7 +1084,7 @@ public class ChunkedKVCache: KVCacheSimple {
 /// Base cache for array-based state storage
 public class ArraysCache: BaseKVCache {
     private var cache: [MLXArray?]
-    private var leftPadding: MLXArray?
+    internal var leftPadding: MLXArray?
 
     public init(size: Int, leftPadding: [Int]? = nil) {
         self.cache = Array(repeating: nil, count: size)
@@ -1062,6 +1110,17 @@ public class ArraysCache: BaseKVCache {
         }
     }
 
+    public override func copy() -> any KVCache {
+        let new = ArraysCache(size: cache.count)
+        let s = self.state
+        if !s.isEmpty {
+            new.state = s.map { $0[.ellipsis] }
+        }
+        new.offset = self.offset
+        new.leftPadding = self.leftPadding
+        return new
+    }
+
     /// In-place filter to keep just the given indices in the cache
     public func filter(batchIndices: MLXArray) {
         cache = cache.map { c in
@@ -1096,6 +1155,17 @@ public class MambaCache: ArraysCache {
     public init(leftPadding: [Int]? = nil) {
         super.init(size: 2, leftPadding: leftPadding)
     }
+
+    public override func copy() -> any KVCache {
+        let new = MambaCache()
+        let s = self.state
+        if !s.isEmpty {
+            new.state = s.map { $0[.ellipsis] }
+        }
+        new.offset = self.offset
+        new.leftPadding = self.leftPadding
+        return new
+    }
 }
 
 /// Composite cache that manages multiple sub-caches
@@ -1107,6 +1177,11 @@ public class CacheList: BaseKVCache {
         super.init()
     }
 
+    public init(_ caches: [any KVCache]) {
+        self.caches = caches
+        super.init()
+    }
+
     public override func innerState() -> [MLXArray] {
         caches.flatMap { $0.innerState() }
     }
@@ -1132,6 +1207,12 @@ public class CacheList: BaseKVCache {
         }
     }
 
+    public override func copy() -> any KVCache {
+        let copiedCaches = caches.map { $0.copy() }
+        let new = CacheList(copiedCaches)
+        return new
+    }
+
     public override var isTrimmable: Bool {
         caches.allSatisfy { $0.isTrimmable }
     }
diff --git a/Package.swift b/Package.swift
@@ -26,7 +26,7 @@ let package = Package(
             targets: ["MLXEmbedders"]),
     ],
     dependencies: [
-        .package(url: "https://github.com/ml-explore/mlx-swift", .upToNextMinor(from: "0.30.6")),
+        .package(url: "https://github.com/ml-explore/mlx-swift", .upToNextMinor(from: "0.31.1")),
         .package(
             url: "https://github.com/huggingface/swift-transformers",
             .upToNextMinor(from: "1.2.0")
diff --git a/Tests/MLXLMTests/KVCacheTests.swift b/Tests/MLXLMTests/KVCacheTests.swift
@@ -3,16 +3,18 @@ import MLX
 import MLXLMCommon
 import Testing
 
+private let cacheCreators: [() -> any KVCache] = [
+    { KVCacheSimple() },
+    { RotatingKVCache(maxSize: 32) },
+    { QuantizedKVCache() },
+    { ChunkedKVCache(chunkSize: 16) },
+    { ArraysCache(size: 2) },
+    { MambaCache() },
+]
+
 @Test(
     .serialized,
-    arguments: [
-        ({ KVCacheSimple() }),
-        ({ RotatingKVCache(maxSize: 32) }),
-        ({ QuantizedKVCache() }),
-        ({ ChunkedKVCache(chunkSize: 16) }),
-        ({ ArraysCache(size: 2) }),
-        ({ MambaCache() }),
-    ])
+    arguments: cacheCreators)
 func testCacheSerialization(creator: (() -> any KVCache)) async throws {
     let cache = (0 ..< 10).map { _ in creator() }
     let keys = MLXArray.ones([1, 8, 32, 64], dtype: .bfloat16)
@@ -43,3 +45,136 @@ func testCacheSerialization(creator: (() -> any KVCache)) async throws {
         #expect(lhs.state.count == rhs.state.count)
     }
 }
+
+/// Verify that copy() produces an independent cache: same type, same state,
+/// but mutating the copy does not affect the original.
+@Test(
+    .serialized,
+    arguments: cacheCreators)
+func testCacheCopyIsIndependent(creator: (() -> any KVCache)) async throws {
+    let original = creator()
+
+    let keys = MLXArray.ones([1, 8, 4, 64], dtype: .bfloat16)
+    let values = MLXArray.ones([1, 8, 4, 64], dtype: .bfloat16)
+
+    // populate the original
+    switch original {
+    case let arrays as ArraysCache:
+        arrays[0] = keys
+        arrays[1] = values
+    case let quantized as QuantizedKVCache:
+        _ = quantized.updateQuantized(keys: keys, values: values)
+    default:
+        _ = original.update(keys: keys, values: values)
+    }
+
+    let originalOffset = original.offset
+    let originalState = original.state
+    eval(originalState)
+    let originalMeta = original.metaState
+
+    // copy
+    let copied = original.copy()
+
+    // same type
+    #expect(type(of: original) == type(of: copied))
+
+    // same offset and metadata
+    #expect(copied.offset == originalOffset)
+    #expect(copied.metaState == originalMeta)
+
+    // same state values
+    let copiedState = copied.state
+    eval(copiedState)
+    #expect(copiedState.count == originalState.count)
+    for (origArr, copyArr) in zip(originalState, copiedState) {
+        #expect(origArr.shape == copyArr.shape)
+        #expect(allClose(origArr, copyArr).item(Bool.self))
+    }
+
+    // mutate the copy — push more tokens through it
+    let moreKeys = MLXArray.zeros([1, 8, 2, 64], dtype: .bfloat16)
+    let moreValues = MLXArray.zeros([1, 8, 2, 64], dtype: .bfloat16)
+
+    switch copied {
+    case let arrays as ArraysCache:
+        // overwrite slot 0 with a different array
+        arrays[0] = moreKeys
+    case let quantized as QuantizedKVCache:
+        _ = quantized.updateQuantized(keys: moreKeys, values: moreValues)
+    default:
+        _ = copied.update(keys: moreKeys, values: moreValues)
+    }
+
+    // original must be unchanged
+    #expect(original.offset == originalOffset)
+    #expect(original.metaState == originalMeta)
+    let currentState = original.state
+    eval(currentState)
+    #expect(currentState.count == originalState.count)
+    for (origArr, savedArr) in zip(currentState, originalState) {
+        #expect(origArr.shape == savedArr.shape)
+        #expect(allClose(origArr, savedArr).item(Bool.self))
+    }
+}
+
+/// copy() on an empty (unpopulated) cache must not crash.
+@Test(
+    .serialized,
+    arguments: cacheCreators)
+func testCacheCopyOnEmptyCache(creator: (() -> any KVCache)) async throws {
+    let empty = creator()
+    let copied = empty.copy()
+
+    #expect(type(of: empty) == type(of: copied))
+    #expect(copied.offset == 0)
+    #expect(copied.state.count == empty.state.count)
+}
+
+/// CacheList.copy() produces independent sub-caches.
+@Test
+func testCacheListCopyIsIndependent() async throws {
+    let sub1 = KVCacheSimple()
+    let sub2 = RotatingKVCache(maxSize: 32)
+    let composite = CacheList(sub1, sub2)
+
+    let keys = MLXArray.ones([1, 8, 4, 64], dtype: .bfloat16)
+    let values = MLXArray.ones([1, 8, 4, 64], dtype: .bfloat16)
+    _ = sub1.update(keys: keys, values: values)
+    _ = sub2.update(keys: keys, values: values)
+
+    // snapshot original state — eval to materialize before copy
+    let originalState = composite.state
+    eval(originalState)
+    let originalOffset0 = sub1.offset
+    let originalOffset1 = sub2.offset
+
+    let copied = composite.copy()
+
+    #expect(copied is CacheList)
+    let copiedState = copied.state
+    eval(copiedState)
+    #expect(copiedState.count == originalState.count)
+    for (orig, copy) in zip(originalState, copiedState) {
+        #expect(orig.shape == copy.shape)
+        #expect(allClose(orig, copy).item(Bool.self))
+    }
+
+    // mutate inside the copy
+    let copiedList = copied as! CacheList
+    _ = copiedList[0].update(
+        keys: MLXArray.zeros([1, 8, 2, 64], dtype: .bfloat16),
+        values: MLXArray.zeros([1, 8, 2, 64], dtype: .bfloat16)
+    )
+
+    // originals unchanged
+    #expect(sub1.offset == originalOffset0)
+    #expect(sub2.offset == originalOffset1)
+    let currentState = composite.state
+    eval(currentState)
+    #expect(currentState.count == originalState.count)
+    for (orig, saved) in zip(currentState, originalState) {
+        #expect(orig.shape == saved.shape)
+        #expect(allClose(orig, saved).item(Bool.self))
+    }
+}