ml-explore · ronaldmannak · Mar 14, 2026 · Mar 14, 2026 · Mar 14, 2026 · Mar 14, 2026
diff --git a/.gitignore b/.gitignore
@@ -94,4 +94,5 @@ iOSInjectionProject/
 
 .idea
 .vscode
-
+.claude/
+.factory/
diff --git a/Libraries/MLXLLM/Models/AfMoE.swift b/Libraries/MLXLLM/Models/AfMoE.swift
@@ -197,13 +197,8 @@ class AfMoEAttention: Module {
 
         // Apply RoPE only for local (sliding window) attention
         if isLocalAttention, let rope = rope {
-            if let cache = cache {
-                queries = rope(queries, offset: cache.offset)
-                keys = rope(keys, offset: cache.offset)
-            } else {
-                queries = rope(queries, offset: 0)
-                keys = rope(keys, offset: 0)
-            }
+            queries = applyRotaryPosition(rope, to: queries, cache: cache)
+            keys = applyRotaryPosition(rope, to: keys, cache: cache)
         }
 
         var output = attentionWithCacheUpdate(

diff --git a/Libraries/MLXLLM/Models/Apertus.swift b/Libraries/MLXLLM/Models/Apertus.swift
@@ -224,17 +224,14 @@ private class ApertusAttention: Module {
         values = values.transposed(0, 2, 1, 3)
 
         // 4. RoPE
-        if let cache = cache {
-            queries = rope(queries, offset: cache.offset)
-            keys = rope(keys, offset: cache.offset)
+        queries = applyRotaryPosition(rope, to: queries, cache: cache)
+        keys = applyRotaryPosition(rope, to: keys, cache: cache)
 
+        if let cache = cache {
             // Update cache (expects [B, H, L, D])
             let (k, v) = cache.update(keys: keys, values: values)
             keys = k
             values = v
-        } else {
-            queries = rope(queries, offset: 0)
-            keys = rope(keys, offset: 0)
         }
 
         // 5. Attention (SDPA expects [B, H, L, D])

diff --git a/Libraries/MLXLLM/Models/BaichuanM1.swift b/Libraries/MLXLLM/Models/BaichuanM1.swift
@@ -113,12 +113,11 @@ class BaichuanM1Attention: Module {
         var keys = qkv[1].reshaped(B, L, numKVHeads, headDim).transposed(0, 2, 1, 3)
         var values = qkv[2].reshaped(B, L, numKVHeads, headDim).transposed(0, 2, 1, 3)
 
-        var offset = 0
         var lastK: MLXArray? = nil
         var lastV: MLXArray? = nil
+        let kvSubCache: KVCache? = (cache as? CacheList)?[1]
 
         if let cacheList = cache as? CacheList {
-            offset = cacheList[1].offset
             if let mambaCache = cacheList[0] as? MambaCache {
                 lastK = mambaCache[0]
                 lastV = mambaCache[1]
@@ -131,8 +130,8 @@ class BaichuanM1Attention: Module {
         keys = customConvolution(keys, convK, state: lastK)
         values = customConvolution(values, convV, state: lastV)
 
-        queries = rope(queries, offset: offset)
-        keys = rope(keys, offset: offset)
+        queries = applyRotaryPosition(rope, to: queries, cache: kvSubCache)
+        keys = applyRotaryPosition(rope, to: keys, cache: kvSubCache)
 
         if let cache = cache as? CacheList {
             let kvCache = cache[1]

diff --git a/Libraries/MLXLLM/Models/BailingMoe.swift b/Libraries/MLXLLM/Models/BailingMoe.swift
@@ -145,13 +145,8 @@ class BailingMoeAttention: Module {
         keys = keys.transposed(0, 2, 1, 3)
         values = values.reshaped(B, L, kvHeads, -1).transposed(0, 2, 1, 3)
 
-        if let cache {
-            queries = rope(queries, offset: cache.offset)
-            keys = rope(keys, offset: cache.offset)
-        } else {
-            queries = rope(queries, offset: 0)
-            keys = rope(keys, offset: 0)
-        }
+        queries = applyRotaryPosition(rope, to: queries, cache: cache)
+        keys = applyRotaryPosition(rope, to: keys, cache: cache)
 
         let output = attentionWithCacheUpdate(
             queries: queries,

diff --git a/Libraries/MLXLLM/Models/Bitnet.swift b/Libraries/MLXLLM/Models/Bitnet.swift
@@ -316,13 +316,11 @@ class BitnetAttention: Module {
         keys = keys.reshaped(B, L, args.resolvedKvHeads, -1).transposed(0, 2, 1, 3)
         values = values.reshaped(B, L, args.resolvedKvHeads, -1).transposed(0, 2, 1, 3)
 
+        queries = applyRotaryPosition(rope, to: queries, cache: cache)
+        keys = applyRotaryPosition(rope, to: keys, cache: cache)
+
         if let cache {
-            queries = rope(queries, offset: cache.offset)
-            keys = rope(keys, offset: cache.offset)
             (keys, values) = cache.update(keys: keys, values: values)
-        } else {
-            queries = rope(queries, offset: 0)
-            keys = rope(keys, offset: 0)
         }
 
         let output = MLXFast.scaledDotProductAttention(

diff --git a/Libraries/MLXLLM/Models/Cohere.swift b/Libraries/MLXLLM/Models/Cohere.swift
@@ -50,13 +50,8 @@ class CohereAttention: Module {
         keys = keys.reshaped(B, L, args.kvHeads, -1).transposed(0, 2, 1, 3)
         values = values.reshaped(B, L, args.kvHeads, -1).transposed(0, 2, 1, 3)
 
-        if let cache {
-            queries = rope(queries, offset: cache.offset)
-            keys = rope(keys, offset: cache.offset)
-        } else {
-            queries = rope(queries)
-            keys = rope(keys)
-        }
+        queries = applyRotaryPosition(rope, to: queries, cache: cache)
+        keys = applyRotaryPosition(rope, to: keys, cache: cache)
 
         let output = attentionWithCacheUpdate(
             queries: queries,

diff --git a/Libraries/MLXLLM/Models/DeepseekV3.swift b/Libraries/MLXLLM/Models/DeepseekV3.swift
@@ -197,17 +197,15 @@ class DeepseekV3Attention: Module {
 
         var (kNope, values) = (splitKv[0], splitKv[1])
 
+        qPe = applyRotaryPosition(self.rope, to: qPe, cache: cache)
+        kPe = applyRotaryPosition(self.rope, to: kPe, cache: cache)
+        kPe = repeated(kPe, count: numHeads, axis: 1)
+
         var keys: MLXArray
         if let cache = cache {
-            qPe = self.rope(qPe, offset: cache.offset)
-            kPe = self.rope(kPe, offset: cache.offset)
-            kPe = repeated(kPe, count: numHeads, axis: 1)
             (keys, values) = cache.update(
                 keys: concatenated([kNope, kPe], axis: -1), values: values)
         } else {
-            qPe = self.rope(qPe, offset: 0)
-            kPe = self.rope(kPe, offset: 0)
-            kPe = repeated(kPe, count: numHeads, axis: 1)
             keys = concatenated([kNope, kPe], axis: -1)
         }
 

diff --git a/Libraries/MLXLLM/Models/Ernie4_5.swift b/Libraries/MLXLLM/Models/Ernie4_5.swift
@@ -104,13 +104,8 @@ class Ernie45Attention: Module {
         keys = keys.reshaped(B, L, nKVHeads, -1).transposed(0, 2, 1, 3)
         values = values.reshaped(B, L, nKVHeads, -1).transposed(0, 2, 1, 3)
 
-        if let cache {
-            queries = rope(queries, offset: cache.offset)
-            keys = rope(keys, offset: cache.offset)
-        } else {
-            queries = rope(queries, offset: 0)
-            keys = rope(keys, offset: 0)
-        }
+        queries = applyRotaryPosition(rope, to: queries, cache: cache)
+        keys = applyRotaryPosition(rope, to: keys, cache: cache)
 
         let output = attentionWithCacheUpdate(
             queries: queries,

diff --git a/Libraries/MLXLLM/Models/Exaone4.swift b/Libraries/MLXLLM/Models/Exaone4.swift
@@ -71,12 +71,9 @@ class Exaone4Attention: Module {
         keys = kNorm(keys.reshaped(B, L, args.kvHeads, -1)).transposed(0, 2, 1, 3)
         values = values.reshaped(B, L, args.kvHeads, -1).transposed(0, 2, 1, 3)
 
-        if let cache, useRope, let rope {
-            queries = rope(queries, offset: cache.offset)
-            keys = rope(keys, offset: cache.offset)
-        } else if useRope, let rope {
-            queries = rope(queries, offset: 0)
-            keys = rope(keys, offset: 0)
+        if useRope, let rope {
+            queries = applyRotaryPosition(rope, to: queries, cache: cache)
+            keys = applyRotaryPosition(rope, to: keys, cache: cache)
         }
 
         let output = attentionWithCacheUpdate(

diff --git a/Libraries/MLXLLM/Models/FalconH1.swift b/Libraries/MLXLLM/Models/FalconH1.swift
@@ -291,7 +291,11 @@ class FalconH1Attention: Module {
             maxPositionEmbeddings: args.maxPositionEmbeddings)
     }
 
-    func callAsFunction(_ x: MLXArray, mask: MLXArray? = nil, cache: KVCache? = nil) -> MLXArray {
+    func callAsFunction(
+        _ x: MLXArray,
+        mask: MLXFast.ScaledDotProductAttentionMaskMode = .none,
+        cache: KVCache? = nil
+    ) -> MLXArray {
         let (B, L, _) = (x.dim(0), x.dim(1), x.dim(2))
 
         var queries = qProj(x)
@@ -302,19 +306,14 @@ class FalconH1Attention: Module {
         keys = keys.reshaped(B, L, numKVHeads, -1).transposed(0, 2, 1, 3)
         values = values.reshaped(B, L, numKVHeads, -1).transposed(0, 2, 1, 3)
 
-        if let cache {
-            queries = rope(queries, offset: cache.offset)
-            keys = rope(keys, offset: cache.offset)
-            (keys, values) = cache.update(keys: keys, values: values)
-        } else {
-            queries = rope(queries, offset: 0)
-            keys = rope(keys, offset: 0)
-        }
+        queries = applyRotaryPosition(rope, to: queries, cache: cache)
+        keys = applyRotaryPosition(rope, to: keys, cache: cache)
 
-        var output = MLXFast.scaledDotProductAttention(
+        var output = attentionWithCacheUpdate(
             queries: queries,
             keys: keys,
             values: values,
+            cache: cache,
             scale: scale,
             mask: mask
         )
@@ -578,7 +577,7 @@ class FalconH1DecoderLayer: Module {
     func callAsFunction(
         _ h: MLXArray,
         cache: CacheList?,
-        attnMask: MLXArray?,
+        attnMask: MLXFast.ScaledDotProductAttentionMaskMode,
         mambaMask: MLXArray?
     ) -> MLXArray {
         var residual = h
@@ -610,17 +609,6 @@ private func createSSMMask(h: MLXArray, cache: ArraysCache?) -> MLXArray? {
     return nil
 }
 
-private func createAttentionMask(h: MLXArray, cache: [KVCache]?) -> MLXArray? {
-    let N = h.dim(1)
-    // If cache exists and can make masks, use it
-    // Otherwise for single token, no mask needed
-    // For multi-token, SDPA will handle causal mask internally when nil
-    if N == 1 {
-        return nil
-    }
-    return nil  // Will be handled by SDPA internally when nil
-}
-
 // MARK: - Model
 
 public class FalconH1ModelInner: Module {
@@ -649,16 +637,26 @@ public class FalconH1ModelInner: Module {
         _finalLayerNorm.wrappedValue = RMSNorm(dimensions: hiddenSize, eps: args.rmsNormEps)
     }
 
-    func callAsFunction(_ inputs: MLXArray, mask: MLXArray? = nil, cache: [CacheList]? = nil)
+    func callAsFunction(
+        _ inputs: MLXArray,
+        mask: MLXFast.ScaledDotProductAttentionMaskMode = .none,
+        cache: [CacheList]? = nil
+    )
         -> MLXArray
     {
         var h = embedTokens(inputs)
 
         let cache: [CacheList?] = cache ?? Array(repeating: nil, count: layers.count)
 
         let mambaMask = createSSMMask(h: h, cache: cache[0]?[0] as? MambaCache)
-        let attnMask: MLXArray? = createAttentionMask(
-            h: h, cache: cache[0]?[1] != nil ? [cache[0]![1]] : nil)
+        let attnMask: MLXFast.ScaledDotProductAttentionMaskMode = {
+            switch mask {
+            case .none:
+                return createAttentionMask(h: h, cache: cache[0]?[1])
+            default:
+                return mask
+            }
+        }()
 
         for (layer, c) in zip(layers, cache) {
             h = layer(

diff --git a/Libraries/MLXLLM/Models/GLM4.swift b/Libraries/MLXLLM/Models/GLM4.swift
@@ -55,13 +55,8 @@ class GLM4Attention: Module {
         keys = keys.reshaped(B, L, args.kvHeads, -1).transposed(0, 2, 1, 3)
         values = values.reshaped(B, L, args.kvHeads, -1).transposed(0, 2, 1, 3)
 
-        if let cache {
-            queries = rope(queries, offset: cache.offset)
-            keys = rope(keys, offset: cache.offset)
-        } else {
-            queries = rope(queries, offset: 0)
-            keys = rope(keys, offset: 0)
-        }
+        queries = applyRotaryPosition(rope, to: queries, cache: cache)
+        keys = applyRotaryPosition(rope, to: keys, cache: cache)
 
         let output = attentionWithCacheUpdate(
             queries: queries,

diff --git a/Libraries/MLXLLM/Models/GLM4MOE.swift b/Libraries/MLXLLM/Models/GLM4MOE.swift
@@ -70,13 +70,8 @@ class GLM4MoEAttention: Module {
         keys = keys.transposed(0, 2, 1, 3)
         values = values.reshaped(B, L, args.kvHeads, -1).transposed(0, 2, 1, 3)
 
-        if let cache {
-            queries = rope(queries, offset: cache.offset)
-            keys = rope(keys, offset: cache.offset)
-        } else {
-            queries = rope(queries, offset: 0)
-            keys = rope(keys, offset: 0)
-        }
+        queries = applyRotaryPosition(rope, to: queries, cache: cache)
+        keys = applyRotaryPosition(rope, to: keys, cache: cache)
 
         let output = attentionWithCacheUpdate(
             queries: queries,

diff --git a/Libraries/MLXLLM/Models/GLM4MOELite.swift b/Libraries/MLXLLM/Models/GLM4MOELite.swift
@@ -254,9 +254,8 @@ class GLM4MoELiteAttention: Module {
         kPe = kPe.reshaped(B, L, 1, qkRopeHeadDim).transposed(0, 2, 1, 3)
         var kvLatent = kvALayerNorm(compressedKv)
 
-        let offset = cache?.offset ?? 0
-        qPe = rope(qPe, offset: offset)
-        kPe = rope(kPe, offset: offset)
+        qPe = applyRotaryPosition(rope, to: qPe, cache: cache)
+        kPe = applyRotaryPosition(rope, to: kPe, cache: cache)
 
         // Expand kvLatent for attention: [B, L, kvLoraRank] -> [B, 1, L, kvLoraRank]
         kvLatent = expandedDimensions(kvLatent, axis: 1)

diff --git a/Libraries/MLXLLM/Models/GPTOSS.swift b/Libraries/MLXLLM/Models/GPTOSS.swift
@@ -229,13 +229,8 @@ class AttentionBlock: Module {
             if sinksActive {
                 fatalError("Quantized attention does not support non-zero sinks.")
             }
-            if qcache.offset == 0 {
-                q = rope(q)
-                k = rope(k)
-            } else {
-                q = rope(q, offset: qcache.offset)
-                k = rope(k, offset: qcache.offset)
-            }
+            q = applyRotaryPosition(rope, to: q, cache: cache)
+            k = applyRotaryPosition(rope, to: k, cache: cache)
 
             let (qKeys, qValues) = qcache.updateQuantized(keys: k, values: v)
             let vHat = quantizedScaledDotProductAttention(
@@ -252,13 +247,11 @@ class AttentionBlock: Module {
             return oProj(vHat.swappedAxes(1, 2).reshaped(B, L, -1))
         }
 
+        q = applyRotaryPosition(rope, to: q, cache: cache)
+        k = applyRotaryPosition(rope, to: k, cache: cache)
+
         if let cache {
-            q = rope(q, offset: cache.offset)
-            k = rope(k, offset: cache.offset)
             (k, v) = cache.update(keys: k, values: v)
-        } else {
-            q = rope(q)
-            k = rope(k)
         }
 
         let vHat = MLXFast.scaledDotProductAttention(

diff --git a/Libraries/MLXLLM/Models/Gemma.swift b/Libraries/MLXLLM/Models/Gemma.swift
@@ -69,13 +69,8 @@ class GemmaAttention: Module {
         keys = keys.reshaped(B, L, nKVHeads, -1).transposed(0, 2, 1, 3)
         values = values.reshaped(B, L, nKVHeads, -1).transposed(0, 2, 1, 3)
 
-        if let cache {
-            queries = rope(queries, offset: cache.offset)
-            keys = rope(keys, offset: cache.offset)
-        } else {
-            queries = rope(queries)
-            keys = rope(keys)
-        }
+        queries = applyRotaryPosition(rope, to: queries, cache: cache)
+        keys = applyRotaryPosition(rope, to: keys, cache: cache)
 
         let output = attentionWithCacheUpdate(
             queries: queries,
-Original file line number
+Diff line change
@@ Expand Up / @@ -94,4 +94,5 @@ iOSInjectionProject/ @@
     .idea
     .vscode
+    .claude/
+    .factory/