mlx-swift 0.29.1 (#411)

davidkoski · web-flow · commit 9bff95ca5f0b · 2025-10-16T11:27:25.000-07:00
- handle changes in quantization
diff --git a/Libraries/Embedders/BaseConfiguration.swift b/Libraries/Embedders/BaseConfiguration.swift
@@ -1,6 +1,7 @@
 // Copyright © 2025 Apple Inc.
 
 import Foundation
+import MLX
 
 /// Base ``LanguageModel`` configuration -- provides `modelType`
 /// and `quantization` (used in loading the model).
@@ -18,12 +19,15 @@ public struct BaseConfiguration: Codable, Sendable {
 
         public let groupSize: Int
         public let bits: Int
+        private var _mode: QuantizationMode? = nil
+        public var mode: QuantizationMode { _mode ?? .affine }
 
-        public var asTuple: (Int, Int) { (groupSize, bits) }
+        public var asTuple: (Int, Int, QuantizationMode) { (groupSize, bits, mode) }
 
         enum CodingKeys: String, CodingKey {
             case groupSize = "group_size"
             case bits = "bits"
+            case _mode = "mode"
         }
     }
 
diff --git a/Libraries/MLXLLM/LLMModelFactory.swift b/Libraries/MLXLLM/LLMModelFactory.swift
@@ -340,6 +340,11 @@ public class LLMRegistry: AbstractModelRegistry, @unchecked Sendable {
         defaultPrompt: ""
     )
 
+    static public let gpt_oss_20b_MXFP4_Q8 = ModelConfiguration(
+        id: "mlx-community/gpt-oss-20b-MXFP4-Q8",
+        defaultPrompt: "Why is the sky blue?"
+    )
+
     private static func all() -> [ModelConfiguration] {
         [
             codeLlama13b4bit,
@@ -389,6 +394,7 @@ public class LLMRegistry: AbstractModelRegistry, @unchecked Sendable {
             ling_mini_2_2bit,
             lfm2_8b_a1b_3bit_mlx,
             nanochat_d20_mlx,
+            gpt_oss_20b_MXFP4_Q8,
         ]
     }
 
diff --git a/Libraries/MLXLLM/Models/GPTOSS.swift b/Libraries/MLXLLM/Models/GPTOSS.swift
@@ -305,7 +305,8 @@ private class AttentionBlock: Module {
                 scale: smScale,
                 mask: .array(mask),
                 groupSize: qcache.groupSize,
-                bits: qcache.bits
+                bits: qcache.bits,
+                mode: qcache.mode
             )
 
             return oProj(vHat.swappedAxes(1, 2).reshaped(B, L, -1))
diff --git a/Libraries/MLXLLM/SwitchLayers.swift b/Libraries/MLXLLM/SwitchLayers.swift
@@ -142,24 +142,28 @@ class SwitchLinear: Module, Quantizable {
         return result
     }
 
-    func toQuantized(groupSize: Int = 64, bits: Int = 4) -> Module {
-        QuantizedSwitchLinear(self, groupSize: groupSize, bits: bits)
+    func toQuantized(groupSize: Int = 64, bits: Int = 4, mode: QuantizationMode) -> Module {
+        QuantizedSwitchLinear(self, groupSize: groupSize, bits: bits, mode: mode)
     }
 }
 
 class QuantizedSwitchLinear: SwitchLinear, Quantized {
     @ModuleInfo(key: "scales") var scales: MLXArray
-    @ModuleInfo(key: "biases") var biases: MLXArray
+    @ModuleInfo(key: "biases") var biases: MLXArray?
 
     let groupSize: Int
     let bits: Int
+    let mode: QuantizationMode
 
-    init(_ other: SwitchLinear, groupSize: Int = 64, bits: Int = 4) {
+    init(
+        _ other: SwitchLinear, groupSize: Int = 64, bits: Int = 4, mode: QuantizationMode = .affine
+    ) {
         self.groupSize = groupSize
         self.bits = bits
+        self.mode = mode
 
         let (quantizedWeight, scales, biases) = MLX.quantized(
-            other.weight, groupSize: groupSize, bits: bits)
+            other.weight, groupSize: groupSize, bits: bits, mode: mode)
 
         self._scales.wrappedValue = scales
         self._biases.wrappedValue = biases
@@ -183,6 +187,7 @@ class QuantizedSwitchLinear: SwitchLinear, Quantized {
             transpose: true,
             groupSize: self.groupSize,
             bits: self.bits,
+            mode: mode,
             sortedIndices: sortedIndices
         )
 
diff --git a/Libraries/MLXLMCommon/Adapters/LoRA/DoRA+Layers.swift b/Libraries/MLXLMCommon/Adapters/LoRA/DoRA+Layers.swift
@@ -147,7 +147,8 @@ public class QDoRALinear: QuantizedLinear, LoRALayer {
         super.init(
             weight: linear.weight, bias: linear.bias,
             scales: linear.scales, biases: linear.biases,
-            groupSize: linear.groupSize, bits: linear.bits
+            groupSize: linear.groupSize, bits: linear.bits,
+            mode: linear.mode
         )
 
         freeze()
@@ -171,7 +172,8 @@ public class QDoRALinear: QuantizedLinear, LoRALayer {
 
     public override func callAsFunction(_ x: MLXArray) -> MLXArray {
         let y = quantizedMatmul(
-            x, weight, scales: scales, biases: biases, groupSize: groupSize, bits: bits)
+            x, weight, scales: scales, biases: biases, groupSize: groupSize, bits: bits,
+            mode: mode)
         return forward(
             x: x, y: y,
             weight: dequantizedWeight, bias: bias,
diff --git a/Libraries/MLXLMCommon/Adapters/LoRA/LoRAModel.swift b/Libraries/MLXLMCommon/Adapters/LoRA/LoRAModel.swift
@@ -79,7 +79,8 @@ extension QuantizedLinear {
             scales: scales,
             biases: biases,
             groupSize: groupSize,
-            bits: bits
+            bits: bits,
+            mode: mode
         )
     }
 }
diff --git a/Libraries/MLXLMCommon/AttentionUtils.swift b/Libraries/MLXLMCommon/AttentionUtils.swift
@@ -52,7 +52,7 @@ public func attentionWithCacheUpdate(
             mask: mask
         )
     }
-    if let quantizedKVCache = cache as? QuantizedKVCache {
+    if let quantizedKVCache = cache as? QuantizedKVCacheProtocol {
         let (quantizedKeys, quantizedValues) = quantizedKVCache.updateQuantized(
             keys: keys, values: values)
         return quantizedScaledDotProductAttention(
@@ -62,7 +62,8 @@ public func attentionWithCacheUpdate(
             scale: scale,
             mask: mask,
             groupSize: quantizedKVCache.groupSize,
-            bits: quantizedKVCache.bits
+            bits: quantizedKVCache.bits,
+            mode: quantizedKVCache.mode
         )
     } else {
         let (cachedKeys, cachedValues) = cache.update(keys: keys, values: values)
diff --git a/Libraries/MLXLMCommon/BaseConfiguration.swift b/Libraries/MLXLMCommon/BaseConfiguration.swift
@@ -1,6 +1,7 @@
 // Copyright © 2025 Apple Inc.
 
 import Foundation
+import MLX
 
 /// Base ``LanguageModel`` configuration -- provides `modelType`
 /// and `quantization` (used in loading the model).
@@ -18,20 +19,15 @@ public struct BaseConfiguration: Codable, Sendable {
 
         public let groupSize: Int
         public let bits: Int
-        public var quantMethod: String? = nil
-        public var linearClass: String? = nil
-        public var quantizationMode: String? = nil
-        public var mode: String? = nil
+        private var _mode: QuantizationMode? = nil
+        public var mode: QuantizationMode { _mode ?? .affine }
 
-        public var asTuple: (Int, Int) { (groupSize, bits) }
+        public var asTuple: (Int, Int, QuantizationMode) { (groupSize, bits, mode) }
 
         enum CodingKeys: String, CodingKey {
             case groupSize = "group_size"
             case bits = "bits"
-            case quantMethod = "quant_method"
-            case linearClass = "linear_class"
-            case quantizationMode = "quantization_mode"
-            case mode = "mode"
+            case _mode = "mode"
         }
     }
 
@@ -115,10 +111,11 @@ public struct BaseConfiguration: Codable, Sendable {
                 switch key.stringValue {
                 case Quantization.CodingKeys.groupSize.rawValue: continue
                 case Quantization.CodingKeys.bits.rawValue: continue
-                case Quantization.CodingKeys.quantMethod.rawValue: continue
-                case Quantization.CodingKeys.linearClass.rawValue: continue
-                case Quantization.CodingKeys.quantizationMode.rawValue: continue
-                case Quantization.CodingKeys.mode.rawValue: continue
+                case Quantization.CodingKeys._mode.rawValue: continue
+
+                // additional keys that are not layer instructions, see
+                // mlx-community/bitnet-b1.58-2B-4T-4bit
+                case "quant_method", "linear_class", "quantization_mode": continue
 
                 default:
                     if let f = try? container.decode(Bool.self, forKey: key) {
diff --git a/Libraries/MLXLMCommon/KVCache.swift b/Libraries/MLXLMCommon/KVCache.swift
diff --git a/Libraries/MLXVLM/Models/Gemma3.swift b/Libraries/MLXVLM/Models/Gemma3.swift
diff --git a/Package.swift b/Package.swift
diff --git a/mlx-swift-examples.xcodeproj/project.xcworkspace/xcshareddata/swiftpm/Package.resolved b/mlx-swift-examples.xcodeproj/project.xcworkspace/xcshareddata/swiftpm/Package.resolved

Original file line number	Diff line number	Diff line change
`@@ -340,6 +340,11 @@ public class LLMRegistry: AbstractModelRegistry, @unchecked Sendable {`
`340`	`340`	`defaultPrompt: ""`
`341`	`341`	`)`
`342`	`342`
	`343`	`+ static public let gpt_oss_20b_MXFP4_Q8 = ModelConfiguration(`
	`344`	`+ id: "mlx-community/gpt-oss-20b-MXFP4-Q8",`
	`345`	`+ defaultPrompt: "Why is the sky blue?"`
	`346`	`+ )`
	`347`	`+`
`343`	`348`	`private static func all() -> [ModelConfiguration] {`
`344`	`349`	`[`
`345`	`350`	`codeLlama13b4bit,`
`@@ -389,6 +394,7 @@ public class LLMRegistry: AbstractModelRegistry, @unchecked Sendable {`
`389`	`394`	`ling_mini_2_2bit,`
`390`	`395`	`lfm2_8b_a1b_3bit_mlx,`
`391`	`396`	`nanochat_d20_mlx,`
	`397`	`+ gpt_oss_20b_MXFP4_Q8,`
`392`	`398`	`]`
`393`	`399`	`}`
`394`	`400`
Original file line number	Diff line number	Diff line change
`@@ -305,7 +305,8 @@ private class AttentionBlock: Module {`
`305`	`305`	`scale: smScale,`
`306`	`306`	`mask: .array(mask),`
`307`	`307`	`groupSize: qcache.groupSize,`
`308`		`- bits: qcache.bits`
	`308`	`+ bits: qcache.bits,`
	`309`	`+ mode: qcache.mode`
`309`	`310`	`)`
`310`	`311`
`311`	`312`	`return oProj(vHat.swappedAxes(1, 2).reshaped(B, L, -1))`
Original file line number	Diff line number	Diff line change
`@@ -79,7 +79,8 @@ extension QuantizedLinear {`
`79`	`79`	`scales: scales,`
`80`	`80`	`biases: biases,`
`81`	`81`	`groupSize: groupSize,`
`82`		`- bits: bits`
	`82`	`+ bits: bits,`
	`83`	`+ mode: mode`
`83`	`84`	`)`
`84`	`85`	`}`
`85`	`86`	`}`
Original file line number	Diff line number	Diff line change
`@@ -52,7 +52,7 @@ public func attentionWithCacheUpdate(`
`52`	`52`	`mask: mask`
`53`	`53`	`)`
`54`	`54`	`}`
`55`		`- if let quantizedKVCache = cache as? QuantizedKVCache {`
	`55`	`+ if let quantizedKVCache = cache as? QuantizedKVCacheProtocol {`
`56`	`56`	`let (quantizedKeys, quantizedValues) = quantizedKVCache.updateQuantized(`
`57`	`57`	`keys: keys, values: values)`
`58`	`58`	`return quantizedScaledDotProductAttention(`
`@@ -62,7 +62,8 @@ public func attentionWithCacheUpdate(`
`62`	`62`	`scale: scale,`
`63`	`63`	`mask: mask,`
`64`	`64`	`groupSize: quantizedKVCache.groupSize,`
`65`		`- bits: quantizedKVCache.bits`
	`65`	`+ bits: quantizedKVCache.bits,`
	`66`	`+ mode: quantizedKVCache.mode`
`66`	`67`	`)`
`67`	`68`	`} else {`
`68`	`69`	`let (cachedKeys, cachedValues) = cache.update(keys: keys, values: values)`