Skip to content

Commit 9bff95c

Browse files
authored
mlx-swift 0.29.1 (#411)
- handle changes in quantization
1 parent a7c99ec commit 9bff95c

File tree

12 files changed

+115
-85
lines changed

12 files changed

+115
-85
lines changed

Libraries/Embedders/BaseConfiguration.swift

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
// Copyright © 2025 Apple Inc.
22

33
import Foundation
4+
import MLX
45

56
/// Base ``LanguageModel`` configuration -- provides `modelType`
67
/// and `quantization` (used in loading the model).
@@ -18,12 +19,15 @@ public struct BaseConfiguration: Codable, Sendable {
1819

1920
public let groupSize: Int
2021
public let bits: Int
22+
private var _mode: QuantizationMode? = nil
23+
public var mode: QuantizationMode { _mode ?? .affine }
2124

22-
public var asTuple: (Int, Int) { (groupSize, bits) }
25+
public var asTuple: (Int, Int, QuantizationMode) { (groupSize, bits, mode) }
2326

2427
enum CodingKeys: String, CodingKey {
2528
case groupSize = "group_size"
2629
case bits = "bits"
30+
case _mode = "mode"
2731
}
2832
}
2933

Libraries/MLXLLM/LLMModelFactory.swift

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -340,6 +340,11 @@ public class LLMRegistry: AbstractModelRegistry, @unchecked Sendable {
340340
defaultPrompt: ""
341341
)
342342

343+
static public let gpt_oss_20b_MXFP4_Q8 = ModelConfiguration(
344+
id: "mlx-community/gpt-oss-20b-MXFP4-Q8",
345+
defaultPrompt: "Why is the sky blue?"
346+
)
347+
343348
private static func all() -> [ModelConfiguration] {
344349
[
345350
codeLlama13b4bit,
@@ -389,6 +394,7 @@ public class LLMRegistry: AbstractModelRegistry, @unchecked Sendable {
389394
ling_mini_2_2bit,
390395
lfm2_8b_a1b_3bit_mlx,
391396
nanochat_d20_mlx,
397+
gpt_oss_20b_MXFP4_Q8,
392398
]
393399
}
394400

Libraries/MLXLLM/Models/GPTOSS.swift

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -305,7 +305,8 @@ private class AttentionBlock: Module {
305305
scale: smScale,
306306
mask: .array(mask),
307307
groupSize: qcache.groupSize,
308-
bits: qcache.bits
308+
bits: qcache.bits,
309+
mode: qcache.mode
309310
)
310311

311312
return oProj(vHat.swappedAxes(1, 2).reshaped(B, L, -1))

Libraries/MLXLLM/SwitchLayers.swift

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -142,24 +142,28 @@ class SwitchLinear: Module, Quantizable {
142142
return result
143143
}
144144

145-
func toQuantized(groupSize: Int = 64, bits: Int = 4) -> Module {
146-
QuantizedSwitchLinear(self, groupSize: groupSize, bits: bits)
145+
func toQuantized(groupSize: Int = 64, bits: Int = 4, mode: QuantizationMode) -> Module {
146+
QuantizedSwitchLinear(self, groupSize: groupSize, bits: bits, mode: mode)
147147
}
148148
}
149149

150150
class QuantizedSwitchLinear: SwitchLinear, Quantized {
151151
@ModuleInfo(key: "scales") var scales: MLXArray
152-
@ModuleInfo(key: "biases") var biases: MLXArray
152+
@ModuleInfo(key: "biases") var biases: MLXArray?
153153

154154
let groupSize: Int
155155
let bits: Int
156+
let mode: QuantizationMode
156157

157-
init(_ other: SwitchLinear, groupSize: Int = 64, bits: Int = 4) {
158+
init(
159+
_ other: SwitchLinear, groupSize: Int = 64, bits: Int = 4, mode: QuantizationMode = .affine
160+
) {
158161
self.groupSize = groupSize
159162
self.bits = bits
163+
self.mode = mode
160164

161165
let (quantizedWeight, scales, biases) = MLX.quantized(
162-
other.weight, groupSize: groupSize, bits: bits)
166+
other.weight, groupSize: groupSize, bits: bits, mode: mode)
163167

164168
self._scales.wrappedValue = scales
165169
self._biases.wrappedValue = biases
@@ -183,6 +187,7 @@ class QuantizedSwitchLinear: SwitchLinear, Quantized {
183187
transpose: true,
184188
groupSize: self.groupSize,
185189
bits: self.bits,
190+
mode: mode,
186191
sortedIndices: sortedIndices
187192
)
188193

Libraries/MLXLMCommon/Adapters/LoRA/DoRA+Layers.swift

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -147,7 +147,8 @@ public class QDoRALinear: QuantizedLinear, LoRALayer {
147147
super.init(
148148
weight: linear.weight, bias: linear.bias,
149149
scales: linear.scales, biases: linear.biases,
150-
groupSize: linear.groupSize, bits: linear.bits
150+
groupSize: linear.groupSize, bits: linear.bits,
151+
mode: linear.mode
151152
)
152153

153154
freeze()
@@ -171,7 +172,8 @@ public class QDoRALinear: QuantizedLinear, LoRALayer {
171172

172173
public override func callAsFunction(_ x: MLXArray) -> MLXArray {
173174
let y = quantizedMatmul(
174-
x, weight, scales: scales, biases: biases, groupSize: groupSize, bits: bits)
175+
x, weight, scales: scales, biases: biases, groupSize: groupSize, bits: bits,
176+
mode: mode)
175177
return forward(
176178
x: x, y: y,
177179
weight: dequantizedWeight, bias: bias,

Libraries/MLXLMCommon/Adapters/LoRA/LoRAModel.swift

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,8 @@ extension QuantizedLinear {
7979
scales: scales,
8080
biases: biases,
8181
groupSize: groupSize,
82-
bits: bits
82+
bits: bits,
83+
mode: mode
8384
)
8485
}
8586
}

Libraries/MLXLMCommon/AttentionUtils.swift

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ public func attentionWithCacheUpdate(
5252
mask: mask
5353
)
5454
}
55-
if let quantizedKVCache = cache as? QuantizedKVCache {
55+
if let quantizedKVCache = cache as? QuantizedKVCacheProtocol {
5656
let (quantizedKeys, quantizedValues) = quantizedKVCache.updateQuantized(
5757
keys: keys, values: values)
5858
return quantizedScaledDotProductAttention(
@@ -62,7 +62,8 @@ public func attentionWithCacheUpdate(
6262
scale: scale,
6363
mask: mask,
6464
groupSize: quantizedKVCache.groupSize,
65-
bits: quantizedKVCache.bits
65+
bits: quantizedKVCache.bits,
66+
mode: quantizedKVCache.mode
6667
)
6768
} else {
6869
let (cachedKeys, cachedValues) = cache.update(keys: keys, values: values)

Libraries/MLXLMCommon/BaseConfiguration.swift

Lines changed: 10 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
// Copyright © 2025 Apple Inc.
22

33
import Foundation
4+
import MLX
45

56
/// Base ``LanguageModel`` configuration -- provides `modelType`
67
/// and `quantization` (used in loading the model).
@@ -18,20 +19,15 @@ public struct BaseConfiguration: Codable, Sendable {
1819

1920
public let groupSize: Int
2021
public let bits: Int
21-
public var quantMethod: String? = nil
22-
public var linearClass: String? = nil
23-
public var quantizationMode: String? = nil
24-
public var mode: String? = nil
22+
private var _mode: QuantizationMode? = nil
23+
public var mode: QuantizationMode { _mode ?? .affine }
2524

26-
public var asTuple: (Int, Int) { (groupSize, bits) }
25+
public var asTuple: (Int, Int, QuantizationMode) { (groupSize, bits, mode) }
2726

2827
enum CodingKeys: String, CodingKey {
2928
case groupSize = "group_size"
3029
case bits = "bits"
31-
case quantMethod = "quant_method"
32-
case linearClass = "linear_class"
33-
case quantizationMode = "quantization_mode"
34-
case mode = "mode"
30+
case _mode = "mode"
3531
}
3632
}
3733

@@ -115,10 +111,11 @@ public struct BaseConfiguration: Codable, Sendable {
115111
switch key.stringValue {
116112
case Quantization.CodingKeys.groupSize.rawValue: continue
117113
case Quantization.CodingKeys.bits.rawValue: continue
118-
case Quantization.CodingKeys.quantMethod.rawValue: continue
119-
case Quantization.CodingKeys.linearClass.rawValue: continue
120-
case Quantization.CodingKeys.quantizationMode.rawValue: continue
121-
case Quantization.CodingKeys.mode.rawValue: continue
114+
case Quantization.CodingKeys._mode.rawValue: continue
115+
116+
// additional keys that are not layer instructions, see
117+
// mlx-community/bitnet-b1.58-2B-4T-4bit
118+
case "quant_method", "linear_class", "quantization_mode": continue
122119

123120
default:
124121
if let f = try? container.decode(Bool.self, forKey: key) {

0 commit comments

Comments
 (0)