feat: add Qwen3 embedding (#402)

sxy-trans-n · web-flow · commit 6ff3c1fb9501 · 2025-10-07T09:41:07.000-07:00
* feat: add Qwen3 embedding model implementation
diff --git a/Libraries/Embedders/Configuration.swift b/Libraries/Embedders/Configuration.swift
@@ -69,6 +69,13 @@ private class ModelTypeRegistry: @unchecked Sendable {
             let model = NomicBertModel(configuration)
             return model
         },
+        "qwen3": {
+            url in
+            let configuration = try JSONDecoder().decode(
+                Qwen3Configuration.self, from: Data(contentsOf: url))
+            let model = Qwen3Model(configuration)
+            return model
+        },
     ]
 
     public func registerModelType(
diff --git a/Libraries/Embedders/Models.swift b/Libraries/Embedders/Models.swift
@@ -108,6 +108,8 @@ extension ModelConfiguration {
     public static let bge_m3 = ModelConfiguration(id: "BAAI/bge-m3")
     public static let mixedbread_large = ModelConfiguration(
         id: "mixedbread-ai/mxbai-embed-large-v1")
+    public static let qwen3_embedding = ModelConfiguration(
+        id: "mlx-community/Qwen3-Embedding-0.6B-4bit-DWQ")
 
     private enum BootstrapState: Sendable {
         case idle
@@ -138,6 +140,7 @@ extension ModelConfiguration {
                 snowflake_lg,
                 bge_m3,
                 mixedbread_large,
+                qwen3_embedding,
             ])
             bootstrapState = .bootstrapped
 
diff --git a/Libraries/Embedders/NomicBert.swift b/Libraries/Embedders/NomicBert.swift
@@ -312,7 +312,7 @@ private class Encoder: Module {
 
     func callAsFunction(_ inputs: MLXArray, attentionMask: MLXArray? = nil) -> MLXArray {
         var outputs = inputs
-        for (index, layer) in layers.enumerated() {
+        for layer in layers {
             outputs = layer(outputs, mask: attentionMask)
         }
         return outputs
diff --git a/Libraries/Embedders/Qwen3.swift b/Libraries/Embedders/Qwen3.swift
@@ -0,0 +1,278 @@
+// Copyright © 2024 Apple Inc.
+
+import Foundation
+import MLX
+import MLXFast
+import MLXLMCommon
+import MLXNN
+
+private class Attention: Module {
+    let args: Qwen3Configuration
+    let scale: Float
+
+    @ModuleInfo(key: "q_proj") var wq: Linear
+    @ModuleInfo(key: "k_proj") var wk: Linear
+    @ModuleInfo(key: "v_proj") var wv: Linear
+    @ModuleInfo(key: "o_proj") var wo: Linear
+
+    @ModuleInfo(key: "q_norm") var qNorm: RMSNorm
+    @ModuleInfo(key: "k_norm") var kNorm: RMSNorm
+
+    let rope: RoPE
+
+    public init(_ args: Qwen3Configuration) {
+        self.args = args
+
+        let dim = args.hiddenSize
+        let heads = args.attentionHeads
+        let kvHeads = args.kvHeads
+
+        let headDim = args.headDim
+        self.scale = Float(pow(Double(headDim), -0.5))
+
+        _wq.wrappedValue = Linear(dim, heads * headDim, bias: false)
+        _wk.wrappedValue = Linear(dim, kvHeads * headDim, bias: false)
+        _wv.wrappedValue = Linear(dim, kvHeads * headDim, bias: false)
+        _wo.wrappedValue = Linear(heads * headDim, dim, bias: false)
+
+        _qNorm.wrappedValue = RMSNorm(dimensions: headDim, eps: args.rmsNormEps)
+        _kNorm.wrappedValue = RMSNorm(dimensions: headDim, eps: args.rmsNormEps)
+
+        var ropeScale: Float = 1
+        if let ropeScaling = args.ropeScaling,
+            let typeValue = ropeScaling["type"],
+            case .string(let type) = typeValue, type == "linear",
+            let factorValue = ropeScaling["factor"]
+        {
+            switch factorValue {
+            case .float(let v):
+                ropeScale = 1 / v
+            case .string(let s) where Float(s) != nil:
+                ropeScale = 1 / Float(s)!
+            default:
+                break
+            }
+        }
+
+        self.rope = RoPE(
+            dimensions: headDim, traditional: false, base: args.ropeTheta,
+            scale: ropeScale)
+    }
+
+    public func callAsFunction(
+        _ x: MLXArray, mask: MLXArray? = nil, cache: KVCache?
+    ) -> MLXArray {
+        let (B, L) = (x.dim(0), x.dim(1))
+
+        var queries = wq(x)
+        var keys = wk(x)
+        var values = wv(x)
+
+        // prepare the queries, keys and values for the attention computation
+        queries = qNorm(queries.reshaped(B, L, args.attentionHeads, -1)).transposed(0, 2, 1, 3)
+        keys = kNorm(keys.reshaped(B, L, args.kvHeads, -1)).transposed(0, 2, 1, 3)
+        values = values.reshaped(B, L, args.kvHeads, -1).transposed(0, 2, 1, 3)
+
+        if let cache {
+            queries = rope(queries, offset: cache.offset)
+            keys = rope(keys, offset: cache.offset)
+            (keys, values) = cache.update(keys: keys, values: values)
+        } else {
+            queries = rope(queries)
+            keys = rope(keys)
+        }
+
+        let output = MLXFast.scaledDotProductAttention(
+            queries: queries, keys: keys, values: values, scale: scale, mask: mask
+        )
+        .transposed(0, 2, 1, 3)
+        .reshaped(B, L, -1)
+
+        return wo(output)
+    }
+}
+
+private class MLP: Module, UnaryLayer {
+    @ModuleInfo(key: "gate_proj") var gate: Linear
+    @ModuleInfo(key: "down_proj") var down: Linear
+    @ModuleInfo(key: "up_proj") var up: Linear
+
+    public init(dimensions: Int, hiddenDimensions: Int) {
+        _gate.wrappedValue = Linear(dimensions, hiddenDimensions, bias: false)
+        _down.wrappedValue = Linear(hiddenDimensions, dimensions, bias: false)
+        _up.wrappedValue = Linear(dimensions, hiddenDimensions, bias: false)
+    }
+
+    public func callAsFunction(_ x: MLXArray) -> MLXArray {
+        down(silu(gate(x)) * up(x))
+    }
+}
+
+private class TransformerBlock: Module {
+    @ModuleInfo(key: "self_attn") var attention: Attention
+    let mlp: MLP
+
+    @ModuleInfo(key: "input_layernorm") var inputLayerNorm: RMSNorm
+    @ModuleInfo(key: "post_attention_layernorm") var postAttentionLayerNorm: RMSNorm
+
+    public init(_ args: Qwen3Configuration) {
+        _attention.wrappedValue = Attention(args)
+        self.mlp = MLP(dimensions: args.hiddenSize, hiddenDimensions: args.intermediateSize)
+        _inputLayerNorm.wrappedValue = RMSNorm(
+            dimensions: args.hiddenSize, eps: args.rmsNormEps)
+        _postAttentionLayerNorm.wrappedValue = RMSNorm(
+            dimensions: args.hiddenSize, eps: args.rmsNormEps)
+    }
+
+    public func callAsFunction(
+        _ x: MLXArray, mask: MLXArray? = nil, cache: KVCache?
+    ) -> MLXArray {
+        var r = attention(inputLayerNorm(x), mask: mask, cache: cache)
+        let h = x + r
+        r = mlp(postAttentionLayerNorm(h))
+        let out = h + r
+        return out
+    }
+}
+
+private class Qwen3ModelInner: Module {
+    @ModuleInfo(key: "embed_tokens") var embedTokens: Embedding
+
+    fileprivate let layers: [TransformerBlock]
+    let norm: RMSNorm
+
+    public init(_ args: Qwen3Configuration) {
+        precondition(args.vocabularySize > 0)
+
+        _embedTokens.wrappedValue = Embedding(
+            embeddingCount: args.vocabularySize, dimensions: args.hiddenSize)
+
+        self.layers = (0 ..< args.hiddenLayers)
+            .map { _ in
+                TransformerBlock(args)
+            }
+        self.norm = RMSNorm(dimensions: args.hiddenSize, eps: args.rmsNormEps)
+    }
+
+    public func callAsFunction(_ inputs: MLXArray, cache: [KVCache]? = nil) -> MLXArray {
+        var h = embedTokens(inputs)
+
+        let mask: MLXArray? = createAttentionMask(h: h, cache: cache)
+
+        for (i, layer) in layers.enumerated() {
+            h = layer(h, mask: mask, cache: cache?[i])
+        }
+
+        return norm(h)
+    }
+}
+
+public class Qwen3Model: Module, EmbeddingModel {
+    public let vocabularySize: Int
+    public let kvHeads: [Int]
+
+    @ModuleInfo(key: "model") private var model: Qwen3ModelInner
+    let configuration: Qwen3Configuration
+
+    public init(_ args: Qwen3Configuration) {
+        self.configuration = args
+        self.vocabularySize = args.vocabularySize
+        self.kvHeads = (0 ..< args.hiddenLayers).map { _ in args.kvHeads }
+        self._model.wrappedValue = Qwen3ModelInner(args)
+    }
+
+    public func callAsFunction(
+        _ inputIds: MLXArray, positionIds: MLXArray? = nil, tokenTypeIds: MLXArray? = nil,
+        attentionMask: MLXArray? = nil
+    )
+        -> EmbeddingModelOutput
+    {
+        let out = model(inputIds, cache: nil)
+        return EmbeddingModelOutput(
+            hiddenStates: out,
+            pooledOutput: nil)
+    }
+
+    public func sanitize(weights: [String: MLXArray]) -> [String: MLXArray] {
+        var sanitizedWeights = [String: MLXArray]()
+
+        for (key, value) in weights {
+            // Skip unused keys
+            if key.contains("self_attn.rotary_emb.inv_freq") || key.contains("lm_head") {
+                continue
+            }
+
+            var newKey = key
+            if !newKey.hasPrefix("model.") {
+                newKey = "model." + newKey
+            }
+
+            sanitizedWeights[newKey] = value
+        }
+
+        return sanitizedWeights
+    }
+}
+
+public struct Qwen3Configuration: Codable, Sendable {
+    var hiddenSize: Int
+    var hiddenLayers: Int
+    var intermediateSize: Int
+    var attentionHeads: Int
+    var rmsNormEps: Float
+    var vocabularySize: Int
+    var kvHeads: Int
+    var ropeTheta: Float = 1_000_000
+    var headDim: Int
+    var ropeScaling: [String: StringOrNumber]? = nil
+    var tieWordEmbeddings = false
+    var maxPositionEmbeddings: Int = 32768
+
+    enum CodingKeys: String, CodingKey {
+        case hiddenSize = "hidden_size"
+        case hiddenLayers = "num_hidden_layers"
+        case intermediateSize = "intermediate_size"
+        case attentionHeads = "num_attention_heads"
+        case rmsNormEps = "rms_norm_eps"
+        case vocabularySize = "vocab_size"
+        case kvHeads = "num_key_value_heads"
+        case ropeTheta = "rope_theta"
+        case headDim = "head_dim"
+        case ropeScaling = "rope_scaling"
+        case tieWordEmbeddings = "tie_word_embeddings"
+        case maxPositionEmbeddings = "max_position_embeddings"
+    }
+
+    public init(from decoder: Decoder) throws {
+        // custom implementation to handle optional keys with required values
+        let container: KeyedDecodingContainer<Qwen3Configuration.CodingKeys> =
+            try decoder.container(
+                keyedBy: Qwen3Configuration.CodingKeys.self)
+
+        self.hiddenSize = try container.decode(
+            Int.self, forKey: Qwen3Configuration.CodingKeys.hiddenSize)
+        self.hiddenLayers = try container.decode(
+            Int.self, forKey: Qwen3Configuration.CodingKeys.hiddenLayers)
+        self.intermediateSize = try container.decode(
+            Int.self, forKey: Qwen3Configuration.CodingKeys.intermediateSize)
+        self.attentionHeads = try container.decode(
+            Int.self, forKey: Qwen3Configuration.CodingKeys.attentionHeads)
+        self.rmsNormEps = try container.decode(
+            Float.self, forKey: Qwen3Configuration.CodingKeys.rmsNormEps)
+        self.vocabularySize = try container.decode(
+            Int.self, forKey: Qwen3Configuration.CodingKeys.vocabularySize)
+        self.kvHeads = try container.decode(Int.self, forKey: Qwen3Configuration.CodingKeys.kvHeads)
+        self.ropeTheta =
+            try container.decodeIfPresent(
+                Float.self, forKey: Qwen3Configuration.CodingKeys.ropeTheta)
+            ?? 1_000_000
+        self.headDim = try container.decode(
+            Int.self, forKey: Qwen3Configuration.CodingKeys.headDim)
+        self.ropeScaling = try container.decodeIfPresent(
+            [String: StringOrNumber].self, forKey: Qwen3Configuration.CodingKeys.ropeScaling)
+        self.tieWordEmbeddings =
+            try container.decodeIfPresent(Bool.self, forKey: .tieWordEmbeddings) ?? false
+        self.maxPositionEmbeddings =
+            try container.decodeIfPresent(Int.self, forKey: .maxPositionEmbeddings) ?? 32768
+    }
+}
diff --git a/Package.resolved b/Package.resolved
diff --git a/Package.swift b/Package.swift
@@ -118,6 +118,7 @@ let package = Package(
                 .product(name: "MLXNN", package: "mlx-swift"),
                 .product(name: "Transformers", package: "swift-transformers"),
                 .product(name: "MLXLinalg", package: "mlx-swift"),
+                .target(name: "MLXLMCommon"),
             ],
             path: "Libraries/Embedders",
             exclude: [

Original file line number	Diff line number	Diff line change
`@@ -312,7 +312,7 @@ private class Encoder: Module {`
`312`	`312`
`313`	`313`	`func callAsFunction(_ inputs: MLXArray, attentionMask: MLXArray? = nil) -> MLXArray {`
`314`	`314`	`var outputs = inputs`
`315`		`- for (index, layer) in layers.enumerated() {`
	`315`	`+ for layer in layers {`
`316`	`316`	`outputs = layer(outputs, mask: attentionMask)`
`317`	`317`	`}`
`318`	`318`	`return outputs`