fix(gemma3n): support per-layer intermediate_size array (#46)

swernerx · web-flow · commit 5d89cc957a8a · 2026-01-08T13:18:23.000-08:00
* fix(gemma3n): support per-layer intermediate_size array

Gemma 3n models from HuggingFace specify intermediate_size as an array
(one value per layer) rather than a single integer. This causes a decoding
error when trying to load these models.

This commit introduces an IntOrArray type that can decode either format,
maintaining backwards compatibility with models that use a single value
while adding support for the per-layer array format.

Fixes loading of models like:
- mlx-community/gemma-3n-E2B-it-4bit
- mlx-community/gemma-3n-E4B-it-4bit

Tested with swift build - compiles successfully.

* fix(gemma3n): make query_pre_attn_scalar optional

Some HuggingFace Gemma 3n configs don't include this field.

* fix(gemma3n): preserve all weights in sanitize function

The sanitize function was only keeping weights with 'model.language_model.'
prefix and discarding all others. This caused missing weight errors when
loading Gemma 3n models.
diff --git a/Libraries/MLXLLM/Models/Gemma3nText.swift b/Libraries/MLXLLM/Models/Gemma3nText.swift
@@ -15,18 +15,59 @@ import MLXNN
 
 // MARK: - Configuration
 
+/// A type that can be decoded as either a single Int or an array of Ints.
+/// This is needed because some models (like Gemma 3n) specify intermediate_size
+/// as a per-layer array, while others use a single value.
+public struct IntOrArray: Codable {
+    public let values: [Int]
+
+    public init(from decoder: Decoder) throws {
+        let container = try decoder.singleValueContainer()
+        if let array = try? container.decode([Int].self) {
+            self.values = array
+        } else if let single = try? container.decode(Int.self) {
+            self.values = [single]
+        } else {
+            throw DecodingError.typeMismatch(
+                IntOrArray.self,
+                DecodingError.Context(
+                    codingPath: decoder.codingPath,
+                    debugDescription: "Expected Int or [Int]"
+                )
+            )
+        }
+    }
+
+    public func encode(to encoder: Encoder) throws {
+        var container = encoder.singleValueContainer()
+        if values.count == 1 {
+            try container.encode(values[0])
+        } else {
+            try container.encode(values)
+        }
+    }
+
+    /// Get the intermediate size for a specific layer
+    public subscript(layerIdx: Int) -> Int {
+        if values.count == 1 {
+            return values[0]
+        }
+        return values[layerIdx]
+    }
+}
+
 public struct Gemma3nTextConfiguration: Codable {
     let modelType: String
     let hiddenSize: Int
     let numHiddenLayers: Int
-    let intermediateSize: Int
+    let intermediateSize: IntOrArray
     let numAttentionHeads: Int
     let headDim: Int
     let rmsNormEps: Float
     let vocabSize: Int
     let numKeyValueHeads: Int
     let numKvSharedLayers: Int
-    let queryPreAttnScalar: Float
+    let queryPreAttnScalar: Float?  // Optional - not present in all HF configs
     let vocabSizePerLayerInput: Int
     let slidingWindow: Int
     let maxPositionEmbeddings: Int
@@ -92,14 +133,14 @@ public struct Gemma3nTextConfiguration: Codable {
         modelType = try container.decode(String.self, forKey: .modelType)
         hiddenSize = try container.decode(Int.self, forKey: .hiddenSize)
         numHiddenLayers = try container.decode(Int.self, forKey: .numHiddenLayers)
-        intermediateSize = try container.decode(Int.self, forKey: .intermediateSize)
+        intermediateSize = try container.decode(IntOrArray.self, forKey: .intermediateSize)
         numAttentionHeads = try container.decode(Int.self, forKey: .numAttentionHeads)
         headDim = try container.decode(Int.self, forKey: .headDim)
         rmsNormEps = try container.decode(Float.self, forKey: .rmsNormEps)
         vocabSize = try container.decode(Int.self, forKey: .vocabSize)
         numKeyValueHeads = try container.decode(Int.self, forKey: .numKeyValueHeads)
         numKvSharedLayers = try container.decode(Int.self, forKey: .numKvSharedLayers)
-        queryPreAttnScalar = try container.decode(Float.self, forKey: .queryPreAttnScalar)
+        queryPreAttnScalar = try container.decodeIfPresent(Float.self, forKey: .queryPreAttnScalar)
         vocabSizePerLayerInput = try container.decode(Int.self, forKey: .vocabSizePerLayerInput)
         slidingWindow = try container.decode(Int.self, forKey: .slidingWindow)
         maxPositionEmbeddings = try container.decode(Int.self, forKey: .maxPositionEmbeddings)
@@ -309,7 +350,7 @@ class Gemma3nMLP: Module {
     init(_ config: Gemma3nTextConfiguration, layerIdx: Int) {
         self.config = config
         self.hiddenSize = config.hiddenSize
-        self.intermediateSize = config.intermediateSize
+        self.intermediateSize = config.intermediateSize[layerIdx]
 
         if let activationSparsityPattern = config.activationSparsityPattern {
             self.activationSparsity = activationSparsityPattern[layerIdx]
@@ -963,9 +1004,13 @@ public class Gemma3nTextModel: Module, LLMModel {
 
         for (key, value) in weights {
             if key.hasPrefix("model.language_model.") {
+                // Remove "model." prefix for VLM-style weights
                 let newKey = key.replacingOccurrences(
                     of: "model.language_model.", with: "language_model.")
                 processedWeights[newKey] = value
+            } else {
+                // Keep other weights as-is
+                processedWeights[key] = value
             }
         }