feat: add phi-4 to recommended models

swernerx · swernerx · commit a4bc8002fa7d · 2026-01-08T17:26:16.000+01:00
Phi-4 uses the same phi3 architecture and works out of the box.
It's larger (14B params) so slower (~3 tok/s vs ~31 tok/s for Phi-3)
but provides better quality responses.
diff --git a/packages/node-mlx/src/index.ts b/packages/node-mlx/src/index.ts
@@ -161,8 +161,10 @@ export const RECOMMENDED_MODELS = {
   "qwen-2.5-1.5b": "Qwen/Qwen2.5-1.5B-Instruct",
   "qwen-2.5-3b": "Qwen/Qwen2.5-3B-Instruct",
 
-  // Phi 3 (Microsoft) - Working with fused QKV and RoPE
-  phi: "microsoft/Phi-3-mini-4k-instruct",
+  // Phi (Microsoft) - Working with fused QKV and RoPE
+  phi: "microsoft/phi-4", // Default to latest
+  phi4: "microsoft/phi-4",
+  "phi-4": "microsoft/phi-4",
   phi3: "microsoft/Phi-3-mini-4k-instruct",
   "phi-3": "microsoft/Phi-3-mini-4k-instruct",
   "phi-3-mini": "microsoft/Phi-3-mini-4k-instruct",
diff --git a/packages/swift/Sources/NodeMLXCore/Models/Gemma3n.swift b/packages/swift/Sources/NodeMLXCore/Models/Gemma3n.swift
@@ -30,25 +30,25 @@ public struct Gemma3nConfiguration: Decodable, Sendable {
     public var queryPreAttnScalar: Int?
     public var layerTypes: [String]?
     public var slidingWindow: Int?
-    
+
     // Gemma3n specific
     public var hiddenSizePerLayerInput: Int?
     public var vocabSizePerLayerInput: Int?
     public var altupNumInputs: Int?
     public var altupActiveIdx: Int?
     public var altupCorrectScale: Bool?
     public var laurelRank: Int?
-    
+
     public var modelType: String?
-    
+
     /// Get intermediate size for a specific layer
     public func intermediateSize(forLayer layer: Int) -> Int {
         if layer < intermediateSizes.count {
             return intermediateSizes[layer]
         }
         return intermediateSizes.first ?? 8192
     }
-    
+
     enum CodingKeys: String, CodingKey {
         case textConfig = "text_config"
         case hiddenSize = "hidden_size"
@@ -73,10 +73,10 @@ public struct Gemma3nConfiguration: Decodable, Sendable {
         case laurelRank = "laurel_rank"
         case modelType = "model_type"
     }
-    
+
     public init(from decoder: Swift.Decoder) throws {
         let container = try decoder.container(keyedBy: CodingKeys.self)
-        
+
         // Try to decode from text_config first (VLM format), then from top level
         if let textContainer = try? container.nestedContainer(keyedBy: CodingKeys.self, forKey: .textConfig) {
             hiddenSize = try textContainer.decode(Int.self, forKey: .hiddenSize)
@@ -147,28 +147,28 @@ public struct Gemma3nConfiguration: Decodable, Sendable {
 class Gemma3nTextDecoderLayer: Module {
     let layerIdx: Int
     let attentionType: String
-    
+
     @ModuleInfo(key: "self_attn") var selfAttn: Gemma3nTextAttention
     @ModuleInfo(key: "mlp") var mlp: Gemma3nTextMLP
     @ModuleInfo(key: "input_layernorm") var inputLayernorm: Gemma3nRMSNorm
     @ModuleInfo(key: "post_attention_layernorm") var postAttentionLayernorm: Gemma3nRMSNorm
     @ModuleInfo(key: "pre_feedforward_layernorm") var preFeedforwardLayernorm: Gemma3nRMSNorm
     @ModuleInfo(key: "post_feedforward_layernorm") var postFeedforwardLayernorm: Gemma3nRMSNorm
-    
+
     init(_ config: Gemma3nConfiguration, layerIdx: Int) {
         self.layerIdx = layerIdx
-        
+
         // Determine attention type for this layer
         if let layerTypes = config.layerTypes, layerIdx < layerTypes.count {
             self.attentionType = layerTypes[layerIdx]
         } else {
             self.attentionType = "full_attention"
         }
-        
+
         let eps = config.rmsNormEps ?? 1e-6
         let numKVHeads = config.numKeyValueHeads ?? config.numAttentionHeads
         let intermediateSize = config.intermediateSize(forLayer: layerIdx)
-        
+
         // Initialize attention
         self._selfAttn.wrappedValue = Gemma3nTextAttention(
             hiddenSize: config.hiddenSize,
@@ -178,20 +178,20 @@ class Gemma3nTextDecoderLayer: Module {
             queryPreAttnScalar: config.queryPreAttnScalar,
             eps: eps
         )
-        
+
         // Initialize MLP
         self._mlp.wrappedValue = Gemma3nTextMLP(
             hiddenSize: config.hiddenSize,
             intermediateSize: intermediateSize
         )
-        
+
         // Initialize norms
         self._inputLayernorm.wrappedValue = Gemma3nRMSNorm(dimensions: config.hiddenSize, eps: eps)
         self._postAttentionLayernorm.wrappedValue = Gemma3nRMSNorm(dimensions: config.hiddenSize, eps: eps)
         self._preFeedforwardLayernorm.wrappedValue = Gemma3nRMSNorm(dimensions: config.hiddenSize, eps: eps)
         self._postFeedforwardLayernorm.wrappedValue = Gemma3nRMSNorm(dimensions: config.hiddenSize, eps: eps)
     }
-    
+
     /// Simplified forward pass (standard transformer without AltUp/Laurel)
     func callAsFunction(
         _ hiddenStates: MLXArray,
@@ -204,13 +204,13 @@ class Gemma3nTextDecoderLayer: Module {
         let attnOut = selfAttn(normed, positionEmbeddings: positionEmbeddings, mask: mask, cache: cache)
         let attnNormed = postAttentionLayernorm(attnOut)
         var h = hiddenStates + attnNormed
-        
+
         // 2. Pre-norm + MLP
         let mlpIn = preFeedforwardLayernorm(h)
         let mlpOut = mlp(mlpIn)
         let mlpNormed = postFeedforwardLayernorm(mlpOut)
         h = h + mlpNormed
-        
+
         return h
     }
 }
@@ -221,33 +221,33 @@ class Gemma3nTextDecoderLayer: Module {
 class Gemma3nTextModelInner: Module {
     let numLayers: Int
     let hiddenSize: Int
-    
+
     @ModuleInfo(key: "embed_tokens") var embedTokens: Gemma3nTextScaledWordEmbedding
     @ModuleInfo(key: "layers") var layers: [Gemma3nTextDecoderLayer]
     @ModuleInfo(key: "norm") var norm: Gemma3nRMSNorm
     @ModuleInfo(key: "rotary_emb") var rotaryEmb: Gemma3nRotaryEmbedding
-    
+
     init(_ config: Gemma3nConfiguration) {
         self.numLayers = config.numHiddenLayers
         self.hiddenSize = config.hiddenSize
-        
+
         let eps = config.rmsNormEps ?? 1e-6
-        
+
         // Main token embedding
         self._embedTokens.wrappedValue = Gemma3nTextScaledWordEmbedding(
             embeddingCount: config.vocabSize,
             dimensions: config.hiddenSize,
             embedScale: sqrt(Float(config.hiddenSize))
         )
-        
+
         // Decoder layers
         self._layers.wrappedValue = (0..<numLayers).map { idx in
             Gemma3nTextDecoderLayer(config, layerIdx: idx)
         }
-        
+
         // Final norm
         self._norm.wrappedValue = Gemma3nRMSNorm(dimensions: config.hiddenSize, eps: eps)
-        
+
         // Rotary embedding
         self._rotaryEmb.wrappedValue = Gemma3nRotaryEmbedding(
             dim: config.headDim,
@@ -256,26 +256,26 @@ class Gemma3nTextModelInner: Module {
             ropeLocalBaseFreq: config.ropeLocalBaseFreq ?? 10000.0
         )
     }
-    
+
     func callAsFunction(_ inputIds: MLXArray, cache: [[KVCache]]? = nil) -> MLXArray {
         // 1. Embed tokens
         var hiddenStates = embedTokens(inputIds)
-        
+
         // 2. Compute position embeddings
         let seqLen = inputIds.dim(1)
         let positions = MLXArray(Array(0..<seqLen).map { Int32($0) })
-        
+
         // 3. Process through layers
         for (layerIdx, layer) in layers.enumerated() {
             let positionEmbeddings = rotaryEmb(positions, layerType: layer.attentionType)
             let layerCache = cache?[layerIdx].first
-            
+
             hiddenStates = layer(hiddenStates, positionEmbeddings: positionEmbeddings, mask: nil, cache: layerCache)
         }
-        
+
         // 4. Final norm
         hiddenStates = norm(hiddenStates)
-        
+
         return hiddenStates
     }
 }
@@ -285,29 +285,29 @@ class Gemma3nTextModelInner: Module {
 public class Gemma3nModel: Module, LLMModel {
     public let vocabularySize: Int
     public let numLayers: Int
-    
+
     @ModuleInfo(key: "model") var model: Gemma3nTextModelInner
     @ModuleInfo(key: "lm_head") var lmHead: Linear
     private let config: Gemma3nConfiguration
-    
+
     public init(_ config: Gemma3nConfiguration) {
         self.config = config
         self.vocabularySize = config.vocabSize
         self.numLayers = config.numHiddenLayers
-        
+
         self._model.wrappedValue = Gemma3nTextModelInner(config)
         self._lmHead.wrappedValue = Linear(config.hiddenSize, config.vocabSize, bias: false)
     }
-    
+
     public func callAsFunction(_ inputIds: MLXArray) -> MLXArray {
         let h = model(inputIds, cache: nil)
         return lmHead(h)
     }
-    
+
     /// Sanitize weight keys from HuggingFace format
     public func sanitize(weights: [String: MLXArray]) -> [String: MLXArray] {
         var result: [String: MLXArray] = [:]
-        
+
         // Keys to skip (complex Gemma3n-specific modules that we don't use in simplified forward)
         let skipPatterns = [
             "altup",
@@ -321,28 +321,28 @@ public class Gemma3nModel: Module, LLMModel {
             "altup_projections",
             "altup_unembed_projections",
         ]
-        
+
         for (key, value) in weights {
             var newKey = key
-            
+
             // model.language_model.X -> model.X
             if newKey.hasPrefix("model.language_model.") {
                 newKey = "model." + String(newKey.dropFirst("model.language_model.".count))
             } else if newKey.hasPrefix("language_model.") {
                 newKey = "model." + String(newKey.dropFirst("language_model.".count))
             }
-            
+
             // Skip complex modules we don't use
             let shouldSkip = skipPatterns.contains { newKey.contains($0) }
             if shouldSkip {
                 continue
             }
-            
+
             // Remap embed_tokens.X -> embed_tokens.inner.X (for our wrapper structure)
             if newKey.contains("embed_tokens.") && !newKey.contains("embed_tokens.inner.") {
                 newKey = newKey.replacingOccurrences(of: "embed_tokens.", with: "embed_tokens.inner.")
             }
-            
+
             result[newKey] = value
         }
         return result
diff --git a/packages/swift/Sources/NodeMLXCore/Models/Phi3.swift b/packages/swift/Sources/NodeMLXCore/Models/Phi3.swift
@@ -353,4 +353,4 @@ public class Phi3Model: Module, LLMModel {
         // Override in subclass if weight key mapping needed
         return weights
     }
-}
+}

Original file line number	Diff line number	Diff line change
`@@ -353,4 +353,4 @@ public class Phi3Model: Module, LLMModel {`
`353`	`353`	`// Override in subclass if weight key mapping needed`
`354`	`354`	`return weights`
`355`	`355`	`}`
`356`		`-}`
	`356`	`+}`