Add lille 130m (ml-explore#429)

Goekdeniz-Guelmez · awni · web-flow · commit 4a085c761898 · 2025-09-09T21:21:24.000-07:00
* in. com.

* inference works

* rebase

* cpyrgt

* upd. ackn

* clean up residuals

* format

* rebase + nits

---------

Co-authored-by: Awni Hannun &lt;awni@apple.com&gt;
diff --git a/ACKNOWLEDGMENTS.md b/ACKNOWLEDGMENTS.md
@@ -8,5 +8,17 @@ with a short description of your contribution(s) below. For example:
 MLX LM was developed with contributions from the following individuals:
 
 - Shunta Saito: Added support for PLaMo models.
-- Gökdeniz Gülmez: Added support for the following architectures: OpenBMB's `MiniCPM` and `MiniCPM3`, Kyutai's `Helium`, State-Space's`Mamba v1`, Z.ai & THUKEG's `GLM4`, Rednote `dots.llm1`, Baisu's `Ernie4.5 MoE`, inclusionAI's `Bailing MoE e.g. Ling-family`, Klear team - Kuaishou Technology's `Klear`, IBM's `Granite MoE`, Meituan's `LongCat`, Nvidia's `Nemotron H`, Swiss-AI's `Apertus`, and Allenai's `OLMoE`; Added support for the following training algorithms: `Full Weight Fine-Tuning`, and the `Muon` optimizer; Added support for the following other features: `Multiple Optimizers to choose for training`, and `reporting training metrics to WandB (Weights & Biases)`.
-- Prince Canuma: Helped add support for the following model architectures: HuggingFace's `Starcoder2`, Cohere's `Cohere (1 and 2)`, Alibaba Qwen's `Qwen (2, 3 and MoE)`, Microsoft's `Phi (3 and 3.5 MoE)`, `BitNet1.58`, Meta's `Llama (3 and 4)`, Google DeepMind's `Gemma 3`, and InterLM's `InternLM 2.5`.
+- Gökdeniz Gülmez: Added support for the following architectures: OpenBMB's
+  `MiniCPM` and `MiniCPM3`, Kyutai's `Helium`, State-Space's`Mamba v1`, Z.ai &
+   THUKEG's `GLM4`, Rednote `dots.llm1`, Baisu's `Ernie4.5 MoE`, inclusionAI's
+   `Bailing MoE e.g. Ling-family`, Klear team - Kuaishou Technology's `Klear`,
+   IBM's `Granite MoE`, Meituan's `LongCat`, Nvidia's `Nemotron H`, Swiss-AI's
+   `Apertus`, Nikity's `Lille130m`, and Allenai's `OLMoE`; Added support for the
+   following training algorithms: `Full Weight Fine-Tuning`, and the `Muon`
+   optimizer; Added support for the following other features: `Multiple Optimizers
+   to choose for training`, and `reporting training metrics to WandB (Weights &
+   Biases)`.
+- Prince Canuma: Helped add support for the following model architectures:
+  HuggingFace's `Starcoder2`, Cohere's `Cohere (1 and 2)`, Alibaba Qwen's `Qwen
+  (2, 3 and MoE)`, Microsoft's `Phi (3 and 3.5 MoE)`, `BitNet1.58`, Meta's `Llama
+  (3 and 4)`, Google DeepMind's `Gemma 3`, and InterLM's `InternLM 2.5`.
diff --git a/mlx_lm/models/lille-130m.py b/mlx_lm/models/lille-130m.py
@@ -0,0 +1,154 @@
+# Copyright © 2025 Apple Inc.
+
+from dataclasses import dataclass
+from typing import Any, Optional
+
+import mlx.core as mx
+import mlx.nn as nn
+
+from .base import BaseModelArgs, create_attention_mask, scaled_dot_product_attention
+
+
+@dataclass
+class ModelArgs(BaseModelArgs):
+    model_type: str
+    block_size: int
+    layer_norm_eps: float
+    n_embd: int
+    n_head: int
+    n_kv_heads: int
+    n_layer: int
+    rope_theta: float
+    vocab_size: int
+    tie_word_embeddings: bool = True
+
+
+class Lille130mAttention(nn.Module):
+    def __init__(self, args: ModelArgs):
+        super().__init__()
+        self.n_head = args.n_head
+        self.n_kv_heads = args.n_kv_heads
+        self.head_dim = args.n_embd // args.n_head
+        self.scale = self.head_dim**-0.5
+
+        self.qkv_proj = nn.Linear(
+            args.n_embd, (args.n_head + 2 * args.n_kv_heads) * self.head_dim, bias=False
+        )
+        self.out_proj = nn.Linear(args.n_head * self.head_dim, args.n_embd, bias=False)
+
+        self.norm = nn.RMSNorm(args.n_embd, eps=args.layer_norm_eps)
+
+        self.rope = nn.RoPE(args.n_embd // args.n_head, True, args.rope_theta)
+
+    def __call__(
+        self,
+        x: mx.array,
+        mask: Optional[mx.array] = None,
+        cache: Optional[Any] = None,
+    ) -> mx.array:
+        B, L, D = x.shape
+
+        qkv = self.qkv_proj(self.norm(x))
+
+        q_size = self.n_head * self.head_dim
+        kv_size = self.n_kv_heads * self.head_dim
+
+        queries, keys, values = mx.split(qkv, [q_size, q_size + kv_size], axis=-1)
+
+        queries = queries.reshape(B, L, self.n_head, -1).transpose(0, 2, 1, 3)
+        keys = keys.reshape(B, L, self.n_kv_heads, -1).transpose(0, 2, 1, 3)
+        values = values.reshape(B, L, self.n_kv_heads, -1).transpose(0, 2, 1, 3)
+
+        if cache is not None:
+            queries = self.rope(queries, offset=cache.offset)
+            keys = self.rope(keys, offset=cache.offset)
+            keys, values = cache.update_and_fetch(keys, values)
+        else:
+            queries = self.rope(queries)
+            keys = self.rope(keys)
+
+        output = scaled_dot_product_attention(
+            queries, keys, values, cache=cache, scale=self.scale, mask=mask
+        )
+
+        output = output.transpose(0, 2, 1, 3).reshape(B, L, -1)
+        return self.out_proj(output)
+
+
+class Lille130mMLP(nn.Module):
+    def __init__(self, args: ModelArgs):
+        super().__init__()
+        hidden_dim = 256 * round(int(8 * args.n_embd / 3) / 256)
+
+        self.norm = nn.RMSNorm(args.n_embd, eps=args.layer_norm_eps)
+        self.gate_proj = nn.Linear(args.n_embd, hidden_dim, bias=False)
+        self.up_proj = nn.Linear(args.n_embd, hidden_dim, bias=False)
+        self.down_proj = nn.Linear(hidden_dim, args.n_embd, bias=False)
+
+    def __call__(self, x: mx.array) -> mx.array:
+        h = self.norm(x)
+        return self.down_proj(nn.silu(self.gate_proj(h)) * self.up_proj(h))
+
+
+class Lille130Block(nn.Module):
+    def __init__(self, args: ModelArgs):
+        super().__init__()
+        self.attention = Lille130mAttention(args)
+        self.feed_forward = Lille130mMLP(args)
+
+    def __call__(
+        self,
+        x: mx.array,
+        mask: Optional[mx.array] = None,
+        cache: Optional[Any] = None,
+    ) -> mx.array:
+        h = x + self.attention(x, mask, cache)
+        out = h + self.feed_forward(h)
+        return out
+
+
+class Lille130(nn.Module):
+    def __init__(self, args: ModelArgs):
+        super().__init__()
+        self.tok_embeddings = nn.Embedding(args.vocab_size, args.n_embd)
+        self.layers = [Lille130Block(args=args) for _ in range(args.n_layer)]
+        self.norm = nn.RMSNorm(args.n_embd, eps=args.layer_norm_eps)
+
+    def __call__(
+        self,
+        inputs: mx.array,
+        cache: Optional[Any] = None,
+    ) -> mx.array:
+        h = self.tok_embeddings(inputs)
+
+        if cache is None:
+            cache = [None] * len(self.layers)
+
+        mask = create_attention_mask(h, cache[0])
+
+        for layer, c in zip(self.layers, cache):
+            h = layer(h, mask, cache=c)
+
+        return self.tok_embeddings.as_linear(self.norm(h))
+
+
+class Model(nn.Module):
+    def __init__(self, args: ModelArgs):
+        super().__init__()
+        self.args = args
+        self.model_type = args.model_type
+        self.transformer = Lille130(args)
+
+    def __call__(
+        self,
+        inputs: mx.array,
+        cache: Optional[Any] = None,
+    ) -> mx.array:
+        return self.transformer(inputs, cache=cache)
+
+    @property
+    def layers(self):
+        return self.transformer.layers
+
+    def sanitize(self, weights):
+        return {k: v for k, v in weights.items() if "rotary_emb" not in k}
diff --git a/mlx_lm/tuner/utils.py b/mlx_lm/tuner/utils.py
@@ -129,6 +129,7 @@ def to_lora(layer):
         "seed_oss",
         "apertus",
         "Klear",
+        "lille-130m",
     }:
         keys = {"self_attn.q_proj", "self_attn.v_proj"}
         if model.model_type in ["mixtral", "phimoe"]:
@@ -140,6 +141,12 @@ def to_lora(layer):
             keys.add("mlp.gate")
         if model.model_type in ["longcat_flash"]:
             keys.add("mlp.router.classifier")
+        if model.model_type == "lille-130m":
+            keys.add("attention.qkv_proj")
+            keys.add("attention.out_proj")
+            keys.add("feed_forward.gate_proj")
+            keys.add("feed_forward.up_proj")
+            keys.add("feed_forward.down_proj")
     elif model.model_type == "gpt_bigcode":
         keys = {"attn.c_attn"}
     elif model.model_type == "gpt2":
diff --git a/tests/test_models.py b/tests/test_models.py
@@ -1671,6 +1671,18 @@ def test_all_models(self):
                 "max_position_embeddings": 1000,
                 "norm_topk_prob": True,
             },
+            {
+                "model_type": "lille-130m",
+                "block_size": 128,
+                "num_hidden_layers": 4,
+                "n_layer": 4,
+                "n_head": 4,
+                "n_kv_heads": 4,
+                "n_embd": 128,
+                "vocab_size": 1000,
+                "rope_theta": 1000,
+                "layer_norm_eps": 1e-5,
+            },
         ]
         for config in test_configs:
             model_type = config["model_type"]