Add Ministral support to Mistral model

dai-yamashita · dai-yamashita · commit bae534a29604 · 2025-12-20T00:41:47.000+09:00
This extends the existing Mistral model to support Ministral variants by adding:

- `attention_head_size`: explicit head_dim support (Ministral uses 128)
- `use_interleaved_attention`: even layers use global attention, odd layers
  use sliding window attention
- `tie_word_embeddings`: share weights between embedding and lm_head

Also adds function-based `attention_window_size` support in transformer.ex
to enable per-layer attention window configuration.

All changes are backward compatible with existing Mistral models.
diff --git a/lib/bumblebee/layers/transformer.ex b/lib/bumblebee/layers/transformer.ex
@@ -25,6 +25,12 @@ defmodule Bumblebee.Layers.Transformer do
       - a keyword list (applied to all blocks)
       - a function that takes the block index and returns the configuration
 
+    * `:attention_window_size` - window size for sliding attention. Can be:
+      - a tuple `{left_size, right_size}` (applied to all blocks)
+      - a function that takes the block index and returns the configuration
+        (useful for interleaved attention patterns)
+      - `nil` for global attention
+
     * `:name` - the prefix for layer names
 
   For all other options (including required options) see `block/2`.
@@ -52,7 +58,6 @@ defmodule Bumblebee.Layers.Transformer do
       :output_use_bias,
       :layer_norm,
       :block_type,
-      :attention_window_size,
       :scale_attention_weights
     ]
 
@@ -64,6 +69,7 @@ defmodule Bumblebee.Layers.Transformer do
             :name,
             :num_blocks,
             :rotary_embedding,
+            :attention_window_size,
             attention_mask: Layers.none(),
             attention_head_mask: Layers.none(),
             attention_relative_bias: nil,
@@ -85,6 +91,7 @@ defmodule Bumblebee.Layers.Transformer do
     cross_attention_head_mask = opts[:cross_attention_head_mask]
     cache = opts[:cache]
     rotary_embedding = opts[:rotary_embedding]
+    attention_window_size = opts[:attention_window_size]
 
     block_opts = Keyword.take(opts, block_opts_keys)
 
@@ -121,6 +128,13 @@ defmodule Bumblebee.Layers.Transformer do
               config when is_list(config) -> config
             end
 
+          block_attention_window_size =
+            case attention_window_size do
+              nil -> nil
+              fun when is_function(fun, 1) -> fun.(idx)
+              config -> config
+            end
+
           {hidden_state, attention, cross_attention, block_cache, attention_relative_bias} =
             block(
               state.hidden_state,
@@ -134,6 +148,7 @@ defmodule Bumblebee.Layers.Transformer do
                 block_cache: block_cache,
                 offset: offset,
                 rotary_embedding: block_rotary_embedding,
+                attention_window_size: block_attention_window_size,
                 name: join(name, idx)
               ] ++ block_opts
             )
diff --git a/lib/bumblebee/text/mistral.ex b/lib/bumblebee/text/mistral.ex
@@ -47,6 +47,28 @@ defmodule Bumblebee.Text.Mistral do
         default: 4096,
         doc: "window size for both sides of the sliding attention window"
       ],
+      attention_head_size: [
+        default: nil,
+        doc: """
+        the projection size for key, value, and query states per attention head.
+        When `nil`, defaults to `hidden_size / num_attention_heads`. Ministral
+        models use an explicit head_dim (typically 128) that differs from this default
+        """
+      ],
+      use_interleaved_attention: [
+        default: false,
+        doc: """
+        whether to use interleaved attention pattern. When enabled, even layers
+        use global attention and odd layers use sliding window attention
+        """
+      ],
+      tie_word_embeddings: [
+        default: false,
+        doc: """
+        whether to tie the word embeddings with the language modeling head weights.
+        When true, the lm_head uses the same weights as the token embedding layer
+        """
+      ],
       activation: [
         default: :silu,
         doc: "the activation function"
@@ -165,7 +187,8 @@ defmodule Bumblebee.Text.Mistral do
     Layers.Decoder.init_cache(batch_size, max_length,
       hidden_size: spec.hidden_size,
       decoder_num_attention_heads: spec.num_attention_heads,
-      decoder_num_blocks: spec.num_blocks
+      decoder_num_blocks: spec.num_blocks,
+      attention_head_size: spec.attention_head_size
     )
   end
 
@@ -315,6 +338,32 @@ defmodule Bumblebee.Text.Mistral do
        ) do
     name = opts[:name]
 
+    # Build attention_window_size configuration
+    # When interleaved attention is enabled, even layers use global attention
+    # and odd layers use sliding window attention
+    attention_window_size =
+      cond do
+        # If no sliding window is configured, use global attention for all layers
+        spec.attention_window_size == nil ->
+          nil
+
+        # Interleaved attention: even layers use global, odd layers use sliding window
+        spec.use_interleaved_attention ->
+          fn layer_idx ->
+            if rem(layer_idx, 2) == 0 do
+              # Even layers: global attention (no window)
+              nil
+            else
+              # Odd layers: sliding window attention
+              {spec.attention_window_size, spec.attention_window_size}
+            end
+          end
+
+        # Non-interleaved: apply sliding window to all layers
+        true ->
+          {spec.attention_window_size, spec.attention_window_size}
+      end
+
     Layers.Transformer.blocks(hidden_state,
       attention_mask: attention_mask,
       attention_head_mask: attention_head_mask,
@@ -323,6 +372,7 @@ defmodule Bumblebee.Text.Mistral do
       num_attention_heads: spec.num_attention_heads,
       num_key_value_heads: spec.num_key_value_heads,
       hidden_size: spec.hidden_size,
+      attention_head_size: spec.attention_head_size,
       kernel_initializer: kernel_initializer(spec),
       layer_norm: &Layers.rms_norm(&1, name: &2, epsilon: spec.layer_norm_epsilon),
       ffn:
@@ -332,8 +382,7 @@ defmodule Bumblebee.Text.Mistral do
         ),
       block_type: :norm_first,
       causal: true,
-      attention_window_size:
-        spec.attention_window_size && {spec.attention_window_size, spec.attention_window_size},
+      attention_window_size: attention_window_size,
       rotary_embedding: [
         position_ids: position_ids,
         max_positions: spec.max_positions,
@@ -367,7 +416,6 @@ defmodule Bumblebee.Text.Mistral do
   defp language_modeling_head(hidden_state, spec, opts) do
     name = opts[:name]
 
-    # TODO: Tie lm-head to word embedding as a spec option
     Layers.dense_transposed(hidden_state, spec.vocab_size,
       kernel_initializer: kernel_initializer(spec),
       name: join(name, "output")
@@ -391,19 +439,22 @@ defmodule Bumblebee.Text.Mistral do
           num_attention_heads: {"num_attention_heads", number()},
           num_key_value_heads: {"num_key_value_heads", number()},
           attention_window_size: {"sliding_window", optional(number())},
+          attention_head_size: {"head_dim", optional(number())},
+          use_interleaved_attention: {"use_interleaved_attention", optional(boolean())},
           intermediate_size: {"intermediate_size", number()},
           activation: {"hidden_act", activation()},
           rotary_embedding_base: {"rope_theta", number()},
           initializer_scale: {"initializer_range", number()},
-          layer_norm_epsilon: {"rms_norm_eps", number()}
+          layer_norm_epsilon: {"rms_norm_eps", number()},
+          tie_word_embeddings: {"tie_word_embeddings", boolean()}
         ) ++ Shared.common_options_from_transformers(data, spec)
 
       @for.config(spec, opts)
     end
   end
 
   defimpl Bumblebee.HuggingFace.Transformers.Model do
-    def params_mapping(_spec) do
+    def params_mapping(spec) do
       %{
         "embedder.token_embedding" => "model.embed_tokens",
         "decoder.blocks.{n}.self_attention.query" => "model.layers.{n}.self_attn.q_proj",
@@ -416,7 +467,8 @@ defmodule Bumblebee.Text.Mistral do
         "decoder.blocks.{n}.ffn.output" => "model.layers.{n}.mlp.down_proj",
         "decoder.blocks.{n}.output_norm" => "model.layers.{n}.post_attention_layernorm",
         "output_norm" => "model.norm",
-        "language_modeling_head.output" => "lm_head",
+        "language_modeling_head.output" =>
+          if(spec.tie_word_embeddings, do: "model.embed_tokens", else: "lm_head"),
         "sequence_classification_head.output" => "score"
       }
     end
diff --git a/test/bumblebee/text/mistral_test.exs b/test/bumblebee/text/mistral_test.exs
@@ -58,6 +58,40 @@ defmodule Bumblebee.Text.MistralTest do
     )
   end
 
+  test ":base with interleaved attention" do
+    assert {:ok, spec} =
+             Bumblebee.load_spec({:hf, "hf-internal-testing/tiny-random-MistralModel"})
+
+    # Enable interleaved attention: even layers use global, odd layers use sliding window
+    spec = Bumblebee.configure(spec, attention_window_size: 2, use_interleaved_attention: true)
+
+    assert {:ok, %{model: model, params: params, spec: spec}} =
+             Bumblebee.load_model({:hf, "hf-internal-testing/tiny-random-MistralModel"},
+               spec: spec
+             )
+
+    assert %Bumblebee.Text.Mistral{architecture: :base, use_interleaved_attention: true} = spec
+
+    inputs = %{
+      "input_ids" => Nx.tensor([[10, 20, 30, 40, 50, 60, 70, 80, 0, 0]]),
+      "attention_mask" => Nx.tensor([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0]])
+    }
+
+    outputs = Axon.predict(model, params, inputs)
+
+    assert Nx.shape(outputs.hidden_state) == {1, 10, 32}
+
+    # With interleaved attention, even layers (0, 2, 4...) use global attention
+    # and odd layers (1, 3, 5...) use sliding window attention
+    # The output should be different from both pure global and pure sliding window
+    assert_all_close(
+      outputs.hidden_state[[.., 1..3, 1..3]],
+      Nx.tensor([
+        [[0.9450, -1.3945, 0.7331], [-2.1118, -1.3091, -0.7834], [-1.4057, -1.2495, 0.8730]]
+      ])
+    )
+  end
+
   test ":for_sequence_classification" do
     assert {:ok, %{model: model, params: params, spec: spec}} =
              Bumblebee.load_model(