fix: Add prediction heads and tests for classification architectures

georgeguimaraes · georgeguimaraes · commit 684e256ef4b8 · 2026-01-01T15:14:16.000-03:00
- Add prediction head (dense, activation, norm) to sequence classification
- Add prediction head to token classification
- Add attention_mask to input_template for sequence classification
- Add tests for sequence_classification and token_classification
diff --git a/lib/bumblebee/text/modernbert.ex b/lib/bumblebee/text/modernbert.ex
@@ -159,6 +159,13 @@ defmodule Bumblebee.Text.ModernBert do
   end
 
   @impl true
+  def input_template(%{architecture: :for_sequence_classification}) do
+    %{
+      "input_ids" => Nx.template({1, 1}, :u32),
+      "attention_mask" => Nx.template({1, 1}, :u32)
+    }
+  end
+
   def input_template(_spec) do
     %{"input_ids" => Nx.template({1, 1}, :u32)}
   end
@@ -193,6 +200,7 @@ defmodule Bumblebee.Text.ModernBert do
       outputs.hidden_state
       |> mean_pooling(inputs["attention_mask"])
       |> Axon.dense(spec.hidden_size,
+        use_bias: false,
         kernel_initializer: kernel_initializer(spec),
         name: "sequence_classification_head.dense"
       )
@@ -223,6 +231,16 @@ defmodule Bumblebee.Text.ModernBert do
 
     logits =
       outputs.hidden_state
+      |> Axon.dense(spec.hidden_size,
+        use_bias: false,
+        kernel_initializer: kernel_initializer(spec),
+        name: "token_classification_head.dense"
+      )
+      |> Layers.activation(spec.activation)
+      |> layer_norm(
+        epsilon: spec.layer_norm_epsilon,
+        name: "token_classification_head.norm"
+      )
       |> Axon.dropout(
         rate: classifier_dropout_rate(spec),
         name: "token_classification_head.dropout"
@@ -564,6 +582,8 @@ defmodule Bumblebee.Text.ModernBert do
         "sequence_classification_head.dense" => "head.dense",
         "sequence_classification_head.norm" => "head.norm",
         "sequence_classification_head.output" => "classifier",
+        "token_classification_head.dense" => "head.dense",
+        "token_classification_head.norm" => "head.norm",
         "token_classification_head.output" => "classifier"
       }
     end
diff --git a/test/bumblebee/text/modernbert_test.exs b/test/bumblebee/text/modernbert_test.exs
@@ -5,10 +5,6 @@ defmodule Bumblebee.Text.ModernBertTest do
 
   @moduletag model_test_tags()
 
-  # Note: sequence_classification and token_classification tests are skipped
-  # because the tiny-random test models have incompatible head structures.
-  # The architectures work correctly with production models.
-
   test ":base" do
     assert {:ok, %{model: model, params: params, spec: spec}} =
              Bumblebee.load_model({:hf, "hf-internal-testing/tiny-random-ModernBertModel"})
@@ -54,4 +50,52 @@ defmodule Bumblebee.Text.ModernBertTest do
       ])
     )
   end
+
+  test ":for_sequence_classification" do
+    assert {:ok, %{model: model, params: params, spec: spec}} =
+             Bumblebee.load_model(
+               {:hf, "hf-internal-testing/tiny-random-ModernBertForSequenceClassification"}
+             )
+
+    assert %Bumblebee.Text.ModernBert{architecture: :for_sequence_classification} = spec
+
+    inputs = %{
+      "input_ids" => Nx.tensor([[10, 20, 30, 40, 50, 60, 70, 80, 0, 0]]),
+      "attention_mask" => Nx.tensor([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0]])
+    }
+
+    outputs = Axon.predict(model, params, inputs)
+
+    assert Nx.shape(outputs.logits) == {1, 2}
+
+    assert_all_close(
+      outputs.logits,
+      Nx.tensor([[1.2857, 2.1079]])
+    )
+  end
+
+  test ":for_token_classification" do
+    assert {:ok, %{model: model, params: params, spec: spec}} =
+             Bumblebee.load_model(
+               {:hf, "hf-internal-testing/tiny-random-ModernBertForTokenClassification"}
+             )
+
+    assert %Bumblebee.Text.ModernBert{architecture: :for_token_classification} = spec
+
+    inputs = %{
+      "input_ids" => Nx.tensor([[10, 20, 30, 40, 50, 60, 70, 80, 0, 0]]),
+      "attention_mask" => Nx.tensor([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0]])
+    }
+
+    outputs = Axon.predict(model, params, inputs)
+
+    assert Nx.shape(outputs.logits) == {1, 10, 2}
+
+    assert_all_close(
+      outputs.logits[[.., 1..3, ..]],
+      Nx.tensor([
+        [[5.0522, -0.8999], [-3.2701, 1.8927], [-0.7372, 5.4871]]
+      ])
+    )
+  end
 end