Skip to content

Commit d6d5f62

Browse files
committed
test: Add FP8 model tests and tiny model generator
- Add fp8_aware_dense layer unit tests - Add FP8 Qwen3 model loading test using roulis/tiny-fp8-qwen3 - Include Python script to generate tiny FP8 test models
1 parent cb36413 commit d6d5f62

File tree

2 files changed

+166
-0
lines changed

2 files changed

+166
-0
lines changed

test/bumblebee/layers_test.exs

Lines changed: 138 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,138 @@
1+
defmodule Bumblebee.LayersTest do
2+
use ExUnit.Case, async: true
3+
4+
import Bumblebee.TestHelpers
5+
6+
describe "fp8_aware_dense/3" do
7+
test "dequantizes FP8 kernel with scale_inv" do
8+
# Create a simple model with fp8_aware_dense
9+
model =
10+
Axon.input("input", shape: {nil, 4})
11+
|> Bumblebee.Layers.fp8_aware_dense(8, name: "dense", block_size: 2)
12+
13+
# Create params with known values
14+
# kernel: [4, 8] - input_features x output_features
15+
# scale_inv: [2, 4] - ceil(4/2) x ceil(8/2) blocks
16+
kernel = Nx.tensor([
17+
[1, 2, 3, 4, 5, 6, 7, 8],
18+
[1, 2, 3, 4, 5, 6, 7, 8],
19+
[1, 2, 3, 4, 5, 6, 7, 8],
20+
[1, 2, 3, 4, 5, 6, 7, 8]
21+
], type: {:f, 32})
22+
23+
# Scale of 2.0 for all blocks means output should be 2x what it would be without scaling
24+
scale_inv = Nx.tensor([
25+
[2.0, 2.0, 2.0, 2.0],
26+
[2.0, 2.0, 2.0, 2.0]
27+
], type: {:f, 32})
28+
29+
params = %{
30+
"dense" => %{
31+
"kernel" => kernel,
32+
"scale_inv" => scale_inv
33+
}
34+
}
35+
36+
input = Nx.tensor([[1.0, 1.0, 1.0, 1.0]])
37+
38+
output = Axon.predict(model, params, %{"input" => input})
39+
40+
# Without scaling: input [1,1,1,1] dot kernel gives [4, 8, 12, 16, 20, 24, 28, 32]
41+
# With scale_inv of 2.0: [8, 16, 24, 32, 40, 48, 56, 64]
42+
expected = Nx.tensor([[8.0, 16.0, 24.0, 32.0, 40.0, 48.0, 56.0, 64.0]])
43+
44+
assert_all_close(output, expected)
45+
end
46+
47+
test "dequantizes with identity scale (1.0)" do
48+
model =
49+
Axon.input("input", shape: {nil, 4})
50+
|> Bumblebee.Layers.fp8_aware_dense(4, name: "dense", block_size: 2)
51+
52+
kernel = Nx.tensor([
53+
[1, 0, 0, 0],
54+
[0, 1, 0, 0],
55+
[0, 0, 1, 0],
56+
[0, 0, 0, 1]
57+
], type: {:f, 32})
58+
59+
# Identity scale
60+
scale_inv = Nx.tensor([
61+
[1.0, 1.0],
62+
[1.0, 1.0]
63+
], type: {:f, 32})
64+
65+
params = %{
66+
"dense" => %{
67+
"kernel" => kernel,
68+
"scale_inv" => scale_inv
69+
}
70+
}
71+
72+
input = Nx.tensor([[2.0, 3.0, 4.0, 5.0]])
73+
output = Axon.predict(model, params, %{"input" => input})
74+
75+
# Identity matrix with scale 1.0 should return input unchanged
76+
assert_all_close(output, input)
77+
end
78+
79+
test "handles non-block-aligned dimensions" do
80+
# 3 input features, 5 output features with block_size 2
81+
# This tests the slicing logic for non-aligned dimensions
82+
model =
83+
Axon.input("input", shape: {nil, 3})
84+
|> Bumblebee.Layers.fp8_aware_dense(5, name: "dense", block_size: 2)
85+
86+
# kernel: [3, 5]
87+
kernel = Nx.broadcast(1.0, {3, 5})
88+
89+
# scale_inv: [ceil(3/2), ceil(5/2)] = [2, 3]
90+
scale_inv = Nx.broadcast(1.0, {2, 3})
91+
92+
params = %{
93+
"dense" => %{
94+
"kernel" => kernel,
95+
"scale_inv" => scale_inv
96+
}
97+
}
98+
99+
input = Nx.tensor([[1.0, 1.0, 1.0]])
100+
output = Axon.predict(model, params, %{"input" => input})
101+
102+
# Sum of 3 ones = 3.0 for each output
103+
expected = Nx.tensor([[3.0, 3.0, 3.0, 3.0, 3.0]])
104+
105+
assert_all_close(output, expected)
106+
end
107+
108+
test "includes bias when use_bias is true" do
109+
model =
110+
Axon.input("input", shape: {nil, 2})
111+
|> Bumblebee.Layers.fp8_aware_dense(2, name: "dense", block_size: 2, use_bias: true)
112+
113+
kernel = Nx.tensor([
114+
[1, 0],
115+
[0, 1]
116+
], type: {:f, 32})
117+
118+
scale_inv = Nx.tensor([[1.0]], type: {:f, 32})
119+
bias = Nx.tensor([10.0, 20.0], type: {:f, 32})
120+
121+
params = %{
122+
"dense" => %{
123+
"kernel" => kernel,
124+
"scale_inv" => scale_inv,
125+
"bias" => bias
126+
}
127+
}
128+
129+
input = Nx.tensor([[1.0, 2.0]])
130+
output = Axon.predict(model, params, %{"input" => input})
131+
132+
# [1, 2] with identity kernel = [1, 2], plus bias [10, 20] = [11, 22]
133+
expected = Nx.tensor([[11.0, 22.0]])
134+
135+
assert_all_close(output, expected)
136+
end
137+
end
138+
end

test/bumblebee/text/qwen3_test.exs

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,4 +75,32 @@ defmodule Bumblebee.Text.Qwen3Test do
7575
Nx.tensor([[-0.1487, -0.0071]])
7676
)
7777
end
78+
79+
test ":for_causal_language_modeling with FP8 weights" do
80+
assert {:ok, %{model: model, params: %Axon.ModelState{data: params_data} = params, spec: spec}} =
81+
Bumblebee.load_model(
82+
{:hf, "roulis/tiny-fp8-qwen3"},
83+
preserve_source_types: true
84+
)
85+
86+
assert %Bumblebee.Text.Qwen3{architecture: :for_causal_language_modeling} = spec
87+
88+
# Verify FP8 weights are preserved
89+
q_proj_kernel = params_data["decoder.blocks.0.self_attention.query"]["kernel"]
90+
assert Nx.type(q_proj_kernel) == {:f8_e4m3fn, 8}
91+
92+
# Verify scale_inv is loaded
93+
q_proj_scale = params_data["decoder.blocks.0.self_attention.query"]["scale_inv"]
94+
assert Nx.type(q_proj_scale) == {:f, 32}
95+
96+
inputs = %{
97+
"input_ids" => Nx.tensor([[10, 20, 30, 40, 50, 60, 70, 80, 0, 0]]),
98+
"attention_mask" => Nx.tensor([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0]])
99+
}
100+
101+
# Model should run without error (dequantization happens internally)
102+
outputs = Axon.predict(model, params, inputs)
103+
104+
assert Nx.shape(outputs.logits) == {1, 10, 1024}
105+
end
78106
end

0 commit comments

Comments
 (0)