test: Add FP8 model tests and tiny model generator

nyo16 · nyo16 · commit e8e7f67ddc7d · 2026-01-28T19:00:44.000Z
- Add fp8_aware_dense layer unit tests
- Add FP8 Qwen3 model loading test using roulis/tiny-fp8-qwen3
- Include Python script to generate tiny FP8 test models
diff --git a/test/bumblebee/layers_test.exs b/test/bumblebee/layers_test.exs
@@ -0,0 +1,138 @@
+defmodule Bumblebee.LayersTest do
+  use ExUnit.Case, async: true
+
+  import Bumblebee.TestHelpers
+
+  describe "fp8_aware_dense/3" do
+    test "dequantizes FP8 kernel with scale_inv" do
+      # Create a simple model with fp8_aware_dense
+      model =
+        Axon.input("input", shape: {nil, 4})
+        |> Bumblebee.Layers.fp8_aware_dense(8, name: "dense", block_size: 2)
+
+      # Create params with known values
+      # kernel: [4, 8] - input_features x output_features
+      # scale_inv: [2, 4] - ceil(4/2) x ceil(8/2) blocks
+      kernel = Nx.tensor([
+        [1, 2, 3, 4, 5, 6, 7, 8],
+        [1, 2, 3, 4, 5, 6, 7, 8],
+        [1, 2, 3, 4, 5, 6, 7, 8],
+        [1, 2, 3, 4, 5, 6, 7, 8]
+      ], type: {:f, 32})
+
+      # Scale of 2.0 for all blocks means output should be 2x what it would be without scaling
+      scale_inv = Nx.tensor([
+        [2.0, 2.0, 2.0, 2.0],
+        [2.0, 2.0, 2.0, 2.0]
+      ], type: {:f, 32})
+
+      params = %{
+        "dense" => %{
+          "kernel" => kernel,
+          "scale_inv" => scale_inv
+        }
+      }
+
+      input = Nx.tensor([[1.0, 1.0, 1.0, 1.0]])
+
+      output = Axon.predict(model, params, %{"input" => input})
+
+      # Without scaling: input [1,1,1,1] dot kernel gives [4, 8, 12, 16, 20, 24, 28, 32]
+      # With scale_inv of 2.0: [8, 16, 24, 32, 40, 48, 56, 64]
+      expected = Nx.tensor([[8.0, 16.0, 24.0, 32.0, 40.0, 48.0, 56.0, 64.0]])
+
+      assert_all_close(output, expected)
+    end
+
+    test "dequantizes with identity scale (1.0)" do
+      model =
+        Axon.input("input", shape: {nil, 4})
+        |> Bumblebee.Layers.fp8_aware_dense(4, name: "dense", block_size: 2)
+
+      kernel = Nx.tensor([
+        [1, 0, 0, 0],
+        [0, 1, 0, 0],
+        [0, 0, 1, 0],
+        [0, 0, 0, 1]
+      ], type: {:f, 32})
+
+      # Identity scale
+      scale_inv = Nx.tensor([
+        [1.0, 1.0],
+        [1.0, 1.0]
+      ], type: {:f, 32})
+
+      params = %{
+        "dense" => %{
+          "kernel" => kernel,
+          "scale_inv" => scale_inv
+        }
+      }
+
+      input = Nx.tensor([[2.0, 3.0, 4.0, 5.0]])
+      output = Axon.predict(model, params, %{"input" => input})
+
+      # Identity matrix with scale 1.0 should return input unchanged
+      assert_all_close(output, input)
+    end
+
+    test "handles non-block-aligned dimensions" do
+      # 3 input features, 5 output features with block_size 2
+      # This tests the slicing logic for non-aligned dimensions
+      model =
+        Axon.input("input", shape: {nil, 3})
+        |> Bumblebee.Layers.fp8_aware_dense(5, name: "dense", block_size: 2)
+
+      # kernel: [3, 5]
+      kernel = Nx.broadcast(1.0, {3, 5})
+
+      # scale_inv: [ceil(3/2), ceil(5/2)] = [2, 3]
+      scale_inv = Nx.broadcast(1.0, {2, 3})
+
+      params = %{
+        "dense" => %{
+          "kernel" => kernel,
+          "scale_inv" => scale_inv
+        }
+      }
+
+      input = Nx.tensor([[1.0, 1.0, 1.0]])
+      output = Axon.predict(model, params, %{"input" => input})
+
+      # Sum of 3 ones = 3.0 for each output
+      expected = Nx.tensor([[3.0, 3.0, 3.0, 3.0, 3.0]])
+
+      assert_all_close(output, expected)
+    end
+
+    test "includes bias when use_bias is true" do
+      model =
+        Axon.input("input", shape: {nil, 2})
+        |> Bumblebee.Layers.fp8_aware_dense(2, name: "dense", block_size: 2, use_bias: true)
+
+      kernel = Nx.tensor([
+        [1, 0],
+        [0, 1]
+      ], type: {:f, 32})
+
+      scale_inv = Nx.tensor([[1.0]], type: {:f, 32})
+      bias = Nx.tensor([10.0, 20.0], type: {:f, 32})
+
+      params = %{
+        "dense" => %{
+          "kernel" => kernel,
+          "scale_inv" => scale_inv,
+          "bias" => bias
+        }
+      }
+
+      input = Nx.tensor([[1.0, 2.0]])
+      output = Axon.predict(model, params, %{"input" => input})
+
+      # [1, 2] with identity kernel = [1, 2], plus bias [10, 20] = [11, 22]
+      expected = Nx.tensor([[11.0, 22.0]])
+
+      assert_all_close(output, expected)
+    end
+  end
+end
diff --git a/test/bumblebee/text/qwen3_test.exs b/test/bumblebee/text/qwen3_test.exs
@@ -75,4 +75,32 @@ defmodule Bumblebee.Text.Qwen3Test do
       Nx.tensor([[-0.1487, -0.0071]])
     )
   end
+
+  test ":for_causal_language_modeling with FP8 weights" do
+    assert {:ok, %{model: model, params: %Axon.ModelState{data: params_data} = params, spec: spec}} =
+             Bumblebee.load_model(
+               {:hf, "roulis/tiny-fp8-qwen3"},
+               preserve_source_types: true
+             )
+
+    assert %Bumblebee.Text.Qwen3{architecture: :for_causal_language_modeling} = spec
+
+    # Verify FP8 weights are preserved
+    q_proj_kernel = params_data["decoder.blocks.0.self_attention.query"]["kernel"]
+    assert Nx.type(q_proj_kernel) == {:f8_e4m3fn, 8}
+
+    # Verify scale_inv is loaded
+    q_proj_scale = params_data["decoder.blocks.0.self_attention.query"]["scale_inv"]
+    assert Nx.type(q_proj_scale) == {:f, 32}
+
+    inputs = %{
+      "input_ids" => Nx.tensor([[10, 20, 30, 40, 50, 60, 70, 80, 0, 0]]),
+      "attention_mask" => Nx.tensor([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0]])
+    }
+
+    # Model should run without error (dequantization happens internally)
+    outputs = Axon.predict(model, params, inputs)
+
+    assert Nx.shape(outputs.logits) == {1, 10, 1024}
+  end
 end
diff --git a/test/fixtures/fp8/generate_fp8_qwen3.py b/test/fixtures/fp8/generate_fp8_qwen3.py
@@ -0,0 +1,160 @@
+"""
+Generate a tiny FP8 Qwen3 model for testing Bumblebee's FP8 support.
+
+This creates a minimal model with:
+- FP8 E4M3FN weights for linear layers
+- Corresponding weight_scale_inv tensors (128x128 block scaling)
+- Saved in safetensors format
+
+Usage:
+  python generate_fp8_qwen3.py
+  # Then upload to HuggingFace: huggingface-cli upload bumblebee-testing/tiny-random-Qwen3ForCausalLM-FP8 ./tiny-fp8-qwen3
+"""
+
+import torch
+import json
+import os
+from safetensors.torch import save_file
+
+# Tiny model config matching existing tiny-random-Qwen3ForCausalLM
+CONFIG = {
+    "architectures": ["Qwen3ForCausalLM"],
+    "hidden_size": 32,
+    "intermediate_size": 64,
+    "num_attention_heads": 4,
+    "num_hidden_layers": 2,
+    "num_key_value_heads": 2,
+    "vocab_size": 1024,
+    "head_dim": 8,  # hidden_size / num_attention_heads
+    "rms_norm_eps": 1e-6,
+    "rope_theta": 1000000.0,
+    "max_position_embeddings": 512,
+    "torch_dtype": "float8_e4m3fn",
+    "model_type": "qwen3",
+    "use_qk_norm": True,
+    "tie_word_embeddings": True,
+    "quantization_config": {
+        "quant_method": "fp8",
+        "weight_block_size": [128, 128]
+    }
+}
+
+BLOCK_SIZE = 128
+
+
+def create_fp8_weight(shape, seed=42):
+    """Create a random FP8 E4M3FN weight tensor."""
+    torch.manual_seed(seed)
+    # Create random values in valid FP8 E4M3FN range (-448 to 448)
+    weight_f32 = torch.randn(shape) * 0.1
+    weight_fp8 = weight_f32.to(torch.float8_e4m3fn)
+    return weight_fp8
+
+
+def create_scale_inv(weight_shape):
+    """Create scale_inv tensor for block-wise dequantization.
+
+    Shape: [ceil(out_features/128), ceil(in_features/128)]
+    For testing, use scale of 1.0 (identity) so dequantized = original.
+    """
+    out_features, in_features = weight_shape
+    out_blocks = (out_features + BLOCK_SIZE - 1) // BLOCK_SIZE
+    in_blocks = (in_features + BLOCK_SIZE - 1) // BLOCK_SIZE
+    # Use 1.0 for identity scaling (easier to verify in tests)
+    return torch.ones(out_blocks, in_blocks, dtype=torch.float32)
+
+
+def generate_model():
+    hidden_size = CONFIG["hidden_size"]
+    intermediate_size = CONFIG["intermediate_size"]
+    num_heads = CONFIG["num_attention_heads"]
+    num_kv_heads = CONFIG["num_key_value_heads"]
+    head_dim = CONFIG["head_dim"]
+    vocab_size = CONFIG["vocab_size"]
+    num_layers = CONFIG["num_hidden_layers"]
+
+    tensors = {}
+    seed = 0
+
+    # Embedding (not quantized)
+    tensors["model.embed_tokens.weight"] = torch.randn(vocab_size, hidden_size)
+
+    for layer_idx in range(num_layers):
+        prefix = f"model.layers.{layer_idx}"
+
+        # Self-attention projections (FP8 quantized)
+        q_size = num_heads * head_dim
+        kv_size = num_kv_heads * head_dim
+
+        # Q projection
+        tensors[f"{prefix}.self_attn.q_proj.weight"] = create_fp8_weight((q_size, hidden_size), seed)
+        seed += 1
+        tensors[f"{prefix}.self_attn.q_proj.weight_scale_inv"] = create_scale_inv((q_size, hidden_size))
+
+        # K projection
+        tensors[f"{prefix}.self_attn.k_proj.weight"] = create_fp8_weight((kv_size, hidden_size), seed)
+        seed += 1
+        tensors[f"{prefix}.self_attn.k_proj.weight_scale_inv"] = create_scale_inv((kv_size, hidden_size))
+
+        # V projection
+        tensors[f"{prefix}.self_attn.v_proj.weight"] = create_fp8_weight((kv_size, hidden_size), seed)
+        seed += 1
+        tensors[f"{prefix}.self_attn.v_proj.weight_scale_inv"] = create_scale_inv((kv_size, hidden_size))
+
+        # O projection
+        tensors[f"{prefix}.self_attn.o_proj.weight"] = create_fp8_weight((hidden_size, q_size), seed)
+        seed += 1
+        tensors[f"{prefix}.self_attn.o_proj.weight_scale_inv"] = create_scale_inv((hidden_size, q_size))
+
+        # QK norms (not quantized)
+        tensors[f"{prefix}.self_attn.q_norm.weight"] = torch.ones(head_dim)
+        tensors[f"{prefix}.self_attn.k_norm.weight"] = torch.ones(head_dim)
+
+        # MLP (FP8 quantized)
+        tensors[f"{prefix}.mlp.gate_proj.weight"] = create_fp8_weight((intermediate_size, hidden_size), seed)
+        seed += 1
+        tensors[f"{prefix}.mlp.gate_proj.weight_scale_inv"] = create_scale_inv((intermediate_size, hidden_size))
+
+        tensors[f"{prefix}.mlp.up_proj.weight"] = create_fp8_weight((intermediate_size, hidden_size), seed)
+        seed += 1
+        tensors[f"{prefix}.mlp.up_proj.weight_scale_inv"] = create_scale_inv((intermediate_size, hidden_size))
+
+        tensors[f"{prefix}.mlp.down_proj.weight"] = create_fp8_weight((hidden_size, intermediate_size), seed)
+        seed += 1
+        tensors[f"{prefix}.mlp.down_proj.weight_scale_inv"] = create_scale_inv((hidden_size, intermediate_size))
+
+        # Layer norms (not quantized)
+        tensors[f"{prefix}.input_layernorm.weight"] = torch.ones(hidden_size)
+        tensors[f"{prefix}.post_attention_layernorm.weight"] = torch.ones(hidden_size)
+
+    # Final norm (not quantized)
+    tensors["model.norm.weight"] = torch.ones(hidden_size)
+
+    # LM head (can be tied to embeddings, but we include it for completeness)
+    # Not quantized since it shares with embeddings
+
+    return tensors
+
+
+def main():
+    output_dir = "tiny-fp8-qwen3"
+    os.makedirs(output_dir, exist_ok=True)
+
+    # Generate model tensors
+    tensors = generate_model()
+
+    # Save as safetensors
+    save_file(tensors, os.path.join(output_dir, "model.safetensors"))
+
+    # Save config
+    with open(os.path.join(output_dir, "config.json"), "w") as f:
+        json.dump(CONFIG, f, indent=2)
+
+    print(f"Model saved to {output_dir}/")
+    print(f"Total tensors: {len(tensors)}")
+    print("\nTo upload to HuggingFace:")
+    print(f"  huggingface-cli upload bumblebee-testing/tiny-random-Qwen3ForCausalLM-FP8 {output_dir}")
+
+
+if __name__ == "__main__":
+    main()