add fp8_e5m2 support and fixing UT

baodii · zufangzhu · commit 3a318ff14b64 · 2025-08-13T22:21:23.000-07:00
Signed-off-by: baodii &lt;di.bao@intel.com&gt;
diff --git a/csrc/xpu/dispatch_utils.h b/csrc/xpu/dispatch_utils.h
@@ -22,10 +22,12 @@
   AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__))
 
 #define VLLM_DISPATCH_CASE_FP8_TYPES(...) \
-  AT_DISPATCH_FP8_CASE(at::ScalarType::Float8_e4m3fn, __VA_ARGS__)
+  AT_DISPATCH_FP8_CASE(at::ScalarType::Float8_e4m3fn, __VA_ARGS__) \
+  AT_DISPATCH_FP8_CASE(at::ScalarType::Float8_e5m2, __VA_ARGS__)
 
 #define VLLM_DISPATCH_CASE_QUANT_TYPES(...)                    \
   AT_DISPATCH_CASE(at::ScalarType::Float8_e4m3fn, __VA_ARGS__) \
+  AT_DISPATCH_CASE(at::ScalarType::Float8_e5m2, __VA_ARGS__) \
   AT_DISPATCH_CASE(at::ScalarType::Char, __VA_ARGS__)
 
 // When using this dispatch macro, the type is 'fp8_t' not 'scalar_t'.
diff --git a/csrc/xpu/quantization/fp8/fp8_quant.cpp b/csrc/xpu/quantization/fp8/fp8_quant.cpp
@@ -5,6 +5,7 @@
 #include <sycl/sycl.hpp>
 
 #include "xpu/dispatch_utils.h"
+#include "xpu/ops.h"
 
 #include "fp8_quant.h"
 #include "utils.h"
diff --git a/tests/ops/fp8_quant_op.py b/tests/ops/fp8_quant_op.py
@@ -7,6 +7,12 @@
 import torch.nn as nn
 import vllm.envs as envs
 
+import sys
+import os
+
+# Add parent directory to Python path
+# sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
+
 import tests.register_ops as ops
 
 
@@ -45,7 +51,7 @@ def scaled_fp8_quant(
     assert (input.ndim == 2)
     shape: Union[tuple[int, int], torch.Size] = input.shape
     # out_dtype: torch.dtype = current_platform.fp8_dtype()
-    out_dtype: torch.dtype = torch.fp8_e5m2
+    out_dtype: torch.dtype = torch.float8_e5m2
     if num_token_padding:
         shape = (max(num_token_padding, input.shape[0]), shape[1])
     if output is None:
diff --git a/tests/test_fp8_quant.py b/tests/test_fp8_quant.py
@@ -11,7 +11,7 @@
 def as_float32_tensor(x: Union[float, torch.tensor]) -> torch.tensor:
     return torch.as_tensor(x, dtype=torch.float32, device='xpu')
 
-def ref_dynamic_per_tensor_fp8_quant(x, fp8_dtype):
+def ref_dynamic_per_tensor_fp8_quant(x, fp8_dtype=torch.float8_e5m2):
 
     fp8_traits = torch.finfo(fp8_dtype)
     fp8_traits_max = fp8_traits.max
@@ -43,16 +43,17 @@ def seed_everything(seed):
 NUM_TOKENS = [1, 7, 83, 4096]  # Arbitrary values for testing
 SCALE_UBS = [True, False]
 SEEDS = [0]
-FP8_DTYPES = [torch.float8_e5m2, torch.float8_e4m3fn]
+FP8_DTYPES = [torch.float8_e5m2]
 
 
 @pytest.mark.parametrize("num_tokens", NUM_TOKENS)
 @pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
+@pytest.mark.parametrize("fp8_dtype", FP8_DTYPES)
 @pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("seed", SEEDS)
 @torch.inference_mode()
 def test_dynamic_per_tensor_fp8_quant(num_tokens: int, hidden_size: int,
-                                      dtype: torch.dtype,
+                                      fp8_dtype: torch.dtype, dtype: torch.dtype,
                                       seed: int) -> None:
     seed_everything(seed)
 
@@ -93,4 +94,4 @@ def test_fp8_quant_large(seed: int, fp8_dtype: torch.dtype) -> None:
     torch.testing.assert_close(ref_out, ops_out)
 
 if __name__ == "__main__":
-    test_dynamic_per_tensor_fp8_quant(1024, 1024, torch.float16, 0)
+    test_dynamic_per_tensor_fp8_quant(1024, 1024, torch.float8_e5m2, torch.float16, 0)