changed test_silu_mul_quant_fuse and test_rsnorm_quant_fuse to use pytorch kernels in reference function instead of aiter non-triton kernels (ROCm#1891)

ahmed-bsod · web-flow · commit 1c564c25b9df · 2026-01-27T10:18:39.000-05:00
diff --git a/aiter/utility/dtypes.py b/aiter/utility/dtypes.py
@@ -8,6 +8,7 @@
 defaultDtypes = {
     "gfx942": {"fp8": torch.float8_e4m3fnuz},
     "gfx950": {"fp8": torch.float8_e4m3fn},
+    "gfx1250": {"fp8": torch.float8_e4m3fn},
 }
 
 _8bit_fallback = torch.uint8
diff --git a/op_tests/triton_tests/quant/test_fused_fp8_quant.py b/op_tests/triton_tests/quant/test_fused_fp8_quant.py
@@ -9,16 +9,9 @@
     fused_silu_mul_fp8_per_tensor_static_quant,
 )
 
-from aiter import (
-    rmsnorm2d_fwd,
-    silu_and_mul,
-)
-
 from aiter.test_common import (
     checkAllclose,
 )
-
-from aiter.ops.quant import per_tensor_quant_hip
 import aiter
 import torch.nn.functional as F
 
@@ -198,17 +191,11 @@ def test_fused_rms_fp8_group_quant(M: int, N1: int, N2: int, dtype):
     torch.testing.assert_close(y1_upcast_torch, y1_upcast_triton, atol=0.1, rtol=0.1)
 
 
-def rmsnorm2d_fwd_(
-    x: torch.Tensor, weight: torch.Tensor, eps: float, dim: int
-) -> torch.Tensor:
-    ori_shape = x.shape
-    x = x.reshape(-1, dim)
-    return rmsnorm2d_fwd(x, weight, eps).view(ori_shape)
-
-
-def rmsnorm_fp8_quantization_ref(x, w, x_scale, eps, n, rocm_fp8_dtype):
-    rms_out = rmsnorm2d_fwd_(x, w, eps, n)
-    quant_out, _ = per_tensor_quant_hip(rms_out, x_scale, rocm_fp8_dtype)
+def rmsnorm_fp8_quantization_ref(x, w, x_scale, eps, rocm_fp8_dtype):
+    rms_out = rmsnorm(x.to(torch.float32), w.to(torch.float32), eps).to(x.dtype)
+    quant_out = per_tensor_fp8_static_quant(
+        rms_out.to(torch.float32), rocm_fp8_dtype, x_scale.to(torch.float32)
+    )
     return quant_out, rms_out
 
 
@@ -248,14 +235,14 @@ def test_rmsnorm_quant_fuse(m, n):
     )
 
     # calculate the correct scale value
-    rms_out = rmsnorm2d_fwd_(x, w, eps, n)
+    rms_out = rmsnorm(x.to(torch.float32), w.to(torch.float32), eps)
     rms_out_abs = torch.abs(rms_out)
     rms_out_abs_max = torch.max(rms_out_abs)
     scale_val = rms_out_abs_max / DTYPE_MAX
     x_scale = torch.tensor((scale_val), dtype=torch.float32, device="cuda")
 
     fp8_x_ref, rms_out_ref = rmsnorm_fp8_quantization_ref(
-        x, w, x_scale, eps, n, rocm_fp8_dtype
+        x, w, x_scale, eps, rocm_fp8_dtype
     )
     fp8_x, rms_out = triton_rmsnorm_fp8_quantization_fuse(
         x, w, x_scale, eps, rocm_fp8_dtype
@@ -658,9 +645,13 @@ def silu_mul_fp8_quantization_ref(x, x_scale, rocm_fp8_dtype):
     m, n2 = x.shape
     assert n2 % 2 == 0
     n = n2 // 2
-    silu_out = torch.empty((m, n), dtype=x.dtype, device=x.device)
-    silu_and_mul(silu_out, x)
-    quant_out, _ = per_tensor_quant_hip(silu_out, x_scale, rocm_fp8_dtype)
+    x1, x2 = x.split([n, n], dim=-1)
+    silu_out = (
+        (F.silu(x1.to(torch.float32)) * x2.to(torch.float32))
+        .to(x.dtype)
+        .to(torch.float32)
+    )
+    quant_out = per_tensor_fp8_static_quant(silu_out, rocm_fp8_dtype, x_scale)
     return quant_out
 
 
@@ -688,8 +679,12 @@ def test_silu_mul_quant_fuse(m, n):
     )
 
     # calculate the correct scale value
-    silu_out = torch.empty((m, n), dtype=x.dtype, device=x.device)
-    silu_and_mul(silu_out, x)
+    x1, x2 = x.split([n, n], dim=-1)
+    silu_out = (
+        (F.silu(x1.to(torch.float32)) * x2.to(torch.float32))
+        .to(x.dtype)
+        .to(torch.float32)
+    )
     silu_out_abs = torch.abs(silu_out)
     silu_out_abs_max = torch.max(silu_out_abs)
     scale_val = silu_out_abs_max / DTYPE_MAX

Original file line number	Diff line number	Diff line change
`@@ -8,6 +8,7 @@`
`8`	`8`	`defaultDtypes = {`
`9`	`9`	`"gfx942": {"fp8": torch.float8_e4m3fnuz},`
`10`	`10`	`"gfx950": {"fp8": torch.float8_e4m3fn},`
	`11`	`+ "gfx1250": {"fp8": torch.float8_e4m3fn},`
`11`	`12`	`}`
`12`	`13`
`13`	`14`	`_8bit_fallback = torch.uint8`