Fix FP16 to Fp8E4M3 RTNE Upper bound to +/-448 (#3814)

leonling-ll · web-flow · commit 423d73430df2 · 2025-04-10T16:00:40.000+08:00
This change fix previous FP16toFp8E4M3 RTNE upper bound from 256 to 448, according to https://www.opencompute.org/documents/ocp-8-bit-floating-point-specification-ofp8-revision-1-0-2023-12-01-pdf-1. Page 13 table 2: "S.1111.1102 = ±448" It helps to sovle SGLANG Group Quant UT failure #3613.
diff --git a/python/test/unit/language/test_conversions.py b/python/test/unit/language/test_conversions.py
@@ -87,7 +87,7 @@ def launch_exhaustive_populate(dst_dtype, offset, numel, force_odd, output_bits,
 
 
 @triton.jit
-def arbitrary_fp32_downcast(x, rounding : tl.constexpr, exponent_bits : tl.constexpr, mantissa_bits : tl.constexpr, exponent_bias : tl.constexpr, device_ : tl.constexpr):
+def arbitrary_fp32_downcast(x, rounding : tl.constexpr, exponent_bits : tl.constexpr, mantissa_bits : tl.constexpr, exponent_bias : tl.constexpr, from_bf16 : tl.constexpr):
 
     tl.static_assert(x.dtype == tl.float32, "input must be float32")
     numbits_dst : tl.constexpr = 1 + exponent_bits + mantissa_bits
@@ -118,7 +118,7 @@ def arbitrary_fp32_downcast(x, rounding : tl.constexpr, exponent_bits : tl.const
     mantissa = tl.where(exponent > -1, mantissa, mantissa * 0.5)
     exponent = tl.where(exponent > -1, exponent, exponent + 1)
 
-    if device_ == 'xpu':
+    if from_bf16:
         # convert mantissa to int with proper rounding without inline asm.
         to_cast = mantissa.to(tl.uint32, bitcast=True)
         mantissa2 = (to_cast & 0x7fffff)
@@ -165,22 +165,23 @@ def arbitrary_fp32_downcast(x, rounding : tl.constexpr, exponent_bits : tl.const
 
 
 @triton.jit
-def downcast_emulated(src, dst, rounding : tl.constexpr, BLOCK_SIZE : tl.constexpr, exponent_bits : tl.constexpr, mantissa_bits : tl.constexpr, exponent_bias : tl.constexpr, device_: tl.constexpr):
+def downcast_emulated(src, dst, rounding : tl.constexpr, BLOCK_SIZE : tl.constexpr, exponent_bits : tl.constexpr, mantissa_bits : tl.constexpr, exponent_bias : tl.constexpr, from_bf16: tl.constexpr):
 
     tl.static_assert(src.dtype.element_ty == tl.float32, "src dtype must be float32")
 
     idxs = tl.program_id(0) * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
     x = tl.load(src + idxs)
-    y = arbitrary_fp32_downcast(x, rounding, exponent_bits, mantissa_bits, exponent_bias, device_=device_)
+    y = arbitrary_fp32_downcast(x, rounding, exponent_bits, mantissa_bits, exponent_bias, from_bf16)
     y = y.to(dst.dtype.element_ty, bitcast=True)
     tl.store(dst + idxs, y)
 
 
 def launch_downcast_emulated(src, src_dtype, dst_dtype, rounding, exponent_bits, mantissa_bits, exponent_bias, device, BLOCK_SIZE=4096):
 
     dst = torch.empty(src.shape, dtype=matching_int(dst_dtype), device=device)
+    from_bf16 = src_dtype == tl.bfloat16 and device == 'xpu'
     downcast_emulated[(src.shape[0] // BLOCK_SIZE,)](
-        triton.reinterpret(src, src_dtype), triton.reinterpret(dst, dst_dtype), rounding, BLOCK_SIZE, exponent_bits, mantissa_bits, exponent_bias, device_=device)
+        triton.reinterpret(src, tl.float32), triton.reinterpret(dst, dst_dtype), rounding, BLOCK_SIZE, exponent_bits, mantissa_bits, exponent_bias, from_bf16=from_bf16)
     # 0x80 in float8e4b8 or float8e5b16 represents inf/nan. downcast_emulated kernel will
     # convert -0. in higher precision to 0x80 and thus need to fix the result to 0.
     if dst_dtype == tl.float8e4b8 or dst_dtype == tl.float8e5b16:
@@ -243,7 +244,7 @@ def downcast_test(src_dtype, dst_dtype, rounding, exponent_bits, mantissa_bits,
     else:
         src = launch_type_convert_triton(src, src_dtype, tl.float32, device=device)
 
-    dst2 = launch_downcast_emulated(src, tl.float32, dst_dtype, rounding, exponent_bits, mantissa_bits, exponent_bias, device=device)
+    dst2 = launch_downcast_emulated(src, src_dtype, dst_dtype, rounding, exponent_bits, mantissa_bits, exponent_bias, device=device)
 
     dst = launch_upcast_emulated(dst, exponent_bits, mantissa_bits, exponent_bias, device=device)
     dst2 = launch_upcast_emulated(dst2, exponent_bits, mantissa_bits, exponent_bias, device=device)
diff --git a/third_party/intel/lib/TritonIntelGPUToLLVM/ElementwiseOpToLLVM.cpp b/third_party/intel/lib/TritonIntelGPUToLLVM/ElementwiseOpToLLVM.cpp
@@ -527,57 +527,56 @@ static SmallVector<Value>
 Fp16_to_Fp8E4M3Nv_RTNE(Location loc, ConversionPatternRewriter &rewriter,
                        const SmallVector<Value> &v) {
   auto b = TritonLLVMOpBuilder(loc, rewriter);
-  Value val = b.zext(i32_ty, b.bitcast(v[0], i16_ty));
-  Value sign = b.and_(i32_ty, val, b.i32_val(0x8000));
-  Value nosign = b.and_(i32_ty, val, b.i32_val(0x7fff));
+  Value val = b.bitcast(v[0], i16_ty);
+  Value sign = b.and_(i16_ty, val, b.i16_val(0x8000));
+  Value nosign = b.and_(i16_ty, val, b.i16_val(0x7fff));
 
-  Value exp = b.and_(i32_ty, b.lshr(nosign, b.i32_val(10)), b.i32_val(0x1f));
+  Value exp = b.and_(i16_ty, b.lshr(nosign, b.i16_val(10)), b.i16_val(0x1f));
   // Check if we need a translation to a subnormal value. This happens when
   // exp value is in range [5, 8].
   Value is_subnormal =
-      b.and_(b.icmp_uge(exp, b.i32_val(5)), b.icmp_ule(exp, b.i32_val(8)));
-  Value shift = b.sub(i32_ty, b.i32_val(8), exp);
-  Value subnormal = b.and_(i32_ty, nosign, b.i32_val(0x3ff));
-  subnormal = b.or_(i32_ty, subnormal, b.i32_val(0x400));
+      b.and_(b.icmp_uge(exp, b.i16_val(5)), b.icmp_ule(exp, b.i16_val(8)));
+  Value shift = b.sub(i16_ty, b.i16_val(8), exp);
+  Value subnormal = b.and_(i16_ty, nosign, b.i16_val(0x3ff));
+  subnormal = b.or_(i16_ty, subnormal, b.i16_val(0x400));
   // Make rounding with respect to bits we are going to shift and cut off.
-  Value round_step = b.shl(i32_ty, b.i32_val(0x100), shift);
-  Value tail_mask = b.sub(i32_ty, round_step, b.i32_val(1));
-  Value tail = b.and_(i32_ty, subnormal, tail_mask);
-  Value threshold = b.shl(i32_ty, b.i32_val(0x80), shift);
+  Value round_step = b.shl(i16_ty, b.i16_val(0x100), shift);
+  Value tail_mask = b.sub(i16_ty, round_step, b.i16_val(1));
+  Value tail = b.and_(i16_ty, subnormal, tail_mask);
+  Value threshold = b.shl(i16_ty, b.i16_val(0x80), shift);
   Value odd_truncated =
-      b.icmp_ne(b.and_(i32_ty, subnormal, round_step), b.i32_val(0));
+      b.icmp_ne(b.and_(i16_ty, subnormal, round_step), b.i16_val(0));
   Value round_up = b.or_(b.icmp_ugt(tail, threshold),
                          b.and_(b.icmp_eq(tail, threshold), odd_truncated));
   subnormal =
-      b.select(round_up, b.add(i32_ty, subnormal, round_step), subnormal);
+      b.select(round_up, b.add(i16_ty, subnormal, round_step), subnormal);
   // Now shift to get the final result.
-  subnormal = b.lshr(i32_ty, subnormal, shift);
+  subnormal = b.lshr(i16_ty, subnormal, shift);
 
   // Normalized case. Start with rounding, then apply exp range to fit 4 bits,
   // adjust bias and shift left.
   // TODO: NaN values might be mishandled.
-  tail = b.and_(i32_ty, nosign, b.i32_val(0x7f));
+  tail = b.and_(i16_ty, nosign, b.i16_val(0x7f));
   odd_truncated =
-      b.icmp_ne(b.and_(i32_ty, nosign, b.i32_val(0x80)), b.i32_val(0));
-  round_up = b.or_(b.icmp_ugt(tail, b.i32_val(0x40)),
-                   b.and_(b.icmp_eq(tail, b.i32_val(0x40)), odd_truncated));
+      b.icmp_ne(b.and_(i16_ty, nosign, b.i16_val(0x80)), b.i16_val(0));
+  round_up = b.or_(b.icmp_ugt(tail, b.i16_val(0x40)),
+                   b.and_(b.icmp_eq(tail, b.i16_val(0x40)), odd_truncated));
   Value rounded =
-      b.and_(i32_ty, b.add(i32_ty, nosign, b.i32_val(0x80)), b.i32_val(0x7f80));
-  nosign = b.select(round_up, rounded, nosign);
+      b.and_(i16_ty, b.add(i16_ty, nosign, b.i16_val(0x80)), b.i16_val(0x7f80));
+  Value normal = b.select(round_up, rounded, nosign);
 
-  nosign = b.umax(i32_ty, nosign, b.i32_val(0x2000));
-  nosign = b.umin(i32_ty, nosign, b.i32_val(0x5c00));
-  nosign = b.sub(i32_ty, nosign, b.i32_val(0x2000));
-  nosign = b.shl(i32_ty, nosign, b.i32_val(1));
+  normal = b.umax(i16_ty, normal, b.i16_val(0x2000));
+  normal = b.umin(i16_ty, normal, b.i16_val(0x5f00));
+  normal = b.sub(i16_ty, normal, b.i16_val(0x2000));
+  normal = b.shl(i16_ty, normal, b.i16_val(1));
 
   // Choose between subnormal and normal values.
-  nosign = b.select(is_subnormal, subnormal, nosign);
-
-  Value res_val = b.or_(i32_ty, nosign, sign);
-  auto fp8x4VecTy = vec_ty(i8_ty, 4);
+  Value res_val = b.select(is_subnormal, subnormal, normal);
+  res_val = b.or_(i16_ty, res_val, sign);
+  auto fp8x4VecTy = vec_ty(i8_ty, 2);
   Value res = b.bitcast(res_val, fp8x4VecTy);
 
-  return {b.extract_element(i8_ty, res, b.i32_val(1))};
+  return {b.extract_element(i8_ty, res, b.i16_val(1))};
 }
 
 static SmallVector<Value> Fp8E4M3Nv_to_Bf16(Location loc,