Fix libdevice iree_f2h_ieee conversion (#20248)

bjacob · Elias Joseph · commit 1ad60c258eb5 · 2025-03-24T11:30:17.000-05:00
This replaces the implementation with a specialized copy of the code we have in base/internal/math.h. This also adds an extensive test, so that we can feel better about this code which isn't shared with anything else (being libdevice) and is relatively little used, being used only with workloads involving the CPU-hostile f16 type, and only on CPU targets lacking native f16-f32 conversion instructions, which is generally only `generic` CPU targets as contemporary CPUs tend to have native instructions for these f16-f32 (F16C extension on x86) even though they lack native f16 *arithmetic* beyond these conversions. Fixes #20163. --------- Signed-off-by: Benoit Jacob <jacob.benoit.1@gmail.com> Signed-off-by: Elias Joseph <eljoseph@amd.com>
diff --git a/runtime/src/iree/builtins/device/device_generic.c b/runtime/src/iree/builtins/device/device_generic.c
@@ -65,65 +65,81 @@ IREE_DEVICE_EXPORT float iree_h2f_ieee(short param) {
 }
 
 IREE_DEVICE_EXPORT short iree_f2h_ieee(float param) {
+  // Some constants about the f32 and f16 types.
+  const int f32_mantissa_bits = 23;
+  const int f32_exp_bias = 127;
+  const uint32_t f32_sign_mask = 0x80000000u;
+  const uint32_t f32_exp_mask = 0x7F800000u;
+  const uint32_t f32_mantissa_mask = 0x007FFFFFu;
+  const int f16_mantissa_bits = 10;
+  const int f16_exp_bits = 5;
+  const int f16_exp_bias = 15;
+  const uint16_t f16_exp_mask = 0x7C00u;
+  const uint16_t f16_mantissa_mask = 0x03FFu;
+
+  // Bitcast float param to uint32.
   union {
     unsigned int u;
     float f;
   } param_bits = {
       .f = param,
   };
-  int sign = param_bits.u >> 31;
-  int mantissa = param_bits.u & 0x007FFFFF;
-  int exp = ((param_bits.u & 0x7F800000) >> 23) + 15 - 127;
-  short res;
-  if (exp > 0 && exp < 30) {
-    // use rte rounding mode, round the significand, combine sign, exponent and
-    // significand into a short.
-    res = (sign << 15) | (exp << 10) | ((mantissa + 0x00001000) >> 13);
-  } else if (param_bits.u == 0) {
-    res = 0;
+  uint32_t u32_value = param_bits.u;
+
+  // Split the f32 sign/exponent/mantissa components.
+  const uint32_t f32_sign = u32_value & f32_sign_mask;
+  const uint32_t f32_exp = u32_value & f32_exp_mask;
+  const uint32_t f32_mantissa = u32_value & f32_mantissa_mask;
+  // Initialize the f16 sign/exponent/mantissa components.
+  uint32_t f16_sign = f32_sign >> 16;
+  uint32_t f16_exp = 0;
+  uint32_t f16_mantissa = 0;
+
+  if (f32_exp >= f32_exp_mask) {
+    // NaN or Inf case.
+    f16_exp = f16_exp_mask;
+    if (f32_mantissa) {
+      // NaN. Generate a quiet NaN.
+      return f16_sign | f16_exp_mask | f16_mantissa_mask;
+    } else {
+      // Inf. Leave zero mantissa.
+    }
+  } else if (f32_exp == 0) {
+    // Zero or subnormal. Generate zero. Leave zero mantissa.
   } else {
-    if (exp <= 0) {
-      if (exp < -10) {
-        // value is less than min half float point
-        res = 0;
-      } else {
-        // normalized single, magnitude is less than min normal half float
-        // point.
-        mantissa = (mantissa | 0x00800000) >> (1 - exp);
-        // round to nearest
-        if ((mantissa & 0x00001000) > 0) {
-          mantissa = mantissa + 0x00002000;
-        }
-        // combine sign & mantissa (exp is zero to get denormalized number)
-        res = (sign << 15) | (mantissa >> 13);
-      }
-    } else if (exp == (255 - 127 + 15)) {
-      if (mantissa == 0) {
-        // input float is infinity, return infinity half
-        res = (sign << 15) | 0x7C00;
-      } else {
-        // input float is NaN, return half NaN
-        res = (sign << 15) | 0x7C00 | (mantissa >> 13);
-      }
+    // Normal finite value.
+    int arithmetic_exp = (f32_exp >> f32_mantissa_bits) - f32_exp_bias;
+    // Test if the exponent is too large for the destination type. If
+    // the destination type does not have infinities, that frees up the
+    // max exponent value for additional finite values.
+    if (arithmetic_exp >= 1 << (f16_exp_bits - 1)) {
+      // Overflow. Generate Inf. Leave zero mantissa.
+      f16_exp = f16_exp_mask;
+    } else if (arithmetic_exp + f16_exp_bias <= 0) {
+      // Underflow. Generate zero. Leave zero mantissa.
+      f16_exp = 0;
     } else {
-      // exp > 0, normalized single, round to nearest
-      if ((mantissa & 0x00001000) > 0) {
-        mantissa = mantissa + 0x00002000;
-        if ((mantissa & 0x00800000) > 0) {
-          mantissa = 0;
-          exp = exp + 1;
-        }
-      }
-      if (exp > 30) {
-        // exponent overflow - return infinity half
-        res = (sign << 15) | 0x7C00;
-      } else {
-        // combine sign, exp and mantissa into normalized half
-        res = (sign << 15) | (exp << 10) | (mantissa >> 13);
+      // Normal case.
+      // Implement round-to-nearest-even, by adding a bias before truncating.
+      int even_bit = 1u << (f32_mantissa_bits - f16_mantissa_bits);
+      int odd_bit = even_bit >> 1;
+      uint32_t biased_f32_mantissa =
+          f32_mantissa +
+          ((f32_mantissa & even_bit) ? (odd_bit) : (odd_bit - 1));
+      // Adding the bias may cause an exponent increment.
+      if (biased_f32_mantissa > f32_mantissa_mask) {
+        biased_f32_mantissa = 0;
+        ++arithmetic_exp;
       }
+      // The exponent increment in the above if() branch may cause overflow.
+      // This is exercised by converting 65520.0f from f32 to f16.
+      f16_exp = (arithmetic_exp + f16_exp_bias) << f16_mantissa_bits;
+      f16_mantissa =
+          biased_f32_mantissa >> (f32_mantissa_bits - f16_mantissa_bits);
     }
   }
-  return res;
+
+  return f16_sign | f16_exp | f16_mantissa;
 }
 
 #if defined(IREE_DEVICE_STANDALONE)
diff --git a/runtime/src/iree/builtins/device/tools/BUILD.bazel b/runtime/src/iree/builtins/device/tools/BUILD.bazel
@@ -29,6 +29,7 @@ iree_runtime_cc_test(
     srcs = ["libdevice_test.cc"],
     deps = [
         "//runtime/src/iree/base",
+        "//runtime/src/iree/base/internal",
         "//runtime/src/iree/base/internal:flags",
         "//runtime/src/iree/builtins/device",
         "//runtime/src/iree/testing:gtest",
diff --git a/runtime/src/iree/builtins/device/tools/CMakeLists.txt b/runtime/src/iree/builtins/device/tools/CMakeLists.txt
@@ -30,6 +30,7 @@ iree_cc_test(
     "libdevice_test.cc"
   DEPS
     iree::base
+    iree::base::internal
     iree::base::internal::flags
     iree::builtins::device
     iree::testing::gtest
diff --git a/runtime/src/iree/builtins/device/tools/libdevice_test.cc b/runtime/src/iree/builtins/device/tools/libdevice_test.cc
@@ -7,16 +7,80 @@
 #include <cstring>
 
 #include "iree/base/api.h"
+#include "iree/base/internal/math.h"
 #include "iree/builtins/device/device.h"
 #include "iree/testing/gtest.h"
 #include "iree/testing/status_matchers.h"
 
+static constexpr uint16_t kF16ExponentMask = 0x7C00;
+static constexpr uint16_t kMantissaMask = 0x03FF;
+
+static uint16_t f16BitsIsNaN(uint16_t bits) {
+  return ((bits & kF16ExponentMask) == kF16ExponentMask) &&
+         (bits & kMantissaMask);
+}
+
+static uint16_t f16BitsIsDenormalOrZero(uint16_t bits) {
+  return !(bits & kF16ExponentMask);
+}
+
 TEST(LibDeviceTest, iree_h2f_ieee) {
-  // Just ensuring that the code links.
-  EXPECT_EQ(0.25f, iree_h2f_ieee(0x3400));
+  // Iterate over all f16 values as u16. Needs a wider type for loop condition.
+  for (uint32_t f16Bits = 0; f16Bits <= 0xffff; ++f16Bits) {
+    float f32 = iree_h2f_ieee(f16Bits);
+    if (f16BitsIsNaN(f16Bits)) {
+      EXPECT_TRUE(std::isnan(f32));
+    } else if (f16Bits == 0) {
+      EXPECT_EQ(f32, 0.f);
+    } else if (f16BitsIsDenormalOrZero(f16Bits)) {
+      EXPECT_LE(std::abs(f32), 6.1e-5f);
+    } else {
+      EXPECT_EQ(f32, iree_math_f16_to_f32(f16Bits));
+    }
+  }
 }
 
 TEST(LibDeviceTest, iree_f2h_ieee) {
-  // Just ensuring that the code links.
-  EXPECT_EQ(0x3400, iree_f2h_ieee(0.25f));
+  auto testcase = [](uint32_t f32Bits) {
+    float f32 = 0.f;
+    memcpy(&f32, &f32Bits, sizeof f32);
+    uint16_t f16Bits = iree_f2h_ieee(f32);
+    if (std::isnan(f32)) {
+      EXPECT_TRUE(f16BitsIsNaN(f16Bits));
+    } else if (f32 == 0.f) {
+      EXPECT_EQ(f16Bits, std::signbit(f32) ? 0x8000 : 0);
+    } else if (std::abs(f32) < 6.1e-5f) {
+      EXPECT_TRUE(f16BitsIsDenormalOrZero(f16Bits));
+    } else {
+      EXPECT_EQ(f16Bits, iree_math_f32_to_f16(f32));
+    }
+  };
+  // Testing all 2^32 float32 values is too much. We test two slices of that
+  // space.
+  //
+  // Test all 2^12 float32 values that have only their top 12 bits potentially
+  // set. That covers all combination of sign x exponent x the top 3 bits of
+  // mantissa. The bottom 20 mantissa bits stay zero, so this lacks coverage
+  // of rounding behavior.
+  for (uint32_t f32Top12Bits = 0; f32Top12Bits <= 0xfff; ++f32Top12Bits) {
+    testcase(f32Top12Bits << 20);
+  }
+  // For a few select exponent values, test all 2^12 float32 values whose
+  // *mantissa* bits have only their top 12 bits potentially set.
+  // Since float16 has only 10 bits of mantissa, that covers all float16
+  // mantissas plus 2 additional bits of float32 mantissa past the truncation.
+  // Having 2 extra bits should be exactly what is relevant to testing rounding
+  // behavior including tie breaks to "nearest even".
+  for (uint32_t f32MantissaTop12Bits = 0; f32MantissaTop12Bits <= 0xfff;
+       ++f32MantissaTop12Bits) {
+    // A few select exponent values.
+    for (uint32_t f32ExponentBits :
+         {0 /*denormal*/, 1 /*minimum normal*/, 127 /*neutral*/,
+          254 /*maximum finite*/, 255 /*infinite*/}) {
+      for (uint32_t f32SignBit : {0, 1}) {
+        testcase((f32SignBit << 31) | (f32ExponentBits << 23) |
+                 (f32MantissaTop12Bits << 11));
+      }
+    }
+  }
 }
diff --git a/tests/e2e/math/math_ops_llvm-cpu.json b/tests/e2e/math/math_ops_llvm-cpu.json
@@ -155,8 +155,7 @@
   {
     "op": "exp2",
     "type": "f16",
-    "atol": 0.25,
-    "comment": "TODO(#20163)",
+    "atol": 1.0e-03,
     "rtol": 1.0e-02
   },
   {
@@ -339,8 +338,7 @@
   {
     "op": "powf",
     "type": "f16",
-    "atol": 0.25,
-    "comment": "TODO(#20163)",
-    "rtol": 5.0e-03
+    "atol": 1.0e-03,
+    "rtol": 1.0e-02
   }
 ]

Original file line number	Diff line number	Diff line change
`@@ -155,8 +155,7 @@`
`155`	`155`	`{`
`156`	`156`	`"op": "exp2",`
`157`	`157`	`"type": "f16",`
`158`		`- "atol": 0.25,`
`159`		`- "comment": "TODO(#20163)",`
	`158`	`+ "atol": 1.0e-03,`
`160`	`159`	`"rtol": 1.0e-02`
`161`	`160`	`},`
`162`	`161`	`{`
`@@ -339,8 +338,7 @@`
`339`	`338`	`{`
`340`	`339`	`"op": "powf",`
`341`	`340`	`"type": "f16",`
`342`		`- "atol": 0.25,`
`343`		`- "comment": "TODO(#20163)",`
`344`		`- "rtol": 5.0e-03`
	`341`	`+ "atol": 1.0e-03,`
	`342`	`+ "rtol": 1.0e-02`
`345`	`343`	`}`
`346`	`344`	`]`