[libc] Use rint builtin for rounding on the GPU (#98345)

jhuber6 · web-flow · commit 6e86e1114847 · 2024-07-10T13:41:08.000-05:00
Summary: Previously this went through the generic bit-twiddling implementation instead of using the dedicated GPU instruction. This patch adds this in to the utility, mirroring the special-casing of the x64 and aarch targets. This results in much nicer code. The following example shows the opencl device libs implementation on the left and the LLVM libc on the right, https://godbolt.org/z/3ch48ccf5. The libc version is "branchier", but the results seem similar.
diff --git a/libc/src/__support/FPUtil/nearest_integer.h b/libc/src/__support/FPUtil/nearest_integer.h
@@ -17,6 +17,18 @@
 #include "x86_64/nearest_integer.h"
 #elif defined(LIBC_TARGET_ARCH_IS_AARCH64)
 #include "aarch64/nearest_integer.h"
+#elif defined(LIBC_TARGET_ARCH_IS_GPU)
+
+namespace LIBC_NAMESPACE {
+namespace fputil {
+
+LIBC_INLINE float nearest_integer(float x) { return __builtin_rintf(x); }
+
+LIBC_INLINE double nearest_integer(double x) { return __builtin_rint(x); }
+
+} // namespace fputil
+} // namespace LIBC_NAMESPACE
+
 #else
 
 namespace LIBC_NAMESPACE {