device-libs: Stop using CORRECTLY_ROUNDED_SQRT32 (llvm#1549)

arsenm · web-flow · commit 45ecca83e449 · 2025-04-19T09:47:29.000+02:00
diff --git a/amd/comgr/test-lit/device-lib-linking.cl b/amd/comgr/test-lit/device-lib-linking.cl
@@ -17,7 +17,6 @@
 
 extern const __constant bool __oclc_finite_only_opt;
 extern const __constant bool __oclc_unsafe_math_opt;
-extern const __constant bool __oclc_correctly_rounded_sqrt32;
 extern const __constant bool __oclc_wavefrontsize64;
 extern const __constant int __oclc_ISA_version;
 extern const __constant int __oclc_ABI_version;
@@ -26,10 +25,9 @@ void kernel device_libs(__global float *status, float x, float y, float z) {
 
   if (__oclc_finite_only_opt)            status[0] = 1.0;
   if (__oclc_unsafe_math_opt)            status[1] = 1.0;
-  if (__oclc_correctly_rounded_sqrt32)   status[2] = 1.0;
-  if (__oclc_wavefrontsize64)            status[3] = 1.0;
-  if (__oclc_ISA_version)                status[4] = 1.0;
-  if (__oclc_ABI_version)                status[5] = 1.0;
+  if (__oclc_wavefrontsize64)            status[2] = 1.0;
+  if (__oclc_ISA_version)                status[3] = 1.0;
+  if (__oclc_ABI_version)                status[4] = 1.0;
 
   // Math functions to test AMDGPULibCalls Folding optimizations
   // fold_sincos()
diff --git a/amd/comgr/test/source/device_libs.cl b/amd/comgr/test/source/device_libs.cl
@@ -1,6 +1,5 @@
 extern const __constant bool __oclc_finite_only_opt;
 extern const __constant bool __oclc_unsafe_math_opt;
-extern const __constant bool __oclc_correctly_rounded_sqrt32;
 extern const __constant bool __oclc_wavefrontsize64;
 extern const __constant int __oclc_ISA_version;
 extern const __constant int __oclc_ABI_version;
@@ -11,8 +10,6 @@ void kernel device_libs(__global float *status) {
     status[0] = 1.0;
   if (__oclc_unsafe_math_opt)
     status[1] = 1.0;
-  if (__oclc_correctly_rounded_sqrt32)
-    status[3] = 1.0;
   if (__oclc_wavefrontsize64)
     status[4] = 1.0;
   if (__oclc_ISA_version)
diff --git a/amd/device-libs/doc/OCKL.md b/amd/device-libs/doc/OCKL.md
@@ -40,9 +40,9 @@ taken with no control flow overhead.  These functions all have the form (in C)
 The currently supported control are
   * `finite_only_opt` - floating point Inf and NaN are never expected to be consumed or produced
   * `unsafe_math_opt` - lower accuracy results may be produced with higher performance
-  * `correctly_rounded_sqrt32` - float square root must be correctly rounded
   * `ISA_version` - an integer representation of the ISA version of the target device
   * `daz_opt` - unused and deprecated. Will be removed in the future.
+  * `correctly_rounded_sqrt32` - unused and deprecated. Will be removed in the future.
 
 ### Versioning
 
diff --git a/amd/device-libs/doc/OCML.md b/amd/device-libs/doc/OCML.md
@@ -43,13 +43,11 @@ The currently supported control `<name>`s and values `N` are
   * `finite_only_opt` - floating point Inf and NaN are never expected to be consumed or produced.  `N` may be 1 (on/true/enabled), or 0 (off/false/disabled).
   * `unsafe_math_opt` - lower accuracy results may be produced with higher performance.  `N` may be 1 (on/true/enabled) or 0 (off/false/disabled).
   * `daz_opt` - subnormal values consumed and produced may be flushed to zero.  `N`may be 1 (on/true/enabled) or 0 (off/false/disabled).
-  * `correctly_rounded_sqrt32` - float square root must be correctly rounded.  `N` may be 1 (on/true/enabled) or 0 (off/false/disabled).
   * `wavefrontsize64` - the wave front size is 64.  `N` may be 1 (on/true/enabled) or 0 (off/false/disabled).  Very few current devices support a value of 0.
   * `ISA_version` - an integer representation of the ISA version of the target device
 
 The language runtime can link a specific set of OCLC control libraries to properly configure OCML and other device libraries which also use the controls.  If linking OCLC libraries is used to define the control variables, then the runtime must link in:
 
-- Exactly one of `oclc_correctly_rounded_sqrt_on.amdgcn.bc` or `oclc_correctly_rounded_sqrt_off.amdgcn.bc` depending on the kernel's requirements
 - Exactly one of `oclc_daz_opt_on.amdgcn.bc` or `oclc_daz_opt_off.amdgcn.bc` depending on the kernel's requirements
 - Exactly one of `oclc_finite_only_on.amdgcn.bc` or `oclc_finite_only_off.amdgcn.bc` depending on the kernel's requirements
 - Exactly one of `oclc_unsafe_math_on.amdgcn.bc` or `oclc_unsafe_math_off.amdgcn.bc` depending on the kernel's requirements
@@ -84,7 +82,7 @@ where `{function}` is generally the familiar libm name of the function, and `{ty
 
 For example, `__ocml_sqrt_f32` is the name of the OCML single precision square root function.
 
-OCML does not currently support higher precision than double precision due to the lack of hardware support for such precisions. 
+OCML does not currently support higher precision than double precision due to the lack of hardware support for such precisions.
 
 ### Supported functions
 
diff --git a/amd/device-libs/oclc/inc/oclc.h b/amd/device-libs/oclc/inc/oclc.h
@@ -19,9 +19,6 @@
 //    __constant bool __oclc_unsafe_math_opt
 //        - the application accepts optimizations that may lower the accuracy of the results
 //
-//    __constant bool __oclc_correctly_rounded_sqrt32(void)
-//        - the application is expecting sqrt(float) to produce a correctly rounded result
-//
 //    __constant bool __oclc_wavefrontsize64
 //        - the application is being compiled for a wavefront size of 64
 //
@@ -40,7 +37,6 @@
 
 extern const __constant bool __oclc_finite_only_opt;
 extern const __constant bool __oclc_unsafe_math_opt;
-extern const __constant bool __oclc_correctly_rounded_sqrt32;
 extern const __constant bool __oclc_wavefrontsize64;
 extern const __constant uint __oclc_wavefrontsize_log2;
 extern const __constant int __oclc_ISA_version;
diff --git a/amd/device-libs/oclc/src/correctly_rounded_sqrt_off.cl b/amd/device-libs/oclc/src/correctly_rounded_sqrt_off.cl
@@ -1,10 +1 @@
-/*===--------------------------------------------------------------------------
- *                   ROCm Device Libraries
- *
- * This file is distributed under the University of Illinois Open Source
- * License. See LICENSE.TXT for details.
- *===------------------------------------------------------------------------*/
-
-#include "oclc.h"
-
-const __constant bool __oclc_correctly_rounded_sqrt32 = 0;
+// Placeholder until clang stops trying to link this
diff --git a/amd/device-libs/oclc/src/correctly_rounded_sqrt_on.cl b/amd/device-libs/oclc/src/correctly_rounded_sqrt_on.cl
@@ -1,11 +1 @@
-/*===--------------------------------------------------------------------------
- *                   ROCm Device Libraries
- *
- * This file is distributed under the University of Illinois Open Source
- * License. See LICENSE.TXT for details.
- *===------------------------------------------------------------------------*/
-
-#include "oclc.h"
-
-const __constant bool __oclc_correctly_rounded_sqrt32 = 1;
-
+// Placeholder until clang stops trying to link this
diff --git a/amd/device-libs/ocml/CMakeLists.txt b/amd/device-libs/ocml/CMakeLists.txt
@@ -21,4 +21,11 @@ set_source_files_properties(
   ${CMAKE_CURRENT_SOURCE_DIR}/src/native_expF.cl
   PROPERTIES COMPILE_FLAGS "${native_func_flags}")
 
+
+# This implementation of sqrt will not be used through opencl, openmp,
+# or hip. Compile to be correctly rounded just in case
+set_source_files_properties(
+  ${CMAKE_CURRENT_SOURCE_DIR}/src/sqrtF.cl
+  PROPERTIES COMPILE_FLAGS -cl-fp32-correctly-rounded-divide-sqrt)
+
 opencl_bc_lib(NAME ocml SOURCES ${sources})
diff --git a/amd/device-libs/ocml/src/opts.h b/amd/device-libs/ocml/src/opts.h
@@ -11,7 +11,4 @@
 #define FINITE_ONLY_OPT() __oclc_finite_only_opt
 #define UNSAFE_MATH_OPT() __oclc_unsafe_math_opt
 
-
 #define DAZ_OPT() __builtin_isfpclass(__builtin_canonicalizef(0x1p-149f), __FPCLASS_POSZERO)
-
-#define CORRECTLY_ROUNDED_SQRT32() __oclc_correctly_rounded_sqrt32
diff --git a/amd/device-libs/ocml/src/sqrtF.cl b/amd/device-libs/ocml/src/sqrtF.cl
@@ -7,32 +7,17 @@
 
 #include "mathF.h"
 
-// 1ulp sqrt that handles denormals, should be used without
-// -cl-fp32-correctly-rounded-divide-sqrt
-static float sqrt_scale_denormal(float x) {
-    bool need_scale = x < 0x1p-126f;
-    float scaled = BUILTIN_FLDEXP_F32(x, need_scale ? 32 : 0);
-    float sqrt_scaled = BUILTIN_AMDGPU_SQRT_F32(scaled);
-    return BUILTIN_FLDEXP_F32(sqrt_scaled, need_scale ? -16 : 0);
-}
-
 CONSTATTR float
 MATH_MANGLE(sqrt)(float x)
 {
-    if (CORRECTLY_ROUNDED_SQRT32()) {
-        return MATH_SQRT(x);
-    } else {
-        if (DAZ_OPT())
-            return BUILTIN_AMDGPU_SQRT_F32(x);
-        return sqrt_scale_denormal(x);
-    }
+  return __builtin_elementwise_sqrt(x);
 }
 
 #define GEN(LN,UN) \
 CONSTATTR float \
 MATH_MANGLE(LN)(float x) \
 { \
-    return BUILTIN_##UN##_F32(x); \
+  return __builtin_elementwise_sqrt(x); \
 }
 
 // GEN(sqrt_rte,SQRT_RTE)