From 68df2622a3ca1b98a0cbf1fc9e6200e12fecbb2e Mon Sep 17 00:00:00 2001 From: Fraser Cormack Date: Wed, 11 Dec 2024 17:28:38 +0000 Subject: [PATCH 1/2] [libclc] Optimize ceil/fabs/floor/rint/trunc These functions all map to the corresponding LLVM intrinsics, but the vector intrinsics weren't being generated. The intrinsic mapping from CLC vector function to vector intrinsic was working correctly, but the mapping from OpenCL builtin to CLC function was suboptimally recursively splitting vectors in halves. For example, with this change, `ceil(float16)` calls `llvm.ceil.v16f32` directly. The CLC versions of each of these builtins are also now enabled for SPIR-V targets. The LLVM -> SPIR-V translator maps the intrinsics to the appropriate OpExtInst. As such, there is no diff to the SPIR-V binaries before/after this change. The clspv targets show a difference, but it's not expected to be a problem: > %call = tail call spir_func double @llvm.fabs.f64(double noundef %x) #9 < %call = tail call spir_func double @_Z4fabsd(double noundef %x) #9 The AMDGPU targets make use of the same _CLC_DEFINE_UNARY_BUILTIN macro to override sqrt, so those functions also appear more optimal with this change, calling the vector `llvm.sqrt.vXf32` intrinsics directly. --- libclc/clc/include/clc/clcmacro.h | 16 +++++++++++++++- libclc/clc/include/clc/math/clc_ceil.h | 7 ------- libclc/clc/include/clc/math/clc_fabs.h | 7 ------- libclc/clc/include/clc/math/clc_floor.h | 7 ------- libclc/clc/include/clc/math/clc_rint.h | 7 ------- libclc/clc/include/clc/math/clc_trunc.h | 7 ------- 6 files changed, 15 insertions(+), 36 deletions(-) diff --git a/libclc/clc/include/clc/clcmacro.h b/libclc/clc/include/clc/clcmacro.h index 244239284ecab..c6583749eca66 100644 --- a/libclc/clc/include/clc/clcmacro.h +++ b/libclc/clc/include/clc/clcmacro.h @@ -191,7 +191,21 @@ #define _CLC_DEFINE_UNARY_BUILTIN(RET_TYPE, FUNCTION, BUILTIN, ARG1_TYPE) \ _CLC_DEF _CLC_OVERLOAD RET_TYPE FUNCTION(ARG1_TYPE x) { return BUILTIN(x); } \ - _CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, RET_TYPE, FUNCTION, ARG1_TYPE) + _CLC_DEF _CLC_OVERLOAD RET_TYPE##2 FUNCTION(ARG1_TYPE##2 x) { \ + return BUILTIN(x); \ + } \ + _CLC_DEF _CLC_OVERLOAD RET_TYPE##3 FUNCTION(ARG1_TYPE##3 x) { \ + return BUILTIN(x); \ + } \ + _CLC_DEF _CLC_OVERLOAD RET_TYPE##4 FUNCTION(ARG1_TYPE##4 x) { \ + return BUILTIN(x); \ + } \ + _CLC_DEF _CLC_OVERLOAD RET_TYPE##8 FUNCTION(ARG1_TYPE##8 x) { \ + return BUILTIN(x); \ + } \ + _CLC_DEF _CLC_OVERLOAD RET_TYPE##16 FUNCTION(ARG1_TYPE##16 x) { \ + return BUILTIN(x); \ + } #ifdef cl_khr_fp16 diff --git a/libclc/clc/include/clc/math/clc_ceil.h b/libclc/clc/include/clc/math/clc_ceil.h index 66590687c3422..905aef37e11c6 100644 --- a/libclc/clc/include/clc/math/clc_ceil.h +++ b/libclc/clc/include/clc/math/clc_ceil.h @@ -1,11 +1,6 @@ #ifndef __CLC_MATH_CLC_CEIL_H__ #define __CLC_MATH_CLC_CEIL_H__ -#if defined(CLC_CLSPV) || defined(CLC_SPIRV) -// clspv and spir-v targets provide their own OpenCL-compatible ceil -#define __clc_ceil ceil -#else - // Map the function to an LLVM intrinsic #define __CLC_FUNCTION __clc_ceil #define __CLC_INTRINSIC "llvm.ceil" @@ -14,6 +9,4 @@ #undef __CLC_INTRINSIC #undef __CLC_FUNCTION -#endif - #endif // __CLC_MATH_CLC_CEIL_H__ diff --git a/libclc/clc/include/clc/math/clc_fabs.h b/libclc/clc/include/clc/math/clc_fabs.h index 93367b5731371..525577ab98a38 100644 --- a/libclc/clc/include/clc/math/clc_fabs.h +++ b/libclc/clc/include/clc/math/clc_fabs.h @@ -1,11 +1,6 @@ #ifndef __CLC_MATH_CLC_FABS_H__ #define __CLC_MATH_CLC_FABS_H__ -#if defined(CLC_CLSPV) || defined(CLC_SPIRV) -// clspv and spir-v targets provide their own OpenCL-compatible fabs -#define __clc_fabs fabs -#else - // Map the function to an LLVM intrinsic #define __CLC_FUNCTION __clc_fabs #define __CLC_INTRINSIC "llvm.fabs" @@ -14,6 +9,4 @@ #undef __CLC_INTRINSIC #undef __CLC_FUNCTION -#endif - #endif // __CLC_MATH_CLC_FABS_H__ diff --git a/libclc/clc/include/clc/math/clc_floor.h b/libclc/clc/include/clc/math/clc_floor.h index 9919872ec633c..e2d9dbadb434d 100644 --- a/libclc/clc/include/clc/math/clc_floor.h +++ b/libclc/clc/include/clc/math/clc_floor.h @@ -1,11 +1,6 @@ #ifndef __CLC_MATH_CLC_FLOOR_H__ #define __CLC_MATH_CLC_FLOOR_H__ -#if defined(CLC_CLSPV) || defined(CLC_SPIRV) -// clspv and spir-v targets provide their own OpenCL-compatible floor -#define __clc_floor floor -#else - // Map the function to an LLVM intrinsic #define __CLC_FUNCTION __clc_floor #define __CLC_INTRINSIC "llvm.floor" @@ -14,6 +9,4 @@ #undef __CLC_INTRINSIC #undef __CLC_FUNCTION -#endif - #endif // __CLC_MATH_CLC_FLOOR_H__ diff --git a/libclc/clc/include/clc/math/clc_rint.h b/libclc/clc/include/clc/math/clc_rint.h index 3761407ad326d..7bb81100f221c 100644 --- a/libclc/clc/include/clc/math/clc_rint.h +++ b/libclc/clc/include/clc/math/clc_rint.h @@ -1,11 +1,6 @@ #ifndef __CLC_MATH_CLC_RINT_H__ #define __CLC_MATH_CLC_RINT_H__ -#if defined(CLC_CLSPV) || defined(CLC_SPIRV) -// clspv and spir-v targets provide their own OpenCL-compatible rint -#define __clc_rint rint -#else - // Map the function to an LLVM intrinsic #define __CLC_FUNCTION __clc_rint #define __CLC_INTRINSIC "llvm.rint" @@ -14,6 +9,4 @@ #undef __CLC_INTRINSIC #undef __CLC_FUNCTION -#endif - #endif // __CLC_MATH_CLC_RINT_H__ diff --git a/libclc/clc/include/clc/math/clc_trunc.h b/libclc/clc/include/clc/math/clc_trunc.h index c78c8899d8523..62467fa114471 100644 --- a/libclc/clc/include/clc/math/clc_trunc.h +++ b/libclc/clc/include/clc/math/clc_trunc.h @@ -1,11 +1,6 @@ #ifndef __CLC_MATH_CLC_TRUNC_H__ #define __CLC_MATH_CLC_TRUNC_H__ -#if defined(CLC_CLSPV) || defined(CLC_SPIRV) -// clspv and spir-v targets provide their own OpenCL-compatible trunc -#define __clc_trunc trunc -#else - // Map the function to an LLVM intrinsic #define __CLC_FUNCTION __clc_trunc #define __CLC_INTRINSIC "llvm.trunc" @@ -14,6 +9,4 @@ #undef __CLC_INTRINSIC #undef __CLC_FUNCTION -#endif - #endif // __CLC_MATH_CLC_TRUNC_H__ From c825dc0766cd38c4950259cad18098e9d9950e74 Mon Sep 17 00:00:00 2001 From: Fraser Cormack Date: Thu, 12 Dec 2024 12:36:44 +0000 Subject: [PATCH 2/2] move away from using intrinsics --- libclc/clc/include/clc/math/clc_ceil.h | 8 ++++---- libclc/clc/include/clc/math/clc_fabs.h | 8 ++++---- libclc/clc/include/clc/math/clc_floor.h | 8 ++++---- libclc/clc/include/clc/math/clc_rint.h | 8 ++++---- libclc/clc/include/clc/math/clc_trunc.h | 8 ++++---- .../lib => clc/include/clc}/math/unary_builtin.inc | 0 libclc/clc/lib/clspv/SOURCES | 6 +++++- libclc/clc/lib/clspv/dummy.cl | 1 - libclc/clc/lib/generic/SOURCES | 5 +++++ libclc/clc/lib/generic/math/clc_ceil.cl | 6 ++++++ libclc/clc/lib/generic/math/clc_fabs.cl | 6 ++++++ libclc/clc/lib/generic/math/clc_floor.cl | 6 ++++++ libclc/clc/lib/generic/math/clc_rint.cl | 6 ++++++ libclc/clc/lib/generic/math/clc_trunc.cl | 6 ++++++ libclc/clc/lib/spirv/SOURCES | 6 +++++- libclc/clc/lib/spirv64/SOURCES | 5 +++++ libclc/generic/lib/math/ceil.cl | 2 +- libclc/generic/lib/math/fabs.cl | 2 +- libclc/generic/lib/math/floor.cl | 2 +- libclc/generic/lib/math/rint.cl | 2 +- libclc/generic/lib/math/round.cl | 2 +- libclc/generic/lib/math/sqrt.cl | 2 +- libclc/generic/lib/math/trunc.cl | 2 +- 23 files changed, 77 insertions(+), 30 deletions(-) rename libclc/{generic/lib => clc/include/clc}/math/unary_builtin.inc (100%) delete mode 100644 libclc/clc/lib/clspv/dummy.cl create mode 100644 libclc/clc/lib/generic/math/clc_ceil.cl create mode 100644 libclc/clc/lib/generic/math/clc_fabs.cl create mode 100644 libclc/clc/lib/generic/math/clc_floor.cl create mode 100644 libclc/clc/lib/generic/math/clc_rint.cl create mode 100644 libclc/clc/lib/generic/math/clc_trunc.cl diff --git a/libclc/clc/include/clc/math/clc_ceil.h b/libclc/clc/include/clc/math/clc_ceil.h index 905aef37e11c6..20adc6d81d863 100644 --- a/libclc/clc/include/clc/math/clc_ceil.h +++ b/libclc/clc/include/clc/math/clc_ceil.h @@ -1,12 +1,12 @@ #ifndef __CLC_MATH_CLC_CEIL_H__ #define __CLC_MATH_CLC_CEIL_H__ -// Map the function to an LLVM intrinsic +#define __CLC_BODY #define __CLC_FUNCTION __clc_ceil -#define __CLC_INTRINSIC "llvm.ceil" -#include -#undef __CLC_INTRINSIC +#include + +#undef __CLC_BODY #undef __CLC_FUNCTION #endif // __CLC_MATH_CLC_CEIL_H__ diff --git a/libclc/clc/include/clc/math/clc_fabs.h b/libclc/clc/include/clc/math/clc_fabs.h index 525577ab98a38..911d34f78c7d2 100644 --- a/libclc/clc/include/clc/math/clc_fabs.h +++ b/libclc/clc/include/clc/math/clc_fabs.h @@ -1,12 +1,12 @@ #ifndef __CLC_MATH_CLC_FABS_H__ #define __CLC_MATH_CLC_FABS_H__ -// Map the function to an LLVM intrinsic +#define __CLC_BODY #define __CLC_FUNCTION __clc_fabs -#define __CLC_INTRINSIC "llvm.fabs" -#include -#undef __CLC_INTRINSIC +#include + +#undef __CLC_BODY #undef __CLC_FUNCTION #endif // __CLC_MATH_CLC_FABS_H__ diff --git a/libclc/clc/include/clc/math/clc_floor.h b/libclc/clc/include/clc/math/clc_floor.h index e2d9dbadb434d..c311cc0edae15 100644 --- a/libclc/clc/include/clc/math/clc_floor.h +++ b/libclc/clc/include/clc/math/clc_floor.h @@ -1,12 +1,12 @@ #ifndef __CLC_MATH_CLC_FLOOR_H__ #define __CLC_MATH_CLC_FLOOR_H__ -// Map the function to an LLVM intrinsic +#define __CLC_BODY #define __CLC_FUNCTION __clc_floor -#define __CLC_INTRINSIC "llvm.floor" -#include -#undef __CLC_INTRINSIC +#include + +#undef __CLC_BODY #undef __CLC_FUNCTION #endif // __CLC_MATH_CLC_FLOOR_H__ diff --git a/libclc/clc/include/clc/math/clc_rint.h b/libclc/clc/include/clc/math/clc_rint.h index 7bb81100f221c..6faeed0b5696e 100644 --- a/libclc/clc/include/clc/math/clc_rint.h +++ b/libclc/clc/include/clc/math/clc_rint.h @@ -1,12 +1,12 @@ #ifndef __CLC_MATH_CLC_RINT_H__ #define __CLC_MATH_CLC_RINT_H__ -// Map the function to an LLVM intrinsic +#define __CLC_BODY #define __CLC_FUNCTION __clc_rint -#define __CLC_INTRINSIC "llvm.rint" -#include -#undef __CLC_INTRINSIC +#include + +#undef __CLC_BODY #undef __CLC_FUNCTION #endif // __CLC_MATH_CLC_RINT_H__ diff --git a/libclc/clc/include/clc/math/clc_trunc.h b/libclc/clc/include/clc/math/clc_trunc.h index 62467fa114471..acfc9d5db4811 100644 --- a/libclc/clc/include/clc/math/clc_trunc.h +++ b/libclc/clc/include/clc/math/clc_trunc.h @@ -1,12 +1,12 @@ #ifndef __CLC_MATH_CLC_TRUNC_H__ #define __CLC_MATH_CLC_TRUNC_H__ -// Map the function to an LLVM intrinsic +#define __CLC_BODY #define __CLC_FUNCTION __clc_trunc -#define __CLC_INTRINSIC "llvm.trunc" -#include -#undef __CLC_INTRINSIC +#include + +#undef __CLC_BODY #undef __CLC_FUNCTION #endif // __CLC_MATH_CLC_TRUNC_H__ diff --git a/libclc/generic/lib/math/unary_builtin.inc b/libclc/clc/include/clc/math/unary_builtin.inc similarity index 100% rename from libclc/generic/lib/math/unary_builtin.inc rename to libclc/clc/include/clc/math/unary_builtin.inc diff --git a/libclc/clc/lib/clspv/SOURCES b/libclc/clc/lib/clspv/SOURCES index 75a3130357c34..393e8d773cda0 100644 --- a/libclc/clc/lib/clspv/SOURCES +++ b/libclc/clc/lib/clspv/SOURCES @@ -1 +1,5 @@ -dummy.cl +../generic/math/clc_ceil.cl +../generic/math/clc_fabs.cl +../generic/math/clc_floor.cl +../generic/math/clc_rint.cl +../generic/math/clc_trunc.cl diff --git a/libclc/clc/lib/clspv/dummy.cl b/libclc/clc/lib/clspv/dummy.cl deleted file mode 100644 index fab17ac780e37..0000000000000 --- a/libclc/clc/lib/clspv/dummy.cl +++ /dev/null @@ -1 +0,0 @@ -// Empty file diff --git a/libclc/clc/lib/generic/SOURCES b/libclc/clc/lib/generic/SOURCES index d7ffaaf6dc3f4..3916ea15f5c45 100644 --- a/libclc/clc/lib/generic/SOURCES +++ b/libclc/clc/lib/generic/SOURCES @@ -1,6 +1,11 @@ geometric/clc_dot.cl integer/clc_abs.cl integer/clc_abs_diff.cl +math/clc_ceil.cl +math/clc_fabs.cl +math/clc_floor.cl +math/clc_rint.cl +math/clc_trunc.cl relational/clc_all.cl relational/clc_any.cl relational/clc_bitselect.cl diff --git a/libclc/clc/lib/generic/math/clc_ceil.cl b/libclc/clc/lib/generic/math/clc_ceil.cl new file mode 100644 index 0000000000000..c712e5fd024d9 --- /dev/null +++ b/libclc/clc/lib/generic/math/clc_ceil.cl @@ -0,0 +1,6 @@ +#include + +#undef __CLC_FUNCTION +#define __CLC_FUNCTION __clc_ceil +#define __CLC_BUILTIN __builtin_elementwise_ceil +#include diff --git a/libclc/clc/lib/generic/math/clc_fabs.cl b/libclc/clc/lib/generic/math/clc_fabs.cl new file mode 100644 index 0000000000000..23ff3a7a187e1 --- /dev/null +++ b/libclc/clc/lib/generic/math/clc_fabs.cl @@ -0,0 +1,6 @@ +#include + +#undef __CLC_FUNCTION +#define __CLC_FUNCTION __clc_fabs +#define __CLC_BUILTIN __builtin_elementwise_abs +#include diff --git a/libclc/clc/lib/generic/math/clc_floor.cl b/libclc/clc/lib/generic/math/clc_floor.cl new file mode 100644 index 0000000000000..98345c768f227 --- /dev/null +++ b/libclc/clc/lib/generic/math/clc_floor.cl @@ -0,0 +1,6 @@ +#include + +#undef __CLC_FUNCTION +#define __CLC_FUNCTION __clc_floor +#define __CLC_BUILTIN __builtin_elementwise_floor +#include diff --git a/libclc/clc/lib/generic/math/clc_rint.cl b/libclc/clc/lib/generic/math/clc_rint.cl new file mode 100644 index 0000000000000..28ad321a7b4f6 --- /dev/null +++ b/libclc/clc/lib/generic/math/clc_rint.cl @@ -0,0 +1,6 @@ +#include + +#undef __CLC_FUNCTION +#define __CLC_FUNCTION __clc_rint +#define __CLC_BUILTIN __builtin_elementwise_rint +#include diff --git a/libclc/clc/lib/generic/math/clc_trunc.cl b/libclc/clc/lib/generic/math/clc_trunc.cl new file mode 100644 index 0000000000000..e62ae062e0502 --- /dev/null +++ b/libclc/clc/lib/generic/math/clc_trunc.cl @@ -0,0 +1,6 @@ +#include + +#undef __CLC_FUNCTION +#define __CLC_FUNCTION __clc_trunc +#define __CLC_BUILTIN __builtin_elementwise_trunc +#include diff --git a/libclc/clc/lib/spirv/SOURCES b/libclc/clc/lib/spirv/SOURCES index d8effd19613c8..3b29fa0a91624 100644 --- a/libclc/clc/lib/spirv/SOURCES +++ b/libclc/clc/lib/spirv/SOURCES @@ -1,2 +1,6 @@ ../generic/geometric/clc_dot.cl - +../generic/math/clc_ceil.cl +../generic/math/clc_fabs.cl +../generic/math/clc_floor.cl +../generic/math/clc_rint.cl +../generic/math/clc_trunc.cl diff --git a/libclc/clc/lib/spirv64/SOURCES b/libclc/clc/lib/spirv64/SOURCES index 9200810ace38e..3b29fa0a91624 100644 --- a/libclc/clc/lib/spirv64/SOURCES +++ b/libclc/clc/lib/spirv64/SOURCES @@ -1 +1,6 @@ ../generic/geometric/clc_dot.cl +../generic/math/clc_ceil.cl +../generic/math/clc_fabs.cl +../generic/math/clc_floor.cl +../generic/math/clc_rint.cl +../generic/math/clc_trunc.cl diff --git a/libclc/generic/lib/math/ceil.cl b/libclc/generic/lib/math/ceil.cl index e02789e694e06..8df864a06314d 100644 --- a/libclc/generic/lib/math/ceil.cl +++ b/libclc/generic/lib/math/ceil.cl @@ -4,4 +4,4 @@ #undef __CLC_FUNCTION #define __CLC_FUNCTION ceil -#include "unary_builtin.inc" +#include diff --git a/libclc/generic/lib/math/fabs.cl b/libclc/generic/lib/math/fabs.cl index 9644369d4a095..55701cb36a951 100644 --- a/libclc/generic/lib/math/fabs.cl +++ b/libclc/generic/lib/math/fabs.cl @@ -4,4 +4,4 @@ #undef __CLC_FUNCTION #define __CLC_FUNCTION fabs -#include "unary_builtin.inc" +#include diff --git a/libclc/generic/lib/math/floor.cl b/libclc/generic/lib/math/floor.cl index f5c36b73862a4..0854fa7efc458 100644 --- a/libclc/generic/lib/math/floor.cl +++ b/libclc/generic/lib/math/floor.cl @@ -4,4 +4,4 @@ #undef __CLC_FUNCTION #define __CLC_FUNCTION floor -#include "unary_builtin.inc" +#include diff --git a/libclc/generic/lib/math/rint.cl b/libclc/generic/lib/math/rint.cl index 185bbbbf8c91d..ecf7d5c1e6dde 100644 --- a/libclc/generic/lib/math/rint.cl +++ b/libclc/generic/lib/math/rint.cl @@ -3,4 +3,4 @@ #undef __CLC_FUNCTION #define __CLC_FUNCTION rint -#include "unary_builtin.inc" +#include diff --git a/libclc/generic/lib/math/round.cl b/libclc/generic/lib/math/round.cl index 285328aaa5d56..6344051820c79 100644 --- a/libclc/generic/lib/math/round.cl +++ b/libclc/generic/lib/math/round.cl @@ -7,4 +7,4 @@ #undef __CLC_FUNCTION #define __CLC_FUNCTION round -#include "unary_builtin.inc" +#include diff --git a/libclc/generic/lib/math/sqrt.cl b/libclc/generic/lib/math/sqrt.cl index 8df25dd45adb6..a9192a9493d17 100644 --- a/libclc/generic/lib/math/sqrt.cl +++ b/libclc/generic/lib/math/sqrt.cl @@ -24,4 +24,4 @@ #include "math/clc_sqrt.h" #define __CLC_FUNCTION sqrt -#include "unary_builtin.inc" +#include diff --git a/libclc/generic/lib/math/trunc.cl b/libclc/generic/lib/math/trunc.cl index 00c2a4a80015f..1d5f04a323054 100644 --- a/libclc/generic/lib/math/trunc.cl +++ b/libclc/generic/lib/math/trunc.cl @@ -3,4 +3,4 @@ #undef __CLC_FUNCTION #define __CLC_FUNCTION trunc -#include "unary_builtin.inc" +#include