Skip to content

Commit e99ee93

Browse files
EJainDevCNugteren
andauthored
cl_khr_expect_assume Extension Support (CNugteren#610)
* Added support for cl_khr_expect_assume extension to enable some optimizations by the compiler. * Undid 'use' of parameters with value 0 in gemv. --------- Co-authored-by: Cedric Nugteren <web@cedricnugteren.nl>
1 parent be9922d commit e99ee93

File tree

8 files changed

+58
-0
lines changed

8 files changed

+58
-0
lines changed

src/kernels/common.opencl

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,14 @@
1212
R"(
1313
// =================================================================================================
1414

15+
#if defined(cl_khr_expect_assume)
16+
#pragma OPENCL EXTENSION cl_khr_expect_assume : enable
17+
#endif
18+
19+
#if !defined(__has_builtin)
20+
#define __has_builtin(x) 0
21+
#endif
22+
1523
// Parameters set by the tuner or by the database. Here they are given a basic default value in case
1624
// this file is used outside of the CLBlast library.
1725
#ifndef PRECISION

src/kernels/level1/xaxpy.opencl

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,11 @@ void Xaxpy(const int n, const real_arg arg_alpha,
4545
void XaxpyFaster(const int n, const real_arg arg_alpha,
4646
const __global realV* restrict xgm,
4747
__global realV* ygm) {
48+
#if __has_builtin(__builtin_assume)
49+
__builtin_assume(n % VW == 0);
50+
__builtin_assume(n % WPT == 0);
51+
#endif
52+
4853
const real alpha = GetRealArg(arg_alpha);
4954

5055
const int num_usefull_threads = n / (VW * WPT);
@@ -69,6 +74,12 @@ void XaxpyFaster(const int n, const real_arg arg_alpha,
6974
void XaxpyFastest(const int n, const real_arg arg_alpha,
7075
const __global realV* restrict xgm,
7176
__global realV* ygm) {
77+
#if __has_builtin(__builtin_assume)
78+
__builtin_assume(n % VW == 0);
79+
__builtin_assume(n % WPT == 0);
80+
__builtin_assume(n % WGS == 0);
81+
#endif
82+
7283
const real alpha = GetRealArg(arg_alpha);
7384

7485
#pragma unroll

src/kernels/level1/xcopy.opencl

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,11 @@ void Xcopy(const int n,
4545
void XcopyFast(const int n,
4646
const __global realV* restrict xgm,
4747
__global realV* ygm) {
48+
#if __has_builtin(__builtin_assume)
49+
__builtin_assume(n % VW == 0);
50+
__builtin_assume(n % WPT == 0);
51+
__builtin_assume(n % WGS == 0);
52+
#endif
4853
#pragma unroll
4954
for (int _w = 0; _w < WPT; _w += 1) {
5055
const int id = _w*get_global_size(0) + get_global_id(0);

src/kernels/level1/xhad.opencl

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,10 @@ void Xhad(const int n, const real_arg arg_alpha, const real_arg arg_beta,
9898
void XhadFaster(const int n, const real_arg arg_alpha, const real_arg arg_beta,
9999
const __global realV* restrict xgm, const __global realV* restrict ygm,
100100
__global realV* zgm) {
101+
#if __has_builtin(__builtin_assume)
102+
__builtin_assume(n % VW == 0);
103+
__builtin_assume(n % WPT == 0);
104+
#endif
101105
const real alpha = GetRealArg(arg_alpha);
102106
const real beta = GetRealArg(arg_beta);
103107

@@ -129,6 +133,11 @@ void XhadFaster(const int n, const real_arg arg_alpha, const real_arg arg_beta,
129133
void XhadFastest(const int n, const real_arg arg_alpha, const real_arg arg_beta,
130134
const __global realV* restrict xgm, const __global realV* restrict ygm,
131135
__global realV* zgm) {
136+
#if __has_builtin(__builtin_assume)
137+
__builtin_assume(n % VW == 0);
138+
__builtin_assume(n % WPT == 0);
139+
__builtin_assume(n % WGS == 0);
140+
#endif
132141
const real alpha = GetRealArg(arg_alpha);
133142
const real beta = GetRealArg(arg_beta);
134143

src/kernels/level1/xscal.opencl

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,11 @@ void Xscal(const int n, const real_arg arg_alpha,
4747
#endif
4848
void XscalFast(const int n, const real_arg arg_alpha,
4949
__global realV* xgm) {
50+
#if __has_builtin(__builtin_assume)
51+
__builtin_assume(n % VW == 0);
52+
__builtin_assume(n % WPT == 0);
53+
__builtin_assume(n % WGS == 0);
54+
#endif
5055
const real alpha = GetRealArg(arg_alpha);
5156

5257
#pragma unroll

src/kernels/level1/xswap.opencl

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,11 @@ void Xswap(const int n,
4747
void XswapFast(const int n,
4848
__global realV* xgm,
4949
__global realV* ygm) {
50+
#if __has_builtin(__builtin_assume)
51+
__builtin_assume(n % VW == 0);
52+
__builtin_assume(n % WPT == 0);
53+
__builtin_assume(n % WGS == 0);
54+
#endif
5055
#pragma unroll
5156
for (int _w = 0; _w < WPT; _w += 1) {
5257
const int id = _w*get_global_size(0) + get_global_id(0);

src/kernels/level2/xgemv_fast.opencl

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,11 @@ void XgemvFast(const int m, const int n,
9898
__global real* ygm, const int y_offset, const int y_inc,
9999
const int do_conjugate, const int parameter,
100100
const int kl_unused, const int ku_unused) {
101+
#if __has_builtin(__builtin_assume)
102+
__builtin_assume(m % WGS2 == 0);
103+
__builtin_assume(n % WGS2 == 0);
104+
__builtin_assume(a_ld % VW2 == 0);
105+
#endif
101106
const real alpha = GetRealArg(arg_alpha);
102107
const real beta = GetRealArg(arg_beta);
103108

@@ -205,6 +210,12 @@ void XgemvFastRot(const int m, const int n,
205210
__global real* ygm, const int y_offset, const int y_inc,
206211
const int do_conjugate, const int parameter,
207212
const int kl_unused, const int ku_unused) {
213+
#if __has_builtin(__builtin_assume)
214+
__builtin_assume(m % WGS3 == 0);
215+
__builtin_assume(n % WGS3 == 0);
216+
__builtin_assume(a_ld % VW3 == 0);
217+
#endif
218+
208219
const real alpha = GetRealArg(arg_alpha);
209220
const real beta = GetRealArg(arg_beta);
210221

src/kernels/level3/copy_fast.opencl

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,10 @@ void CopyMatrixFast(const int ld,
4040
__global const realC* restrict src,
4141
__global realC* dest,
4242
const real_arg arg_alpha) {
43+
#if __has_builtin(__builtin_assume)
44+
__builtin_assume(ld % COPY_VW == 0);
45+
#endif
46+
4347
const real alpha = GetRealArg(arg_alpha);
4448
#pragma unroll
4549
for (int _w_one = 0; _w_one < COPY_WPT; _w_one += 1) {

0 commit comments

Comments
 (0)