@@ -3,13 +3,24 @@ string opencl_c_container() { return R( // ########################## begin of O
33
44
55
6+ int dp4a (const char4 a, const char4 b, const int c) { // 4-wide byte dot product and accumulate
7+ )+" #if cl_nv_compute_capability>=61" +R ( // use hardware-supported dp4a on Nvidia Pascal or newer GPUs with inline PTX assembly
8+ int d;)+" asm(\" dp4a.s32.s32\t %0,%1,%2,%3;\" :\" =r\" (d):\" r\" (as_int(a)),\" r\" (as_int(b)),\" r\" (c));" +R (
9+ return d;
10+ )+" #else" +R ( // fallback emulation (compilers will turn this into hardware-supported dp4a instruction if available)
11+ return c+a.x *b.x +a.y *b.y +a.z *b.z +a.w *b.w ;
12+ )+" #endif" +R (
13+ }
14+
15+
16+
617)+" #ifdef cl_khr_fp64" +R ( // OpenCL C defines don't work in R() stringification macro
718kernel void kernel_double (global float * data) {
819 double x = (double )get_global_id (0 );
920 double y = (double )get_local_id (0 );
1021 for (uint i=0u ; i<128u ; i++) {
11- x = fma (y, x, y);
12- y = fma (x, y, x);
22+ x = fma (y, x, y); // 2 operations
23+ y = fma (x, y, x); // 2 operations
1324 }
1425 data[get_global_id (0 )] = (float )y;
1526}
@@ -19,8 +30,8 @@ kernel void kernel_float(global float* data) {
1930 float x = (float )get_global_id (0 );
2031 float y = (float )get_local_id (0 );
2132 for (uint i=0u ; i<512u ; i++) {
22- x = fma (y, x, y);
23- y = fma (x, y, x);
33+ x = fma (y, x, y); // 2 operations
34+ y = fma (x, y, x); // 2 operations
2435 }
2536 data[get_global_id (0 )] = y;
2637}
@@ -30,8 +41,8 @@ kernel void kernel_half(global float* data) {
3041 half2 x = (half2)((float )get_global_id (0 ), (float )get_local_id (0 ));
3142 half2 y = (half2)((float )get_local_id (0 ), (float )get_global_id (0 ));
3243 for (uint i=0u ; i<512u ; i++) {
33- x = y*x+y;
34- y = x*y+x;
44+ x = y*x+y; // 4 operations
45+ y = x*y+x; // 4 operations
3546 }
3647 data[get_global_id (0 )] = (float )y.x +(float )y.y ;
3748}
@@ -41,8 +52,8 @@ kernel void kernel_long(global float* data) {
4152 long x = (long )get_global_id (0 );
4253 long y = (long )get_local_id (0 );
4354 for (uint i=0u ; i<8u ; i++) {
44- x = y*x+y;
45- y = x*y+x;
55+ x = y*x+y; // 2 operations
56+ y = x*y+x; // 2 operations
4657 }
4758 data[get_global_id (0 )] = as_float ((int )y);
4859}
@@ -51,28 +62,28 @@ kernel void kernel_int(global float* data) {
5162 int x = get_global_id (0 );
5263 int y = get_local_id (0 );
5364 for (uint i=0u ; i<512u ; i++) {
54- x = y*x+y;
55- y = x*y+x;
65+ x = y*x+y; // 2 operations
66+ y = x*y+x; // 2 operations
5667 }
5768 data[get_global_id (0 )] = as_float (y);
5869}
5970
6071kernel void kernel_short (global float * data) {
61- short2 x = as_short2 ((int )get_global_id (0 ));
62- short2 y = as_short2 ((int )get_local_id (0 ));
72+ short2 x = as_short2 ((uint )get_global_id (0 ));
73+ short2 y = as_short2 ((uint )get_local_id (0 ));
6374 for (uint i=0u ; i<128u ; i++) {
64- x = y*x+y;
65- y = x*y+x;
75+ x = y*x+y; // 4 operations
76+ y = x*y+x; // 4 operations
6677 }
6778 data[get_global_id (0 )] = as_float (y);
6879}
6980
7081kernel void kernel_char (global float * data) {
71- char4 x = as_char4 ((int )get_global_id (0 ));
72- char4 y = as_char4 ((int )get_local_id (0 ));
82+ char4 x = as_char4 ((uint )get_global_id (0 ));
83+ char4 y = as_char4 ((uint )get_local_id (0 ));
7384 for (uint i=0u ; i<64u ; i++) {
74- x = y*x+y;
75- y = x*y+x;
85+ x = as_char4 ( dp4a (y, x, as_int (y))); // 8 operations
86+ y = as_char4 ( dp4a (x, y, as_int (x))); // 8 operations
7687 }
7788 data[get_global_id (0 )] = as_float (y);
7889}
@@ -81,7 +92,7 @@ kernel void kernel_char(global float* data) {
8192
8293kernel void kernel_coalesced_write (global float * data) {
8394 const uint n = get_global_id (0 );
84- for (uint i=0u ; i<def_M; i++) data[i*def_N+n] = ( float )n ; // coalesced write
95+ for (uint i=0u ; i<def_M; i++) data[i*def_N+n] = as_float (n) ; // coalesced write
8596}
8697kernel void kernel_coalesced_read (global float * data) {
8798 const uint n = get_global_id (0 );
@@ -91,7 +102,7 @@ kernel void kernel_coalesced_read(global float* data) {
91102}
92103kernel void kernel_misaligned_write (global float * data) {
93104 const uint n = get_global_id (0 );
94- for (uint i=0u ; i<def_M; i++) data[n*def_M+i] = ( float )n ; // misaligned write
105+ for (uint i=0u ; i<def_M; i++) data[n*def_M+i] = as_float (n) ; // misaligned write
95106}
96107kernel void kernel_misaligned_read (global float * data) {
97108 const uint n = get_global_id (0 );
0 commit comments