Skip to content

Commit 513c52d

Browse files
committed
Fixed compiling on macOS with new OpenCL headers, INT8 benchmark will now use dp4a instruction if supported
1 parent c980082 commit 513c52d

File tree

2 files changed

+40
-21
lines changed

2 files changed

+40
-21
lines changed

src/kernel.cpp

Lines changed: 31 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -3,13 +3,24 @@ string opencl_c_container() { return R( // ########################## begin of O
33

44

55

6+
int dp4a(const char4 a, const char4 b, const int c) { // 4-wide byte dot product and accumulate
7+
)+"#if cl_nv_compute_capability>=61"+R( // use hardware-supported dp4a on Nvidia Pascal or newer GPUs with inline PTX assembly
8+
int d;)+"asm(\"dp4a.s32.s32\t%0,%1,%2,%3;\":\"=r\"(d):\"r\"(as_int(a)),\"r\"(as_int(b)),\"r\"(c));"+R(
9+
return d;
10+
)+"#else"+R( // fallback emulation (compilers will turn this into hardware-supported dp4a instruction if available)
11+
return c+a.x*b.x+a.y*b.y+a.z*b.z+a.w*b.w;
12+
)+"#endif"+R(
13+
}
14+
15+
16+
617
)+"#ifdef cl_khr_fp64"+R( // OpenCL C defines don't work in R() stringification macro
718
kernel void kernel_double(global float* data) {
819
double x = (double)get_global_id(0);
920
double y = (double)get_local_id(0);
1021
for(uint i=0u; i<128u; i++) {
11-
x = fma(y, x, y);
12-
y = fma(x, y, x);
22+
x = fma(y, x, y); // 2 operations
23+
y = fma(x, y, x); // 2 operations
1324
}
1425
data[get_global_id(0)] = (float)y;
1526
}
@@ -19,8 +30,8 @@ kernel void kernel_float(global float* data) {
1930
float x = (float)get_global_id(0);
2031
float y = (float)get_local_id(0);
2132
for(uint i=0u; i<512u; i++) {
22-
x = fma(y, x, y);
23-
y = fma(x, y, x);
33+
x = fma(y, x, y); // 2 operations
34+
y = fma(x, y, x); // 2 operations
2435
}
2536
data[get_global_id(0)] = y;
2637
}
@@ -30,8 +41,8 @@ kernel void kernel_half(global float* data) {
3041
half2 x = (half2)((float)get_global_id(0), (float)get_local_id(0));
3142
half2 y = (half2)((float)get_local_id(0), (float)get_global_id(0));
3243
for(uint i=0u; i<512u; i++) {
33-
x = y*x+y;
34-
y = x*y+x;
44+
x = y*x+y; // 4 operations
45+
y = x*y+x; // 4 operations
3546
}
3647
data[get_global_id(0)] = (float)y.x+(float)y.y;
3748
}
@@ -41,8 +52,8 @@ kernel void kernel_long(global float* data) {
4152
long x = (long)get_global_id(0);
4253
long y = (long)get_local_id(0);
4354
for(uint i=0u; i<8u; i++) {
44-
x = y*x+y;
45-
y = x*y+x;
55+
x = y*x+y; // 2 operations
56+
y = x*y+x; // 2 operations
4657
}
4758
data[get_global_id(0)] = as_float((int)y);
4859
}
@@ -51,28 +62,28 @@ kernel void kernel_int(global float* data) {
5162
int x = get_global_id(0);
5263
int y = get_local_id(0);
5364
for(uint i=0u; i<512u; i++) {
54-
x = y*x+y;
55-
y = x*y+x;
65+
x = y*x+y; // 2 operations
66+
y = x*y+x; // 2 operations
5667
}
5768
data[get_global_id(0)] = as_float(y);
5869
}
5970

6071
kernel void kernel_short(global float* data) {
61-
short2 x = as_short2((int)get_global_id(0));
62-
short2 y = as_short2((int)get_local_id(0));
72+
short2 x = as_short2((uint)get_global_id(0));
73+
short2 y = as_short2((uint)get_local_id(0));
6374
for(uint i=0u; i<128u; i++) {
64-
x = y*x+y;
65-
y = x*y+x;
75+
x = y*x+y; // 4 operations
76+
y = x*y+x; // 4 operations
6677
}
6778
data[get_global_id(0)] = as_float(y);
6879
}
6980

7081
kernel void kernel_char(global float* data) {
71-
char4 x = as_char4((int)get_global_id(0));
72-
char4 y = as_char4((int)get_local_id(0));
82+
char4 x = as_char4((uint)get_global_id(0));
83+
char4 y = as_char4((uint)get_local_id(0));
7384
for(uint i=0u; i<64u; i++) {
74-
x = y*x+y;
75-
y = x*y+x;
85+
x = as_char4(dp4a(y, x, as_int(y))); // 8 operations
86+
y = as_char4(dp4a(x, y, as_int(x))); // 8 operations
7687
}
7788
data[get_global_id(0)] = as_float(y);
7889
}
@@ -81,7 +92,7 @@ kernel void kernel_char(global float* data) {
8192

8293
kernel void kernel_coalesced_write(global float* data) {
8394
const uint n = get_global_id(0);
84-
for(uint i=0u; i<def_M; i++) data[i*def_N+n] = (float)n; // coalesced write
95+
for(uint i=0u; i<def_M; i++) data[i*def_N+n] = as_float(n); // coalesced write
8596
}
8697
kernel void kernel_coalesced_read(global float* data) {
8798
const uint n = get_global_id(0);
@@ -91,7 +102,7 @@ kernel void kernel_coalesced_read(global float* data) {
91102
}
92103
kernel void kernel_misaligned_write(global float* data) {
93104
const uint n = get_global_id(0);
94-
for(uint i=0u; i<def_M; i++) data[n*def_M+i] = (float)n; // misaligned write
105+
for(uint i=0u; i<def_M; i++) data[n*def_M+i] = as_float(n); // misaligned write
95106
}
96107
kernel void kernel_misaligned_read(global float* data) {
97108
const uint n = get_global_id(0);

src/opencl.hpp

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,11 @@
77
// https://github.com/KhronosGroup/OpenCL-Headers
88
// https://github.com/KhronosGroup/OpenCL-CLHPP
99
#define CL_HPP_MINIMUM_OPENCL_VERSION 100
10-
#define CL_HPP_TARGET_OPENCL_VERSION 300
10+
#if !defined(__APPLE__) // Windows/Linux/Android
11+
#define CL_HPP_TARGET_OPENCL_VERSION 300 // Windows/Linux/Android can use OpenCL 3.0
12+
#else // macOS
13+
#define CL_HPP_TARGET_OPENCL_VERSION 120 // macOS only supports OpenCL 1.2
14+
#endif // macOS
1115
#include <CL/opencl.hpp>
1216
#include "utilities.hpp"
1317
using cl::Event;
@@ -133,7 +137,9 @@ struct Device_Info {
133137
const bool amd_128_cores_per_dualcu = contains(to_lower(name), "gfx10"); // identify RDNA/RDNA2 GPUs where dual CUs are reported
134138
const bool amd_256_cores_per_dualcu = contains(to_lower(name), "gfx11"); // identify RDNA3 GPUs where dual CUs are reported
135139
cores_per_cu = is_gpu ? (amd_256_cores_per_dualcu ? 256.0f : amd_128_cores_per_dualcu ? 128.0f : 64.0f) : 0.5f; // 64 cores/CU (GCN, CDNA), 128 cores/dualCU (RDNA, RDNA2), 256 cores/dualCU (RDNA3), 1/2 core/CU (CPUs)
140+
#if !defined(__APPLE__) // AMD OpenCL extensions are not supported on macOS
136141
if(is_gpu) name = trim(cl_device.getInfo<CL_DEVICE_BOARD_NAME_AMD>()); // for AMD GPUs, CL_DEVICE_NAME wrongly outputs chip codename, and CL_DEVICE_BOARD_NAME_AMD outputs actual device name
142+
#endif // macOS
137143
} else if(vendor_id==0x8086) { // Intel GPU/CPU
138144
const bool intel_16_cores_per_cu = contains_any(to_lower(name), {"gpu max", "140v", "130v", "b580", "b570"}); // identify PVC/Xe2 GPUs
139145
cores_per_cu = is_gpu ? (intel_16_cores_per_cu ? 16.0f : 8.0f) : 0.5f; // Intel GPUs have 16 cores/CU (PVC) or 8 cores/CU (integrated/Arc), Intel CPUs (with HT) have 1/2 core/CU
@@ -146,7 +152,9 @@ struct Device_Info {
146152
}
147153
patch_intel_gpu_above_4gb = patch_intel_gpu_above_4gb||(is_gpu&&memory>4096u); // enable memory allocations greater than 4GB for Intel GPUs with >4GB VRAM
148154
} else if(vendor_id==0x10DE||vendor_id==0x13B5) { // Nvidia GPU/CPU
155+
#if !defined(__APPLE__) // Nvidia OpenCL extensions are not supported on macOS
149156
nvidia_compute_capability = 10u*(uint)cl_device.getInfo<CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV>()+(uint)cl_device.getInfo<CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV>();
157+
#endif // macOS
150158
const bool nvidia__32_cores_per_cu = (nvidia_compute_capability <30); // identify Fermi GPUs
151159
const bool nvidia_192_cores_per_cu = (nvidia_compute_capability>=30&&nvidia_compute_capability< 50); // identify Kepler GPUs
152160
const bool nvidia__64_cores_per_cu = (nvidia_compute_capability>=70&&nvidia_compute_capability<=80)||nvidia_compute_capability==60; // identify Volta, Turing, P100, A100, A30

0 commit comments

Comments
 (0)