Fixed compiling on macOS with new OpenCL headers, INT8 benchmark will now use dp4a instruction if supported

ProjectPhysX · ProjectPhysX · commit 513c52d0fa68 · 2025-02-22T11:42:20.000+01:00
diff --git a/src/kernel.cpp b/src/kernel.cpp
@@ -3,13 +3,24 @@ string opencl_c_container() { return R( // ########################## begin of O
 
 
 
+int dp4a(const char4 a, const char4 b, const int c) { // 4-wide byte dot product and accumulate
+)+"#if cl_nv_compute_capability>=61"+R( // use hardware-supported dp4a on Nvidia Pascal or newer GPUs with inline PTX assembly
+	int d;)+"asm(\"dp4a.s32.s32\t%0,%1,%2,%3;\":\"=r\"(d):\"r\"(as_int(a)),\"r\"(as_int(b)),\"r\"(c));"+R(
+	return d;
+)+"#else"+R( // fallback emulation (compilers will turn this into hardware-supported dp4a instruction if available)
+	return c+a.x*b.x+a.y*b.y+a.z*b.z+a.w*b.w;
+)+"#endif"+R(
+}
+
+
+
 )+"#ifdef cl_khr_fp64"+R( // OpenCL C defines don't work in R() stringification macro
 kernel void kernel_double(global float* data) {
 	double x = (double)get_global_id(0);
 	double y = (double)get_local_id(0);
 	for(uint i=0u; i<128u; i++) {
-		x = fma(y, x, y);
-		y = fma(x, y, x);
+		x = fma(y, x, y); // 2 operations
+		y = fma(x, y, x); // 2 operations
 	}
 	data[get_global_id(0)] = (float)y;
 }
@@ -19,8 +30,8 @@ kernel void kernel_float(global float* data) {
 	float x = (float)get_global_id(0);
 	float y = (float)get_local_id(0);
 	for(uint i=0u; i<512u; i++) {
-		x = fma(y, x, y);
-		y = fma(x, y, x);
+		x = fma(y, x, y); // 2 operations
+		y = fma(x, y, x); // 2 operations
 	}
 	data[get_global_id(0)] = y;
 }
@@ -30,8 +41,8 @@ kernel void kernel_half(global float* data) {
 	half2 x = (half2)((float)get_global_id(0), (float)get_local_id(0));
 	half2 y = (half2)((float)get_local_id(0), (float)get_global_id(0));
 	for(uint i=0u; i<512u; i++) {
-		x = y*x+y;
-		y = x*y+x;
+		x = y*x+y; // 4 operations
+		y = x*y+x; // 4 operations
 	}
 	data[get_global_id(0)] = (float)y.x+(float)y.y;
 }
@@ -41,8 +52,8 @@ kernel void kernel_long(global float* data) {
 	long x = (long)get_global_id(0);
 	long y = (long)get_local_id(0);
 	for(uint i=0u; i<8u; i++) {
-		x = y*x+y;
-		y = x*y+x;
+		x = y*x+y; // 2 operations
+		y = x*y+x; // 2 operations
 	}
 	data[get_global_id(0)] = as_float((int)y);
 }
@@ -51,28 +62,28 @@ kernel void kernel_int(global float* data) {
 	int x = get_global_id(0);
 	int y = get_local_id(0);
 	for(uint i=0u; i<512u; i++) {
-		x = y*x+y;
-		y = x*y+x;
+		x = y*x+y; // 2 operations
+		y = x*y+x; // 2 operations
 	}
 	data[get_global_id(0)] = as_float(y);
 }
 
 kernel void kernel_short(global float* data) {
-	short2 x = as_short2((int)get_global_id(0));
-	short2 y = as_short2((int)get_local_id(0));
+	short2 x = as_short2((uint)get_global_id(0));
+	short2 y = as_short2((uint)get_local_id(0));
 	for(uint i=0u; i<128u; i++) {
-		x = y*x+y;
-		y = x*y+x;
+		x = y*x+y; // 4 operations
+		y = x*y+x; // 4 operations
 	}
 	data[get_global_id(0)] = as_float(y);
 }
 
 kernel void kernel_char(global float* data) {
-	char4 x = as_char4((int)get_global_id(0));
-	char4 y = as_char4((int)get_local_id(0));
+	char4 x = as_char4((uint)get_global_id(0));
+	char4 y = as_char4((uint)get_local_id(0));
 	for(uint i=0u; i<64u; i++) {
-		x = y*x+y;
-		y = x*y+x;
+		x = as_char4(dp4a(y, x, as_int(y))); // 8 operations
+		y = as_char4(dp4a(x, y, as_int(x))); // 8 operations
 	}
 	data[get_global_id(0)] = as_float(y);
 }
@@ -81,7 +92,7 @@ kernel void kernel_char(global float* data) {
 
 kernel void kernel_coalesced_write(global float* data) {
 	const uint n = get_global_id(0);
-	for(uint i=0u; i<def_M; i++) data[i*def_N+n] = (float)n; // coalesced write
+	for(uint i=0u; i<def_M; i++) data[i*def_N+n] = as_float(n); // coalesced write
 }
 kernel void kernel_coalesced_read(global float* data) {
 	const uint n = get_global_id(0);
@@ -91,7 +102,7 @@ kernel void kernel_coalesced_read(global float* data) {
 }
 kernel void kernel_misaligned_write(global float* data) {
 	const uint n = get_global_id(0);
-	for(uint i=0u; i<def_M; i++) data[n*def_M+i] = (float)n; // misaligned write
+	for(uint i=0u; i<def_M; i++) data[n*def_M+i] = as_float(n); // misaligned write
 }
 kernel void kernel_misaligned_read(global float* data) {
 	const uint n = get_global_id(0);
diff --git a/src/opencl.hpp b/src/opencl.hpp
@@ -7,7 +7,11 @@
 // https://github.com/KhronosGroup/OpenCL-Headers
 // https://github.com/KhronosGroup/OpenCL-CLHPP
 #define CL_HPP_MINIMUM_OPENCL_VERSION 100
-#define CL_HPP_TARGET_OPENCL_VERSION 300
+#if !defined(__APPLE__) // Windows/Linux/Android
+#define CL_HPP_TARGET_OPENCL_VERSION 300 // Windows/Linux/Android can use OpenCL 3.0
+#else // macOS
+#define CL_HPP_TARGET_OPENCL_VERSION 120 // macOS only supports OpenCL 1.2
+#endif // macOS
 #include <CL/opencl.hpp>
 #include "utilities.hpp"
 using cl::Event;
@@ -133,7 +137,9 @@ struct Device_Info {
 			const bool amd_128_cores_per_dualcu = contains(to_lower(name), "gfx10"); // identify RDNA/RDNA2 GPUs where dual CUs are reported
 			const bool amd_256_cores_per_dualcu = contains(to_lower(name), "gfx11"); // identify RDNA3 GPUs where dual CUs are reported
 			cores_per_cu = is_gpu ? (amd_256_cores_per_dualcu ? 256.0f : amd_128_cores_per_dualcu ? 128.0f : 64.0f) : 0.5f; // 64 cores/CU (GCN, CDNA), 128 cores/dualCU (RDNA, RDNA2), 256 cores/dualCU (RDNA3), 1/2 core/CU (CPUs)
+#if !defined(__APPLE__) // AMD OpenCL extensions are not supported on macOS
 			if(is_gpu) name = trim(cl_device.getInfo<CL_DEVICE_BOARD_NAME_AMD>()); // for AMD GPUs, CL_DEVICE_NAME wrongly outputs chip codename, and CL_DEVICE_BOARD_NAME_AMD outputs actual device name
+#endif // macOS
 		} else if(vendor_id==0x8086) { // Intel GPU/CPU
 			const bool intel_16_cores_per_cu = contains_any(to_lower(name), {"gpu max", "140v", "130v", "b580", "b570"}); // identify PVC/Xe2 GPUs
 			cores_per_cu = is_gpu ? (intel_16_cores_per_cu ? 16.0f : 8.0f) : 0.5f; // Intel GPUs have 16 cores/CU (PVC) or 8 cores/CU (integrated/Arc), Intel CPUs (with HT) have 1/2 core/CU
@@ -146,7 +152,9 @@ struct Device_Info {
 			}
 			patch_intel_gpu_above_4gb = patch_intel_gpu_above_4gb||(is_gpu&&memory>4096u); // enable memory allocations greater than 4GB for Intel GPUs with >4GB VRAM
 		} else if(vendor_id==0x10DE||vendor_id==0x13B5) { // Nvidia GPU/CPU
+#if !defined(__APPLE__) // Nvidia OpenCL extensions are not supported on macOS
 			nvidia_compute_capability = 10u*(uint)cl_device.getInfo<CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV>()+(uint)cl_device.getInfo<CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV>();
+#endif // macOS
 			const bool nvidia__32_cores_per_cu = (nvidia_compute_capability <30); // identify Fermi GPUs
 			const bool nvidia_192_cores_per_cu = (nvidia_compute_capability>=30&&nvidia_compute_capability< 50); // identify Kepler GPUs
 			const bool nvidia__64_cores_per_cu = (nvidia_compute_capability>=70&&nvidia_compute_capability<=80)||nvidia_compute_capability==60; // identify Volta, Turing, P100, A100, A30