Skip to content

Commit 3b66959

Browse files
committed
dp4a now is also supported on AMD RDNA3+ GPUs
1 parent fcde761 commit 3b66959

File tree

1 file changed

+5
-3
lines changed

1 file changed

+5
-3
lines changed

src/kernel.cpp

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,10 +8,12 @@ int dp4a(const char4 a, const char4 b, const int c) { // 4-wide byte dot product
88
int d;)+"asm(\"dp4a.s32.s32\t%0,%1,%2,%3;\":\"=r\"(d):\"r\"(as_int(a)),\"r\"(as_int(b)),\"r\"(c));"+R(return d;
99
)+"#elif defined(__opencl_c_integer_dot_product_input_4x8bit)"+R( // use hardware-supported dp4a on some Intel GPUs
1010
return c+dot(a, b); // dot_acc_sat(a, b, c); is slow
11-
)+"#elif __has_builtin(__builtin_amdgcn_sdot4)"+R( // use hardware-supported dp4a on some AMD GPUs
12-
return __builtin_amdgcn_sdot4(as_int(a), as_int(b), c, false);
11+
)+"#elif __has_builtin(__builtin_amdgcn_sdot4)"+R( // use hardware-supported dp4a on older AMD GPUs
12+
return __builtin_amdgcn_sdot4(as_int(a), as_int(b), c, false);
13+
)+"#elif __has_builtin(__builtin_amdgcn_sudot4)"+R( // use hardware-supported dp4a on newer AMD GPUs
14+
return __builtin_amdgcn_sudot4(true, as_int(a), true, as_int(b), c, false);
1315
)+"#elif defined(cl_arm_integer_dot_product_accumulate_int8)"+R( // use hardware-supported dp4a on some ARM GPUs
14-
return arm_dot_acc(a, b, c);
16+
return arm_dot_acc(a, b, c);
1517
)+"#else"+R( // fallback emulation (compilers will turn this into hardware-supported dp4a instruction if available)
1618
return c+a.x*b.x+a.y*b.y+a.z*b.z+a.w*b.w;
1719
)+"#endif"+R(

0 commit comments

Comments
 (0)