Skip to content

Commit ee3e739

Browse files
committed
Added hardware-supported dp4a on ARM GPUs
1 parent ef438e4 commit ee3e739

File tree

1 file changed

+2
-0
lines changed

1 file changed

+2
-0
lines changed

src/kernel.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,8 @@ int dp4a(const char4 a, const char4 b, const int c) { // 4-wide byte dot product
1010
return c+dot(a, b); // dot_acc_sat(a, b, c); is slow
1111
)+"#elif __has_builtin(__builtin_amdgcn_sdot4)"+R( // use hardware-supported dp4a on some AMD GPUs
1212
return __builtin_amdgcn_sdot4(as_int(a), as_int(b), c, false);
13+
)+"#elif defined(cl_arm_integer_dot_product_accumulate_int8)"+R( // use hardware-supported dp4a on some ARM GPUs
14+
return arm_dot_acc(a, b, c);
1315
)+"#else"+R( // fallback emulation (compilers will turn this into hardware-supported dp4a instruction if available)
1416
return c+a.x*b.x+a.y*b.y+a.z*b.z+a.w*b.w;
1517
)+"#endif"+R(

0 commit comments

Comments
 (0)