|
259 | 259 | #define cublasComputeType_t cudaDataType_t |
260 | 260 |
|
261 | 261 | // XXX: Clang builtins mapping |
262 | | -#define __vsubss4 __vsubss4_musa |
263 | 262 | #define __vsub4 __vsub4_musa |
264 | 263 | #define __vcmpeq4 __vcmpeq4_musa |
265 | 264 | #define __vcmpne4 __vcmpne4_musa |
@@ -372,30 +371,10 @@ typedef float2 dfloat2; |
372 | 371 | #define __has_builtin(x) 0 |
373 | 372 | #endif |
374 | 373 |
|
375 | | -typedef int8_t int8x4_t __attribute__((ext_vector_type(4))); |
376 | 374 | typedef uint8_t uint8x4_t __attribute__((ext_vector_type(4))); |
377 | | -static __device__ __forceinline__ int __vsubss4_musa(const int a, const int b) { |
378 | | - const int8x4_t va = reinterpret_cast<const int8x4_t&>(a); |
379 | | - const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b); |
380 | | -#if __has_builtin(__builtin_elementwise_sub_sat) |
381 | | - const int8x4_t c = __builtin_elementwise_sub_sat(va, vb); |
382 | | - return reinterpret_cast<const int &>(c); |
383 | | -#else |
384 | | - int8x4_t c; |
385 | | - int16_t tmp; |
386 | | -#pragma unroll |
387 | | - for (int i = 0; i < 4; i++) { |
388 | | - tmp = va[i] - vb[i]; |
389 | | - if(tmp > std::numeric_limits<int8_t>::max()) tmp = std::numeric_limits<int8_t>::max(); |
390 | | - if(tmp < std::numeric_limits<int8_t>::min()) tmp = std::numeric_limits<int8_t>::min(); |
391 | | - c[i] = tmp; |
392 | | - } |
393 | | - return reinterpret_cast<int &>(c); |
394 | | -#endif // __has_builtin(__builtin_elementwise_sub_sat) |
395 | | -} |
396 | 375 |
|
397 | 376 | static __device__ __forceinline__ int __vsub4_musa(const int a, const int b) { |
398 | | - return __vsubss4_musa(a, b); |
| 377 | + return __vsubss4(a, b); |
399 | 378 | } |
400 | 379 |
|
401 | 380 | static __device__ __forceinline__ unsigned int __vcmpeq4_musa(unsigned int a, unsigned int b) { |
|
0 commit comments