|
25 | 25 | __attribute__((__always_inline__, __nodebug__, __target__("avxneconvert"), \ |
26 | 26 | __min_vector_width__(256))) |
27 | 27 |
|
| 28 | +#ifndef __SYCL_DEVICE_ONLY__ |
28 | 29 | /// Convert scalar BF16 (16-bit) floating-point element |
29 | 30 | /// stored at memory locations starting at location \a __A to a |
30 | 31 | /// single-precision (32-bit) floating-point, broadcast it to packed |
@@ -90,6 +91,7 @@ static __inline__ __m256 __DEFAULT_FN_ATTRS256 |
90 | 91 | _mm256_bcstnebf16_ps(const void *__A) { |
91 | 92 | return (__m256)__builtin_ia32_vbcstnebf162ps256((const __bf16 *)__A); |
92 | 93 | } |
| 94 | +#endif |
93 | 95 |
|
94 | 96 | /// Convert scalar half-precision (16-bit) floating-point element |
95 | 97 | /// stored at memory locations starting at location \a __A to a |
@@ -157,6 +159,7 @@ _mm256_bcstnesh_ps(const void *__A) { |
157 | 159 | return (__m256)__builtin_ia32_vbcstnesh2ps256((const _Float16 *)__A); |
158 | 160 | } |
159 | 161 |
|
| 162 | +#ifndef __SYCL_DEVICE_ONLY__ |
160 | 163 | /// Convert packed BF16 (16-bit) floating-point even-indexed elements |
161 | 164 | /// stored at memory locations starting at location \a __A to packed |
162 | 165 | /// single-precision (32-bit) floating-point elements, and store the results in |
@@ -222,6 +225,7 @@ static __inline__ __m256 __DEFAULT_FN_ATTRS256 |
222 | 225 | _mm256_cvtneebf16_ps(const __m256bh *__A) { |
223 | 226 | return (__m256)__builtin_ia32_vcvtneebf162ps256((const __v16bf *)__A); |
224 | 227 | } |
| 228 | +#endif |
225 | 229 |
|
226 | 230 | /// Convert packed half-precision (16-bit) floating-point even-indexed elements |
227 | 231 | /// stored at memory locations starting at location \a __A to packed |
@@ -289,6 +293,7 @@ _mm256_cvtneeph_ps(const __m256h *__A) { |
289 | 293 | return (__m256)__builtin_ia32_vcvtneeph2ps256((const __v16hf *)__A); |
290 | 294 | } |
291 | 295 |
|
| 296 | +#ifndef __SYCL_DEVICE_ONLY__ |
292 | 297 | /// Convert packed BF16 (16-bit) floating-point odd-indexed elements |
293 | 298 | /// stored at memory locations starting at location \a __A to packed |
294 | 299 | /// single-precision (32-bit) floating-point elements, and store the results in |
@@ -354,6 +359,7 @@ static __inline__ __m256 __DEFAULT_FN_ATTRS256 |
354 | 359 | _mm256_cvtneobf16_ps(const __m256bh *__A) { |
355 | 360 | return (__m256)__builtin_ia32_vcvtneobf162ps256((const __v16bf *)__A); |
356 | 361 | } |
| 362 | +#endif |
357 | 363 |
|
358 | 364 | /// Convert packed half-precision (16-bit) floating-point odd-indexed elements |
359 | 365 | /// stored at memory locations starting at location \a __A to packed |
@@ -421,6 +427,7 @@ _mm256_cvtneoph_ps(const __m256h *__A) { |
421 | 427 | return (__m256)__builtin_ia32_vcvtneoph2ps256((const __v16hf *)__A); |
422 | 428 | } |
423 | 429 |
|
| 430 | +#ifndef __SYCL_DEVICE_ONLY__ |
424 | 431 | /// Convert packed single-precision (32-bit) floating-point elements in \a __A |
425 | 432 | /// to packed BF16 (16-bit) floating-point elements, and store the results in \a |
426 | 433 | /// dst. |
@@ -476,6 +483,7 @@ static __inline__ __m128bh __DEFAULT_FN_ATTRS256 |
476 | 483 | _mm256_cvtneps_avx_pbh(__m256 __A) { |
477 | 484 | return (__m128bh)__builtin_ia32_vcvtneps2bf16256((__v8sf)__A); |
478 | 485 | } |
| 486 | +#endif |
479 | 487 |
|
480 | 488 | #undef __DEFAULT_FN_ATTRS128 |
481 | 489 | #undef __DEFAULT_FN_ATTRS256 |
|
0 commit comments