|
16 | 16 | #include <arm_sve.h> |
17 | 17 | #endif // __ARM_FEATURE_SVE |
18 | 18 |
|
| 19 | +#if defined(__ARM_NEON) && !defined(__CUDACC__) && !defined(__MUSACC__) |
| 20 | +// if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example: |
| 21 | +// |
| 22 | +// $ ln -sfn /Library/Developer/CommandLineTools/usr/lib/clang/13.1.6/include/arm_neon.h ./src/ |
| 23 | +// |
| 24 | +#include <arm_neon.h> |
| 25 | +#endif |
| 26 | + |
19 | 27 | #if defined(__F16C__) |
20 | 28 | #include <immintrin.h> |
21 | 29 | #endif |
@@ -303,35 +311,29 @@ GGML_API void ggml_aligned_free(void * ptr, size_t size); |
303 | 311 |
|
304 | 312 | // FP16 to FP32 conversion |
305 | 313 |
|
306 | | -// 16-bit float |
307 | | -// on Arm, we use __fp16 |
308 | | -// on x86, we use uint16_t |
309 | | -// |
310 | | -// for old CUDA compilers (<= 11), we use uint16_t: ref https://github.com/ggml-org/llama.cpp/pull/10616 |
311 | | -// for MUSA compilers , we use uint16_t: ref https://github.com/ggml-org/llama.cpp/pull/11843 |
312 | | -// |
313 | | -#if defined(__ARM_NEON) && !(defined(__CUDACC__) && __CUDACC_VER_MAJOR__ <= 11) && !defined(__MUSACC__) |
314 | | - |
315 | | - // if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example: |
316 | | - // |
317 | | - // $ ln -sfn /Library/Developer/CommandLineTools/usr/lib/clang/13.1.6/include/arm_neon.h ./src/ |
318 | | - // |
319 | | - #include <arm_neon.h> |
| 314 | +#if defined(__ARM_NEON) |
| 315 | + #if defined(_MSC_VER) || (defined(__CUDACC__) && __CUDACC_VER_MAJOR__ <= 11) |
| 316 | + typedef uint16_t ggml_fp16_internal_t; |
| 317 | + #else |
| 318 | + typedef __fp16 ggml_fp16_internal_t; |
| 319 | + #endif |
| 320 | +#endif |
320 | 321 |
|
| 322 | +#if defined(__ARM_NEON) && !defined(_MSC_VER) && !(defined(__CUDACC__) && __CUDACC_VER_MAJOR__ <= 11) |
321 | 323 | #define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x) |
322 | 324 | #define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x) |
323 | 325 |
|
324 | 326 | #define GGML_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x) |
325 | 327 |
|
326 | 328 | static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) { |
327 | | - __fp16 tmp; |
| 329 | + ggml_fp16_internal_t tmp; |
328 | 330 | memcpy(&tmp, &h, sizeof(ggml_fp16_t)); |
329 | 331 | return (float)tmp; |
330 | 332 | } |
331 | 333 |
|
332 | 334 | static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) { |
333 | 335 | ggml_fp16_t res; |
334 | | - __fp16 tmp = f; |
| 336 | + ggml_fp16_internal_t tmp = f; |
335 | 337 | memcpy(&res, &tmp, sizeof(ggml_fp16_t)); |
336 | 338 | return res; |
337 | 339 | } |
@@ -483,7 +485,7 @@ GGML_API void ggml_aligned_free(void * ptr, size_t size); |
483 | 485 | #define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x) |
484 | 486 | #define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x) |
485 | 487 |
|
486 | | -#endif // defined(__ARM_NEON) && !(defined(__CUDACC__) && __CUDACC_VER_MAJOR__ <= 11) && !defined(__MUSACC__) |
| 488 | +#endif // defined(__ARM_NEON) && (!defined(__MSC_VER) |
487 | 489 |
|
488 | 490 | // precomputed f32 table for f16 (256 KB) |
489 | 491 | // defined in ggml.c, initialized in ggml_init() |
|
0 commit comments