|
16 | 16 | #include <arm_sve.h> |
17 | 17 | #endif // __ARM_FEATURE_SVE |
18 | 18 |
|
19 | | -#if defined(__ARM_NEON) && !defined(__CUDACC__) && !defined(__MUSACC__) |
20 | | -// if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example: |
21 | | -// |
22 | | -// $ ln -sfn /Library/Developer/CommandLineTools/usr/lib/clang/13.1.6/include/arm_neon.h ./src/ |
23 | | -// |
24 | | -#include <arm_neon.h> |
25 | | -#endif |
26 | | - |
27 | 19 | #if defined(__F16C__) |
28 | 20 | #include <immintrin.h> |
29 | 21 | #endif |
@@ -311,29 +303,35 @@ GGML_API void ggml_aligned_free(void * ptr, size_t size); |
311 | 303 |
|
312 | 304 | // FP16 to FP32 conversion |
313 | 305 |
|
314 | | -#if defined(__ARM_NEON) |
315 | | - #if defined(_MSC_VER) || (defined(__CUDACC__) && __CUDACC_VER_MAJOR__ <= 11) |
316 | | - typedef uint16_t ggml_fp16_internal_t; |
317 | | - #else |
318 | | - typedef __fp16 ggml_fp16_internal_t; |
319 | | - #endif |
320 | | -#endif |
| 306 | +// 16-bit float |
| 307 | +// on Arm, we use __fp16 |
| 308 | +// on x86, we use uint16_t |
| 309 | +// |
| 310 | +// for old CUDA compilers (<= 11), we use uint16_t: ref https://github.com/ggml-org/llama.cpp/pull/10616 |
| 311 | +// for MUSA compilers , we use uint16_t: ref https://github.com/ggml-org/llama.cpp/pull/11843 |
| 312 | +// |
| 313 | +#if defined(__ARM_NEON) && !(defined(__CUDACC__) && __CUDACC_VER_MAJOR__ <= 11) && !defined(__MUSACC__) |
| 314 | + |
| 315 | + // if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example: |
| 316 | + // |
| 317 | + // $ ln -sfn /Library/Developer/CommandLineTools/usr/lib/clang/13.1.6/include/arm_neon.h ./src/ |
| 318 | + // |
| 319 | + #include <arm_neon.h> |
321 | 320 |
|
322 | | -#if defined(__ARM_NEON) && !defined(_MSC_VER) && !(defined(__CUDACC__) && __CUDACC_VER_MAJOR__ <= 11) |
323 | 321 | #define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x) |
324 | 322 | #define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x) |
325 | 323 |
|
326 | 324 | #define GGML_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x) |
327 | 325 |
|
328 | 326 | static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) { |
329 | | - ggml_fp16_internal_t tmp; |
| 327 | + __fp16 tmp; |
330 | 328 | memcpy(&tmp, &h, sizeof(ggml_fp16_t)); |
331 | 329 | return (float)tmp; |
332 | 330 | } |
333 | 331 |
|
334 | 332 | static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) { |
335 | 333 | ggml_fp16_t res; |
336 | | - ggml_fp16_internal_t tmp = f; |
| 334 | + __fp16 tmp = f; |
337 | 335 | memcpy(&res, &tmp, sizeof(ggml_fp16_t)); |
338 | 336 | return res; |
339 | 337 | } |
@@ -485,7 +483,7 @@ GGML_API void ggml_aligned_free(void * ptr, size_t size); |
485 | 483 | #define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x) |
486 | 484 | #define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x) |
487 | 485 |
|
488 | | -#endif // defined(__ARM_NEON) && (!defined(__MSC_VER) |
| 486 | +#endif // defined(__ARM_NEON) && !(defined(__CUDACC__) && __CUDACC_VER_MAJOR__ <= 11) && !defined(__MUSACC__) |
489 | 487 |
|
490 | 488 | // precomputed f32 table for f16 (256 KB) |
491 | 489 | // defined in ggml.c, initialized in ggml_init() |
|
0 commit comments