Skip to content

Commit c764fc5

Browse files
committed
ggml : simlpify Arm fp16 CPU logic
ggml-ci
1 parent 819b7d7 commit c764fc5

File tree

3 files changed

+13
-35
lines changed

3 files changed

+13
-35
lines changed

ggml/src/ggml-cpu/ggml-cpu-impl.h

Lines changed: 2 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -4,13 +4,13 @@
44

55
#include "ggml.h"
66
#include "ggml-impl.h"
7+
78
#include <stdlib.h> // load `stdlib.h` before other headers to work around MinGW bug: https://sourceforge.net/p/mingw-w64/bugs/192/
89
//#include <stddef.h>
910
#include <stdbool.h>
1011
#include <string.h> // memcpy
1112
#include <math.h> // fabsf
1213

13-
1414
#ifdef __cplusplus
1515
extern "C" {
1616
#endif
@@ -69,33 +69,16 @@ struct ggml_compute_params {
6969
#endif
7070

7171
#if defined(__ARM_FEATURE_SVE)
72-
#include <arm_sve.h>
7372
#include <sys/prctl.h>
7473
#endif
7574

76-
// 16-bit float
77-
// on Arm, we use __fp16
78-
// on x86, we use uint16_t
7975
#if defined(__ARM_NEON)
8076

81-
// if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example:
82-
//
83-
// $ ln -sfn /Library/Developer/CommandLineTools/usr/lib/clang/13.1.6/include/arm_neon.h ./src/
84-
//
85-
#include <arm_neon.h>
86-
77+
// ref: https://github.com/ggml-org/llama.cpp/pull/5404
8778
#ifdef _MSC_VER
88-
89-
typedef uint16_t ggml_fp16_internal_t;
90-
9179
#define ggml_vld1q_u32(w,x,y,z) { ((w) + ((uint64_t)(x) << 32)), ((y) + ((uint64_t)(z) << 32)) }
92-
9380
#else
94-
95-
typedef __fp16 ggml_fp16_internal_t;
96-
9781
#define ggml_vld1q_u32(w,x,y,z) { (w), (x), (y), (z) }
98-
9982
#endif // _MSC_VER
10083

10184
#if !defined(__aarch64__)

ggml/src/ggml-cpu/simd-mappings.h

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,7 @@
7171
#define GGML_F16x8 float16x8_t
7272
#define GGML_F16x8_ZERO vdupq_n_f16(0.0f)
7373
#define GGML_F16x8_SET1(x) vdupq_n_f16(x)
74-
#define GGML_F16x8_LOAD(x) vld1q_f16((const ggml_fp16_internal_t *)(x))
74+
#define GGML_F16x8_LOAD(x) vld1q_f16((const __fp16 *)(x))
7575
#define GGML_F16x8_STORE vst1q_f16
7676
#define GGML_F16x8_FMA(a, b, c) vfmaq_f16(a, b, c)
7777
#define GGML_F16x8_ADD vaddq_f16
@@ -99,7 +99,7 @@
9999
#define GGML_F16_VEC_ZERO GGML_F16x8_ZERO
100100
#define GGML_F16_VEC_SET1 GGML_F16x8_SET1
101101
#define GGML_F16_VEC_LOAD(p, i) GGML_F16x8_LOAD(p)
102-
#define GGML_F16_VEC_STORE(p, r, i) GGML_F16x8_STORE((ggml_fp16_internal_t *)(p), (r)[i])
102+
#define GGML_F16_VEC_STORE(p, r, i) GGML_F16x8_STORE((__fp16 *)(p), (r)[i])
103103
#define GGML_F16_VEC_FMA GGML_F16x8_FMA
104104
#define GGML_F16_VEC_ADD GGML_F16x8_ADD
105105
#define GGML_F16_VEC_MUL GGML_F16x8_MUL
@@ -114,7 +114,7 @@
114114
#define GGML_F32Cx4 float32x4_t
115115
#define GGML_F32Cx4_ZERO vdupq_n_f32(0.0f)
116116
#define GGML_F32Cx4_SET1(x) vdupq_n_f32(x)
117-
#define GGML_F32Cx4_LOAD(x) vcvt_f32_f16(vld1_f16((const ggml_fp16_internal_t *)(x)))
117+
#define GGML_F32Cx4_LOAD(x) vcvt_f32_f16(vld1_f16((const __fp16 *)(x)))
118118
#define GGML_F32Cx4_STORE(x, y) vst1_f16(x, vcvt_f16_f32(y))
119119
#define GGML_F32Cx4_FMA(a, b, c) vfmaq_f32(a, b, c)
120120
#define GGML_F32Cx4_ADD vaddq_f32
@@ -125,7 +125,7 @@
125125
#define GGML_F16_VEC_ZERO GGML_F32Cx4_ZERO
126126
#define GGML_F16_VEC_SET1 GGML_F32Cx4_SET1
127127
#define GGML_F16_VEC_LOAD(p, i) GGML_F32Cx4_LOAD(p)
128-
#define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx4_STORE((ggml_fp16_internal_t *)(p), r[i])
128+
#define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx4_STORE((__fp16 *)(p), r[i])
129129
#define GGML_F16_VEC_FMA GGML_F32Cx4_FMA
130130
#define GGML_F16_VEC_ADD GGML_F32Cx4_ADD
131131
#define GGML_F16_VEC_MUL GGML_F32Cx4_MUL

ggml/src/ggml-impl.h

Lines changed: 7 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
#include <arm_sve.h>
1717
#endif // __ARM_FEATURE_SVE
1818

19-
#if defined(__ARM_NEON) && !defined(__CUDACC__) && !defined(__MUSACC__)
19+
#if defined(__ARM_NEON)
2020
// if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example:
2121
//
2222
// $ ln -sfn /Library/Developer/CommandLineTools/usr/lib/clang/13.1.6/include/arm_neon.h ./src/
@@ -311,29 +311,24 @@ GGML_API void ggml_aligned_free(void * ptr, size_t size);
311311

312312
// FP16 to FP32 conversion
313313

314+
// 16-bit float
315+
// on Arm, we use __fp16
316+
// on x86, we use uint16_t
314317
#if defined(__ARM_NEON)
315-
#if defined(_MSC_VER) || (defined(__CUDACC__) && __CUDACC_VER_MAJOR__ <= 11)
316-
typedef uint16_t ggml_fp16_internal_t;
317-
#else
318-
typedef __fp16 ggml_fp16_internal_t;
319-
#endif
320-
#endif
321-
322-
#if defined(__ARM_NEON) && !defined(_MSC_VER) && !(defined(__CUDACC__) && __CUDACC_VER_MAJOR__ <= 11)
323318
#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
324319
#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
325320

326321
#define GGML_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
327322

328323
static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
329-
ggml_fp16_internal_t tmp;
324+
__fp16 tmp;
330325
memcpy(&tmp, &h, sizeof(ggml_fp16_t));
331326
return (float)tmp;
332327
}
333328

334329
static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
335330
ggml_fp16_t res;
336-
ggml_fp16_internal_t tmp = f;
331+
__fp16 tmp = f;
337332
memcpy(&res, &tmp, sizeof(ggml_fp16_t));
338333
return res;
339334
}
@@ -485,7 +480,7 @@ GGML_API void ggml_aligned_free(void * ptr, size_t size);
485480
#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
486481
#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
487482

488-
#endif // defined(__ARM_NEON) && (!defined(__MSC_VER)
483+
#endif // defined(__ARM_NEON)
489484

490485
// precomputed f32 table for f16 (256 KB)
491486
// defined in ggml.c, initialized in ggml_init()

0 commit comments

Comments
 (0)