Skip to content

Commit f9c7ee7

Browse files
committed
Reapply "ggml-cpu: drop support for nnpa intrinsics (ggml-org#15821)"
1 parent bd57526 commit f9c7ee7

File tree

5 files changed

+85
-0
lines changed

5 files changed

+85
-0
lines changed

ggml/include/ggml-cpu.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,7 @@ extern "C" {
101101
GGML_BACKEND_API int ggml_cpu_has_riscv_v (void);
102102
GGML_BACKEND_API int ggml_cpu_has_vsx (void);
103103
GGML_BACKEND_API int ggml_cpu_has_vxe (void);
104+
GGML_BACKEND_API int ggml_cpu_has_nnpa (void);
104105
GGML_BACKEND_API int ggml_cpu_has_wasm_simd (void);
105106
GGML_BACKEND_API int ggml_cpu_has_llamafile (void);
106107

ggml/src/ggml-cpu/ggml-cpu-impl.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,12 @@ struct ggml_compute_params {
6868
#endif // __VXE2__
6969
#endif // __s390x__ && __VEC__
7070

71+
#if defined(__s390x__) && defined(GGML_NNPA)
72+
#ifndef __NNPA__
73+
#define __NNPA__
74+
#endif // __NNPA__
75+
#endif // __s390x__ && GGML_NNPA
76+
7177
#if defined(__ARM_FEATURE_SVE)
7278
#include <sys/prctl.h>
7379
#endif

ggml/src/ggml-cpu/ggml-cpu.c

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4077,6 +4077,21 @@ void ggml_cpu_fp32_to_fp16(const float * x, ggml_fp16_t * y, int64_t n) {
40774077
__m128i y_vec = _mm_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT);
40784078
_mm_storel_epi64((__m128i *)(y + i), y_vec);
40794079
}
4080+
#elif defined(__NNPA__)
4081+
for (; i + 7 < n; i += 8) {
4082+
float32x4_t v_xh = vec_xl(0, (const float *)(x + i + 0));
4083+
float32x4_t v_xl = vec_xl(0, (const float *)(x + i + 4));
4084+
uint16x8_t v_yd = vec_round_from_fp32(v_xh, v_xl, 0);
4085+
uint16x8_t v_y = vec_convert_to_fp16(v_yd, 0);
4086+
vec_xst(v_y, 0, (ggml_fp16_t *)(y + i));
4087+
}
4088+
for (; i + 3 < n; i += 4) {
4089+
float32x4_t v_x = vec_xl(0, (const float *)(x + i));
4090+
float32x4_t v_zero = vec_splats(0.0f);
4091+
uint16x8_t v_yd = vec_round_from_fp32(v_x, v_zero, 0);
4092+
uint16x8_t v_y = vec_convert_to_fp16(v_yd, 0);
4093+
vec_xst(v_y, 0, (ggml_fp16_t *)(y + i));
4094+
}
40804095
#elif defined(__riscv_zvfh)
40814096
for (int vl; i < n; i += vl) {
40824097
vl = __riscv_vsetvl_e32m2(n - i);
@@ -4110,6 +4125,21 @@ void ggml_cpu_fp16_to_fp32(const ggml_fp16_t * x, float * y, int64_t n) {
41104125
__m128 y_vec = _mm_cvtph_ps(x_vec);
41114126
_mm_storeu_ps(y + i, y_vec);
41124127
}
4128+
#elif defined(__NNPA__)
4129+
for (; i + 7 < n; i += 8) {
4130+
uint16x8_t v_x = vec_xl(0, (const ggml_fp16_t *)(x + i));
4131+
uint16x8_t v_yd = vec_convert_from_fp16(v_x, 0);
4132+
float32x4_t v_yh = vec_extend_to_fp32_hi(v_yd, 0);
4133+
float32x4_t v_yl = vec_extend_to_fp32_lo(v_yd, 0);
4134+
vec_xst(v_yh, 0, (float *)(y + i + 0));
4135+
vec_xst(v_yl, 0, (float *)(y + i + 4));
4136+
}
4137+
for (; i + 3 < n; i += 4) {
4138+
uint16x8_t v_x = vec_xl(0, (const ggml_fp16_t *)(x + i));
4139+
uint16x8_t v_yd = vec_convert_from_fp16(v_x, 0);
4140+
float32x4_t v_yh = vec_extend_to_fp32_hi(v_yd, 0);
4141+
vec_xst(v_yh, 0, (float *)(y + i));
4142+
}
41134143
#endif
41144144

41154145
for (; i < n; ++i) {
@@ -4313,6 +4343,14 @@ int ggml_cpu_has_vxe(void) {
43134343
#endif
43144344
}
43154345

4346+
int ggml_cpu_has_nnpa(void) {
4347+
#if defined(GGML_NNPA)
4348+
return 1;
4349+
#else
4350+
return 0;
4351+
#endif
4352+
}
4353+
43164354
int ggml_cpu_has_neon(void) {
43174355
#if defined(__ARM_ARCH) && defined(__ARM_NEON)
43184356
return 1;

ggml/src/ggml-cpu/ggml-cpu.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -579,6 +579,9 @@ static ggml_backend_feature * ggml_backend_cpu_get_features(ggml_backend_reg_t r
579579
if (ggml_cpu_has_vxe()) {
580580
features.push_back({ "VXE", "1" });
581581
}
582+
if (ggml_cpu_has_nnpa()) {
583+
features.push_back({ "NNPA", "1" });
584+
}
582585
if (ggml_cpu_has_wasm_simd()) {
583586
features.push_back({ "WASM_SIMD", "1" });
584587
}

ggml/src/ggml-cpu/simd-mappings.h

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -114,6 +114,26 @@ extern "C" {
114114
#define GGML_CPU_COMPUTE_FP32_TO_FP16(x) riscv_compute_fp32_to_fp16(x)
115115
#define GGML_CPU_FP16_TO_FP32(x) GGML_CPU_COMPUTE_FP16_TO_FP32(x)
116116
#define GGML_CPU_FP32_TO_FP16(x) GGML_CPU_COMPUTE_FP32_TO_FP16(x)
117+
#elif defined(__NNPA__)
118+
#define GGML_CPU_COMPUTE_FP16_TO_FP32(x) nnpa_compute_fp16_to_fp32(x)
119+
#define GGML_CPU_COMPUTE_FP32_TO_FP16(x) nnpa_compute_fp32_to_fp16(x)
120+
121+
#define GGML_CPU_FP16_TO_FP32(x) GGML_CPU_COMPUTE_FP16_TO_FP32(x)
122+
#define GGML_CPU_FP32_TO_FP16(x) GGML_CPU_COMPUTE_FP32_TO_FP16(x)
123+
124+
static inline float nnpa_compute_fp16_to_fp32(ggml_fp16_t h) {
125+
uint16x8_t v_h = vec_splats(h);
126+
uint16x8_t v_hd = vec_convert_from_fp16(v_h, 0);
127+
return vec_extend_to_fp32_hi(v_hd, 0)[0];
128+
}
129+
130+
static inline ggml_fp16_t nnpa_compute_fp32_to_fp16(float f) {
131+
float32x4_t v_f = vec_splats(f);
132+
float32x4_t v_zero = vec_splats(0.0f);
133+
uint16x8_t v_hd = vec_round_from_fp32(v_f, v_zero, 0);
134+
uint16x8_t v_h = vec_convert_to_fp16(v_hd, 0);
135+
return vec_extract(v_h, 0);
136+
}
117137
#endif
118138

119139
// precomputed f32 table for f16 (256 KB)
@@ -1136,6 +1156,11 @@ static inline void __lsx_f16x4_store(ggml_fp16_t * x, __m128 y) {
11361156
#define GGML_F16_EPR GGML_F32_EPR
11371157

11381158
static inline float32x4_t __lzs_f16cx4_load(const ggml_fp16_t * x) {
1159+
#if defined(__NNPA__)
1160+
uint16x8_t v_x = vec_xl(0, (const ggml_fp16_t *)x);
1161+
uint16x8_t v_xd = vec_convert_from_fp16(v_x, 0);
1162+
return vec_extend_to_fp32_hi(v_xd, 0);
1163+
#else
11391164
float tmp[4];
11401165

11411166
for (int i = 0; i < 4; i++) {
@@ -1145,9 +1170,20 @@ static inline float32x4_t __lzs_f16cx4_load(const ggml_fp16_t * x) {
11451170
// note: keep type-cast here to prevent compiler bugs
11461171
// see: https://github.com/ggml-org/llama.cpp/issues/12846
11471172
return vec_xl(0, (const float *)(tmp));
1173+
#endif
11481174
}
11491175

11501176
static inline void __lzs_f16cx4_store(ggml_fp16_t * x, float32x4_t v_y) {
1177+
#if defined(__NNPA__)
1178+
float32x4_t v_zero = vec_splats(0.0f);
1179+
uint16x8_t v_xd = vec_round_from_fp32(v_y, v_zero, 0);
1180+
uint16x8_t v_x = vec_convert_to_fp16(v_xd, 0);
1181+
1182+
x[0] = vec_extract(v_x, 0);
1183+
x[1] = vec_extract(v_x, 1);
1184+
x[2] = vec_extract(v_x, 2);
1185+
x[3] = vec_extract(v_x, 3);
1186+
#else
11511187
float arr[4];
11521188

11531189
// note: keep type-cast here to prevent compiler bugs
@@ -1157,6 +1193,7 @@ static inline void __lzs_f16cx4_store(ggml_fp16_t * x, float32x4_t v_y) {
11571193
for (int i = 0; i < 4; i++) {
11581194
x[i] = GGML_CPU_FP32_TO_FP16(arr[i]);
11591195
}
1196+
#endif
11601197
}
11611198

11621199
#define GGML_F16_VEC GGML_F32x4

0 commit comments

Comments
 (0)