Skip to content

Commit 5834dee

Browse files
committed
ggml-cpu: move nnpa fp16->fp32 and fp32->fp16 to simd-mappings
Signed-off-by: Aaron Teo <[email protected]>
1 parent 5004e43 commit 5834dee

File tree

2 files changed

+39
-37
lines changed

2 files changed

+39
-37
lines changed

ggml/src/ggml-cpu/simd-mappings.h

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1013,6 +1013,45 @@ static inline void __lzs_f16cx4_store(ggml_fp16_t * x, float32x4_t v_y) {
10131013
#define GGML_F16_VEC_MUL GGML_F32x4_MUL
10141014
#define GGML_F16_VEC_REDUCE GGML_F32x4_REDUCE
10151015

1016+
#if defined(__NNPA__)
1017+
#ifdef GGML_FP16_TO_FP32
1018+
#undef GGML_FP16_TO_FP32
1019+
#endif
1020+
1021+
#ifdef GGML_FP32_TO_FP16
1022+
#undef GGML_FP32_TO_FP16
1023+
#endif
1024+
1025+
#ifdef GGML_COMPUTE_FP16_TO_FP32
1026+
#undef GGML_COMPUTE_FP16_TO_FP32
1027+
#endif
1028+
1029+
#ifdef GGML_COMPUTE_FP32_TO_FP16
1030+
#undef GGML_COMPUTE_FP32_TO_FP16
1031+
#endif
1032+
1033+
#define GGML_COMPUTE_FP16_TO_FP32(x) nnpa_compute_fp16_to_fp32(x)
1034+
#define GGML_COMPUTE_FP32_TO_FP16(x) nnpa_compute_fp32_to_fp16(x)
1035+
1036+
#define GGML_FP16_TO_FP32(x) GGML_COMPUTE_FP16_TO_FP32(x)
1037+
#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
1038+
1039+
static inline float nnpa_compute_fp16_to_fp32(ggml_fp16_t h) {
1040+
uint16x8_t v_h = vec_splats(h);
1041+
uint16x8_t v_hd = vec_convert_from_fp16(v_h, 0);
1042+
return vec_extend_to_fp32_hi(v_hd, 0)[0];
1043+
}
1044+
1045+
static inline ggml_fp16_t nnpa_compute_fp32_to_fp16(float f) {
1046+
float32x4_t v_f = vec_splats(f);
1047+
float32x4_t v_zero = vec_splats(0.0f);
1048+
uint16x8_t v_hd = vec_round_from_fp32(v_f, v_zero, 0);
1049+
uint16x8_t v_h = vec_convert_to_fp16(v_hd, 0);
1050+
return vec_extract(v_h, 0);
1051+
}
1052+
1053+
#endif // __NNPA__
1054+
10161055
#endif
10171056

10181057
// GGML_F32_ARR / GGML_F16_ARR

ggml/src/ggml-impl.h

Lines changed: 0 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -417,43 +417,6 @@ GGML_API void ggml_aligned_free(void * ptr, size_t size);
417417
#define GGML_FP16_TO_FP32(x) GGML_COMPUTE_FP16_TO_FP32(x)
418418
#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
419419

420-
#elif defined(__NNPA__)
421-
/*
422-
* Note: This functionality is ready for use, but the compiler macros
423-
* defined for the s390x platform are defined in ggml-cpu while
424-
* this file is 1 step behind, in ggml-src. I currently have no
425-
* idea how to fix this, so I am leaving it as is.
426-
*
427-
* CMake chain: ggml -> ggml-src -> ggml-cpu
428-
* ^^^^^^^^ ^^^^^^^^
429-
* | | ggml-cpu defines the macros
430-
* | | needed for s390x detection.
431-
* | this file is here, where the s390x
432-
* | detection macros are not defined.
433-
*
434-
* TODO: Fix s390x platform detection in this file.
435-
*/
436-
437-
#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
438-
#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
439-
440-
#define GGML_FP16_TO_FP32(x) GGML_COMPUTE_FP16_TO_FP32(x)
441-
#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
442-
443-
static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
444-
uint16x8_t v_h = vec_splats(h);
445-
uint16x8_t v_hd = vec_convert_from_fp16(v_h, 0);
446-
return vec_extend_to_fp32_hi(v_hd, 0)[0];
447-
}
448-
449-
static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
450-
float32x4_t v_f = vec_splats(f);
451-
float32x4_t v_zero = vec_splats(0.0f);
452-
uint16x8_t v_hd = vec_round_from_fp32(v_f, v_zero, 0);
453-
uint16x8_t v_h = vec_convert_to_fp16(v_hd, 0);
454-
return vec_extract(v_h, 0);
455-
}
456-
457420
#else
458421

459422
// FP16 <-> FP32

0 commit comments

Comments
 (0)