From c5e3b52cd3ee1611d93e42a4225e957d85decf32 Mon Sep 17 00:00:00 2001 From: SXX Date: Fri, 25 Apr 2025 08:27:40 +0800 Subject: [PATCH 1/3] ggml: dynamic x86_64 feature detection for FP32 <-> FP16/BF16 conversion --- ggml/src/ggml.c | 223 ++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 177 insertions(+), 46 deletions(-) diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 2a39dc7bfd125..9d8f0598ab568 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -42,6 +42,13 @@ #include #endif +#if defined(__x86_64__) +#include +#if defined(_MSC_VER) +# include +#endif +#endif + #if defined(_WIN32) #define WIN32_LEAN_AND_MEAN #ifndef NOMINMAX @@ -382,62 +389,186 @@ void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, int64_t n) { } } -// FIXME: these functions must detect the instruction set at runtime, since they are part of the core ggml library -// currently, the ggml_cpu_has_* functions are entirely compile-time -void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int64_t n) { - int64_t i = 0; -#if defined(__F16C__) - //if (ggml_cpu_has_f16c()) { - for (; i + 7 < n; i += 8) { - __m256 x_vec = _mm256_loadu_ps(x + i); - __m128i y_vec = _mm256_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT); - _mm_storeu_si128((__m128i *)(y + i), y_vec); - } - for(; i + 3 < n; i += 4) { - __m128 x_vec = _mm_loadu_ps(x + i); - __m128i y_vec = _mm_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT); - _mm_storel_epi64((__m128i *)(y + i), y_vec); - } - //} +#if defined(__x86_64__) + +#if defined(_MSC_VER) +#include +static void cpuid(int leaf, int subleaf, int *eax, int *ebx, int *ecx, int *edx) { + int regs[4]; + __cpuidex(regs, leaf, subleaf); + *eax = regs[0]; + *ebx = regs[1]; + *ecx = regs[2]; + *edx = regs[3]; +} +#elif defined(__GNUC__) || defined(__clang__) +static void cpuid(int leaf, int subleaf, int *eax, int *ebx, int *ecx, int *edx) { + __asm__ volatile ( + "cpuid" + : "=a"(*eax), "=b"(*ebx), "=c"(*ecx), "=d"(*edx) + : "a"(leaf), "c"(subleaf) + ); +} +#else + #error Unsupported compiler #endif - for (; i < n; i++) { + +static bool x86_64_supports_f16c(void) { + int eax, ebx, ecx, edx; + cpuid(1, 0, &eax, &ebx, &ecx, &edx); + return (ecx & (1 << 29)) != 0; +} + +static bool x86_64_supports_avx2(void) { + int eax, ebx, ecx, edx; + cpuid(0, 0, &eax, &ebx, &ecx, &edx); + if (eax < 7) + return 0; + cpuid(7, 0, &eax, &ebx, &ecx, &edx); + return (ebx & (1 << 5)) != 0; +} + +static bool x86_64_supports_avx512f(void) { + int eax, ebx, ecx, edx; + cpuid(0, 0, &eax, &ebx, &ecx, &edx); + if (eax < 7) return 0; + cpuid(7, 0, &eax, &ebx, &ecx, &edx); + return (ebx & (1 << 16)) != 0; +} + +static struct ggml_type_traits type_traits[GGML_TYPE_COUNT]; + +static inline void ggml_fp32_to_fp16_generic(const float * x, ggml_fp16_t * y, int64_t n) { + for (int64_t i = 0; i < n; i++) { y[i] = GGML_FP32_TO_FP16(x[i]); } } -void ggml_bf16_to_fp32_row(const ggml_bf16_t * x, float * y, int64_t n) { +static inline void __attribute__((target("f16c"))) ggml_fp32_to_fp16_row_f16c(const float * x, ggml_fp16_t * y, int64_t n) { int64_t i = 0; -#if defined(__AVX512F__) - //if (ggml_cpu_has_avx512()) { - for (; i + 16 <= n; i += 16) { - _mm512_storeu_ps(y + i, - _mm512_castsi512_ps( - _mm512_slli_epi32( - _mm512_cvtepu16_epi32( - _mm256_loadu_si256( - (const __m256i *)(x + i))), - 16))); - } - //} -#endif -#if defined(__AVX2__) - //if (ggml_cpu_has_avx2()) { - for (; i + 8 <= n; i += 8) { - _mm256_storeu_ps(y + i, - _mm256_castsi256_ps( - _mm256_slli_epi32( - _mm256_cvtepu16_epi32( - _mm_loadu_si128( - (const __m128i *)(x + i))), - 16))); - } - //} + for (; i + 7 < n; i += 8) { + __m256 x_vec = _mm256_loadu_ps(x + i); + __m128i y_vec = _mm256_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT); + _mm_storeu_si128((__m128i *)(y + i), y_vec); + } + for (; i + 3 < n; i += 4) { + __m128 x_vec = _mm_loadu_ps(x + i); + __m128i y_vec = _mm_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT); + _mm_storel_epi64((__m128i *)(y + i), y_vec); + } + ggml_fp32_to_fp16_generic(x + i, y + i, n - i); +} + +static inline void __attribute__((target("avx512f"))) ggml_fp32_to_fp16_row_avx512f(const float * x, ggml_fp16_t * y, int64_t n) { + int64_t i = 0; + for (; i + 15 < n; i += 16) { + __m512 x_vec = _mm512_loadu_ps(x + i); + __m256i y_vec = _mm512_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT); + _mm256_storeu_si256((__m256i *)(y + i), y_vec); + } + ggml_fp32_to_fp16_row_f16c(x + i, y + i, n - i); +} + +void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int64_t n) { +static ggml_from_float_t from_float_ref = NULL; + if (from_float_ref != NULL) { + from_float_ref(x, y, n); + return; + } + + bool has_avx512f = x86_64_supports_avx512f(); + bool has_f16c = x86_64_supports_f16c(); + if (has_avx512f && has_f16c) { + // use AVX512F + from_float_ref = (ggml_from_float_t)ggml_fp32_to_fp16_row_avx512f; + } else if (has_f16c) { + // use F16C + from_float_ref = (ggml_from_float_t)ggml_fp32_to_fp16_row_f16c; + } else { + // fallback to generic implementation + from_float_ref = (ggml_from_float_t)ggml_fp32_to_fp16_generic; + } + type_traits[GGML_TYPE_F16].from_float_ref = from_float_ref; + from_float_ref(x, y, n); +} + +#else +void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int64_t n) { + for (int64_t i = 0; i < n; i++) { + y[i] = GGML_FP32_TO_FP16(x[i]); + } +} + #endif - for (; i < n; i++) { + +#if defined(__x86_64__) + + +static inline void ggml_bf16_to_fp32_generic(const ggml_bf16_t * x, float * y, int64_t n) { + for (int64_t i = 0; i < n; i++) { y[i] = GGML_BF16_TO_FP32(x[i]); } } +static inline void __attribute__((target("avx2"))) ggml_bf16_to_fp32_row_avx2(const ggml_bf16_t * x, float * y, int64_t n) { + int64_t i = 0; + for (; i + 7 < n; i += 8) { + _mm256_storeu_ps(y + i, + _mm256_castsi256_ps( + _mm256_slli_epi32( + _mm256_cvtepu16_epi32( + _mm_loadu_si128( + (const __m128i *)(x + i))), + 16))); + } + ggml_bf16_to_fp32_generic(x + i, y + i, n - i); +} + +static inline void __attribute__((target("avx512f"))) ggml_bf16_to_fp32_row_avx512f(const ggml_bf16_t * x, float * y, int64_t n) { + int64_t i = 0; + for (; i + 15 < n; i += 16) { + _mm512_storeu_ps(y + i, + _mm512_castsi512_ps( + _mm512_slli_epi32( + _mm512_cvtepu16_epi32( + _mm256_loadu_si256( + (const __m256i *)(x + i))), + 16))); + } + ggml_bf16_to_fp32_row_avx2(x + i, y + i, n - i); +} + +void ggml_bf16_to_fp32_row(const ggml_bf16_t * x, float * y, int64_t n) { + static ggml_to_float_t to_float = NULL; + if (to_float != NULL) { + to_float(x, y, n); + return; + } + bool has_avx512f = x86_64_supports_avx512f(); + bool has_avx2 = x86_64_supports_avx2(); + if (has_avx512f) { + // use AVX512F + to_float = (ggml_to_float_t)ggml_bf16_to_fp32_row_avx512f; + } else if (has_avx2) { + // use AVX2 + to_float = (ggml_to_float_t)ggml_bf16_to_fp32_row_avx2; + } else { + // fallback to generic implementation + to_float = (ggml_to_float_t)ggml_bf16_to_fp32_generic; + } + type_traits[GGML_TYPE_BF16].to_float = to_float; + to_float(x, y, n); +} + +#else + +void ggml_bf16_to_fp32_row(const ggml_bf16_t * x, float * y, int64_t n) { + for (int64_t i = 0; i < n; i++) { + y[i] = GGML_BF16_TO_FP32(x[i]); + } +} +#endif + void ggml_fp32_to_bf16_row_ref(const float * x, ggml_bf16_t * y, int64_t n) { for (int i = 0; i < n; i++) { y[i] = ggml_compute_fp32_to_bf16(x[i]); @@ -569,7 +700,7 @@ static void ggml_vec_dot_f32(int n, float * GGML_RESTRICT s, size_t bs, const fl static void ggml_vec_dot_f16(int n, float * GGML_RESTRICT s, size_t bs, ggml_fp16_t * GGML_RESTRICT x, size_t bx, ggml_fp16_t * GGML_RESTRICT y, size_t by, int nrc); static void ggml_vec_dot_bf16(int n, float * GGML_RESTRICT s, size_t bs, ggml_bf16_t * GGML_RESTRICT x, size_t bx, ggml_bf16_t * GGML_RESTRICT y, size_t by, int nrc); -static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = { +static struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = { [GGML_TYPE_I8] = { .type_name = "i8", .blck_size = 1, From 3efb0e7327e5cf877da407647645cdf22b1fdd73 Mon Sep 17 00:00:00 2001 From: SXX Date: Sat, 26 Apr 2025 11:13:22 +0800 Subject: [PATCH 2/3] move fp converter to ggml-cpu --- ggml/include/ggml-cpu.h | 5 + ggml/src/ggml-cpu/ggml-cpu.c | 91 ++++++++++++++++- ggml/src/ggml.c | 184 ++--------------------------------- 3 files changed, 100 insertions(+), 180 deletions(-) diff --git a/ggml/include/ggml-cpu.h b/ggml/include/ggml-cpu.h index f5e11f1e10002..de77a875ec533 100644 --- a/ggml/include/ggml-cpu.h +++ b/ggml/include/ggml-cpu.h @@ -133,6 +133,11 @@ extern "C" { GGML_BACKEND_API ggml_backend_reg_t ggml_backend_cpu_reg(void); + GGML_BACKEND_API void ggml_cpu_fp32_to_fp16(const float *, ggml_fp16_t *, int64_t); + GGML_BACKEND_API void ggml_cpu_fp16_to_fp32(const ggml_fp16_t *, float *, int64_t); + GGML_BACKEND_API void ggml_cpu_fp32_to_bf16(const float *, ggml_bf16_t *, int64_t); + GGML_BACKEND_API void ggml_cpu_bf16_to_fp32(const ggml_bf16_t *, float *, int64_t); + #ifdef __cplusplus } #endif diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c index dbad8f61a1e92..64405449e2467 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ggml/src/ggml-cpu/ggml-cpu.c @@ -215,7 +215,7 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = { .nrows = 1, }, [GGML_TYPE_F16] = { - .from_float = (ggml_from_float_t) ggml_fp32_to_fp16_row, + .from_float = (ggml_from_float_t) ggml_cpu_fp32_to_fp16, .vec_dot = (ggml_vec_dot_t) ggml_vec_dot_f16, .vec_dot_type = GGML_TYPE_F16, .nrows = 1, @@ -356,7 +356,7 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = { .from_float = quantize_row_q8_K, }, [GGML_TYPE_BF16] = { - .from_float = (ggml_from_float_t) ggml_fp32_to_bf16_row, + .from_float = (ggml_from_float_t) ggml_cpu_fp32_to_bf16, .vec_dot = (ggml_vec_dot_t) ggml_vec_dot_bf16, .vec_dot_type = GGML_TYPE_BF16, .nrows = 1, @@ -3166,6 +3166,93 @@ enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct g return ggml_graph_compute(cgraph, &cplan); } +void ggml_cpu_fp32_to_fp16(const float * x, ggml_fp16_t * y, int64_t n) { + int64_t i = 0; +#if defined(__F16C__) +#if defined(__AVX512F__) + for (; i + 15 < n; i += 16) { + __m512 x_vec = _mm512_loadu_ps(x + i); + __m256i y_vec = _mm512_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT); + _mm256_storeu_si256((__m256i *)(y + i), y_vec); + } +#endif + for (; i + 7 < n; i += 8) { + __m256 x_vec = _mm256_loadu_ps(x + i); + __m128i y_vec = _mm256_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT); + _mm_storeu_si128((__m128i *)(y + i), y_vec); + } + for (; i + 3 < n; i += 4) { + __m128 x_vec = _mm_loadu_ps(x + i); + __m128i y_vec = _mm_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT); + _mm_storel_epi64((__m128i *)(y + i), y_vec); + } +#endif + for (; i < n; ++i) { + y[i] = GGML_FP32_TO_FP16(x[i]); + } +} + +void ggml_cpu_fp16_to_fp32(const ggml_fp16_t * x, float * y, int64_t n) { + int64_t i = 0; +#if defined(__F16C__) +#if defined(__AVX512F__) + for (; i + 15 < n; i += 16) { + __m256i x_vec = _mm256_loadu_si256((const __m256i *)(x + i)); + __m512 y_vec = _mm512_cvtph_ps(x_vec); + _mm512_storeu_ps(y + i, y_vec); + } +#endif + for (; i + 7 < n; i += 8) { + __m128i x_vec = _mm_loadu_si128((const __m128i *)(x + i)); + __m256 y_vec = _mm256_cvtph_ps(x_vec); + _mm256_storeu_ps(y + i, y_vec); + } + for (; i + 3 < n; i += 4) { + __m128i x_vec = _mm_loadl_epi64((const __m128i *)(x + i)); + __m128 y_vec = _mm_cvtph_ps(x_vec); + _mm_storeu_ps(y + i, y_vec); + } +#endif + for (; i < n; ++i) { + y[i] = GGML_FP16_TO_FP32(x[i]); + } +} + +void ggml_cpu_fp32_to_bf16(const float * x, ggml_bf16_t * y, int64_t n) { + int64_t i = 0; + for (; i < n; ++i) { + y[i] = GGML_FP32_TO_BF16(x[i]); + } +} + +void ggml_cpu_bf16_to_fp32(const ggml_bf16_t * x, float * y, int64_t n) { + int64_t i = 0; +#if defined(__AVX2__) +#if defined(__AVX512F__) + for (; i + 15 < n; i += 16) { + _mm512_storeu_ps(y + i, + _mm512_castsi512_ps( + _mm512_slli_epi32( + _mm512_cvtepu16_epi32( + _mm256_loadu_si256( + (const __m256i *)(x + i))), + 16))); + } +#endif + for (; i + 7 < n; i += 8) { + _mm256_storeu_ps(y + i, + _mm256_castsi256_ps( + _mm256_slli_epi32( + _mm256_cvtepu16_epi32( + _mm_loadu_si128( + (const __m128i *)(x + i))), + 16))); + } +#endif + for (; i < n; i++) { + y[i] = GGML_BF16_TO_FP32(x[i]); + } +} int ggml_cpu_has_avx(void) { #if defined(__AVX__) diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 9d8f0598ab568..7654ae1779b1d 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -4,6 +4,7 @@ #include "ggml-backend.h" #include "ggml-impl.h" #include "ggml-threading.h" +#include "ggml-cpu.h" #include "ggml.h" // FIXME: required here for quantization functions @@ -42,13 +43,6 @@ #include #endif -#if defined(__x86_64__) -#include -#if defined(_MSC_VER) -# include -#endif -#endif - #if defined(_WIN32) #define WIN32_LEAN_AND_MEAN #ifndef NOMINMAX @@ -389,185 +383,19 @@ void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, int64_t n) { } } -#if defined(__x86_64__) - -#if defined(_MSC_VER) -#include -static void cpuid(int leaf, int subleaf, int *eax, int *ebx, int *ecx, int *edx) { - int regs[4]; - __cpuidex(regs, leaf, subleaf); - *eax = regs[0]; - *ebx = regs[1]; - *ecx = regs[2]; - *edx = regs[3]; -} -#elif defined(__GNUC__) || defined(__clang__) -static void cpuid(int leaf, int subleaf, int *eax, int *ebx, int *ecx, int *edx) { - __asm__ volatile ( - "cpuid" - : "=a"(*eax), "=b"(*ebx), "=c"(*ecx), "=d"(*edx) - : "a"(leaf), "c"(subleaf) - ); -} -#else - #error Unsupported compiler -#endif - -static bool x86_64_supports_f16c(void) { - int eax, ebx, ecx, edx; - cpuid(1, 0, &eax, &ebx, &ecx, &edx); - return (ecx & (1 << 29)) != 0; -} - -static bool x86_64_supports_avx2(void) { - int eax, ebx, ecx, edx; - cpuid(0, 0, &eax, &ebx, &ecx, &edx); - if (eax < 7) - return 0; - cpuid(7, 0, &eax, &ebx, &ecx, &edx); - return (ebx & (1 << 5)) != 0; -} - -static bool x86_64_supports_avx512f(void) { - int eax, ebx, ecx, edx; - cpuid(0, 0, &eax, &ebx, &ecx, &edx); - if (eax < 7) return 0; - cpuid(7, 0, &eax, &ebx, &ecx, &edx); - return (ebx & (1 << 16)) != 0; -} - -static struct ggml_type_traits type_traits[GGML_TYPE_COUNT]; - -static inline void ggml_fp32_to_fp16_generic(const float * x, ggml_fp16_t * y, int64_t n) { - for (int64_t i = 0; i < n; i++) { - y[i] = GGML_FP32_TO_FP16(x[i]); - } -} - -static inline void __attribute__((target("f16c"))) ggml_fp32_to_fp16_row_f16c(const float * x, ggml_fp16_t * y, int64_t n) { - int64_t i = 0; - for (; i + 7 < n; i += 8) { - __m256 x_vec = _mm256_loadu_ps(x + i); - __m128i y_vec = _mm256_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT); - _mm_storeu_si128((__m128i *)(y + i), y_vec); - } - for (; i + 3 < n; i += 4) { - __m128 x_vec = _mm_loadu_ps(x + i); - __m128i y_vec = _mm_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT); - _mm_storel_epi64((__m128i *)(y + i), y_vec); - } - ggml_fp32_to_fp16_generic(x + i, y + i, n - i); -} - -static inline void __attribute__((target("avx512f"))) ggml_fp32_to_fp16_row_avx512f(const float * x, ggml_fp16_t * y, int64_t n) { - int64_t i = 0; - for (; i + 15 < n; i += 16) { - __m512 x_vec = _mm512_loadu_ps(x + i); - __m256i y_vec = _mm512_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT); - _mm256_storeu_si256((__m256i *)(y + i), y_vec); - } - ggml_fp32_to_fp16_row_f16c(x + i, y + i, n - i); -} - -void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int64_t n) { -static ggml_from_float_t from_float_ref = NULL; - if (from_float_ref != NULL) { - from_float_ref(x, y, n); - return; - } - - bool has_avx512f = x86_64_supports_avx512f(); - bool has_f16c = x86_64_supports_f16c(); - if (has_avx512f && has_f16c) { - // use AVX512F - from_float_ref = (ggml_from_float_t)ggml_fp32_to_fp16_row_avx512f; - } else if (has_f16c) { - // use F16C - from_float_ref = (ggml_from_float_t)ggml_fp32_to_fp16_row_f16c; - } else { - // fallback to generic implementation - from_float_ref = (ggml_from_float_t)ggml_fp32_to_fp16_generic; - } - type_traits[GGML_TYPE_F16].from_float_ref = from_float_ref; - from_float_ref(x, y, n); -} - -#else void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int64_t n) { - for (int64_t i = 0; i < n; i++) { + int i = 0; + for (; i < n; ++i) { y[i] = GGML_FP32_TO_FP16(x[i]); } } -#endif - -#if defined(__x86_64__) - - -static inline void ggml_bf16_to_fp32_generic(const ggml_bf16_t * x, float * y, int64_t n) { - for (int64_t i = 0; i < n; i++) { - y[i] = GGML_BF16_TO_FP32(x[i]); - } -} - -static inline void __attribute__((target("avx2"))) ggml_bf16_to_fp32_row_avx2(const ggml_bf16_t * x, float * y, int64_t n) { - int64_t i = 0; - for (; i + 7 < n; i += 8) { - _mm256_storeu_ps(y + i, - _mm256_castsi256_ps( - _mm256_slli_epi32( - _mm256_cvtepu16_epi32( - _mm_loadu_si128( - (const __m128i *)(x + i))), - 16))); - } - ggml_bf16_to_fp32_generic(x + i, y + i, n - i); -} - -static inline void __attribute__((target("avx512f"))) ggml_bf16_to_fp32_row_avx512f(const ggml_bf16_t * x, float * y, int64_t n) { - int64_t i = 0; - for (; i + 15 < n; i += 16) { - _mm512_storeu_ps(y + i, - _mm512_castsi512_ps( - _mm512_slli_epi32( - _mm512_cvtepu16_epi32( - _mm256_loadu_si256( - (const __m256i *)(x + i))), - 16))); - } - ggml_bf16_to_fp32_row_avx2(x + i, y + i, n - i); -} - -void ggml_bf16_to_fp32_row(const ggml_bf16_t * x, float * y, int64_t n) { - static ggml_to_float_t to_float = NULL; - if (to_float != NULL) { - to_float(x, y, n); - return; - } - bool has_avx512f = x86_64_supports_avx512f(); - bool has_avx2 = x86_64_supports_avx2(); - if (has_avx512f) { - // use AVX512F - to_float = (ggml_to_float_t)ggml_bf16_to_fp32_row_avx512f; - } else if (has_avx2) { - // use AVX2 - to_float = (ggml_to_float_t)ggml_bf16_to_fp32_row_avx2; - } else { - // fallback to generic implementation - to_float = (ggml_to_float_t)ggml_bf16_to_fp32_generic; - } - type_traits[GGML_TYPE_BF16].to_float = to_float; - to_float(x, y, n); -} - -#else - void ggml_bf16_to_fp32_row(const ggml_bf16_t * x, float * y, int64_t n) { - for (int64_t i = 0; i < n; i++) { + int i = 0; + for (; i < n; ++i) { y[i] = GGML_BF16_TO_FP32(x[i]); } } -#endif void ggml_fp32_to_bf16_row_ref(const float * x, ggml_bf16_t * y, int64_t n) { for (int i = 0; i < n; i++) { @@ -700,7 +528,7 @@ static void ggml_vec_dot_f32(int n, float * GGML_RESTRICT s, size_t bs, const fl static void ggml_vec_dot_f16(int n, float * GGML_RESTRICT s, size_t bs, ggml_fp16_t * GGML_RESTRICT x, size_t bx, ggml_fp16_t * GGML_RESTRICT y, size_t by, int nrc); static void ggml_vec_dot_bf16(int n, float * GGML_RESTRICT s, size_t bs, ggml_bf16_t * GGML_RESTRICT x, size_t bx, ggml_bf16_t * GGML_RESTRICT y, size_t by, int nrc); -static struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = { +static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = { [GGML_TYPE_I8] = { .type_name = "i8", .blck_size = 1, From 82f8630acf2c2cb2743e7b9c4296be07687b2403 Mon Sep 17 00:00:00 2001 From: SXX Date: Sat, 26 Apr 2025 19:49:48 +0800 Subject: [PATCH 3/3] Switch ggml_compute_forward_get_rows_f16/bf16 to new ggml_cpu_fp16/bf16_to_fp32 --- ggml/src/ggml-cpu/ops.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp index 3c2adb217267b..7413192b746b6 100644 --- a/ggml/src/ggml-cpu/ops.cpp +++ b/ggml/src/ggml-cpu/ops.cpp @@ -4222,7 +4222,7 @@ static void ggml_compute_forward_get_rows_f16( GGML_ASSERT(i01 >= 0 && i01 < ne01); - ggml_fp16_to_fp32_row( + ggml_cpu_fp16_to_fp32( (const ggml_fp16_t*) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03), (float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc); } @@ -4263,7 +4263,7 @@ static void ggml_compute_forward_get_rows_bf16( GGML_ASSERT(i01 >= 0 && i01 < ne01); - ggml_bf16_to_fp32_row( + ggml_cpu_bf16_to_fp32( (const ggml_bf16_t *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03), (float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc); }