From c5e3b52cd3ee1611d93e42a4225e957d85decf32 Mon Sep 17 00:00:00 2001
From: SXX <sxx1136965276@gmail.com>
Date: Fri, 25 Apr 2025 08:27:40 +0800
Subject: [PATCH 1/3] ggml: dynamic x86_64 feature detection for FP32 <->
 FP16/BF16 conversion

---
 ggml/src/ggml.c | 223 ++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 177 insertions(+), 46 deletions(-)

diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index 2a39dc7bfd125..9d8f0598ab568 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -42,6 +42,13 @@
 #include <TargetConditionals.h>
 #endif
 
+#if defined(__x86_64__)
+#include <immintrin.h>
+#if defined(_MSC_VER)
+#  include <intrin.h>
+#endif
+#endif
+
 #if defined(_WIN32)
 #define WIN32_LEAN_AND_MEAN
 #ifndef NOMINMAX
@@ -382,62 +389,186 @@ void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, int64_t n) {
     }
 }
 
-// FIXME: these functions must detect the instruction set at runtime, since they are part of the core ggml library
-//        currently, the ggml_cpu_has_* functions are entirely compile-time
-void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int64_t n) {
-    int64_t i = 0;
-#if defined(__F16C__)
-    //if (ggml_cpu_has_f16c()) {
-        for (; i + 7 < n; i += 8) {
-            __m256 x_vec = _mm256_loadu_ps(x + i);
-            __m128i y_vec = _mm256_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT);
-            _mm_storeu_si128((__m128i *)(y + i), y_vec);
-        }
-        for(; i + 3 < n; i += 4) {
-            __m128 x_vec = _mm_loadu_ps(x + i);
-            __m128i y_vec = _mm_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT);
-            _mm_storel_epi64((__m128i *)(y + i), y_vec);
-        }
-    //}
+#if defined(__x86_64__)
+
+#if defined(_MSC_VER)
+#include <intrin.h>
+static void cpuid(int leaf, int subleaf, int *eax, int *ebx, int *ecx, int *edx) {
+    int regs[4];
+    __cpuidex(regs, leaf, subleaf);
+    *eax = regs[0];
+    *ebx = regs[1];
+    *ecx = regs[2];
+    *edx = regs[3];
+}
+#elif defined(__GNUC__) || defined(__clang__)
+static void cpuid(int leaf, int subleaf, int *eax, int *ebx, int *ecx, int *edx) {
+    __asm__ volatile (
+        "cpuid"
+        : "=a"(*eax), "=b"(*ebx), "=c"(*ecx), "=d"(*edx)
+        : "a"(leaf), "c"(subleaf)
+    );
+}
+#else
+  #error Unsupported compiler
 #endif
-    for (; i < n; i++) {
+
+static bool x86_64_supports_f16c(void) {
+    int eax, ebx, ecx, edx;
+    cpuid(1, 0, &eax, &ebx, &ecx, &edx);
+    return (ecx & (1 << 29)) != 0;
+}
+
+static bool x86_64_supports_avx2(void) {
+    int eax, ebx, ecx, edx;
+    cpuid(0, 0, &eax, &ebx, &ecx, &edx);
+    if (eax < 7)
+        return 0;
+    cpuid(7, 0, &eax, &ebx, &ecx, &edx);
+    return (ebx & (1 << 5)) != 0;
+}
+
+static bool x86_64_supports_avx512f(void) {
+    int eax, ebx, ecx, edx;
+    cpuid(0, 0, &eax, &ebx, &ecx, &edx);
+    if (eax < 7) return 0;
+    cpuid(7, 0, &eax, &ebx, &ecx, &edx);
+    return (ebx & (1 << 16)) != 0;
+}
+
+static struct ggml_type_traits type_traits[GGML_TYPE_COUNT];
+
+static inline void ggml_fp32_to_fp16_generic(const float * x, ggml_fp16_t * y, int64_t n) {
+    for (int64_t i = 0; i < n; i++) {
         y[i] = GGML_FP32_TO_FP16(x[i]);
     }
 }
 
-void ggml_bf16_to_fp32_row(const ggml_bf16_t * x, float * y, int64_t n) {
+static inline void __attribute__((target("f16c"))) ggml_fp32_to_fp16_row_f16c(const float * x, ggml_fp16_t * y, int64_t n) {
     int64_t i = 0;
-#if defined(__AVX512F__)
-    //if (ggml_cpu_has_avx512()) {
-        for (; i + 16 <= n; i += 16) {
-            _mm512_storeu_ps(y + i,
-                            _mm512_castsi512_ps(
-                                _mm512_slli_epi32(
-                                    _mm512_cvtepu16_epi32(
-                                        _mm256_loadu_si256(
-                                            (const __m256i *)(x + i))),
-                                    16)));
-        }
-    //}
-#endif
-#if defined(__AVX2__)
-    //if (ggml_cpu_has_avx2()) {
-        for (; i + 8 <= n; i += 8) {
-            _mm256_storeu_ps(y + i,
-                            _mm256_castsi256_ps(
-                                _mm256_slli_epi32(
-                                    _mm256_cvtepu16_epi32(
-                                        _mm_loadu_si128(
-                                            (const __m128i *)(x + i))),
-                                    16)));
-        }
-    //}
+    for (; i + 7 < n; i += 8) {
+        __m256 x_vec = _mm256_loadu_ps(x + i);
+        __m128i y_vec = _mm256_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT);
+        _mm_storeu_si128((__m128i *)(y + i), y_vec);
+    }
+    for (; i + 3 < n; i += 4) {
+        __m128 x_vec = _mm_loadu_ps(x + i);
+        __m128i y_vec = _mm_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT);
+        _mm_storel_epi64((__m128i *)(y + i), y_vec);
+    }
+    ggml_fp32_to_fp16_generic(x + i, y + i, n - i);
+}
+
+static inline void __attribute__((target("avx512f"))) ggml_fp32_to_fp16_row_avx512f(const float * x, ggml_fp16_t * y, int64_t n) {
+    int64_t i = 0;
+    for (; i + 15 < n; i += 16) {
+        __m512 x_vec = _mm512_loadu_ps(x + i);
+        __m256i y_vec = _mm512_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT);
+        _mm256_storeu_si256((__m256i *)(y + i), y_vec);
+    }
+    ggml_fp32_to_fp16_row_f16c(x + i, y + i, n - i);
+}
+
+void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int64_t n) {
+static ggml_from_float_t from_float_ref = NULL;
+    if (from_float_ref != NULL) {
+        from_float_ref(x, y, n);
+        return;
+    }
+
+    bool has_avx512f = x86_64_supports_avx512f();
+    bool has_f16c = x86_64_supports_f16c();
+    if (has_avx512f && has_f16c) {
+        // use AVX512F
+        from_float_ref = (ggml_from_float_t)ggml_fp32_to_fp16_row_avx512f;
+    } else if (has_f16c) {
+        // use F16C
+        from_float_ref = (ggml_from_float_t)ggml_fp32_to_fp16_row_f16c;
+    } else {
+        // fallback to generic implementation
+        from_float_ref = (ggml_from_float_t)ggml_fp32_to_fp16_generic;
+    }
+    type_traits[GGML_TYPE_F16].from_float_ref = from_float_ref;
+    from_float_ref(x, y, n);
+}
+
+#else
+void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int64_t n) {
+    for (int64_t i = 0; i < n; i++) {
+        y[i] = GGML_FP32_TO_FP16(x[i]);
+    }
+}
+
 #endif
-    for (; i < n; i++) {
+
+#if defined(__x86_64__)
+
+
+static inline void ggml_bf16_to_fp32_generic(const ggml_bf16_t * x, float * y, int64_t n) {
+    for (int64_t i = 0; i < n; i++) {
         y[i] = GGML_BF16_TO_FP32(x[i]);
     }
 }
 
+static inline void __attribute__((target("avx2"))) ggml_bf16_to_fp32_row_avx2(const ggml_bf16_t * x, float * y, int64_t n) {
+    int64_t i = 0;
+    for (; i + 7 < n; i += 8) {
+        _mm256_storeu_ps(y + i,
+                        _mm256_castsi256_ps(
+                            _mm256_slli_epi32(
+                                _mm256_cvtepu16_epi32(
+                                    _mm_loadu_si128(
+                                        (const __m128i *)(x + i))),
+                                16)));
+    }
+    ggml_bf16_to_fp32_generic(x + i, y + i, n - i);
+}
+
+static inline void __attribute__((target("avx512f"))) ggml_bf16_to_fp32_row_avx512f(const ggml_bf16_t * x, float * y, int64_t n) {
+    int64_t i = 0;
+    for (; i + 15 < n; i += 16) {
+        _mm512_storeu_ps(y + i,
+                        _mm512_castsi512_ps(
+                            _mm512_slli_epi32(
+                                _mm512_cvtepu16_epi32(
+                                    _mm256_loadu_si256(
+                                        (const __m256i *)(x + i))),
+                                16)));
+    }
+    ggml_bf16_to_fp32_row_avx2(x + i, y + i, n - i);
+}
+
+void ggml_bf16_to_fp32_row(const ggml_bf16_t * x, float * y, int64_t n) {
+    static ggml_to_float_t to_float = NULL;
+    if (to_float != NULL) {
+        to_float(x, y, n);
+        return;
+    }
+    bool has_avx512f = x86_64_supports_avx512f();
+    bool has_avx2 = x86_64_supports_avx2();
+    if (has_avx512f) {
+        // use AVX512F
+        to_float = (ggml_to_float_t)ggml_bf16_to_fp32_row_avx512f;
+    } else if (has_avx2) {
+        // use AVX2
+        to_float = (ggml_to_float_t)ggml_bf16_to_fp32_row_avx2;
+    } else {
+        // fallback to generic implementation
+        to_float = (ggml_to_float_t)ggml_bf16_to_fp32_generic;
+    }
+    type_traits[GGML_TYPE_BF16].to_float = to_float;
+    to_float(x, y, n);
+}
+
+#else
+
+void ggml_bf16_to_fp32_row(const ggml_bf16_t * x, float * y, int64_t n) {
+    for (int64_t i = 0; i < n; i++) {
+        y[i] = GGML_BF16_TO_FP32(x[i]);
+    }
+}
+#endif
+
 void ggml_fp32_to_bf16_row_ref(const float * x, ggml_bf16_t * y, int64_t n) {
     for (int i = 0; i < n; i++) {
         y[i] = ggml_compute_fp32_to_bf16(x[i]);
@@ -569,7 +700,7 @@ static void ggml_vec_dot_f32(int n, float * GGML_RESTRICT s, size_t bs, const fl
 static void ggml_vec_dot_f16(int n, float * GGML_RESTRICT s, size_t bs, ggml_fp16_t * GGML_RESTRICT x, size_t bx, ggml_fp16_t * GGML_RESTRICT y, size_t by, int nrc);
 static void ggml_vec_dot_bf16(int n, float * GGML_RESTRICT s, size_t bs, ggml_bf16_t * GGML_RESTRICT x, size_t bx, ggml_bf16_t * GGML_RESTRICT y, size_t by, int nrc);
 
-static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
+static struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
     [GGML_TYPE_I8] = {
         .type_name                = "i8",
         .blck_size                = 1,

From 3efb0e7327e5cf877da407647645cdf22b1fdd73 Mon Sep 17 00:00:00 2001
From: SXX <sxx1136965276@gmail.com>
Date: Sat, 26 Apr 2025 11:13:22 +0800
Subject: [PATCH 2/3] move fp converter to ggml-cpu

---
 ggml/include/ggml-cpu.h      |   5 +
 ggml/src/ggml-cpu/ggml-cpu.c |  91 ++++++++++++++++-
 ggml/src/ggml.c              | 184 ++---------------------------------
 3 files changed, 100 insertions(+), 180 deletions(-)

diff --git a/ggml/include/ggml-cpu.h b/ggml/include/ggml-cpu.h
index f5e11f1e10002..de77a875ec533 100644
--- a/ggml/include/ggml-cpu.h
+++ b/ggml/include/ggml-cpu.h
@@ -133,6 +133,11 @@ extern "C" {
 
     GGML_BACKEND_API ggml_backend_reg_t ggml_backend_cpu_reg(void);
 
+    GGML_BACKEND_API void ggml_cpu_fp32_to_fp16(const float *, ggml_fp16_t *, int64_t);
+    GGML_BACKEND_API void ggml_cpu_fp16_to_fp32(const ggml_fp16_t *, float *, int64_t);
+    GGML_BACKEND_API void ggml_cpu_fp32_to_bf16(const float *, ggml_bf16_t *, int64_t);
+    GGML_BACKEND_API void ggml_cpu_bf16_to_fp32(const ggml_bf16_t *, float *, int64_t);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
index dbad8f61a1e92..64405449e2467 100644
--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -215,7 +215,7 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = {
         .nrows                    = 1,
     },
     [GGML_TYPE_F16] = {
-        .from_float               = (ggml_from_float_t) ggml_fp32_to_fp16_row,
+        .from_float               = (ggml_from_float_t) ggml_cpu_fp32_to_fp16,
         .vec_dot                  = (ggml_vec_dot_t) ggml_vec_dot_f16,
         .vec_dot_type             = GGML_TYPE_F16,
         .nrows                    = 1,
@@ -356,7 +356,7 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = {
         .from_float               = quantize_row_q8_K,
     },
     [GGML_TYPE_BF16] = {
-        .from_float               = (ggml_from_float_t) ggml_fp32_to_bf16_row,
+        .from_float               = (ggml_from_float_t) ggml_cpu_fp32_to_bf16,
         .vec_dot                  = (ggml_vec_dot_t) ggml_vec_dot_bf16,
         .vec_dot_type             = GGML_TYPE_BF16,
         .nrows                    = 1,
@@ -3166,6 +3166,93 @@ enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct g
     return ggml_graph_compute(cgraph, &cplan);
 }
 
+void ggml_cpu_fp32_to_fp16(const float * x, ggml_fp16_t * y, int64_t n) {
+    int64_t i = 0;
+#if defined(__F16C__)
+#if defined(__AVX512F__)
+    for (; i + 15 < n; i += 16) {
+        __m512 x_vec = _mm512_loadu_ps(x + i);
+        __m256i y_vec = _mm512_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT);
+        _mm256_storeu_si256((__m256i *)(y + i), y_vec);
+    }
+#endif
+    for (; i + 7 < n; i += 8) {
+        __m256 x_vec = _mm256_loadu_ps(x + i);
+        __m128i y_vec = _mm256_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT);
+        _mm_storeu_si128((__m128i *)(y + i), y_vec);
+    }
+    for (; i + 3 < n; i += 4) {
+        __m128 x_vec = _mm_loadu_ps(x + i);
+        __m128i y_vec = _mm_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT);
+        _mm_storel_epi64((__m128i *)(y + i), y_vec);
+    }
+#endif
+    for (; i < n; ++i) {
+        y[i] = GGML_FP32_TO_FP16(x[i]);
+    }
+}
+
+void ggml_cpu_fp16_to_fp32(const ggml_fp16_t * x, float * y, int64_t n) {
+    int64_t i = 0;
+#if defined(__F16C__)
+#if defined(__AVX512F__)
+    for (; i + 15 < n; i += 16) {
+        __m256i x_vec = _mm256_loadu_si256((const __m256i *)(x + i));
+        __m512 y_vec = _mm512_cvtph_ps(x_vec);
+        _mm512_storeu_ps(y + i, y_vec);
+    }
+#endif
+    for (; i + 7 < n; i += 8) {
+        __m128i x_vec = _mm_loadu_si128((const __m128i *)(x + i));
+        __m256 y_vec = _mm256_cvtph_ps(x_vec);
+        _mm256_storeu_ps(y + i, y_vec);
+    }
+    for (; i + 3 < n; i += 4) {
+        __m128i x_vec = _mm_loadl_epi64((const __m128i *)(x + i));
+        __m128 y_vec = _mm_cvtph_ps(x_vec);
+        _mm_storeu_ps(y + i, y_vec);
+    }
+#endif
+    for (; i < n; ++i) {
+        y[i] = GGML_FP16_TO_FP32(x[i]);
+    }
+}
+
+void ggml_cpu_fp32_to_bf16(const float * x, ggml_bf16_t * y, int64_t n) {
+    int64_t i = 0;
+    for (; i < n; ++i) {
+        y[i] = GGML_FP32_TO_BF16(x[i]);
+    }
+}
+
+void ggml_cpu_bf16_to_fp32(const ggml_bf16_t * x, float * y, int64_t n) {
+    int64_t i = 0;
+#if defined(__AVX2__)
+#if defined(__AVX512F__)
+    for (; i + 15 < n; i += 16) {
+        _mm512_storeu_ps(y + i,
+                        _mm512_castsi512_ps(
+                            _mm512_slli_epi32(
+                                _mm512_cvtepu16_epi32(
+                                    _mm256_loadu_si256(
+                                        (const __m256i *)(x + i))),
+                                16)));
+    }
+#endif
+    for (; i + 7 < n; i += 8) {
+        _mm256_storeu_ps(y + i,
+                        _mm256_castsi256_ps(
+                            _mm256_slli_epi32(
+                                _mm256_cvtepu16_epi32(
+                                    _mm_loadu_si128(
+                                        (const __m128i *)(x + i))),
+                                16)));
+    }
+#endif
+    for (; i < n; i++) {
+        y[i] = GGML_BF16_TO_FP32(x[i]);
+    }
+}
 
 int ggml_cpu_has_avx(void) {
 #if defined(__AVX__)
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index 9d8f0598ab568..7654ae1779b1d 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -4,6 +4,7 @@
 #include "ggml-backend.h"
 #include "ggml-impl.h"
 #include "ggml-threading.h"
+#include "ggml-cpu.h"
 #include "ggml.h"
 
 // FIXME: required here for quantization functions
@@ -42,13 +43,6 @@
 #include <TargetConditionals.h>
 #endif
 
-#if defined(__x86_64__)
-#include <immintrin.h>
-#if defined(_MSC_VER)
-#  include <intrin.h>
-#endif
-#endif
-
 #if defined(_WIN32)
 #define WIN32_LEAN_AND_MEAN
 #ifndef NOMINMAX
@@ -389,185 +383,19 @@ void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, int64_t n) {
     }
 }
 
-#if defined(__x86_64__)
-
-#if defined(_MSC_VER)
-#include <intrin.h>
-static void cpuid(int leaf, int subleaf, int *eax, int *ebx, int *ecx, int *edx) {
-    int regs[4];
-    __cpuidex(regs, leaf, subleaf);
-    *eax = regs[0];
-    *ebx = regs[1];
-    *ecx = regs[2];
-    *edx = regs[3];
-}
-#elif defined(__GNUC__) || defined(__clang__)
-static void cpuid(int leaf, int subleaf, int *eax, int *ebx, int *ecx, int *edx) {
-    __asm__ volatile (
-        "cpuid"
-        : "=a"(*eax), "=b"(*ebx), "=c"(*ecx), "=d"(*edx)
-        : "a"(leaf), "c"(subleaf)
-    );
-}
-#else
-  #error Unsupported compiler
-#endif
-
-static bool x86_64_supports_f16c(void) {
-    int eax, ebx, ecx, edx;
-    cpuid(1, 0, &eax, &ebx, &ecx, &edx);
-    return (ecx & (1 << 29)) != 0;
-}
-
-static bool x86_64_supports_avx2(void) {
-    int eax, ebx, ecx, edx;
-    cpuid(0, 0, &eax, &ebx, &ecx, &edx);
-    if (eax < 7)
-        return 0;
-    cpuid(7, 0, &eax, &ebx, &ecx, &edx);
-    return (ebx & (1 << 5)) != 0;
-}
-
-static bool x86_64_supports_avx512f(void) {
-    int eax, ebx, ecx, edx;
-    cpuid(0, 0, &eax, &ebx, &ecx, &edx);
-    if (eax < 7) return 0;
-    cpuid(7, 0, &eax, &ebx, &ecx, &edx);
-    return (ebx & (1 << 16)) != 0;
-}
-
-static struct ggml_type_traits type_traits[GGML_TYPE_COUNT];
-
-static inline void ggml_fp32_to_fp16_generic(const float * x, ggml_fp16_t * y, int64_t n) {
-    for (int64_t i = 0; i < n; i++) {
-        y[i] = GGML_FP32_TO_FP16(x[i]);
-    }
-}
-
-static inline void __attribute__((target("f16c"))) ggml_fp32_to_fp16_row_f16c(const float * x, ggml_fp16_t * y, int64_t n) {
-    int64_t i = 0;
-    for (; i + 7 < n; i += 8) {
-        __m256 x_vec = _mm256_loadu_ps(x + i);
-        __m128i y_vec = _mm256_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT);
-        _mm_storeu_si128((__m128i *)(y + i), y_vec);
-    }
-    for (; i + 3 < n; i += 4) {
-        __m128 x_vec = _mm_loadu_ps(x + i);
-        __m128i y_vec = _mm_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT);
-        _mm_storel_epi64((__m128i *)(y + i), y_vec);
-    }
-    ggml_fp32_to_fp16_generic(x + i, y + i, n - i);
-}
-
-static inline void __attribute__((target("avx512f"))) ggml_fp32_to_fp16_row_avx512f(const float * x, ggml_fp16_t * y, int64_t n) {
-    int64_t i = 0;
-    for (; i + 15 < n; i += 16) {
-        __m512 x_vec = _mm512_loadu_ps(x + i);
-        __m256i y_vec = _mm512_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT);
-        _mm256_storeu_si256((__m256i *)(y + i), y_vec);
-    }
-    ggml_fp32_to_fp16_row_f16c(x + i, y + i, n - i);
-}
-
-void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int64_t n) {
-static ggml_from_float_t from_float_ref = NULL;
-    if (from_float_ref != NULL) {
-        from_float_ref(x, y, n);
-        return;
-    }
-
-    bool has_avx512f = x86_64_supports_avx512f();
-    bool has_f16c = x86_64_supports_f16c();
-    if (has_avx512f && has_f16c) {
-        // use AVX512F
-        from_float_ref = (ggml_from_float_t)ggml_fp32_to_fp16_row_avx512f;
-    } else if (has_f16c) {
-        // use F16C
-        from_float_ref = (ggml_from_float_t)ggml_fp32_to_fp16_row_f16c;
-    } else {
-        // fallback to generic implementation
-        from_float_ref = (ggml_from_float_t)ggml_fp32_to_fp16_generic;
-    }
-    type_traits[GGML_TYPE_F16].from_float_ref = from_float_ref;
-    from_float_ref(x, y, n);
-}
-
-#else
 void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int64_t n) {
-    for (int64_t i = 0; i < n; i++) {
+    int i = 0;
+    for (; i < n; ++i) {
         y[i] = GGML_FP32_TO_FP16(x[i]);
     }
 }
 
-#endif
-
-#if defined(__x86_64__)
-
-
-static inline void ggml_bf16_to_fp32_generic(const ggml_bf16_t * x, float * y, int64_t n) {
-    for (int64_t i = 0; i < n; i++) {
-        y[i] = GGML_BF16_TO_FP32(x[i]);
-    }
-}
-
-static inline void __attribute__((target("avx2"))) ggml_bf16_to_fp32_row_avx2(const ggml_bf16_t * x, float * y, int64_t n) {
-    int64_t i = 0;
-    for (; i + 7 < n; i += 8) {
-        _mm256_storeu_ps(y + i,
-                        _mm256_castsi256_ps(
-                            _mm256_slli_epi32(
-                                _mm256_cvtepu16_epi32(
-                                    _mm_loadu_si128(
-                                        (const __m128i *)(x + i))),
-                                16)));
-    }
-    ggml_bf16_to_fp32_generic(x + i, y + i, n - i);
-}
-
-static inline void __attribute__((target("avx512f"))) ggml_bf16_to_fp32_row_avx512f(const ggml_bf16_t * x, float * y, int64_t n) {
-    int64_t i = 0;
-    for (; i + 15 < n; i += 16) {
-        _mm512_storeu_ps(y + i,
-                        _mm512_castsi512_ps(
-                            _mm512_slli_epi32(
-                                _mm512_cvtepu16_epi32(
-                                    _mm256_loadu_si256(
-                                        (const __m256i *)(x + i))),
-                                16)));
-    }
-    ggml_bf16_to_fp32_row_avx2(x + i, y + i, n - i);
-}
-
-void ggml_bf16_to_fp32_row(const ggml_bf16_t * x, float * y, int64_t n) {
-    static ggml_to_float_t to_float = NULL;
-    if (to_float != NULL) {
-        to_float(x, y, n);
-        return;
-    }
-    bool has_avx512f = x86_64_supports_avx512f();
-    bool has_avx2 = x86_64_supports_avx2();
-    if (has_avx512f) {
-        // use AVX512F
-        to_float = (ggml_to_float_t)ggml_bf16_to_fp32_row_avx512f;
-    } else if (has_avx2) {
-        // use AVX2
-        to_float = (ggml_to_float_t)ggml_bf16_to_fp32_row_avx2;
-    } else {
-        // fallback to generic implementation
-        to_float = (ggml_to_float_t)ggml_bf16_to_fp32_generic;
-    }
-    type_traits[GGML_TYPE_BF16].to_float = to_float;
-    to_float(x, y, n);
-}
-
-#else
-
 void ggml_bf16_to_fp32_row(const ggml_bf16_t * x, float * y, int64_t n) {
-    for (int64_t i = 0; i < n; i++) {
+    int i = 0;
+    for (; i < n; ++i) {
         y[i] = GGML_BF16_TO_FP32(x[i]);
     }
 }
-#endif
 
 void ggml_fp32_to_bf16_row_ref(const float * x, ggml_bf16_t * y, int64_t n) {
     for (int i = 0; i < n; i++) {
@@ -700,7 +528,7 @@ static void ggml_vec_dot_f32(int n, float * GGML_RESTRICT s, size_t bs, const fl
 static void ggml_vec_dot_f16(int n, float * GGML_RESTRICT s, size_t bs, ggml_fp16_t * GGML_RESTRICT x, size_t bx, ggml_fp16_t * GGML_RESTRICT y, size_t by, int nrc);
 static void ggml_vec_dot_bf16(int n, float * GGML_RESTRICT s, size_t bs, ggml_bf16_t * GGML_RESTRICT x, size_t bx, ggml_bf16_t * GGML_RESTRICT y, size_t by, int nrc);
 
-static struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
+static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
     [GGML_TYPE_I8] = {
         .type_name                = "i8",
         .blck_size                = 1,

From 82f8630acf2c2cb2743e7b9c4296be07687b2403 Mon Sep 17 00:00:00 2001
From: SXX <sxx1136965276@gmail.com>
Date: Sat, 26 Apr 2025 19:49:48 +0800
Subject: [PATCH 3/3] Switch ggml_compute_forward_get_rows_f16/bf16 to new
 ggml_cpu_fp16/bf16_to_fp32

---
 ggml/src/ggml-cpu/ops.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp
index 3c2adb217267b..7413192b746b6 100644
--- a/ggml/src/ggml-cpu/ops.cpp
+++ b/ggml/src/ggml-cpu/ops.cpp
@@ -4222,7 +4222,7 @@ static void ggml_compute_forward_get_rows_f16(
 
         GGML_ASSERT(i01 >= 0 && i01 < ne01);
 
-        ggml_fp16_to_fp32_row(
+        ggml_cpu_fp16_to_fp32(
             (const ggml_fp16_t*) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
                        (float *) ((char *)  dst->data + i10*nb1  + i11*nb2  + i12*nb3), nc);
     }
@@ -4263,7 +4263,7 @@ static void ggml_compute_forward_get_rows_bf16(
 
         GGML_ASSERT(i01 >= 0 && i01 < ne01);
 
-        ggml_bf16_to_fp32_row(
+        ggml_cpu_bf16_to_fp32(
             (const ggml_bf16_t *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
                         (float *) ((char *)  dst->data + i10*nb1  + i11*nb2  + i12*nb3), nc);
     }