ggml-org · sirus20x6 · Oct 11, 2025 · Oct 11, 2025 · Oct 13, 2025 · Oct 15, 2025
@@ -77,16 +77,85 @@ inline static void ggml_vec_add_f16 (const int n, ggml_fp16_t * z, const ggml_fp
         z[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(x[i]) + GGML_CPU_FP16_TO_FP32(y[i]));
     }
 }
-inline static void ggml_vec_add1_f32(const int n, float * z, const float * x, const float   v) { for (int i = 0; i < n; ++i) z[i]  = x[i] + v;    }
-inline static void ggml_vec_acc_f32 (const int n, float * y, const float * x)                  { for (int i = 0; i < n; ++i) y[i] += x[i];        }
-inline static void ggml_vec_acc1_f32(const int n, float * y, const float   v)                  { for (int i = 0; i < n; ++i) y[i] += v;           }
+inline static void ggml_vec_add1_f32(const int n, float * z, const float * x, const float v) {
+    int i = 0;
+#if defined(GGML_SIMD)
+    const int np = (n & ~(GGML_F32_STEP - 1));
+
+    GGML_F32_VEC vv = GGML_F32_VEC_SET1(v);
+
+    for (; i < np; i += GGML_F32_STEP) {
+        for (int j = 0; j < GGML_F32_ARR; ++j) {
+            GGML_F32_VEC ax = GGML_F32_VEC_LOAD(x + i + j*GGML_F32_EPR);
+            GGML_F32_VEC az = GGML_F32_VEC_ADD(ax, vv);
+            GGML_F32_VEC_STORE(z + i + j*GGML_F32_EPR, az);
+        }
+    }
+#endif
+    for (; i < n; ++i) {
+        z[i] = x[i] + v;
+    }
+}
+inline static void ggml_vec_acc_f32 (const int n, float * y, const float * x) {
+    int i = 0;
+#if defined(GGML_SIMD)
+    const int np = (n & ~(GGML_F32_STEP - 1));
+
+    for (; i < np; i += GGML_F32_STEP) {
+        for (int j = 0; j < GGML_F32_ARR; ++j) {
+            GGML_F32_VEC ay = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
+            GGML_F32_VEC ax = GGML_F32_VEC_LOAD(x + i + j*GGML_F32_EPR);
+            ay = GGML_F32_VEC_ADD(ay, ax);
+            GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay);
+        }
+    }
+#endif
+    for (; i < n; ++i) {
+        y[i] += x[i];
+    }
+}
+inline static void ggml_vec_acc1_f32(const int n, float * y, const float v) {
+    int i = 0;
+#if defined(GGML_SIMD)
+    const int np = (n & ~(GGML_F32_STEP - 1));
+
+    GGML_F32_VEC vv = GGML_F32_VEC_SET1(v);
+
+    for (; i < np; i += GGML_F32_STEP) {
+        for (int j = 0; j < GGML_F32_ARR; ++j) {
+            GGML_F32_VEC ay = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
+            ay = GGML_F32_VEC_ADD(ay, vv);
+            GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay);
+        }
+    }
+#endif
+    for (; i < n; ++i) {
+        y[i] += v;
+    }
+}
 inline static void ggml_vec_sub_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i]  = x[i] - y[i]; }
 inline static void ggml_vec_sub_f16 (const int n, ggml_fp16_t * z, const ggml_fp16_t * x, const ggml_fp16_t * y) {
     for (int i = 0; i < n; ++i) {
         z[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(x[i]) - GGML_CPU_FP16_TO_FP32(y[i]));
     }
 }
-inline static void ggml_vec_set_f32 (const int n, float * x, const float   v)                  { for (int i = 0; i < n; ++i) x[i]  = v;           }
+inline static void ggml_vec_set_f32 (const int n, float * x, const float v) {
+    int i = 0;
+#if defined(GGML_SIMD)
+    const int np = (n & ~(GGML_F32_STEP - 1));
+
+    GGML_F32_VEC vx = GGML_F32_VEC_SET1(v);
+
+    for (; i < np; i += GGML_F32_STEP) {
+        for (int j = 0; j < GGML_F32_ARR; ++j) {
+            GGML_F32_VEC_STORE(x + i + j*GGML_F32_EPR, vx);
+        }
+    }
+#endif
+    for (; i < n; ++i) {
+        x[i] = v;
+    }
+}
 inline static void ggml_vec_cpy_f32 (const int n, float * y, const float * x)                  { for (int i = 0; i < n; ++i) y[i]  = x[i];        }
 inline static void ggml_vec_neg_f32 (const int n, float * y, const float * x)                  { for (int i = 0; i < n; ++i) y[i]  = -x[i];       }
 inline static void ggml_vec_neg_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
@@ -95,7 +164,24 @@ inline static void ggml_vec_neg_f16 (const int n, ggml_fp16_t * y, const ggml_fp
     }
 }
 
-inline static void ggml_vec_mul_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i]  = x[i]*y[i];   }
+inline static void ggml_vec_mul_f32 (const int n, float * z, const float * x, const float * y) {
+    int i = 0;
+#if defined(GGML_SIMD)
+    const int np = (n & ~(GGML_F32_STEP - 1));
+
+    for (; i < np; i += GGML_F32_STEP) {
+        for (int j = 0; j < GGML_F32_ARR; ++j) {
+            GGML_F32_VEC ax = GGML_F32_VEC_LOAD(x + i + j*GGML_F32_EPR);
+            GGML_F32_VEC ay = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
+            GGML_F32_VEC az = GGML_F32_VEC_MUL(ax, ay);
+            GGML_F32_VEC_STORE(z + i + j*GGML_F32_EPR, az);
+        }
+    }
+#endif
+    for (; i < n; ++i) {
+        z[i] = x[i]*y[i];
+    }
+}
 inline static void ggml_vec_mul_f16 (const int n, ggml_fp16_t * z, const ggml_fp16_t * x, const ggml_fp16_t * y) {
     for (int i = 0; i < n; ++i) {
         z[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(x[i]) * GGML_CPU_FP16_TO_FP32(y[i]));

@@ -428,20 +428,46 @@ ggml_bf16_t ggml_fp32_to_bf16(float x) {
 }
 
 void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, int64_t n) {
-    for (int64_t i = 0; i < n; i++) {
+    int i = 0;
+#if defined(__F16C__)
+    for (; i + 7 < n; i += 8) {
+        __m128i x_i = _mm_loadu_si128((__m128i *)(x + i));
+        __m256 y_v = _mm256_cvtph_ps(x_i);
+        _mm256_storeu_ps(y + i, y_v);
+    }
+#endif
+    for (; i < n; i++) {
         y[i] = GGML_FP16_TO_FP32(x[i]);
     }
 }
 
 void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int64_t n) {
     int i = 0;
+#if defined(__F16C__)
+    for (; i + 7 < n; i += 8) {
+        __m256 x_v = _mm256_loadu_ps(x + i);
+        __m128i y_v = _mm256_cvtps_ph(x_v, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+        _mm_storeu_si128((__m128i *)(y + i), y_v);
+    }
+#endif
     for (; i < n; ++i) {
         y[i] = GGML_FP32_TO_FP16(x[i]);
     }
 }
 
 void ggml_bf16_to_fp32_row(const ggml_bf16_t * x, float * y, int64_t n) {
     int i = 0;
+#if defined(__AVX512F__)
+    for (; i + 15 < n; i += 16) {
+        __m512 y_v = _mm512_castsi512_ps(_mm512_slli_epi32(_mm512_cvtepu16_epi32(_mm256_loadu_si256((__m256i *)(x + i))), 16));
+        _mm512_storeu_ps(y + i, y_v);
+    }
+#elif defined(__AVX2__)
+    for (; i + 7 < n; i += 8) {
+        __m256 y_v = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(x + i))), 16));
+        _mm256_storeu_ps(y + i, y_v);
+    }
+#endif
     for (; i < n; ++i) {
         y[i] = GGML_BF16_TO_FP32(x[i]);
     }