Skip to content

Commit 1deee0f

Browse files
dudutaggerganovtaronaeo
authored
cpu : optimize the ggml NORM operation (ggml-org#15953)
* ggml-cpu: optimize norm operation to use intrinsics or Accelerate rename function add endif macro comment Co-authored-by: Georgi Gerganov <[email protected]> Co-authored-by: Aaron Teo <[email protected]> * implement s390x SIMD suggested by @taronaeo * add TODO comment * tidy up spaces --------- Co-authored-by: Georgi Gerganov <[email protected]> Co-authored-by: Aaron Teo <[email protected]>
1 parent d00cbea commit 1deee0f

File tree

3 files changed

+77
-14
lines changed

3 files changed

+77
-14
lines changed

ggml/src/ggml-cpu/ops.cpp

Lines changed: 10 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -3467,31 +3467,27 @@ static void ggml_compute_forward_norm_f32(
34673467

34683468
GGML_ASSERT(eps >= 0.0f);
34693469

3470-
// TODO: optimize
34713470
for (int64_t i03 = 0; i03 < ne03; i03++) {
34723471
for (int64_t i02 = 0; i02 < ne02; i02++) {
34733472
for (int64_t i01 = ith; i01 < ne01; i01 += nth) {
34743473
const float * x = (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03);
34753474

3476-
ggml_float sum = 0.0;
3477-
for (int64_t i00 = 0; i00 < ne00; i00++) {
3478-
sum += (ggml_float)x[i00];
3479-
}
3480-
3475+
float sum = 0.0;
3476+
ggml_vec_sum_f32(ne00, &sum, x);
34813477
float mean = sum/ne00;
34823478

34833479
float * y = (float *) ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3);
3480+
float variance = 0;
34843481

3485-
ggml_float sum2 = 0.0;
3486-
for (int64_t i00 = 0; i00 < ne00; i00++) {
3487-
float v = x[i00] - mean;
3488-
y[i00] = v;
3489-
sum2 += (ggml_float)(v*v);
3490-
}
3482+
#ifdef GGML_USE_ACCELERATE
3483+
mean = -mean;
3484+
vDSP_vsadd(x, 1, &mean, y, 1, ne00);
3485+
vDSP_measqv(y, 1, &variance, ne00);
3486+
#else
3487+
variance = ggml_vec_cvar_f32(ne00, y, x, mean);
3488+
#endif //GGML_USE_ACCELERATE
34913489

3492-
float variance = sum2/ne00;
34933490
const float scale = 1.0f/sqrtf(variance + eps);
3494-
34953491
ggml_vec_scale_f32(ne00, y, scale);
34963492
}
34973493
}

ggml/src/ggml-cpu/vec.cpp

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -404,6 +404,72 @@ void ggml_vec_swiglu_f32(const int n, float * y, const float * x, const float *
404404
}
405405
}
406406

407+
ggml_float ggml_vec_cvar_f32(const int n, float * y, const float * x, const float mean) {
408+
int i = 0;
409+
ggml_float sum = 0;
410+
// TODO: optimize to process the remaining elements in groups using the smaller vector sizes from AVX2 and SSE
411+
// ref: https://github.com/ggml-org/llama.cpp/pull/15953#pullrequestreview-3310928344
412+
#if defined(__AVX512F__) && defined(__AVX512DQ__)
413+
for (; i + 15 < n; i += 16) {
414+
__m512 val = _mm512_sub_ps(_mm512_loadu_ps(x + i),
415+
_mm512_set1_ps(mean));
416+
_mm512_storeu_ps(y + i, val);
417+
sum += (ggml_float)_mm512_reduce_add_ps(_mm512_mul_ps(val, val));
418+
}
419+
#elif defined(__AVX2__) && defined(__FMA__)
420+
for (; i + 7 < n; i += 8) {
421+
__m256 val = _mm256_sub_ps(_mm256_loadu_ps(x + i),
422+
_mm256_set1_ps(mean));
423+
_mm256_storeu_ps(y + i, val);
424+
val = _mm256_mul_ps(val,val);
425+
__m128 val2 = _mm_add_ps(_mm256_extractf128_ps(val, 1),
426+
_mm256_castps256_ps128(val));
427+
val2 = _mm_add_ps(val2, _mm_movehl_ps(val2, val2));
428+
val2 = _mm_add_ss(val2, _mm_movehdup_ps(val2));
429+
sum += (ggml_float)_mm_cvtss_f32(val2);
430+
}
431+
#elif defined(__SSE2__)
432+
for (; i + 3 < n; i += 4) {
433+
__m128 val = _mm_sub_ps(_mm_loadu_ps(x + i),
434+
_mm_set1_ps(mean));
435+
_mm_storeu_ps(y + i, val);
436+
val = _mm_mul_ps(val, val);
437+
#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)
438+
val = _mm_add_ps(val, _mm_movehl_ps(val, val));
439+
val = _mm_add_ss(val, _mm_movehdup_ps(val));
440+
#else
441+
__m128 tmp = _mm_shuffle_ps(val, val, _MM_SHUFFLE(2, 3, 0, 1));
442+
val = _mm_add_ps(val, tmp);
443+
tmp = _mm_movehl_ps(tmp, val);
444+
val = _mm_add_ss(val, tmp);
445+
#endif // __AVX__ || __AVX2__ || __AVX512F__
446+
sum += (ggml_float)_mm_cvtss_f32(val);
447+
}
448+
#elif defined(__ARM_NEON) && defined(__aarch64__)
449+
for (; i + 3 < n; i += 4) {
450+
float32x4_t val = vsubq_f32(vld1q_f32(x + i),
451+
vdupq_n_f32(mean));
452+
vst1q_f32(y + i, val);
453+
val = vmulq_f32(val, val);
454+
sum += (ggml_float)vaddvq_f32(val);
455+
}
456+
#elif defined(__VXE__) || defined(__VXE2__)
457+
for (; i + 3 < n; i += 4) {
458+
float32x4_t val = vec_sub(vec_xl(0, x + i), vec_splats(mean));
459+
vec_xst(val, 0, y + i);
460+
val = vec_mul(val, val);
461+
sum += (ggml_float)vec_hsum_f32x4(val);
462+
}
463+
#endif
464+
for (; i < n; ++i) {
465+
float val = x[i] - mean;
466+
val *= val;
467+
sum += (ggml_float)val;
468+
y[i] = val;
469+
}
470+
return sum/n;
471+
}
472+
407473
ggml_float ggml_vec_soft_max_f32(const int n, float * y, const float * x, float max) {
408474
int i = 0;
409475
ggml_float sum = 0;

ggml/src/ggml-cpu/vec.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@ void ggml_vec_dot_bf16(int n, float * GGML_RESTRICT s, size_t bs, ggml_bf16_t *
4444
void ggml_vec_dot_f16(int n, float * GGML_RESTRICT s, size_t bs, ggml_fp16_t * GGML_RESTRICT x, size_t bx, ggml_fp16_t * GGML_RESTRICT y, size_t by, int nrc);
4545

4646
void ggml_vec_silu_f32(const int n, float * y, const float * x);
47+
ggml_float ggml_vec_cvar_f32(const int n, float * y, const float * x, const float mean); //it will also center y ( y = y - mean )
4748
ggml_float ggml_vec_soft_max_f32(const int n, float * y, const float * x, float max);
4849
ggml_float ggml_vec_log_soft_max_f32(const int n, float * y, const float * x, float max);
4950

0 commit comments

Comments
 (0)