Skip to content

Commit 421ff7b

Browse files
author
Aaron
committed
feat(ggml): vectorize row conversion functions
Vectorized the following functions in ggml.c for improved performance on x86 architectures: - ggml_fp16_to_fp32_row: using F16C intrinsics. - ggml_fp32_to_fp16_row: using F16C intrinsics. - ggml_bf16_to_fp32_row: using AVX2 and AVX512F intrinsics. This change follows the existing pattern of using direct SIMD intrinsic checks in this file.
1 parent fa882fd commit 421ff7b

File tree

1 file changed

+27
-1
lines changed

1 file changed

+27
-1
lines changed

ggml/src/ggml.c

Lines changed: 27 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -428,20 +428,46 @@ ggml_bf16_t ggml_fp32_to_bf16(float x) {
428428
}
429429

430430
void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, int64_t n) {
431-
for (int64_t i = 0; i < n; i++) {
431+
int i = 0;
432+
#if defined(__F16C__)
433+
for (; i + 7 < n; i += 8) {
434+
__m128i x_i = _mm_loadu_si128((__m128i *)(x + i));
435+
__m256 y_v = _mm256_cvtph_ps(x_i);
436+
_mm256_storeu_ps(y + i, y_v);
437+
}
438+
#endif
439+
for (; i < n; i++) {
432440
y[i] = GGML_FP16_TO_FP32(x[i]);
433441
}
434442
}
435443

436444
void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int64_t n) {
437445
int i = 0;
446+
#if defined(__F16C__)
447+
for (; i + 7 < n; i += 8) {
448+
__m256 x_v = _mm256_loadu_ps(x + i);
449+
__m128i y_v = _mm256_cvtps_ph(x_v, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
450+
_mm_storeu_si128((__m128i *)(y + i), y_v);
451+
}
452+
#endif
438453
for (; i < n; ++i) {
439454
y[i] = GGML_FP32_TO_FP16(x[i]);
440455
}
441456
}
442457

443458
void ggml_bf16_to_fp32_row(const ggml_bf16_t * x, float * y, int64_t n) {
444459
int i = 0;
460+
#if defined(__AVX512F__)
461+
for (; i + 15 < n; i += 16) {
462+
__m512 y_v = _mm512_castsi512_ps(_mm512_slli_epi32(_mm512_cvtepu16_epi32(_mm256_loadu_si256((__m256i *)(x + i))), 16));
463+
_mm512_storeu_ps(y + i, y_v);
464+
}
465+
#elif defined(__AVX2__)
466+
for (; i + 7 < n; i += 8) {
467+
__m256 y_v = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(x + i))), 16));
468+
_mm256_storeu_ps(y + i, y_v);
469+
}
470+
#endif
445471
for (; i < n; ++i) {
446472
y[i] = GGML_BF16_TO_FP32(x[i]);
447473
}

0 commit comments

Comments
 (0)