diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 2bce1375ba3c0..35c1677e708ae 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -428,13 +428,28 @@ ggml_bf16_t ggml_fp32_to_bf16(float x) { } void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, int64_t n) { - for (int64_t i = 0; i < n; i++) { + int i = 0; +#if defined(__F16C__) + for (; i + 7 < n; i += 8) { + __m128i x_i = _mm_loadu_si128((__m128i *)(x + i)); + __m256 y_v = _mm256_cvtph_ps(x_i); + _mm256_storeu_ps(y + i, y_v); + } +#endif + for (; i < n; i++) { y[i] = GGML_FP16_TO_FP32(x[i]); } } void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int64_t n) { int i = 0; +#if defined(__F16C__) + for (; i + 7 < n; i += 8) { + __m256 x_v = _mm256_loadu_ps(x + i); + __m128i y_v = _mm256_cvtps_ph(x_v, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + _mm_storeu_si128((__m128i *)(y + i), y_v); + } +#endif for (; i < n; ++i) { y[i] = GGML_FP32_TO_FP16(x[i]); } @@ -442,6 +457,17 @@ void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int64_t n) { void ggml_bf16_to_fp32_row(const ggml_bf16_t * x, float * y, int64_t n) { int i = 0; +#if defined(__AVX512F__) + for (; i + 15 < n; i += 16) { + __m512 y_v = _mm512_castsi512_ps(_mm512_slli_epi32(_mm512_cvtepu16_epi32(_mm256_loadu_si256((__m256i *)(x + i))), 16)); + _mm512_storeu_ps(y + i, y_v); + } +#elif defined(__AVX2__) + for (; i + 7 < n; i += 8) { + __m256 y_v = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(x + i))), 16)); + _mm256_storeu_ps(y + i, y_v); + } +#endif for (; i < n; ++i) { y[i] = GGML_BF16_TO_FP32(x[i]); }