@@ -215,7 +215,7 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = {
215215 .nrows = 1 ,
216216 },
217217 [GGML_TYPE_F16 ] = {
218- .from_float = (ggml_from_float_t ) ggml_fp32_to_fp16_row ,
218+ .from_float = (ggml_from_float_t ) ggml_cpu_fp32_to_fp16 ,
219219 .vec_dot = (ggml_vec_dot_t ) ggml_vec_dot_f16 ,
220220 .vec_dot_type = GGML_TYPE_F16 ,
221221 .nrows = 1 ,
@@ -356,7 +356,7 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = {
356356 .from_float = quantize_row_q8_K ,
357357 },
358358 [GGML_TYPE_BF16 ] = {
359- .from_float = (ggml_from_float_t ) ggml_fp32_to_bf16_row ,
359+ .from_float = (ggml_from_float_t ) ggml_cpu_fp32_to_bf16 ,
360360 .vec_dot = (ggml_vec_dot_t ) ggml_vec_dot_bf16 ,
361361 .vec_dot_type = GGML_TYPE_BF16 ,
362362 .nrows = 1 ,
@@ -3166,6 +3166,93 @@ enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct g
31663166 return ggml_graph_compute (cgraph , & cplan );
31673167}
31683168
3169+ void ggml_cpu_fp32_to_fp16 (const float * x , ggml_fp16_t * y , int64_t n ) {
3170+ int64_t i = 0 ;
3171+ #if defined(__F16C__ )
3172+ #if defined(__AVX512F__ )
3173+ for (; i + 15 < n ; i += 16 ) {
3174+ __m512 x_vec = _mm512_loadu_ps (x + i );
3175+ __m256i y_vec = _mm512_cvtps_ph (x_vec , _MM_FROUND_TO_NEAREST_INT );
3176+ _mm256_storeu_si256 ((__m256i * )(y + i ), y_vec );
3177+ }
3178+ #endif
3179+ for (; i + 7 < n ; i += 8 ) {
3180+ __m256 x_vec = _mm256_loadu_ps (x + i );
3181+ __m128i y_vec = _mm256_cvtps_ph (x_vec , _MM_FROUND_TO_NEAREST_INT );
3182+ _mm_storeu_si128 ((__m128i * )(y + i ), y_vec );
3183+ }
3184+ for (; i + 3 < n ; i += 4 ) {
3185+ __m128 x_vec = _mm_loadu_ps (x + i );
3186+ __m128i y_vec = _mm_cvtps_ph (x_vec , _MM_FROUND_TO_NEAREST_INT );
3187+ _mm_storel_epi64 ((__m128i * )(y + i ), y_vec );
3188+ }
3189+ #endif
3190+ for (; i < n ; ++ i ) {
3191+ y [i ] = GGML_FP32_TO_FP16 (x [i ]);
3192+ }
3193+ }
3194+
3195+ void ggml_cpu_fp16_to_fp32 (const ggml_fp16_t * x , float * y , int64_t n ) {
3196+ int64_t i = 0 ;
3197+ #if defined(__F16C__ )
3198+ #if defined(__AVX512F__ )
3199+ for (; i + 15 < n ; i += 16 ) {
3200+ __m256i x_vec = _mm256_loadu_si256 ((const __m256i * )(x + i ));
3201+ __m512 y_vec = _mm512_cvtph_ps (x_vec );
3202+ _mm512_storeu_ps (y + i , y_vec );
3203+ }
3204+ #endif
3205+ for (; i + 7 < n ; i += 8 ) {
3206+ __m128i x_vec = _mm_loadu_si128 ((const __m128i * )(x + i ));
3207+ __m256 y_vec = _mm256_cvtph_ps (x_vec );
3208+ _mm256_storeu_ps (y + i , y_vec );
3209+ }
3210+ for (; i + 3 < n ; i += 4 ) {
3211+ __m128i x_vec = _mm_loadl_epi64 ((const __m128i * )(x + i ));
3212+ __m128 y_vec = _mm_cvtph_ps (x_vec );
3213+ _mm_storeu_ps (y + i , y_vec );
3214+ }
3215+ #endif
3216+ for (; i < n ; ++ i ) {
3217+ y [i ] = GGML_FP16_TO_FP32 (x [i ]);
3218+ }
3219+ }
3220+
3221+ void ggml_cpu_fp32_to_bf16 (const float * x , ggml_bf16_t * y , int64_t n ) {
3222+ int64_t i = 0 ;
3223+ for (; i < n ; ++ i ) {
3224+ y [i ] = GGML_FP32_TO_BF16 (x [i ]);
3225+ }
3226+ }
3227+
3228+ void ggml_cpu_bf16_to_fp32 (const ggml_bf16_t * x , float * y , int64_t n ) {
3229+ int64_t i = 0 ;
3230+ #if defined(__AVX2__ )
3231+ #if defined(__AVX512F__ )
3232+ for (; i + 15 < n ; i += 16 ) {
3233+ _mm512_storeu_ps (y + i ,
3234+ _mm512_castsi512_ps (
3235+ _mm512_slli_epi32 (
3236+ _mm512_cvtepu16_epi32 (
3237+ _mm256_loadu_si256 (
3238+ (const __m256i * )(x + i ))),
3239+ 16 )));
3240+ }
3241+ #endif
3242+ for (; i + 7 < n ; i += 8 ) {
3243+ _mm256_storeu_ps (y + i ,
3244+ _mm256_castsi256_ps (
3245+ _mm256_slli_epi32 (
3246+ _mm256_cvtepu16_epi32 (
3247+ _mm_loadu_si128 (
3248+ (const __m128i * )(x + i ))),
3249+ 16 )));
3250+ }
3251+ #endif
3252+ for (; i < n ; i ++ ) {
3253+ y [i ] = GGML_BF16_TO_FP32 (x [i ]);
3254+ }
3255+ }
31693256
31703257int ggml_cpu_has_avx (void ) {
31713258#if defined(__AVX__ )
0 commit comments