Skip to content

Commit 3efb0e7

Browse files
committed
move fp converter to ggml-cpu
1 parent c5e3b52 commit 3efb0e7

File tree

3 files changed

+100
-180
lines changed

3 files changed

+100
-180
lines changed

ggml/include/ggml-cpu.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -133,6 +133,11 @@ extern "C" {
133133

134134
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_cpu_reg(void);
135135

136+
GGML_BACKEND_API void ggml_cpu_fp32_to_fp16(const float *, ggml_fp16_t *, int64_t);
137+
GGML_BACKEND_API void ggml_cpu_fp16_to_fp32(const ggml_fp16_t *, float *, int64_t);
138+
GGML_BACKEND_API void ggml_cpu_fp32_to_bf16(const float *, ggml_bf16_t *, int64_t);
139+
GGML_BACKEND_API void ggml_cpu_bf16_to_fp32(const ggml_bf16_t *, float *, int64_t);
140+
136141
#ifdef __cplusplus
137142
}
138143
#endif

ggml/src/ggml-cpu/ggml-cpu.c

Lines changed: 89 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -215,7 +215,7 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = {
215215
.nrows = 1,
216216
},
217217
[GGML_TYPE_F16] = {
218-
.from_float = (ggml_from_float_t) ggml_fp32_to_fp16_row,
218+
.from_float = (ggml_from_float_t) ggml_cpu_fp32_to_fp16,
219219
.vec_dot = (ggml_vec_dot_t) ggml_vec_dot_f16,
220220
.vec_dot_type = GGML_TYPE_F16,
221221
.nrows = 1,
@@ -356,7 +356,7 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = {
356356
.from_float = quantize_row_q8_K,
357357
},
358358
[GGML_TYPE_BF16] = {
359-
.from_float = (ggml_from_float_t) ggml_fp32_to_bf16_row,
359+
.from_float = (ggml_from_float_t) ggml_cpu_fp32_to_bf16,
360360
.vec_dot = (ggml_vec_dot_t) ggml_vec_dot_bf16,
361361
.vec_dot_type = GGML_TYPE_BF16,
362362
.nrows = 1,
@@ -3166,6 +3166,93 @@ enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct g
31663166
return ggml_graph_compute(cgraph, &cplan);
31673167
}
31683168

3169+
void ggml_cpu_fp32_to_fp16(const float * x, ggml_fp16_t * y, int64_t n) {
3170+
int64_t i = 0;
3171+
#if defined(__F16C__)
3172+
#if defined(__AVX512F__)
3173+
for (; i + 15 < n; i += 16) {
3174+
__m512 x_vec = _mm512_loadu_ps(x + i);
3175+
__m256i y_vec = _mm512_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT);
3176+
_mm256_storeu_si256((__m256i *)(y + i), y_vec);
3177+
}
3178+
#endif
3179+
for (; i + 7 < n; i += 8) {
3180+
__m256 x_vec = _mm256_loadu_ps(x + i);
3181+
__m128i y_vec = _mm256_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT);
3182+
_mm_storeu_si128((__m128i *)(y + i), y_vec);
3183+
}
3184+
for (; i + 3 < n; i += 4) {
3185+
__m128 x_vec = _mm_loadu_ps(x + i);
3186+
__m128i y_vec = _mm_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT);
3187+
_mm_storel_epi64((__m128i *)(y + i), y_vec);
3188+
}
3189+
#endif
3190+
for (; i < n; ++i) {
3191+
y[i] = GGML_FP32_TO_FP16(x[i]);
3192+
}
3193+
}
3194+
3195+
void ggml_cpu_fp16_to_fp32(const ggml_fp16_t * x, float * y, int64_t n) {
3196+
int64_t i = 0;
3197+
#if defined(__F16C__)
3198+
#if defined(__AVX512F__)
3199+
for (; i + 15 < n; i += 16) {
3200+
__m256i x_vec = _mm256_loadu_si256((const __m256i *)(x + i));
3201+
__m512 y_vec = _mm512_cvtph_ps(x_vec);
3202+
_mm512_storeu_ps(y + i, y_vec);
3203+
}
3204+
#endif
3205+
for (; i + 7 < n; i += 8) {
3206+
__m128i x_vec = _mm_loadu_si128((const __m128i *)(x + i));
3207+
__m256 y_vec = _mm256_cvtph_ps(x_vec);
3208+
_mm256_storeu_ps(y + i, y_vec);
3209+
}
3210+
for (; i + 3 < n; i += 4) {
3211+
__m128i x_vec = _mm_loadl_epi64((const __m128i *)(x + i));
3212+
__m128 y_vec = _mm_cvtph_ps(x_vec);
3213+
_mm_storeu_ps(y + i, y_vec);
3214+
}
3215+
#endif
3216+
for (; i < n; ++i) {
3217+
y[i] = GGML_FP16_TO_FP32(x[i]);
3218+
}
3219+
}
3220+
3221+
void ggml_cpu_fp32_to_bf16(const float * x, ggml_bf16_t * y, int64_t n) {
3222+
int64_t i = 0;
3223+
for (; i < n; ++i) {
3224+
y[i] = GGML_FP32_TO_BF16(x[i]);
3225+
}
3226+
}
3227+
3228+
void ggml_cpu_bf16_to_fp32(const ggml_bf16_t * x, float * y, int64_t n) {
3229+
int64_t i = 0;
3230+
#if defined(__AVX2__)
3231+
#if defined(__AVX512F__)
3232+
for (; i + 15 < n; i += 16) {
3233+
_mm512_storeu_ps(y + i,
3234+
_mm512_castsi512_ps(
3235+
_mm512_slli_epi32(
3236+
_mm512_cvtepu16_epi32(
3237+
_mm256_loadu_si256(
3238+
(const __m256i *)(x + i))),
3239+
16)));
3240+
}
3241+
#endif
3242+
for (; i + 7 < n; i += 8) {
3243+
_mm256_storeu_ps(y + i,
3244+
_mm256_castsi256_ps(
3245+
_mm256_slli_epi32(
3246+
_mm256_cvtepu16_epi32(
3247+
_mm_loadu_si128(
3248+
(const __m128i *)(x + i))),
3249+
16)));
3250+
}
3251+
#endif
3252+
for (; i < n; i++) {
3253+
y[i] = GGML_BF16_TO_FP32(x[i]);
3254+
}
3255+
}
31693256

31703257
int ggml_cpu_has_avx(void) {
31713258
#if defined(__AVX__)

ggml/src/ggml.c

Lines changed: 6 additions & 178 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
#include "ggml-backend.h"
55
#include "ggml-impl.h"
66
#include "ggml-threading.h"
7+
#include "ggml-cpu.h"
78
#include "ggml.h"
89

910
// FIXME: required here for quantization functions
@@ -42,13 +43,6 @@
4243
#include <TargetConditionals.h>
4344
#endif
4445

45-
#if defined(__x86_64__)
46-
#include <immintrin.h>
47-
#if defined(_MSC_VER)
48-
# include <intrin.h>
49-
#endif
50-
#endif
51-
5246
#if defined(_WIN32)
5347
#define WIN32_LEAN_AND_MEAN
5448
#ifndef NOMINMAX
@@ -389,185 +383,19 @@ void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, int64_t n) {
389383
}
390384
}
391385

392-
#if defined(__x86_64__)
393-
394-
#if defined(_MSC_VER)
395-
#include <intrin.h>
396-
static void cpuid(int leaf, int subleaf, int *eax, int *ebx, int *ecx, int *edx) {
397-
int regs[4];
398-
__cpuidex(regs, leaf, subleaf);
399-
*eax = regs[0];
400-
*ebx = regs[1];
401-
*ecx = regs[2];
402-
*edx = regs[3];
403-
}
404-
#elif defined(__GNUC__) || defined(__clang__)
405-
static void cpuid(int leaf, int subleaf, int *eax, int *ebx, int *ecx, int *edx) {
406-
__asm__ volatile (
407-
"cpuid"
408-
: "=a"(*eax), "=b"(*ebx), "=c"(*ecx), "=d"(*edx)
409-
: "a"(leaf), "c"(subleaf)
410-
);
411-
}
412-
#else
413-
#error Unsupported compiler
414-
#endif
415-
416-
static bool x86_64_supports_f16c(void) {
417-
int eax, ebx, ecx, edx;
418-
cpuid(1, 0, &eax, &ebx, &ecx, &edx);
419-
return (ecx & (1 << 29)) != 0;
420-
}
421-
422-
static bool x86_64_supports_avx2(void) {
423-
int eax, ebx, ecx, edx;
424-
cpuid(0, 0, &eax, &ebx, &ecx, &edx);
425-
if (eax < 7)
426-
return 0;
427-
cpuid(7, 0, &eax, &ebx, &ecx, &edx);
428-
return (ebx & (1 << 5)) != 0;
429-
}
430-
431-
static bool x86_64_supports_avx512f(void) {
432-
int eax, ebx, ecx, edx;
433-
cpuid(0, 0, &eax, &ebx, &ecx, &edx);
434-
if (eax < 7) return 0;
435-
cpuid(7, 0, &eax, &ebx, &ecx, &edx);
436-
return (ebx & (1 << 16)) != 0;
437-
}
438-
439-
static struct ggml_type_traits type_traits[GGML_TYPE_COUNT];
440-
441-
static inline void ggml_fp32_to_fp16_generic(const float * x, ggml_fp16_t * y, int64_t n) {
442-
for (int64_t i = 0; i < n; i++) {
443-
y[i] = GGML_FP32_TO_FP16(x[i]);
444-
}
445-
}
446-
447-
static inline void __attribute__((target("f16c"))) ggml_fp32_to_fp16_row_f16c(const float * x, ggml_fp16_t * y, int64_t n) {
448-
int64_t i = 0;
449-
for (; i + 7 < n; i += 8) {
450-
__m256 x_vec = _mm256_loadu_ps(x + i);
451-
__m128i y_vec = _mm256_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT);
452-
_mm_storeu_si128((__m128i *)(y + i), y_vec);
453-
}
454-
for (; i + 3 < n; i += 4) {
455-
__m128 x_vec = _mm_loadu_ps(x + i);
456-
__m128i y_vec = _mm_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT);
457-
_mm_storel_epi64((__m128i *)(y + i), y_vec);
458-
}
459-
ggml_fp32_to_fp16_generic(x + i, y + i, n - i);
460-
}
461-
462-
static inline void __attribute__((target("avx512f"))) ggml_fp32_to_fp16_row_avx512f(const float * x, ggml_fp16_t * y, int64_t n) {
463-
int64_t i = 0;
464-
for (; i + 15 < n; i += 16) {
465-
__m512 x_vec = _mm512_loadu_ps(x + i);
466-
__m256i y_vec = _mm512_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT);
467-
_mm256_storeu_si256((__m256i *)(y + i), y_vec);
468-
}
469-
ggml_fp32_to_fp16_row_f16c(x + i, y + i, n - i);
470-
}
471-
472-
void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int64_t n) {
473-
static ggml_from_float_t from_float_ref = NULL;
474-
if (from_float_ref != NULL) {
475-
from_float_ref(x, y, n);
476-
return;
477-
}
478-
479-
bool has_avx512f = x86_64_supports_avx512f();
480-
bool has_f16c = x86_64_supports_f16c();
481-
if (has_avx512f && has_f16c) {
482-
// use AVX512F
483-
from_float_ref = (ggml_from_float_t)ggml_fp32_to_fp16_row_avx512f;
484-
} else if (has_f16c) {
485-
// use F16C
486-
from_float_ref = (ggml_from_float_t)ggml_fp32_to_fp16_row_f16c;
487-
} else {
488-
// fallback to generic implementation
489-
from_float_ref = (ggml_from_float_t)ggml_fp32_to_fp16_generic;
490-
}
491-
type_traits[GGML_TYPE_F16].from_float_ref = from_float_ref;
492-
from_float_ref(x, y, n);
493-
}
494-
495-
#else
496386
void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int64_t n) {
497-
for (int64_t i = 0; i < n; i++) {
387+
int i = 0;
388+
for (; i < n; ++i) {
498389
y[i] = GGML_FP32_TO_FP16(x[i]);
499390
}
500391
}
501392

502-
#endif
503-
504-
#if defined(__x86_64__)
505-
506-
507-
static inline void ggml_bf16_to_fp32_generic(const ggml_bf16_t * x, float * y, int64_t n) {
508-
for (int64_t i = 0; i < n; i++) {
509-
y[i] = GGML_BF16_TO_FP32(x[i]);
510-
}
511-
}
512-
513-
static inline void __attribute__((target("avx2"))) ggml_bf16_to_fp32_row_avx2(const ggml_bf16_t * x, float * y, int64_t n) {
514-
int64_t i = 0;
515-
for (; i + 7 < n; i += 8) {
516-
_mm256_storeu_ps(y + i,
517-
_mm256_castsi256_ps(
518-
_mm256_slli_epi32(
519-
_mm256_cvtepu16_epi32(
520-
_mm_loadu_si128(
521-
(const __m128i *)(x + i))),
522-
16)));
523-
}
524-
ggml_bf16_to_fp32_generic(x + i, y + i, n - i);
525-
}
526-
527-
static inline void __attribute__((target("avx512f"))) ggml_bf16_to_fp32_row_avx512f(const ggml_bf16_t * x, float * y, int64_t n) {
528-
int64_t i = 0;
529-
for (; i + 15 < n; i += 16) {
530-
_mm512_storeu_ps(y + i,
531-
_mm512_castsi512_ps(
532-
_mm512_slli_epi32(
533-
_mm512_cvtepu16_epi32(
534-
_mm256_loadu_si256(
535-
(const __m256i *)(x + i))),
536-
16)));
537-
}
538-
ggml_bf16_to_fp32_row_avx2(x + i, y + i, n - i);
539-
}
540-
541-
void ggml_bf16_to_fp32_row(const ggml_bf16_t * x, float * y, int64_t n) {
542-
static ggml_to_float_t to_float = NULL;
543-
if (to_float != NULL) {
544-
to_float(x, y, n);
545-
return;
546-
}
547-
bool has_avx512f = x86_64_supports_avx512f();
548-
bool has_avx2 = x86_64_supports_avx2();
549-
if (has_avx512f) {
550-
// use AVX512F
551-
to_float = (ggml_to_float_t)ggml_bf16_to_fp32_row_avx512f;
552-
} else if (has_avx2) {
553-
// use AVX2
554-
to_float = (ggml_to_float_t)ggml_bf16_to_fp32_row_avx2;
555-
} else {
556-
// fallback to generic implementation
557-
to_float = (ggml_to_float_t)ggml_bf16_to_fp32_generic;
558-
}
559-
type_traits[GGML_TYPE_BF16].to_float = to_float;
560-
to_float(x, y, n);
561-
}
562-
563-
#else
564-
565393
void ggml_bf16_to_fp32_row(const ggml_bf16_t * x, float * y, int64_t n) {
566-
for (int64_t i = 0; i < n; i++) {
394+
int i = 0;
395+
for (; i < n; ++i) {
567396
y[i] = GGML_BF16_TO_FP32(x[i]);
568397
}
569398
}
570-
#endif
571399

572400
void ggml_fp32_to_bf16_row_ref(const float * x, ggml_bf16_t * y, int64_t n) {
573401
for (int i = 0; i < n; i++) {
@@ -700,7 +528,7 @@ static void ggml_vec_dot_f32(int n, float * GGML_RESTRICT s, size_t bs, const fl
700528
static void ggml_vec_dot_f16(int n, float * GGML_RESTRICT s, size_t bs, ggml_fp16_t * GGML_RESTRICT x, size_t bx, ggml_fp16_t * GGML_RESTRICT y, size_t by, int nrc);
701529
static void ggml_vec_dot_bf16(int n, float * GGML_RESTRICT s, size_t bs, ggml_bf16_t * GGML_RESTRICT x, size_t bx, ggml_bf16_t * GGML_RESTRICT y, size_t by, int nrc);
702530

703-
static struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
531+
static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
704532
[GGML_TYPE_I8] = {
705533
.type_name = "i8",
706534
.blck_size = 1,

0 commit comments

Comments
 (0)