|
42 | 42 | #include <TargetConditionals.h> |
43 | 43 | #endif |
44 | 44 |
|
| 45 | +#if defined(__x86_64__) |
| 46 | +#include <immintrin.h> |
| 47 | +#endif |
| 48 | + |
45 | 49 | #if defined(_WIN32) |
46 | 50 | #define WIN32_LEAN_AND_MEAN |
47 | 51 | #ifndef NOMINMAX |
@@ -382,61 +386,185 @@ void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, int64_t n) { |
382 | 386 | } |
383 | 387 | } |
384 | 388 |
|
385 | | -// FIXME: these functions must detect the instruction set at runtime, since they are part of the core ggml library |
386 | | -// currently, the ggml_cpu_has_* functions are entirely compile-time |
387 | | -void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int64_t n) { |
388 | | - int64_t i = 0; |
389 | | -#if defined(__F16C__) |
390 | | - //if (ggml_cpu_has_f16c()) { |
391 | | - for (; i + 7 < n; i += 8) { |
392 | | - __m256 x_vec = _mm256_loadu_ps(x + i); |
393 | | - __m128i y_vec = _mm256_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT); |
394 | | - _mm_storeu_si128((__m128i *)(y + i), y_vec); |
395 | | - } |
396 | | - for(; i + 3 < n; i += 4) { |
397 | | - __m128 x_vec = _mm_loadu_ps(x + i); |
398 | | - __m128i y_vec = _mm_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT); |
399 | | - _mm_storel_epi64((__m128i *)(y + i), y_vec); |
400 | | - } |
401 | | - //} |
| 389 | +#if defined(__x86_64__) |
| 390 | + |
| 391 | +#if defined(_MSC_VER) |
| 392 | +#include <intrin.h> |
| 393 | +static void cpuid(int leaf, int subleaf, int *eax, int *ebx, int *ecx, int *edx) { |
| 394 | + int regs[4]; |
| 395 | + __cpuidex(regs, leaf, subleaf); |
| 396 | + *eax = regs[0]; |
| 397 | + *ebx = regs[1]; |
| 398 | + *ecx = regs[2]; |
| 399 | + *edx = regs[3]; |
| 400 | +} |
| 401 | +#elif defined(__GNUC__) || defined(__clang__) |
| 402 | +static void cpuid(int leaf, int subleaf, int *eax, int *ebx, int *ecx, int *edx) { |
| 403 | + __asm__ volatile ( |
| 404 | + "cpuid" |
| 405 | + : "=a"(*eax), "=b"(*ebx), "=c"(*ecx), "=d"(*edx) |
| 406 | + : "a"(leaf), "c"(subleaf) |
| 407 | + ); |
| 408 | +} |
| 409 | +#else |
| 410 | + #error Unsupported compiler |
402 | 411 | #endif |
403 | | - for (; i < n; i++) { |
| 412 | + |
| 413 | +static bool x86_64_supports_f16c(void) { |
| 414 | + int eax, ebx, ecx, edx; |
| 415 | + cpuid(1, 0, &eax, &ebx, &ecx, &edx); |
| 416 | + return (ecx & (1 << 29)) != 0; |
| 417 | +} |
| 418 | + |
| 419 | +static bool x86_64_supports_avx2(void) { |
| 420 | + int eax, ebx, ecx, edx; |
| 421 | + cpuid(0, 0, &eax, &ebx, &ecx, &edx); |
| 422 | + if (eax < 7) |
| 423 | + return 0; |
| 424 | + cpuid(7, 0, &eax, &ebx, &ecx, &edx); |
| 425 | + return (ebx & (1 << 5)) != 0; |
| 426 | +} |
| 427 | + |
| 428 | +static bool x86_64_supports_avx512f(void) { |
| 429 | + int eax, ebx, ecx, edx; |
| 430 | + cpuid(0, 0, &eax, &ebx, &ecx, &edx); |
| 431 | + if (eax < 7) return 0; |
| 432 | + cpuid(7, 0, &eax, &ebx, &ecx, &edx); |
| 433 | + return (ebx & (1 << 16)) != 0; |
| 434 | +} |
| 435 | + |
| 436 | +static struct ggml_type_traits type_traits[GGML_TYPE_COUNT]; |
| 437 | + |
| 438 | +static inline void ggml_fp32_to_fp16_generic(const float * x, ggml_fp16_t * y, int64_t n) { |
| 439 | + for (int64_t i = 0; i < n; i++) { |
404 | 440 | y[i] = GGML_FP32_TO_FP16(x[i]); |
405 | 441 | } |
406 | 442 | } |
407 | 443 |
|
408 | | -void ggml_bf16_to_fp32_row(const ggml_bf16_t * x, float * y, int64_t n) { |
| 444 | +static inline void __attribute__((target("f16c"))) ggml_fp32_to_fp16_row_f16c(const float * x, ggml_fp16_t * y, int64_t n) { |
409 | 445 | int64_t i = 0; |
410 | | -#if defined(__AVX512F__) |
411 | | - //if (ggml_cpu_has_avx512()) { |
412 | | - for (; i + 16 <= n; i += 16) { |
413 | | - _mm512_storeu_ps(y + i, |
414 | | - _mm512_castsi512_ps( |
415 | | - _mm512_slli_epi32( |
416 | | - _mm512_cvtepu16_epi32( |
417 | | - _mm256_loadu_si256( |
418 | | - (const __m256i *)(x + i))), |
419 | | - 16))); |
420 | | - } |
421 | | - //} |
422 | | -#endif |
423 | | -#if defined(__AVX2__) |
424 | | - //if (ggml_cpu_has_avx2()) { |
425 | | - for (; i + 8 <= n; i += 8) { |
426 | | - _mm256_storeu_ps(y + i, |
427 | | - _mm256_castsi256_ps( |
428 | | - _mm256_slli_epi32( |
429 | | - _mm256_cvtepu16_epi32( |
430 | | - _mm_loadu_si128( |
431 | | - (const __m128i *)(x + i))), |
432 | | - 16))); |
433 | | - } |
434 | | - //} |
| 446 | + for (; i + 7 < n; i += 8) { |
| 447 | + __m256 x_vec = _mm256_loadu_ps(x + i); |
| 448 | + __m128i y_vec = _mm256_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT); |
| 449 | + _mm_storeu_si128((__m128i *)(y + i), y_vec); |
| 450 | + } |
| 451 | + for (; i + 3 < n; i += 4) { |
| 452 | + __m128 x_vec = _mm_loadu_ps(x + i); |
| 453 | + __m128i y_vec = _mm_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT); |
| 454 | + _mm_storel_epi64((__m128i *)(y + i), y_vec); |
| 455 | + } |
| 456 | + ggml_fp32_to_fp16_generic(x + i, y + i, n - i); |
| 457 | +} |
| 458 | + |
| 459 | +static inline void __attribute__((target("avx512f"))) ggml_fp32_to_fp16_row_avx512f(const float * x, ggml_fp16_t * y, int64_t n) { |
| 460 | + int64_t i = 0; |
| 461 | + for (; i + 15 < n; i += 16) { |
| 462 | + __m512 x_vec = _mm512_loadu_ps(x + i); |
| 463 | + __m256i y_vec = _mm512_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT); |
| 464 | + _mm256_storeu_si256((__m256i *)(y + i), y_vec); |
| 465 | + } |
| 466 | + ggml_fp32_to_fp16_row_f16c(x + i, y + i, n - i); |
| 467 | +} |
| 468 | + |
| 469 | +void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int64_t n) { |
| 470 | +static ggml_from_float_t from_float_ref = NULL; |
| 471 | + if (from_float_ref != NULL) { |
| 472 | + from_float_ref(x, y, n); |
| 473 | + return; |
| 474 | + } |
| 475 | + |
| 476 | + bool has_avx512f = x86_64_supports_avx512f(); |
| 477 | + bool has_f16c = x86_64_supports_f16c(); |
| 478 | + if (has_avx512f && has_f16c) { |
| 479 | + // use AVX512F |
| 480 | + from_float_ref = (ggml_from_float_t)ggml_fp32_to_fp16_row_avx512f; |
| 481 | + } else if (has_f16c) { |
| 482 | + // use F16C |
| 483 | + from_float_ref = (ggml_from_float_t)ggml_fp32_to_fp16_row_f16c; |
| 484 | + } else { |
| 485 | + // fallback to generic implementation |
| 486 | + from_float_ref = (ggml_from_float_t)ggml_fp32_to_fp16_generic; |
| 487 | + } |
| 488 | + type_traits[GGML_TYPE_F16].from_float_ref = from_float_ref; |
| 489 | + from_float_ref(x, y, n); |
| 490 | +} |
| 491 | + |
| 492 | +#else |
| 493 | +void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int64_t n) { |
| 494 | + for (int64_t i = 0; i < n; i++) { |
| 495 | + y[i] = GGML_FP32_TO_FP16(x[i]); |
| 496 | + } |
| 497 | +} |
| 498 | + |
435 | 499 | #endif |
436 | | - for (; i < n; i++) { |
| 500 | + |
| 501 | +#if defined(__x86_64__) |
| 502 | + |
| 503 | + |
| 504 | +static inline void ggml_bf16_to_fp32_generic(const ggml_bf16_t * x, float * y, int64_t n) { |
| 505 | + for (int64_t i = 0; i < n; i++) { |
| 506 | + y[i] = GGML_BF16_TO_FP32(x[i]); |
| 507 | + } |
| 508 | +} |
| 509 | + |
| 510 | +static inline void __attribute__((target("avx2"))) ggml_bf16_to_fp32_row_avx2(const ggml_bf16_t * x, float * y, int64_t n) { |
| 511 | + int64_t i = 0; |
| 512 | + for (; i + 7 < n; i += 8) { |
| 513 | + _mm256_storeu_ps(y + i, |
| 514 | + _mm256_castsi256_ps( |
| 515 | + _mm256_slli_epi32( |
| 516 | + _mm256_cvtepu16_epi32( |
| 517 | + _mm_loadu_si128( |
| 518 | + (const __m128i *)(x + i))), |
| 519 | + 16))); |
| 520 | + } |
| 521 | + ggml_bf16_to_fp32_generic(x + i, y + i, n - i); |
| 522 | +} |
| 523 | + |
| 524 | +static inline void __attribute__((target("avx512f"))) ggml_bf16_to_fp32_row_avx512f(const ggml_bf16_t * x, float * y, int64_t n) { |
| 525 | + int64_t i = 0; |
| 526 | + for (; i + 15 < n; i += 16) { |
| 527 | + _mm512_storeu_ps(y + i, |
| 528 | + _mm512_castsi512_ps( |
| 529 | + _mm512_slli_epi32( |
| 530 | + _mm512_cvtepu16_epi32( |
| 531 | + _mm256_loadu_si256( |
| 532 | + (const __m256i *)(x + i))), |
| 533 | + 16))); |
| 534 | + } |
| 535 | + ggml_bf16_to_fp32_row_avx2(x + i, y + i, n - i); |
| 536 | +} |
| 537 | + |
| 538 | +void ggml_bf16_to_fp32_row(const ggml_bf16_t * x, float * y, int64_t n) { |
| 539 | + static ggml_to_float_t to_float = NULL; |
| 540 | + if (to_float != NULL) { |
| 541 | + to_float(x, y, n); |
| 542 | + return; |
| 543 | + } |
| 544 | + bool has_avx512f = x86_64_supports_avx512f(); |
| 545 | + bool has_avx2 = x86_64_supports_avx2(); |
| 546 | + if (has_avx512f) { |
| 547 | + // use AVX512F |
| 548 | + to_float = (ggml_to_float_t)ggml_bf16_to_fp32_row_avx512f; |
| 549 | + } else if (has_avx2) { |
| 550 | + // use AVX2 |
| 551 | + to_float = (ggml_to_float_t)ggml_bf16_to_fp32_row_avx2; |
| 552 | + } else { |
| 553 | + // fallback to generic implementation |
| 554 | + to_float = (ggml_to_float_t)ggml_bf16_to_fp32_generic; |
| 555 | + } |
| 556 | + type_traits[GGML_TYPE_BF16].to_float = to_float; |
| 557 | + to_float(x, y, n); |
| 558 | +} |
| 559 | + |
| 560 | +#else |
| 561 | + |
| 562 | +void ggml_bf16_to_fp32_row(const ggml_bf16_t * x, float * y, int64_t n) { |
| 563 | + for (int64_t i = 0; i < n; i++) { |
437 | 564 | y[i] = GGML_BF16_TO_FP32(x[i]); |
438 | 565 | } |
439 | 566 | } |
| 567 | +#endif |
440 | 568 |
|
441 | 569 | void ggml_fp32_to_bf16_row_ref(const float * x, ggml_bf16_t * y, int64_t n) { |
442 | 570 | for (int i = 0; i < n; i++) { |
@@ -569,7 +697,7 @@ static void ggml_vec_dot_f32(int n, float * GGML_RESTRICT s, size_t bs, const fl |
569 | 697 | static void ggml_vec_dot_f16(int n, float * GGML_RESTRICT s, size_t bs, ggml_fp16_t * GGML_RESTRICT x, size_t bx, ggml_fp16_t * GGML_RESTRICT y, size_t by, int nrc); |
570 | 698 | static void ggml_vec_dot_bf16(int n, float * GGML_RESTRICT s, size_t bs, ggml_bf16_t * GGML_RESTRICT x, size_t bx, ggml_bf16_t * GGML_RESTRICT y, size_t by, int nrc); |
571 | 699 |
|
572 | | -static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = { |
| 700 | +static struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = { |
573 | 701 | [GGML_TYPE_I8] = { |
574 | 702 | .type_name = "i8", |
575 | 703 | .blck_size = 1, |
|
0 commit comments