diff --git a/llamafile/iqk_mul_mat.inc b/llamafile/iqk_mul_mat.inc index 5b9c7b1a5d..5e9d688ce4 100644 --- a/llamafile/iqk_mul_mat.inc +++ b/llamafile/iqk_mul_mat.inc @@ -1,3 +1,8 @@ +// Adapted from +// https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/iqk_mul_mat.inc +// Copyrigth 2024 Iwan Kawrakow. +// Copyright(c) 2024 by KVCache.AI, All Rights Reserved. + // -*- mode:c++;indent-tabs-mode:nil;c-basic-offset:4;coding:utf-8 -*- // vi: set et ft=cpp fenc=utf-8 :vi // @@ -17,7 +22,7 @@ #include #include -#if defined __x86_64__ || defined __aarch64__ +#if defined __x86_64__ || defined __aarch64__ || defined(_M_X64) #include "llama.cpp/ggml-impl.h" #include "llama.cpp/ggml-quants.h" @@ -220,7 +225,7 @@ bool iqk_mul_mat_moe(long Nx, long Ny, long ne00, int ne11, int typeA, const voi return true; } -#if defined __x86_64__ +#if defined __x86_64__ || defined(_M_X64) #if defined HAVE_FANCY_SIMD #undef HAVE_FANCY_SIMD @@ -1407,7 +1412,8 @@ template void MulMat::set_functions(MulMat& m) { bool MulMat::set_mul_mat(int typeA, int ne00, MulMat& mm, int& row_size_q8, int) { - row_size_q8 = ggml_row_size(GGML_TYPE_Q8_K, ne00); + if (ne00 % ggml_blck_size(GGML_TYPE_Q8_K) == 0) + row_size_q8 = ggml_row_size(GGML_TYPE_Q8_K, ne00); switch (typeA) { case GGML_TYPE_Q2_K: diff --git a/llamafile/sgemm.cpp b/llamafile/sgemm.cpp index a379112332..5b6eb503bc 100644 --- a/llamafile/sgemm.cpp +++ b/llamafile/sgemm.cpp @@ -1,3 +1,8 @@ +// Adapted from +// https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/sgemm.cpp +// Copyrigth 2024 Mozilla Foundation. +// Copyright(c) 2024 by KVCache.AI, All Rights Reserved. + // -*- mode:c++;indent-tabs-mode:nil;c-basic-offset:4;coding:utf-8 -*- // vi: set et ft=cpp ts=4 sts=4 sw=4 fenc=utf-8 :vi // @@ -16,88 +21,87 @@ // limitations under the License. #include "sgemm.h" -#include "llamafile.h" -#include -#include -#include -#include +// #include +// #include +// #include +#include #include +#include +// #include "llamafile.h" static const struct GemmFuncs { - typeof(llamafile_sgemm) *sgemm; - typeof(llamafile_mixmul) *mixmul; - typeof(llamafile_mixmul_iqk) *iqk_mixmul = iqk_mul_mat_moe_unsupported; + bool (*sgemm)(long, long, long, const void*, long, const void*, long, void*, long, int, int, int, int, int, int, int); + bool (*mixmul)(const struct ggml_compute_params*, const struct ggml_tensor*, const struct ggml_tensor*, const struct ggml_tensor*, struct ggml_tensor*); + bool (*iqk_mixmul)(long, long, long, int, int, const void*, const void*, float*, long, long, const void*, int, int); + // typeof(llamafile_sgemm)* sgemm; + // typeof(llamafile_mixmul)* mixmul; + // typeof(llamafile_mixmul_iqk)* iqk_mixmul = iqk_mul_mat_moe_unsupported; GemmFuncs() { -#ifdef __x86_64__ - if (X86_HAVE(AVX)) { - if (X86_HAVE(FMA)) { - if (X86_HAVE(AVX2)) { - if (X86_HAVE(AVX512F)) { - if (X86_HAVE(AVX512VL) && // - X86_HAVE(AVX512BW) && // - X86_HAVE(AVX512DQ) && // - X86_HAVE(AVX512_VNNI) && // - X86_HAVE(AVX512_BF16)) { - // AMD Zen4+ (2023-) - sgemm = llamafile_sgemm_amd_zen4; - mixmul = llamafile_mixmul_amd_zen4; - iqk_mixmul = iqk_mul_mat_moe_zen4; - } else { - // Intel Xeon Skylake+ (2015-) - sgemm = llamafile_sgemm_amd_avx512f; - mixmul = llamafile_mixmul_amd_avx512f; - iqk_mixmul = iqk_mul_mat_moe; - } - } else if (X86_HAVE(AVXVNNI)) { - // Intel Alderlake (2021-) - sgemm = llamafile_sgemm_amd_avxvnni; - mixmul = llamafile_mixmul_amd_avxvnni; - iqk_mixmul = iqk_mul_mat_moe; - } else { - // Intel Haswell/Broadwell/Skylake (2013-2020) - // AMD Excavator (2015-2022) - sgemm = llamafile_sgemm_amd_avx2; - mixmul = llamafile_mixmul_amd_avx2; - if (X86_HAVE(F16C)) - iqk_mixmul = iqk_mul_mat_moe; - } - } else { - // AMD Piledriver (2011-2014) - sgemm = llamafile_sgemm_amd_fma; - mixmul = llamafile_mixmul_amd_fma; - if (X86_HAVE(F16C)) - iqk_mixmul = iqk_mul_mat_moe; - } - } else { - // Intel Sandybridge/Ivybridge (2010-2012) - // AMD Bulldozer (2011) - sgemm = llamafile_sgemm_amd_avx; - mixmul = llamafile_mixmul_amd_avx; - } - } else { - // AMD K8/Barcelona (2003-2010) - // Intel Core/Nehalem (2006-2009) - sgemm = llamafile_sgemm_unsupported; - mixmul = llamafile_mixmul_unsupported; - } -#elif defined(__aarch64__) - long hwcap = getauxval(AT_HWCAP); - if ((hwcap & HWCAP_FPHP) && // fp16 scalar isa (ID_AA64PFR0_EL1.FP == 1) - (hwcap & HWCAP_ASIMDHP) && // fp16 vector isa (ID_AA64PFR0_EL1.AdvSIMD == 1) - (hwcap & HWCAP_ASIMDDP)) { // dotprod isa (ID_AA64ISAR0_EL1.DP == 1) +//#if defined(__x86_64__) || defined(_M_X64) + // if (X86_HAVE(AVX)) { + // if (X86_HAVE(FMA)) { + // if (X86_HAVE(AVX2)) { + // if (X86_HAVE(AVX512F)) { + // if (X86_HAVE(AVX512VL) && // + // X86_HAVE(AVX512BW) && // + // X86_HAVE(AVX512DQ) && // + // X86_HAVE(AVX512_VNNI) && // + // X86_HAVE(AVX512_BF16)) { + // // AMD Zen4+ (2023-) + // sgemm = llamafile_sgemm_amd_zen4; + // mixmul = llamafile_mixmul_amd_zen4; + // iqk_mixmul = iqk_mul_mat_moe_zen4; + // } else { + // // Intel Xeon Skylake+ (2015-) + // sgemm = llamafile_sgemm_amd_avx512f; + // mixmul = llamafile_mixmul_amd_avx512f; + // iqk_mixmul = iqk_mul_mat_moe; + // } + // } else if (X86_HAVE(AVXVNNI)) { + // // Intel Alderlake (2021-) + // sgemm = llamafile_sgemm_amd_avxvnni; + // mixmul = llamafile_mixmul_amd_avxvnni; + // iqk_mixmul = iqk_mul_mat_moe; + // } else { + // // Intel Haswell/Broadwell/Skylake (2013-2020) + // // AMD Excavator (2015-2022) + // sgemm = llamafile_sgemm_amd_avx2; + // mixmul = llamafile_mixmul_amd_avx2; + // if (X86_HAVE(F16C)) + // iqk_mixmul = iqk_mul_mat_moe; + // } + // } else { + // // AMD Piledriver (2011-2014) + // sgemm = llamafile_sgemm_amd_fma; + // mixmul = llamafile_mixmul_amd_fma; + // if (X86_HAVE(F16C)) + // iqk_mixmul = iqk_mul_mat_moe; + // } + // } else { + // // Intel Sandybridge/Ivybridge (2010-2012) + // // AMD Bulldozer (2011) + // sgemm = llamafile_sgemm_amd_avx; + // mixmul = llamafile_mixmul_amd_avx; + // } + // } else { + // // AMD K8/Barcelona (2003-2010) + // // Intel Core/Nehalem (2006-2009) + // sgemm = llamafile_sgemm_unsupported; + // mixmul = llamafile_mixmul_unsupported; + // } + + +//#elif defined(__aarch64__) + //long hwcap = getauxval(AT_HWCAP); + //if ((hwcap & HWCAP_FPHP) && // fp16 scalar isa (ID_AA64PFR0_EL1.FP == 1) + // (hwcap & HWCAP_ASIMDHP) && // fp16 vector isa (ID_AA64PFR0_EL1.AdvSIMD == 1) + // (hwcap & HWCAP_ASIMDDP)) { // dotprod isa (ID_AA64ISAR0_EL1.DP == 1) // e.g. Apple M1, Raspberry Pi 5 sgemm = llamafile_sgemm_arm82; mixmul = llamafile_mixmul_arm82; iqk_mixmul = iqk_mul_mat_moe_arm82; - } else { - // ARM64 baseline ISA - sgemm = llamafile_sgemm_arm80; - mixmul = llamafile_mixmul_arm80; - } -#else - sgemm = llamafile_sgemm_unsupported; - mixmul = llamafile_mixmul_unsupported; -#endif + +//#endif } } funcs; @@ -120,26 +124,25 @@ static const struct GemmFuncs { * @param ldc is row stride of `C` * @param ith is thread id (must be less than `nth`) * @param nth is number of threads (must be greater than zero) + * @param task is GGML task type * @param Atype is GGML data type of `A` * @param Btype is GGML data type of `B` * @param Ctype is GGML data type of `C` + * @param precision may be used to control the internal compute type * @return true if this function was able to service the matmul request */ -bool llamafile_sgemm(long m, long n, long k, const void *A, long lda, const void *B, long ldb, - void *C, long ldc, int ith, int nth, int Atype, int Btype, int Ctype) { - return funcs.sgemm(m, n, k, A, lda, B, ldb, C, ldc, ith, nth, Atype, Btype, Ctype); +bool llamafile_sgemm(long m, long n, long k, const void* A, long lda, const void* B, long ldb, void* C, long ldc, int ith, int nth, int task, int Atype, int Btype, int Ctype, int precision) { + return funcs.sgemm(m, n, k, A, lda, B, ldb, C, ldc, ith, nth, task, Atype, Btype, Ctype, + precision); } /** * Performs "mixture of experts" tensor multiplication on CPU. */ -bool llamafile_mixmul(const ggml_compute_params *params, const ggml_tensor *weights, - const ggml_tensor *thought, const ggml_tensor *plan, ggml_tensor *result) { +bool llamafile_mixmul(const ggml_compute_params* params, const ggml_tensor* weights, const ggml_tensor* thought, const ggml_tensor* plan, ggml_tensor* result) { return funcs.mixmul(params, weights, thought, plan, result); } -bool llamafile_mixmul_iqk(long Nx, long Ny, long ne00, int ne11, int typeA, const void *A, - const void *B, float *C, long nb1, long nb2, const void *vrow_mapping, - int ith, int nth) { +bool llamafile_mixmul_iqk(long Nx, long Ny, long ne00, int ne11, int typeA, const void* A, const void* B, float* C, long nb1, long nb2, const void* vrow_mapping, int ith, int nth) { return funcs.iqk_mixmul(Nx, Ny, ne00, ne11, typeA, A, B, C, nb1, nb2, vrow_mapping, ith, nth); }