Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
176 changes: 51 additions & 125 deletions ggml/src/ggml-cpu/llamafile/sgemm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -117,8 +117,7 @@ inline float32x4_t mul(float32x4_t x, float32x4_t y) { return vec_mul(x, y); }
#endif

#if defined(__MMA__)
typedef vector unsigned char vec_t;
typedef __vector_quad acc_t;
#include "tinyBLAS_ppc.h"
#endif
////////////////////////////////////////////////////////////////////////////////////////////////////
// VECTORIZED FUSED MULTIPLY ADD
Expand Down Expand Up @@ -1573,95 +1572,35 @@ class tinyBLAS_BF16_PPC {
const int nth;
};

template <typename TA>
class tinyBLAS_Q0_PPC {
public:
tinyBLAS_Q0_PPC(int64_t k,
const TA *A, int64_t lda,
const block_q8_0 *B, int64_t ldb,
float *C, int64_t ldc,
int ith, int nth)
template <typename TA>
tinyBLAS_Q0_PPC<TA>::tinyBLAS_Q0_PPC(int64_t k,
const TA *A, int64_t lda,
const block_q8_0 *B, int64_t ldb,
float *C, int64_t ldc,
int ith, int nth)
: A(A), B(B), C(C), k(k), lda(lda), ldb(ldb), ldc(ldc), ith(ith), nth(nth) {
kc = 64;
}

void matmul(int64_t m, int64_t n) {
mnpack(0, m, 0, n);
}

private:

inline void save_res(int ii, int jj, int idx, vector float* fin_res, int RM=4, int RN=4) {
for (int I = 0; I < RM; I++) {
for (int J = 0; J < RN; J++) {
*((float*)(C+ii+((jj+J)*ldc)+I)) = *((float*)&fin_res[idx+I]+J);
}
}
}

template<int size>
inline void compute(acc_t* ACC, int c_idx, int s_idx, std::array<int, size>& comparray, vector float* vs, vector float* fin_res) {
vector signed int vec_C[4];
vector float CA[4] = {0};
vector float res[4] = {0};
__builtin_mma_disassemble_acc(vec_C, ACC);
for (int i = 0; i < 4; i++) {
CA[i] = vec_splats((float)(((double)comparray[c_idx+i]) * -128.0));
res[i] = vec_add(vec_ctf(vec_C[i], 0), CA[i]);
fin_res[s_idx+i] = vec_madd(res[i], vs[s_idx+i], fin_res[s_idx+i]);
}
}
/* This function processes quantized data from block_q4_0 elements.
* First the we try to extract the two int4 values stored in single int8_t into two signed int8.
* And then we subtract each of the resultant element with 8, to convert signed int8 to unsigned int8.
* Also compute the rowsum which is required to compensate the above conversion. */
inline void process_q4_elements(vector signed char (&c)[2], int* ca) {
const vector signed char lowMask = vec_splats((signed char)0xF);
const vector unsigned char v4 = vec_splats((unsigned char)0x4);
const vector signed char v8 = vec_splats((signed char)0x8);
vector signed int vsum = {0};
vector signed int vsum2 = {0};
c[0] = vec_and(c[1], lowMask);
c[1] = vec_sr(c[1], v4);
c[0] = vec_sub(c[0], v8);
c[1] = vec_sub(c[1], v8);
vsum = vec_sum4s(c[0], vsum);
vsum2 = vec_sum4s(c[1], vsum2);
vsum = vec_add(vsum, vsum2);
*(ca) = vsum[0] + vsum[1] + vsum[2] + vsum[3];
}

template <typename V1, typename V2>
inline void vector_permute_store(V2 &s1, V2 &s2, V2 &s3, V2 &s4, V1 *vecOffset, bool flip) {
vector unsigned char swiz1 = {0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23};
vector unsigned char swiz2 = {8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31};
vector unsigned char swiz3 = {0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27};
vector unsigned char swiz4 = {4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31};
V2 t1, t2, t3, t4, t5, t6, t7, t8;
vector unsigned char xor_vector;
uint8_t flip_vec = 0x80;
xor_vector = vec_splats(flip_vec);
t1 = vec_perm(s1, s2, swiz1);
t2 = vec_perm(s1, s2, swiz2);
t3 = vec_perm(s3, s4, swiz1);
t4 = vec_perm(s3, s4, swiz2);
t5 = vec_perm(t1, t3, swiz3);
t6 = vec_perm(t1, t3, swiz4);
t7 = vec_perm(t2, t4, swiz3);
t8 = vec_perm(t2, t4, swiz4);
if (flip == true) {
t5 = vec_xor(t5, xor_vector);
t6 = vec_xor(t6, xor_vector);
t7 = vec_xor(t7, xor_vector);
t8 = vec_xor(t8, xor_vector);
template<typename TA>
void tinyBLAS_Q0_PPC<TA>::matmul(int64_t m, int64_t n) {
int mc = 64; int nc = 64;
if (n % 8 == 0 && n < nc) {
nc = n;
mc = 32 ;
kc = 32;
}
const bool is_aligned = ((m & (mc - 1)) == 0) & ((n & (nc - 1)) == 0) & ((k & (kc - 1)) == 0);
if (is_aligned) {
this->matmul_tiled_q0(m, n, mc, nc, kc);
} else {
mnpack(0, m, 0, n);
}
vec_xst(t5, 0, vecOffset);
vec_xst(t6, 0, vecOffset+16);
vec_xst(t7, 0, vecOffset+32);
vec_xst(t8, 0, vecOffset+48);
}

template<int size>
void packNormalInt4(const TA* a, int64_t lda, int rows, int cols, int8_t* vec, std::array<int, size>& comparray) {
template<typename TA>
template<int size>
void tinyBLAS_Q0_PPC<TA>::packNormalInt4(const TA* a, int64_t lda, int rows, int cols, int8_t* vec, std::array<int, size>& comparray) {
int64_t i, j;
TA *aoffset = NULL;
int8_t *vecOffset = NULL;
Expand Down Expand Up @@ -1781,8 +1720,10 @@ class tinyBLAS_Q0_PPC {
}
}
}

template<typename TA>
template<typename VA, typename VB>
void packNormal(const block_q8_0* a, int64_t lda, int rows, int cols, VA* vec, bool flip) {
void tinyBLAS_Q0_PPC<TA>::packNormal(const block_q8_0* a, int64_t lda, int rows, int cols, VA* vec, bool flip) {
int64_t i, j;
block_q8_0 *aoffset = NULL;
VA *vecOffset = NULL;
Expand Down Expand Up @@ -1822,7 +1763,6 @@ class tinyBLAS_Q0_PPC {
j--;
} while(j > 0);
}

if (rows & 4) {
aoffsets[0] = aoffset;
for (int it = 1; it < 4; it++ )
Expand Down Expand Up @@ -1878,7 +1818,8 @@ class tinyBLAS_Q0_PPC {
}
}

void mnpack(int64_t m0, int64_t m, int64_t n0, int64_t n) {
template<typename TA>
void tinyBLAS_Q0_PPC<TA>::mnpack(int64_t m0, int64_t m, int64_t n0, int64_t n) {
int m_rem = MIN(m - m0, 16);
int n_rem = MIN(n - n0, 16);

Expand Down Expand Up @@ -1915,7 +1856,8 @@ class tinyBLAS_Q0_PPC {
}


void KERNEL_4x8(int64_t ii, int64_t jj) {
template<typename TA>
void tinyBLAS_Q0_PPC<TA>::KERNEL_4x8(int64_t ii, int64_t jj) {
vec_t vec_A[8], vec_B[16] = {0};
acc_t acc_0, acc_1;
std::array<int, 4> comparray {};
Expand Down Expand Up @@ -1953,14 +1895,15 @@ class tinyBLAS_Q0_PPC {
aoffset += lda;
}
}
compute<4>(&acc_0, 0, 0, comparray, vs, fin_res);
compute<4>(&acc_1, 0, 4, comparray, vs, fin_res);
compute(&acc_0, 0, 0, comparray, vs, fin_res);
compute(&acc_1, 0, 4, comparray, vs, fin_res);
}
save_res(ii, jj, 0, fin_res);
save_res(ii, jj+4, 4, fin_res);
}

void KERNEL_8x4(int64_t ii, int64_t jj) {
template<typename TA>
void tinyBLAS_Q0_PPC<TA>::KERNEL_8x4(int64_t ii, int64_t jj) {
vec_t vec_A[16], vec_B[8] = {0};
acc_t acc_0, acc_1;
std::array<int, 8> comparray {};
Expand Down Expand Up @@ -1997,16 +1940,18 @@ class tinyBLAS_Q0_PPC {
aoffset += lda;
}
}
compute<8>(&acc_0, 0, 0, comparray, vs, fin_res);
compute<8>(&acc_1, 4, 4, comparray, vs, fin_res);
compute(&acc_0, 0, 0, comparray, vs, fin_res);
compute(&acc_1, 4, 4, comparray, vs, fin_res);
}
save_res(ii, jj, 0, fin_res);
save_res(ii+4, jj, 4, fin_res);
}

void KERNEL_8x8(int64_t ii, int64_t jj) {
template<typename TA>
void tinyBLAS_Q0_PPC<TA>::KERNEL_8x8(int64_t ii, int64_t jj) {
vec_t vec_A[16], vec_B[16] = {0};
acc_t acc_0, acc_1, acc_2, acc_3;
acc_t acc_4, acc_5, acc_6, acc_7;
std::array<int, 8> comparray {};
vector float fin_res[16] = {0};
vector float vs[16] = {0};
Expand Down Expand Up @@ -2046,18 +1991,19 @@ class tinyBLAS_Q0_PPC {
aoffset += lda;
}
}
compute<8>(&acc_0, 0, 0, comparray, vs, fin_res);
compute<8>(&acc_1, 4, 4, comparray, vs, fin_res);
compute<8>(&acc_2, 0, 8, comparray, vs, fin_res);
compute<8>(&acc_3, 4, 12, comparray, vs, fin_res);
compute(&acc_0, 0, 0, comparray, vs, fin_res);
compute(&acc_1, 4, 4, comparray, vs, fin_res);
compute(&acc_2, 0, 8, comparray, vs, fin_res);
compute(&acc_3, 4, 12, comparray, vs, fin_res);
}
save_res(ii, jj, 0, fin_res);
save_res(ii+4, jj, 4, fin_res);
save_res(ii, jj+4, 8, fin_res);
save_res(ii+4, jj+4, 12, fin_res);
}

void gemm_small(int64_t m0, int64_t m, int64_t n0, int64_t n, int RM, int RN) {
template<typename TA>
void tinyBLAS_Q0_PPC<TA>::gemm_small(int64_t m0, int64_t m, int64_t n0, int64_t n, int RM, int RN) {
int64_t ytiles = (m - m0) / RM;
int64_t xtiles = (n - n0) / RN;
int64_t tiles = xtiles * ytiles;
Expand Down Expand Up @@ -2125,21 +2071,9 @@ class tinyBLAS_Q0_PPC {
}
}

template<int RM, int RN>
inline void kernel(int64_t ii, int64_t jj) {
if constexpr(RM == 4 && RN == 8) {
KERNEL_4x8(ii,jj);
} else if constexpr(RM == 8 && RN == 4) {
KERNEL_8x4(ii,jj);
} else if constexpr(RM == 8 && RN == 8) {
KERNEL_8x8(ii,jj);
} else {
assert(false && "RN/RM values not supported");
}
}

template<typename TA>
template <int RM, int RN>
NOINLINE void gemm(int64_t m0, int64_t m, int64_t n0, int64_t n) {
NOINLINE void tinyBLAS_Q0_PPC<TA>::gemm(int64_t m0, int64_t m, int64_t n0, int64_t n) {
int64_t ytiles = (m - m0) / RM;
int64_t xtiles = (n - n0) / RN;
int64_t tiles = xtiles * ytiles;
Expand All @@ -2151,20 +2085,12 @@ class tinyBLAS_Q0_PPC {
for (int64_t job = start; job < end; ++job) {
int64_t ii = m0 + job / xtiles * RM;
int64_t jj = n0 + job % xtiles * RN;
kernel<RM, RN>(ii, jj);
this->kernel<RM, RN>(ii, jj);
}
}

const TA *const A;
const block_q8_0 *const B;
float *C;
const int64_t k;
const int64_t lda;
const int64_t ldb;
const int64_t ldc;
const int ith;
const int nth;
};
template class tinyBLAS_Q0_PPC<block_q4_0>;
template class tinyBLAS_Q0_PPC<block_q8_0>;

class tinyBLAS_PPC {
public:
Expand Down
6 changes: 6 additions & 0 deletions ggml/src/ggml-cpu/llamafile/sgemm.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,12 @@
#include <vecintrin.h>
#endif

#ifdef _MSC_VER
#define NOINLINE __declspec(noinline)
#else
#define NOINLINE __attribute__((__noinline__))
#endif

#ifdef __cplusplus
extern "C" {
#endif
Expand Down
Loading