Skip to content

Commit 01013af

Browse files
committed
Trellis quants with CPU inference LostRuins#441 part 2
1 parent c4a827a commit 01013af

File tree

10 files changed

+3623
-3707
lines changed

10 files changed

+3623
-3707
lines changed

ggml/include/ggml.h

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -421,6 +421,10 @@ extern "C" {
421421
GGML_TYPE_Q8_K128 = 150,
422422
GGML_TYPE_Q8_KV = 151,
423423
GGML_TYPE_IQ5_KS = 152,
424+
GGML_TYPE_IQ2_KT = 153,
425+
GGML_TYPE_IQ3_KT = 154,
426+
GGML_TYPE_IQ4_KT = 155,
427+
424428
GGML_TYPE_IQ3_KS = 195,
425429

426430
GGML_TYPE_Q4_0_R8 = 202,
@@ -452,10 +456,6 @@ extern "C" {
452456
GGML_TYPE_Q8_KV_R8 = 398,
453457
GGML_TYPE_Q8_K_R8 = 399,
454458

455-
GGML_TYPE_IQ2_KT = 947,
456-
GGML_TYPE_IQ3_KT = 948,
457-
GGML_TYPE_IQ4_KT = 949,
458-
459459
GGML_TYPE_COUNT,
460460
};
461461

@@ -510,7 +510,11 @@ extern "C" {
510510
GGML_FTYPE_MOSTLY_IQ4_KSS = 139, // except 1d tensors
511511
GGML_FTYPE_MOSTLY_Q8_KV = 140, // except 1d tensors
512512
GGML_FTYPE_MOSTLY_IQ5_KS = 141, // except 1d tensors
513-
GGML_FTYPE_MOSTLY_IQ3_KS = 188, // except 1d tensors
513+
GGML_FTYPE_MOSTLY_IQ2_KT = 142, // except 1d tensors
514+
GGML_FTYPE_MOSTLY_IQ3_KT = 143, // except 1d tensors
515+
GGML_FTYPE_MOSTLY_IQ4_KT = 144, // except 1d tensors
516+
517+
GGML_FTYPE_MOSTLY_IQ3_KS = 185, // except 1d tensors
514518
//
515519
GGML_FTYPE_MOSTLY_Q4_0_R8 = 202, // except 1d tensors
516520
GGML_FTYPE_MOSTLY_Q8_0_R8 = 207, // except 1d tensors
@@ -541,9 +545,6 @@ extern "C" {
541545
GGML_FTYPE_MOSTLY_Q8_KV_R8 = 398, // except 1d tensors
542546
GGML_FTYPE_MOSTLY_Q8_K_R8 = 399, // except 1d tensors
543547

544-
GGML_FTYPE_MOSTLY_IQ2_KT = 947, // except 1d tensors
545-
GGML_FTYPE_MOSTLY_IQ3_KT = 948, // except 1d tensors
546-
GGML_FTYPE_MOSTLY_IQ4_KT = 949, // except 1d tensors
547548
};
548549

549550
// available tensor operations:

ggml/src/ggml-cpu/ggml-cpu.c

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -706,20 +706,20 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = {
706706
.nrows = 1,
707707
},
708708
[GGML_TYPE_IQ2_KT] = {
709-
// .from_float = quantize_row_iq2_kt,
710-
// .vec_dot = vec_dot_iq2_kt_q8_k,
709+
.from_float = quantize_row_iq2_kt,
710+
.vec_dot = vec_dot_iq2_kt_q8_k,
711711
.vec_dot_type = GGML_TYPE_Q8_K,
712712
.nrows = 1,
713713
},
714714
[GGML_TYPE_IQ3_KT] = {
715-
// .from_float = quantize_row_iq3_kt,
716-
// .vec_dot = vec_dot_iq3_kt_q8_k,
715+
.from_float = quantize_row_iq3_kt,
716+
.vec_dot = vec_dot_iq3_kt_q8_k,
717717
.vec_dot_type = GGML_TYPE_Q8_K,
718718
.nrows = 1,
719719
},
720720
[GGML_TYPE_IQ4_KT] = {
721-
// .from_float = quantize_row_iq4_kt,
722-
// .vec_dot = vec_dot_iq4_kt_q8_k,
721+
.from_float = quantize_row_iq4_kt,
722+
.vec_dot = vec_dot_iq4_kt_q8_k,
723723
.vec_dot_type = GGML_TYPE_Q8_K,
724724
.nrows = 1,
725725
},

ggml/src/ggml-cpu/ops.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1340,6 +1340,9 @@ void ggml_compute_forward_add(
13401340
case GGML_TYPE_Q8_K_R8:
13411341
case GGML_TYPE_Q8_KV:
13421342
case GGML_TYPE_BF16_R16:
1343+
case GGML_TYPE_IQ2_KT:
1344+
case GGML_TYPE_IQ3_KT:
1345+
case GGML_TYPE_IQ4_KT:
13431346
case GGML_TYPE_Q2_K:
13441347
case GGML_TYPE_Q3_K:
13451348
case GGML_TYPE_Q4_K:
@@ -1359,9 +1362,6 @@ void ggml_compute_forward_add(
13591362
{
13601363
ggml_compute_forward_add_q_f32(params, dst);
13611364
} break;
1362-
case GGML_TYPE_IQ2_KT:
1363-
case GGML_TYPE_IQ3_KT:
1364-
case GGML_TYPE_IQ4_KT: break;
13651365
default:
13661366
{
13671367
GGML_ABORT("fatal error");

ggml/src/ggml-cuda/convert.cu

Lines changed: 0 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -366,11 +366,6 @@ float __device__ __forceinline__ trellis_next(uint32_t& val) {
366366
const half * h = (const half *)&s;
367367
val = ka*val + kb;
368368
s = (val & kmask) ^ km32;
369-
//float r = (float)(h[0] +h[1]);
370-
//val = ka*val + kb;
371-
//s = (val & kmask) ^ km32;
372-
//r += (float)(h[0]+h[1]);
373-
//return r;
374369
return (float)(h[0]+h[1]);
375370
}
376371

@@ -417,34 +412,6 @@ static __global__ void dequantize_block_iq3_kt(const void * __restrict__ vx, dst
417412
}
418413
}
419414

420-
//template<typename dst_t>
421-
422-
//static __global__ void dequantize_block_iq3_kt(const void * __restrict__ vx, dst_t * __restrict__ yy, const int64_t nrows, const int64_t n_per_row) { next
423-
424-
//static __global__ void dequantize_block_iq3_kt(const void * __restrict__ vx, dst_t * __restrict__ yy, int64_t n_per_row, int64_t row_size) { orig
425-
426-
//
427-
// int64_t ii = blockIdx.x;
428-
// int64_t row = (QK_K * ii) / n_per_row;
429-
// const float * dptr = (const float *)((const char *)vx + row * row_size);
430-
// float scale = dptr[0];
431-
// float alpha = dptr[1];
432-
// const block_iq3_kt * x = (const block_iq3_kt *)(dptr + 2);
433-
// const int64_t i = ii - (row*n_per_row)/QK_K;
434-
//
435-
// const int64_t tid = threadIdx.x;
436-
// const int64_t ib = tid; // 0...31
437-
// dst_t * y = yy + ii*QK_K + 8*ib;
438-
// const uint16_t * ql = (const uint16_t *)x[i].ql;
439-
// uint32_t idx = ql[ib] + 4096;
440-
// const float dl = scale * ((x[i].scales[(ib/4)%4] >> 4*(ib/16)) & 0xf) * 31.75f * 1.01f; //1.015f;
441-
// uint8_t mask = 1 << (ib/4);
442-
// for (int j = 0; j < 8; ++j) {
443-
// float ay = std::abs(trellis_next(idx));
444-
// y[j] = dl * ay/(1 - alpha*ay) * (x[i].qh[(8*ib+j)%32] & mask ? -1.f : 1.f);
445-
// }
446-
//}
447-
448415
template<typename dst_t>
449416
static __global__ void dequantize_block_iq4_kt(const void * __restrict__ vx, dst_t * __restrict__ yy, int64_t n_per_row, int64_t row_size) {
450417

ggml/src/ggml-cuda/dmmv.cu

Lines changed: 0 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -41,30 +41,6 @@ static __device__ __forceinline__ void trellis_accum(uint32_t& val1, uint32_t& v
4141
#endif
4242
}
4343

44-
//static __device__ __forceinline__ void trellis_accum(uint32_t& val1, uint32_t& val2, uint32_t* s, const dfloat2* y, dfloat2& bdot1, dfloat2& bdot2) {
45-
// const half * h = (const half *)s;
46-
// s[0] = trellis_next(val1);
47-
// s[1] = trellis_next(val1);
48-
// s[2] = trellis_next(val1);
49-
// s[3] = trellis_next(val1);
50-
//#ifdef GGML_CUDA_F16
51-
// bdot1 = __hfma2(y[ 0], {h[0]+h[1]+h[2]+h[3], h[4]+h[5]+h[6]+h[7]}, bdot1);
52-
//#else
53-
// bdot1.x += y[ 0].x * (float)(h[0] + h[1] + h[2] + h[3]);
54-
// bdot1.y += y[ 0].y * (float)(h[4] + h[5] + h[6] + h[7]);
55-
//#endif
56-
// s[0] = trellis_next(val2);
57-
// s[1] = trellis_next(val2);
58-
// s[2] = trellis_next(val2);
59-
// s[3] = trellis_next(val2);
60-
//#ifdef GGML_CUDA_F16
61-
// bdot2 = __hfma2(y[64], {h[0]+h[1]+h[2]+h[3], h[4]+h[5]+h[6]+h[7]}, bdot2);
62-
//#else
63-
// bdot2.x += y[64].x * (float)(h[0] + h[1] + h[2] + h[3]);
64-
// bdot2.y += y[64].y * (float)(h[4] + h[5] + h[6] + h[7]);
65-
//#endif
66-
//}
67-
6844
static __device__ __forceinline__ void trellis_accum_abs(uint8_t signs1, uint8_t signs2, uint8_t mask1, uint8_t mask2,
6945
uint32_t& val1, uint32_t& val2, uint32_t* s, const dfloat2* y, dfloat2& bdot1, dfloat2& bdot2) {
7046
const half * h = (const half *)s;
@@ -77,8 +53,6 @@ static __device__ __forceinline__ void trellis_accum_abs(uint8_t signs1, uint8_t
7753
half h10 = __habs(h[4]+h[5]), h11 = __habs(h[6]+h[7]);
7854
half2 h1 = {signs1 & mask1 ? -h00 : h00, signs2 & mask1 ? -h01 : h01};
7955
half2 h2 = {signs1 & mask2 ? -h10 : h10, signs2 & mask2 ? -h11 : h11};
80-
//half2 h1 = __hmul2(__habs2({h[0]+h[1], h[2]+h[3]}), {signs1 & mask1 ? -1 : 1, signs2 & mask1 ? -1 : 1});
81-
//half2 h2 = __hmul2(__habs2({h[4]+h[5], h[6]+h[7]}), {signs1 & mask2 ? -1 : 1, signs2 & mask2 ? -1 : 1});
8256
bdot1 = __hfma2(y[ 0], h1, bdot1);
8357
bdot2 = __hfma2(y[64], h2, bdot2);
8458
#else

0 commit comments

Comments
 (0)