Skip to content

Commit a982c99

Browse files
Iwan KawrakowNexesenex
authored andcommitted
iq3_ks: basics, with an offset problem again.
iq3_ks: CUDA works iq3_ks: Fix CUDA dot product iq3_ks: Zen4 iq3_ks: ARM_NEON iq3_ks: slightly faster ARM_NEON iq3_ks: Metal - partially working Sequantize works, but not the dot product. Don't see what is wrong with it. iq3_ks: AVX2
1 parent ee3c9cc commit a982c99

24 files changed

+618
-1
lines changed

ggml/include/ggml.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -421,6 +421,7 @@ extern "C" {
421421
GGML_TYPE_Q8_K128 = 150,
422422
GGML_TYPE_Q8_KV = 151,
423423
GGML_TYPE_IQ5_KS = 152,
424+
GGML_TYPE_IQ3_KS = 195,
424425

425426
GGML_TYPE_Q4_0_R8 = 202,
426427
GGML_TYPE_Q5_0_R4 = 206,
@@ -509,6 +510,7 @@ extern "C" {
509510
GGML_FTYPE_MOSTLY_IQ4_KSS = 139, // except 1d tensors
510511
GGML_FTYPE_MOSTLY_Q8_KV = 140, // except 1d tensors
511512
GGML_FTYPE_MOSTLY_IQ5_KS = 141, // except 1d tensors
513+
GGML_FTYPE_MOSTLY_IQ3_KS = 188, // except 1d tensors
512514
//
513515
GGML_FTYPE_MOSTLY_Q4_0_R8 = 202, // except 1d tensors
514516
GGML_FTYPE_MOSTLY_Q8_0_R8 = 207, // except 1d tensors

ggml/src/ggml-common.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -737,6 +737,13 @@ typedef struct {
737737
} block_iq3_k_r4;
738738
static_assert(sizeof(block_iq3_k_r4) == 4*sizeof(block_iq3_k), "wrong iq3_k_r4 block size/padding");
739739

740+
typedef struct {
741+
uint8_t scales[QK_K/32];
742+
uint8_t qs[QK_K/4];
743+
uint8_t qh[QK_K/8];
744+
} block_iq3_ks;
745+
static_assert(sizeof(block_iq3_ks) == QK_K/32 + QK_K/4 + QK_K/8, "wrong iq3_ks block size/padding");
746+
740747
typedef struct {
741748
ggml_half d;
742749
uint16_t extra;

ggml/src/ggml-cpu/ggml-cpu-quants.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@ void quantize_row_iq4_ks (const float * GGML_RESTRICT x, void * GGML_RESTRICT y,
4444
void quantize_row_iq4_k (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
4545
void quantize_row_iq5_k (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
4646
void quantize_row_iq5_ks (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
47+
void quantize_row_iq3_ks (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
4748
void quantize_row_iq6_k (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
4849
void quantize_row_iq2_kt (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
4950
void quantize_row_iq3_kt (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
@@ -87,6 +88,7 @@ void vec_dot_iq4_ks_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void
8788
void vec_dot_iq4_k_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
8889
void vec_dot_iq5_k_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
8990
void vec_dot_iq5_ks_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
91+
void vec_dot_iq3_ks_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
9092
void vec_dot_iq6_k_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
9193
void vec_dot_iq2_kt_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
9294
void vec_dot_iq3_kt_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);

ggml/src/ggml-cpu/ggml-cpu.c

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -505,6 +505,12 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = {
505505
.vec_dot_type = GGML_TYPE_Q8_K,
506506
.nrows = 1,
507507
},
508+
[GGML_TYPE_IQ3_KS] = {
509+
.from_float = quantize_row_iq3_ks,
510+
.vec_dot = vec_dot_iq3_ks_q8_k,
511+
.vec_dot_type = GGML_TYPE_Q8_K,
512+
.nrows = 1,
513+
},
508514
[GGML_TYPE_Q8_K64] = {
509515
.from_float = quantize_row_q8_K64,
510516
},

ggml/src/ggml-cpu/ops.cpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1307,6 +1307,7 @@ void ggml_compute_forward_add(
13071307
case GGML_TYPE_IQ2_K:
13081308
case GGML_TYPE_IQ2_KS:
13091309
case GGML_TYPE_IQ3_K:
1310+
case GGML_TYPE_IQ3_KS:
13101311
case GGML_TYPE_IQ4_K:
13111312
case GGML_TYPE_IQ5_K:
13121313
case GGML_TYPE_IQ5_KS:
@@ -1730,6 +1731,7 @@ void ggml_compute_forward_add1(
17301731
case GGML_TYPE_IQ3_KT:
17311732
case GGML_TYPE_IQ4_KT:
17321733
case GGML_TYPE_IQ3_K:
1734+
case GGML_TYPE_IQ3_KS:
17331735
case GGML_TYPE_IQ4_K:
17341736
case GGML_TYPE_IQ5_K:
17351737
case GGML_TYPE_IQ5_KS:
@@ -1900,6 +1902,7 @@ void ggml_compute_forward_acc(
19001902
case GGML_TYPE_IQ3_KT:
19011903
case GGML_TYPE_IQ4_KT:
19021904
case GGML_TYPE_IQ3_K:
1905+
case GGML_TYPE_IQ3_KS:
19031906
case GGML_TYPE_IQ4_K:
19041907
case GGML_TYPE_IQ5_K:
19051908
case GGML_TYPE_IQ5_KS:
@@ -3929,6 +3932,7 @@ void ggml_compute_forward_out_prod(
39293932
case GGML_TYPE_IQ3_KT:
39303933
case GGML_TYPE_IQ4_KT:
39313934
case GGML_TYPE_IQ3_K:
3935+
case GGML_TYPE_IQ3_KS:
39323936
case GGML_TYPE_IQ4_K:
39333937
case GGML_TYPE_IQ5_K:
39343938
case GGML_TYPE_IQ5_KS:
@@ -4428,6 +4432,7 @@ void ggml_compute_forward_set(
44284432
case GGML_TYPE_IQ3_KT:
44294433
case GGML_TYPE_IQ4_KT:
44304434
case GGML_TYPE_IQ3_K:
4435+
case GGML_TYPE_IQ3_KS:
44314436
case GGML_TYPE_IQ4_K:
44324437
case GGML_TYPE_IQ5_K:
44334438
case GGML_TYPE_IQ5_KS:
@@ -4732,6 +4737,7 @@ void ggml_compute_forward_get_rows(
47324737
case GGML_TYPE_IQ3_KT:
47334738
case GGML_TYPE_IQ4_KT:
47344739
case GGML_TYPE_IQ3_K:
4740+
case GGML_TYPE_IQ3_KS:
47354741
case GGML_TYPE_IQ4_K:
47364742
case GGML_TYPE_IQ5_K:
47374743
case GGML_TYPE_IQ5_KS:
@@ -5408,6 +5414,7 @@ void ggml_compute_forward_clamp(
54085414
case GGML_TYPE_IQ3_KT:
54095415
case GGML_TYPE_IQ4_KT:
54105416
case GGML_TYPE_IQ3_K:
5417+
case GGML_TYPE_IQ3_KS:
54115418
case GGML_TYPE_IQ4_K:
54125419
case GGML_TYPE_IQ5_K:
54135420
case GGML_TYPE_IQ5_KS:

ggml/src/ggml-cuda/common.cuh

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -695,6 +695,13 @@ struct ggml_cuda_type_traits<GGML_TYPE_IQ4_KSS> {
695695
static constexpr int qi = QI4_XS;
696696
};
697697

698+
template<>
699+
struct ggml_cuda_type_traits<GGML_TYPE_IQ3_KS> {
700+
static constexpr int qk = QK_K;
701+
static constexpr int qr = QR4_XS;
702+
static constexpr int qi = QI4_XS;
703+
};
704+
698705
template<>
699706
struct ggml_cuda_type_traits<GGML_TYPE_IQ5_K> {
700707
static constexpr int qk = QK_K;

ggml/src/ggml-cuda/convert.cu

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -706,6 +706,37 @@ static __global__ void dequantize_block_iq4_xs(const void * __restrict__ vx, dst
706706
}
707707
}
708708

709+
template<typename dst_t>
710+
static __global__ void dequantize_block_iq3_ks(const void * __restrict__ vx, dst_t * __restrict__ yy, int64_t n_per_row, int64_t row_size) {
711+
712+
int64_t ii = blockIdx.x;
713+
int64_t row = (QK_K * ii) / n_per_row;
714+
const char * cx = (const char *)vx + row * row_size;
715+
float scale = *(const float *)cx;
716+
const block_iq3_ks * x = (const block_iq3_ks *)(cx + sizeof(float));
717+
const int64_t i = ii - (row*n_per_row)/QK_K;
718+
719+
const int tid = threadIdx.x;
720+
int ib128 = tid/16; // 0 or 1
721+
int il = tid%16; // 0...15
722+
dst_t * y = yy + ii*QK_K + 128*ib128 + 2*il;
723+
//uint32_t sc = ((const uint32_t *)x[i].scales)[ib128];
724+
//uint32_t aux32 =
725+
const float dl1 = scale * ((x[i].scales[4*ib128+0] & 254) - 127);
726+
const float dl2 = scale * ((x[i].scales[4*ib128+1] & 254) - 127);
727+
const float dl3 = scale * ((x[i].scales[4*ib128+2] & 254) - 127);
728+
const float dl4 = scale * ((x[i].scales[4*ib128+3] & 254) - 127);
729+
const uint8_t * qs = x[i].qs + 32*ib128 + 2*il;
730+
const uint8_t * qh = x[i].qh + 2*il;
731+
for (int j = 0; j < 2; ++j) {
732+
const uint8_t h = qh[j] >> (4*(ib128%2));
733+
y[j+ 0] = dl1 * iq3nl_values[(((qs[j] >> 0) & 0x03) | ((h & 0x01) << 2)) + ((x[i].scales[4*ib128+0] & 1) << 3)];
734+
y[j+32] = dl2 * iq3nl_values[(((qs[j] >> 2) & 0x03) | ((h & 0x02) << 1)) + ((x[i].scales[4*ib128+1] & 1) << 3)];
735+
y[j+64] = dl3 * iq3nl_values[(((qs[j] >> 4) & 0x03) | ((h & 0x04) >> 0)) + ((x[i].scales[4*ib128+2] & 1) << 3)];
736+
y[j+96] = dl4 * iq3nl_values[(((qs[j] >> 6) & 0x03) | ((h & 0x08) >> 1)) + ((x[i].scales[4*ib128+3] & 1) << 3)];
737+
}
738+
}
739+
709740
template<typename dst_t>
710741
static __global__ void dequantize_block_iq4_ks(const void * __restrict__ vx, dst_t * __restrict__ yy, int64_t n_per_row, int64_t row_size) {
711742

@@ -1257,6 +1288,14 @@ static void dequantize_row_iq2_ks_cuda(const void * vx, dst_t * y, const int64_t
12571288
dequantize_block_iq2_ks<<<nb, 32, 0, stream>>>(vx, y, n_per_row, row_size);
12581289
}
12591290

1291+
template<typename dst_t>
1292+
static void dequantize_row_iq3_ks_cuda(const void * vx, dst_t * y, const int64_t nrows, const int64_t n_per_row, cudaStream_t stream) {
1293+
const int64_t k = nrows * n_per_row;
1294+
const int64_t row_size = ggml_row_size(GGML_TYPE_IQ3_KS, n_per_row);
1295+
const int nb = (k + QK_K - 1) / QK_K;
1296+
dequantize_block_iq3_ks<<<nb, 32, 0, stream>>>(vx, y, n_per_row, row_size);
1297+
}
1298+
12601299
template<typename dst_t>
12611300
static void dequantize_row_iq2_k_cuda(const void * vx, dst_t * y, const int64_t nrows, const int64_t n_per_row, cudaStream_t stream) {
12621301
const int64_t k = nrows * n_per_row;
@@ -1383,6 +1422,8 @@ to_bf16_cuda_t ggml_get_to_bf16_cuda(ggml_type type) {
13831422
return dequantize_row_iq2_k_cuda<nv_bfloat16>;
13841423
case GGML_TYPE_IQ3_K:
13851424
return dequantize_row_iq3_k_cuda<nv_bfloat16>;
1425+
case GGML_TYPE_IQ3_KS:
1426+
return dequantize_row_iq3_ks_cuda<nv_bfloat16>;
13861427
case GGML_TYPE_IQ4_KSS:
13871428
return dequantize_row_iq4_kss_cuda<nv_bfloat16>;
13881429
case GGML_TYPE_IQ4_KS:
@@ -1461,6 +1502,8 @@ to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
14611502
return dequantize_row_iq4_xs_cuda;
14621503
case GGML_TYPE_IQ2_KS:
14631504
return dequantize_row_iq2_ks_cuda;
1505+
case GGML_TYPE_IQ3_KS:
1506+
return dequantize_row_iq3_ks_cuda;
14641507
case GGML_TYPE_IQ2_K:
14651508
return dequantize_row_iq2_k_cuda;
14661509
case GGML_TYPE_IQ3_K:
@@ -1547,6 +1590,8 @@ to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
15471590
return dequantize_row_iq4_kss_cuda;
15481591
case GGML_TYPE_IQ2_KS:
15491592
return dequantize_row_iq2_ks_cuda;
1593+
case GGML_TYPE_IQ3_KS:
1594+
return dequantize_row_iq3_ks_cuda;
15501595
case GGML_TYPE_IQ2_K:
15511596
return dequantize_row_iq2_k_cuda;
15521597
case GGML_TYPE_IQ3_K:

ggml/src/ggml-cuda/ggml-cuda.cu

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3320,6 +3320,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
33203320
case GGML_TYPE_IQ4_KSS:
33213321
case GGML_TYPE_IQ2_K:
33223322
case GGML_TYPE_IQ2_KS:
3323+
case GGML_TYPE_IQ3_KS:
33233324
case GGML_TYPE_IQ2_KT:
33243325
case GGML_TYPE_IQ3_KT:
33253326
case GGML_TYPE_IQ4_KT:

ggml/src/ggml-cuda/iqk_mmvq.cu

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -664,6 +664,69 @@ __device__ __forceinline__ float vec_dot_iq3_k_q8_1(
664664

665665
}
666666

667+
__device__ __forceinline__ float vec_dot_iq3_ks_q8_1(
668+
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iiqs) {
669+
670+
const float d = *(const float *)vbq;
671+
const block_iq3_ks * bq3 = (const block_iq3_ks *)((const char *)vbq + sizeof(float)) + kbx;
672+
673+
int iqs = iiqs/4;
674+
const int ib128 = iqs/4; // 0 or 1. 0 works on quants 0...127, 1 on quants 128...255
675+
// Each thread processes 8 quants in each of the 4 32-blocks
676+
const int il8 = iqs%4; // 0...3. 0 works on quants 0...7, 1 on quants 8...15, 2 on 16...23, 3 on 24...31
677+
678+
const uint32_t * ql = (const uint32_t *)bq3->qs + 8*ib128 + 2*il8;
679+
const uint32_t * qh = (const uint32_t *)bq3->qh + 2*il8;
680+
681+
uint32_t aux32;
682+
const uint8_t * aux8 = (const uint8_t *)&aux32;
683+
684+
const int hshift = 4*(1-ib128);
685+
686+
const uint16_t * values1 = iq3k_table + ((bq3->scales[4*ib128+0] << 6) & 0x40);
687+
const uint16_t * values2 = iq3k_table + ((bq3->scales[4*ib128+1] << 6) & 0x40);
688+
const uint16_t * values3 = iq3k_table + ((bq3->scales[4*ib128+2] << 6) & 0x40);
689+
const uint16_t * values4 = iq3k_table + ((bq3->scales[4*ib128+3] << 6) & 0x40);
690+
691+
const int * q8;
692+
int sumi[4] = {0, 0, 0, 0};
693+
int v;
694+
for (int i = 0; i < 2; ++i) {
695+
uint32_t vl = ql[i];
696+
uint32_t vh = (qh[i] << hshift) >> 2;
697+
698+
q8 = (const int *)bq8_1[4*ib128+0].qs + 2*il8;
699+
aux32 = (vl & 0x03030303) | (vh & 0x04040404);
700+
v = int_from_table_2(aux8, values1);
701+
sumi[0] = ggml_cuda_dp4a(v, q8[i], sumi[0]);
702+
vl >>= 2; vh >>= 1;
703+
704+
q8 += sizeof(block_q8_1)/4;
705+
aux32 = (vl & 0x03030303) | (vh & 0x04040404);
706+
v = int_from_table_2(aux8, values2);
707+
sumi[1] = ggml_cuda_dp4a(v, q8[i], sumi[1]);
708+
vl >>= 2; vh >>= 1;
709+
710+
q8 += sizeof(block_q8_1)/4;
711+
aux32 = (vl & 0x03030303) | (vh & 0x04040404);
712+
v = int_from_table_2(aux8, values3);
713+
sumi[2] = ggml_cuda_dp4a(v, q8[i], sumi[2]);
714+
vl >>= 2; vh >>= 1;
715+
716+
q8 += sizeof(block_q8_1)/4;
717+
aux32 = (vl & 0x03030303) | (vh & 0x04040404);
718+
v = int_from_table_2(aux8, values4);
719+
sumi[3] = ggml_cuda_dp4a(v, q8[i], sumi[3]);
720+
721+
}
722+
aux32 = ((const uint32_t *)bq3->scales)[ib128] & 0xfefefefe;
723+
return d * (__low2float(bq8_1[4*ib128+0].ds) * ((int)aux8[0] - 127) * sumi[0] +
724+
__low2float(bq8_1[4*ib128+1].ds) * ((int)aux8[1] - 127) * sumi[1] +
725+
__low2float(bq8_1[4*ib128+2].ds) * ((int)aux8[2] - 127) * sumi[2] +
726+
__low2float(bq8_1[4*ib128+3].ds) * ((int)aux8[3] - 127) * sumi[3]);
727+
728+
}
729+
667730
/* __device__ __forceinline__ float vec_dot_iq1_bn_q8_1(
668731
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) {
669732
@@ -780,6 +843,13 @@ void mul_mat_vec_iq3_k_q8_1_cuda(
780843
iqk_mul_mat_vec_q_cuda<GGML_TYPE_IQ3_K, VDR_IQ3_K_Q8_1_MMVQ, vec_dot_iq3_k_q8_1>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
781844
}
782845

846+
void mul_mat_vec_iq3_ks_q8_1_cuda(
847+
const void * vx, const void * vy, float * dst,
848+
const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
849+
850+
iqk_mul_mat_vec_q_cuda<GGML_TYPE_IQ3_KS, VDR_IQ3_K_Q8_1_MMVQ, vec_dot_iq3_ks_q8_1>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
851+
}
852+
783853
void mul_mat_vec_iq4_k_q8_1_cuda(
784854
const void * vx, const void * vy, float * dst,
785855
const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {

ggml/src/ggml-cuda/iqk_mmvq.cuh

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,10 @@ void mul_mat_vec_iq2_ks_q8_1_cuda(
3232
const void * vx, const void * vy, float * dst,
3333
const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream);
3434

35+
void mul_mat_vec_iq3_ks_q8_1_cuda(
36+
const void * vx, const void * vy, float * dst,
37+
const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream);
38+
3539
void mul_mat_vec_iq5_ks_q8_1_cuda(
3640
const void * vx, const void * vy, float * dst,
3741
const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream);
@@ -47,3 +51,4 @@ void mul_mat_vec_iq2_kt_q8_1_cuda(
4751
// void mul_mat_vec_iq2_bn_q8_1_cuda(
4852
// const void * vx, const void * vy, float * dst,
4953
// const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream);
54+

0 commit comments

Comments
 (0)