Skip to content

Commit eada78d

Browse files
Iwan KawrakowNexesenex
authored andcommitted
iq3_ks_v2: basics
iq3_ks: CUDA dequantize iq3_ks: CUDA mmvq iq3_ks: mmq iq3_ks: faster mmq iq3_ks: Zen4 iq3_ks: AVX2 convert to q8_k_r8 This gives usPP-512 = 360 t/s. iq3_ks: AVX2 GEMM/GEMV iq3_ks: NEON GEMM/GEMV iq3_ks: NEON convert to q8_k_r8 This gives us PP-512 = 164 t/s. iq3_ks: Metal dequantize iq3_ks: Metal gemv - pathetic performance Update ops.cpp and constants
1 parent 4f2f1b7 commit eada78d

File tree

24 files changed

+891
-67
lines changed

24 files changed

+891
-67
lines changed

ggml/include/ggml.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -424,6 +424,7 @@ extern "C" {
424424
GGML_TYPE_IQ2_KT = 153,
425425
GGML_TYPE_IQ3_KT = 154,
426426
GGML_TYPE_IQ4_KT = 155,
427+
GGML_TYPE_IQ3_KS = 156,
427428

428429
GGML_TYPE_IQ3_KS_V1 = 196,
429430

@@ -513,6 +514,7 @@ extern "C" {
513514
GGML_FTYPE_MOSTLY_IQ2_KT = 142, // except 1d tensors
514515
GGML_FTYPE_MOSTLY_IQ3_KT = 143, // except 1d tensors
515516
GGML_FTYPE_MOSTLY_IQ4_KT = 144, // except 1d tensors
517+
GGML_FTYPE_MOSTLY_IQ3_KS = 145, // except 1d tensors
516518

517519
GGML_FTYPE_MOSTLY_IQ3_KS_V1 = 185, // except 1d tensors
518520
//

ggml/src/ggml-common.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -748,6 +748,14 @@ typedef struct {
748748
} block_iq3_k;
749749
static_assert(sizeof(block_iq3_k) == sizeof(ggml_half) + 2*sizeof(uint16_t) + QK_K/32 + QK_K/4 + QK_K/8, "wrong iq3_k block size/padding");
750750

751+
typedef struct {
752+
uint16_t extra;
753+
uint8_t scales[QK_K/64];
754+
uint8_t qs[QK_K/4];
755+
uint8_t qh[QK_K/8];
756+
} block_iq3_ks;
757+
static_assert(sizeof(block_iq3_ks) == sizeof(uint16_t) + QK_K/64 + QK_K/4 + QK_K/8, "wrong iq3_ks block size/padding");
758+
751759
typedef struct {
752760
ggml_half d[4];
753761
uint8_t extra[8];

ggml/src/ggml-cpu/ggml-cpu-quants.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,7 @@ void quantize_row_iq4_xs (const float * GGML_RESTRICT x, void * GGML_RESTRICT y,
5555
// void quantize_row_iq5_k (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
5656
// void quantize_row_iq5_ks (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
5757
// void quantize_row_iq3_ks_v1 (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
58+
// void quantize_row_iq3_ks (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
5859
// void quantize_row_iq6_k (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
5960
// void quantize_row_iq2_kt (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
6061
// void quantize_row_iq3_kt (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
@@ -99,6 +100,7 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const
99100
// void vec_dot_iq5_k_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
100101
// void vec_dot_iq5_ks_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
101102
// void vec_dot_iq3_ks_v1_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
103+
// void vec_dot_iq3_ks_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
102104
// void vec_dot_iq6_k_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
103105
// void vec_dot_iq2_kt_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
104106
// void vec_dot_iq3_kt_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);

ggml/src/ggml-cpu/ggml-cpu.c

Lines changed: 12 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -632,12 +632,6 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = {
632632
.vec_dot_type = GGML_TYPE_Q8_K,
633633
.nrows = 1,
634634
},
635-
[GGML_TYPE_IQ3_KS_V1] = {
636-
.from_float = quantize_row_iq3_ks_v1,
637-
.vec_dot = vec_dot_iq3_ks_v1_q8_k,
638-
.vec_dot_type = GGML_TYPE_Q8_K,
639-
.nrows = 1,
640-
},
641635
[GGML_TYPE_Q8_K] = {
642636
.from_float = quantize_row_q8_K,
643637
},
@@ -798,6 +792,18 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = {
798792
.vec_dot_type = GGML_TYPE_Q8_K,
799793
.nrows = 1,
800794
},
795+
[GGML_TYPE_IQ3_KS_V1] = {
796+
.from_float = quantize_row_iq3_ks_v1,
797+
.vec_dot = vec_dot_iq3_ks_v1_q8_k,
798+
.vec_dot_type = GGML_TYPE_Q8_K,
799+
.nrows = 1,
800+
},
801+
[GGML_TYPE_IQ3_KS] = {
802+
.from_float = quantize_row_iq3_ks,
803+
.vec_dot = vec_dot_iq3_ks_q8_k,
804+
.vec_dot_type = GGML_TYPE_Q8_K,
805+
.nrows = 1,
806+
},
801807
[GGML_TYPE_IQ4_K] = {
802808
.from_float = quantize_row_iq4_k,
803809
.vec_dot = vec_dot_iq4_k_q8_k,

ggml/src/ggml-cpu/ops.cpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1745,6 +1745,7 @@ void ggml_compute_forward_add(
17451745
case GGML_TYPE_TQ1_0:
17461746
case GGML_TYPE_TQ2_0:
17471747
case GGML_TYPE_IQ3_KS_V1:
1748+
case GGML_TYPE_IQ3_KS:
17481749
case GGML_TYPE_Q6_K_R4:
17491750
case GGML_TYPE_Q8_K_R8:
17501751
case GGML_TYPE_Q8_KR8:
@@ -2227,6 +2228,7 @@ void ggml_compute_forward_add1(
22272228
case GGML_TYPE_TQ1_0:
22282229
case GGML_TYPE_TQ2_0:
22292230
case GGML_TYPE_IQ3_KS_V1:
2231+
case GGML_TYPE_IQ3_KS:
22302232
case GGML_TYPE_Q6_K_R4:
22312233
case GGML_TYPE_Q8_K_R8:
22322234
case GGML_TYPE_Q8_KR8:
@@ -2406,6 +2408,7 @@ void ggml_compute_forward_acc(
24062408
case GGML_TYPE_TQ1_0:
24072409
case GGML_TYPE_TQ2_0:
24082410
case GGML_TYPE_IQ3_KS_V1:
2411+
case GGML_TYPE_IQ3_KS:
24092412
case GGML_TYPE_Q6_K_R4:
24102413
case GGML_TYPE_Q8_K_R8:
24112414
case GGML_TYPE_Q8_KR8:
@@ -5114,6 +5117,7 @@ void ggml_compute_forward_out_prod(
51145117
case GGML_TYPE_TQ1_0:
51155118
case GGML_TYPE_TQ2_0:
51165119
case GGML_TYPE_IQ3_KS_V1:
5120+
case GGML_TYPE_IQ3_KS:
51175121
case GGML_TYPE_Q6_K_R4:
51185122
case GGML_TYPE_Q8_K_R8:
51195123
case GGML_TYPE_Q8_KR8:
@@ -5622,6 +5626,7 @@ void ggml_compute_forward_set(
56225626
case GGML_TYPE_TQ1_0:
56235627
case GGML_TYPE_TQ2_0:
56245628
case GGML_TYPE_IQ3_KS_V1:
5629+
case GGML_TYPE_IQ3_KS:
56255630
case GGML_TYPE_Q6_K_R4:
56265631
case GGML_TYPE_Q8_K_R8:
56275632
case GGML_TYPE_Q8_KR8:
@@ -5935,6 +5940,7 @@ void ggml_compute_forward_get_rows(
59355940
case GGML_TYPE_TQ1_0:
59365941
case GGML_TYPE_TQ2_0:
59375942
case GGML_TYPE_IQ3_KS_V1:
5943+
case GGML_TYPE_IQ3_KS:
59385944
case GGML_TYPE_Q6_K_R4:
59395945
case GGML_TYPE_Q8_K_R8:
59405946
case GGML_TYPE_Q8_KR8:
@@ -6689,6 +6695,7 @@ void ggml_compute_forward_clamp(
66896695
case GGML_TYPE_TQ1_0:
66906696
case GGML_TYPE_TQ2_0:
66916697
case GGML_TYPE_IQ3_KS_V1:
6698+
case GGML_TYPE_IQ3_KS:
66926699
case GGML_TYPE_Q6_K_R4:
66936700
case GGML_TYPE_Q8_K_R8:
66946701
case GGML_TYPE_Q8_KR8:

ggml/src/ggml-cuda/common.cuh

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -708,6 +708,13 @@ struct ggml_cuda_type_traits<GGML_TYPE_IQ3_K> {
708708
static constexpr int qi = QI4_XS;
709709
};
710710

711+
template<>
712+
struct ggml_cuda_type_traits<GGML_TYPE_IQ3_KS> {
713+
static constexpr int qk = QK_K;
714+
static constexpr int qr = QR4_XS;
715+
static constexpr int qi = QI4_XS;
716+
};
717+
711718
template<>
712719
struct ggml_cuda_type_traits<GGML_TYPE_IQ4_K> {
713720
static constexpr int qk = QK_K;

ggml/src/ggml-cuda/convert.cu

Lines changed: 95 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -683,37 +683,6 @@ static __global__ void dequantize_block_iq4_xs(const void * __restrict__ vx, dst
683683
}
684684
}
685685

686-
template<typename dst_t>
687-
static __global__ void dequantize_block_iq3_ks_v1(const void * __restrict__ vx, dst_t * __restrict__ yy, int64_t n_per_row, int64_t row_size) {
688-
689-
int64_t ii = blockIdx.x;
690-
int64_t row = (QK_K * ii) / n_per_row;
691-
const char * cx = (const char *)vx + row * row_size;
692-
float scale = *(const float *)cx;
693-
const block_iq3_ks_v1 * x = (const block_iq3_ks_v1 *)(cx + sizeof(float));
694-
const int64_t i = ii - (row*n_per_row)/QK_K;
695-
696-
const int tid = threadIdx.x;
697-
int ib128 = tid/16; // 0 or 1
698-
int il = tid%16; // 0...15
699-
dst_t * y = yy + ii*QK_K + 128*ib128 + 2*il;
700-
//uint32_t sc = ((const uint32_t *)x[i].scales)[ib128];
701-
//uint32_t aux32 =
702-
const float dl1 = scale * ((x[i].scales[4*ib128+0] & 254) - 127);
703-
const float dl2 = scale * ((x[i].scales[4*ib128+1] & 254) - 127);
704-
const float dl3 = scale * ((x[i].scales[4*ib128+2] & 254) - 127);
705-
const float dl4 = scale * ((x[i].scales[4*ib128+3] & 254) - 127);
706-
const uint8_t * qs = x[i].qs + 32*ib128 + 2*il;
707-
const uint8_t * qh = x[i].qh + 2*il;
708-
for (int j = 0; j < 2; ++j) {
709-
const uint8_t h = qh[j] >> (4*(ib128%2));
710-
y[j+ 0] = dl1 * iq3nl_values[(((qs[j] >> 0) & 0x03) | ((h & 0x01) << 2)) + ((x[i].scales[4*ib128+0] & 1) << 3)];
711-
y[j+32] = dl2 * iq3nl_values[(((qs[j] >> 2) & 0x03) | ((h & 0x02) << 1)) + ((x[i].scales[4*ib128+1] & 1) << 3)];
712-
y[j+64] = dl3 * iq3nl_values[(((qs[j] >> 4) & 0x03) | ((h & 0x04) >> 0)) + ((x[i].scales[4*ib128+2] & 1) << 3)];
713-
y[j+96] = dl4 * iq3nl_values[(((qs[j] >> 6) & 0x03) | ((h & 0x08) >> 1)) + ((x[i].scales[4*ib128+3] & 1) << 3)];
714-
}
715-
}
716-
717686
template<typename dst_t>
718687
static __global__ void dequantize_block_iq4_ks(const void * __restrict__ vx, dst_t * __restrict__ yy, int64_t n_per_row, int64_t row_size) {
719688

@@ -1056,6 +1025,82 @@ static __global__ void dequantize_block_iq3_k(const void * __restrict__ vx, dst_
10561025
}
10571026
}
10581027

1028+
template<typename dst_t>
1029+
static __global__ void dequantize_block_iq3_ks_v1(const void * __restrict__ vx, dst_t * __restrict__ yy, int64_t n_per_row, int64_t row_size) {
1030+
1031+
int64_t ii = blockIdx.x;
1032+
int64_t row = (QK_K * ii) / n_per_row;
1033+
const char * cx = (const char *)vx + row * row_size;
1034+
float scale = *(const float *)cx;
1035+
const block_iq3_ks_v1 * x = (const block_iq3_ks_v1 *)(cx + sizeof(float));
1036+
const int64_t i = ii - (row*n_per_row)/QK_K;
1037+
1038+
const int tid = threadIdx.x;
1039+
int ib128 = tid/16; // 0 or 1
1040+
int il = tid%16; // 0...15
1041+
dst_t * y = yy + ii*QK_K + 128*ib128 + 2*il;
1042+
//uint32_t sc = ((const uint32_t *)x[i].scales)[ib128];
1043+
//uint32_t aux32 =
1044+
const float dl1 = scale * ((x[i].scales[4*ib128+0] & 254) - 127);
1045+
const float dl2 = scale * ((x[i].scales[4*ib128+1] & 254) - 127);
1046+
const float dl3 = scale * ((x[i].scales[4*ib128+2] & 254) - 127);
1047+
const float dl4 = scale * ((x[i].scales[4*ib128+3] & 254) - 127);
1048+
const uint8_t * qs = x[i].qs + 32*ib128 + 2*il;
1049+
const uint8_t * qh = x[i].qh + 2*il;
1050+
for (int j = 0; j < 2; ++j) {
1051+
const uint8_t h = qh[j] >> (4*(ib128%2));
1052+
y[j+ 0] = dl1 * iq3nl_values[(((qs[j] >> 0) & 0x03) | ((h & 0x01) << 2)) + ((x[i].scales[4*ib128+0] & 1) << 3)];
1053+
y[j+32] = dl2 * iq3nl_values[(((qs[j] >> 2) & 0x03) | ((h & 0x02) << 1)) + ((x[i].scales[4*ib128+1] & 1) << 3)];
1054+
y[j+64] = dl3 * iq3nl_values[(((qs[j] >> 4) & 0x03) | ((h & 0x04) >> 0)) + ((x[i].scales[4*ib128+2] & 1) << 3)];
1055+
y[j+96] = dl4 * iq3nl_values[(((qs[j] >> 6) & 0x03) | ((h & 0x08) >> 1)) + ((x[i].scales[4*ib128+3] & 1) << 3)];
1056+
}
1057+
}
1058+
1059+
template<typename dst_t>
1060+
static __global__ void dequantize_block_iq3_ks(const void * __restrict__ vx, dst_t * __restrict__ yy, int64_t n_per_row, int64_t row_size) {
1061+
1062+
int64_t ii = blockIdx.x;
1063+
int64_t row = (QK_K * ii) / n_per_row;
1064+
const char * cx = (const char *)vx + row * row_size;
1065+
float scale = *(const ggml_half *)cx;
1066+
const block_iq3_ks * x = (const block_iq3_ks *)(cx + sizeof(ggml_half));
1067+
const int64_t i = ii - (row*n_per_row)/QK_K;
1068+
1069+
const int64_t tid = threadIdx.x;
1070+
const int64_t is = tid/16;
1071+
const int64_t il = tid%16;
1072+
dst_t * y = yy + ii*QK_K + 128*is + 2*il;
1073+
const uint8_t * qs = x[i].qs + 32*is + 2*il;
1074+
const uint8_t * qh = x[i].qh + 2*il;
1075+
uint16_t extra = x[i].extra >> 4*is;
1076+
const float d0 = scale * (int(((x[i].scales[0] >> 4*is) & 0xf) | ((extra << 4) & 0x10)) - 16);
1077+
const float d1 = scale * (int(((x[i].scales[1] >> 4*is) & 0xf) | ((extra << 3) & 0x10)) - 16);
1078+
const float d2 = scale * (int(((x[i].scales[2] >> 4*is) & 0xf) | ((extra << 2) & 0x10)) - 16);
1079+
const float d3 = scale * (int(((x[i].scales[3] >> 4*is) & 0xf) | ((extra << 1) & 0x10)) - 16);
1080+
extra >>= 8;
1081+
const int8_t * values0 = iq3nl_values + ((extra & 1) << 3);
1082+
const int8_t * values1 = iq3nl_values + ((extra & 2) << 2);
1083+
const int8_t * values2 = iq3nl_values + ((extra & 4) << 1);
1084+
const int8_t * values3 = iq3nl_values + ((extra & 8) << 0);
1085+
if constexpr (std::is_same_v<dst_t, nv_bfloat16>) {
1086+
for (int j = 0; j < 2; ++j) {
1087+
uint8_t h = qh[j] >> 4*is;
1088+
y[j+ 0] = __float2bfloat16(d0 * values0[((qs[j] >> 0) & 3) | ((h << 2) & 4)]);
1089+
y[j+32] = __float2bfloat16(d1 * values1[((qs[j] >> 2) & 3) | ((h << 1) & 4)]);
1090+
y[j+64] = __float2bfloat16(d2 * values2[((qs[j] >> 4) & 3) | ((h >> 0) & 4)]);
1091+
y[j+96] = __float2bfloat16(d3 * values3[((qs[j] >> 6) & 3) | ((h >> 1) & 4)]);
1092+
}
1093+
} else {
1094+
for (int j = 0; j < 2; ++j) {
1095+
uint8_t h = qh[j] >> 4*is;
1096+
y[j+ 0] = d0 * values0[((qs[j] >> 0) & 3) | ((h << 2) & 4)];
1097+
y[j+32] = d1 * values1[((qs[j] >> 2) & 3) | ((h << 1) & 4)];
1098+
y[j+64] = d2 * values2[((qs[j] >> 4) & 3) | ((h >> 0) & 4)];
1099+
y[j+96] = d3 * values3[((qs[j] >> 6) & 3) | ((h >> 1) & 4)];
1100+
}
1101+
}
1102+
}
1103+
10591104
template<typename dst_t>
10601105
static __global__ void dequantize_block_iq1_s_r4(const void * __restrict__ vx, dst_t * __restrict__ yy, int64_t n_per_row, int64_t row_size) {
10611106

@@ -1615,6 +1660,14 @@ static void dequantize_row_iq2_ks_cuda(const void * vx, dst_t * y, const int64_t
16151660
dequantize_block_iq2_ks<<<nb, 32, 0, stream>>>(vx, y, n_per_row, row_size);
16161661
}
16171662

1663+
template<typename dst_t>
1664+
static void dequantize_row_iq2_k_cuda(const void * vx, dst_t * y, const int64_t nrows, const int64_t n_per_row, cudaStream_t stream) {
1665+
const int64_t k = nrows * n_per_row;
1666+
// const int64_t row_size = ggml_row_size(GGML_TYPE_IQ2_K, n_per_row);
1667+
const int nb = (k + QK_K - 1) / QK_K;
1668+
dequantize_block_iq2_k<<<nb, 32, 0, stream>>>(vx, y);
1669+
}
1670+
16181671
template<typename dst_t>
16191672
static void dequantize_row_iq3_ks_v1_cuda(const void * vx, dst_t * y, const int64_t nrows, const int64_t n_per_row, cudaStream_t stream) {
16201673
const int64_t k = nrows * n_per_row;
@@ -1624,11 +1677,11 @@ static void dequantize_row_iq3_ks_v1_cuda(const void * vx, dst_t * y, const int6
16241677
}
16251678

16261679
template<typename dst_t>
1627-
static void dequantize_row_iq2_k_cuda(const void * vx, dst_t * y, const int64_t nrows, const int64_t n_per_row, cudaStream_t stream) {
1680+
static void dequantize_row_iq3_ks_cuda(const void * vx, dst_t * y, const int64_t nrows, const int64_t n_per_row, cudaStream_t stream) {
16281681
const int64_t k = nrows * n_per_row;
1629-
// const int64_t row_size = ggml_row_size(GGML_TYPE_IQ2_K, n_per_row);
1682+
const int64_t row_size = ggml_row_size(GGML_TYPE_IQ3_KS, n_per_row);
16301683
const int nb = (k + QK_K - 1) / QK_K;
1631-
dequantize_block_iq2_k<<<nb, 32, 0, stream>>>(vx, y);
1684+
dequantize_block_iq3_ks<<<nb, 32, 0, stream>>>(vx, y, n_per_row, row_size);
16321685
}
16331686

16341687
template<typename dst_t>
@@ -1816,10 +1869,12 @@ to_bf16_cuda_t ggml_get_to_bf16_cuda(ggml_type type) {
18161869
return dequantize_row_iq2_ks_cuda<nv_bfloat16>;
18171870
case GGML_TYPE_IQ2_K:
18181871
return dequantize_row_iq2_k_cuda<nv_bfloat16>;
1819-
case GGML_TYPE_IQ3_K:
1820-
return dequantize_row_iq3_k_cuda<nv_bfloat16>;
18211872
case GGML_TYPE_IQ3_KS_V1:
18221873
return dequantize_row_iq3_ks_v1_cuda<nv_bfloat16>;
1874+
case GGML_TYPE_IQ3_KS:
1875+
return dequantize_row_iq3_ks_cuda<nv_bfloat16>;
1876+
case GGML_TYPE_IQ3_K:
1877+
return dequantize_row_iq3_k_cuda<nv_bfloat16>;
18231878
case GGML_TYPE_IQ4_KSS:
18241879
return dequantize_row_iq4_kss_cuda<nv_bfloat16>;
18251880
case GGML_TYPE_IQ4_KS:
@@ -1918,6 +1973,8 @@ to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
19181973
return dequantize_row_iq3_ks_v1_cuda;
19191974
case GGML_TYPE_IQ2_K:
19201975
return dequantize_row_iq2_k_cuda;
1976+
case GGML_TYPE_IQ3_KS:
1977+
return dequantize_row_iq3_ks_cuda;
19211978
case GGML_TYPE_IQ3_K:
19221979
return dequantize_row_iq3_k_cuda;
19231980
case GGML_TYPE_IQ4_KSS:
@@ -2024,6 +2081,8 @@ to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
20242081
return dequantize_row_iq2_k_cuda;
20252082
case GGML_TYPE_IQ3_K:
20262083
return dequantize_row_iq3_k_cuda;
2084+
case GGML_TYPE_IQ3_KS:
2085+
return dequantize_row_iq3_ks_cuda;
20272086
case GGML_TYPE_IQ4_K:
20282087
return dequantize_row_iq4_k_cuda;
20292088
case GGML_TYPE_IQ5_K:

ggml/src/ggml-cuda/ggml-cuda.cu

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3429,6 +3429,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
34293429
case GGML_TYPE_IQ3_XXS:
34303430
case GGML_TYPE_IQ4_NL:
34313431
case GGML_TYPE_IQ4_XS:
3432+
case GGML_TYPE_IQ3_KS:
34323433
case GGML_TYPE_IQ4_KS:
34333434
case GGML_TYPE_IQ4_KSS:
34343435
case GGML_TYPE_IQ2_K:

0 commit comments

Comments
 (0)