Skip to content

Commit bf9aecf

Browse files
committed
Activate IQ1_BN and IQ2_BN
And update IK_Llame.cpp IQK files for reference. Credits : Iwan Kawrakow @ikawrakow
1 parent 79cde2c commit bf9aecf

File tree

15 files changed

+11976
-4920
lines changed

15 files changed

+11976
-4920
lines changed

ggml/src/ggml-cpu/ggml-cpu.c

Lines changed: 26 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -397,18 +397,18 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = {
397397
.vec_dot_type = GGML_TYPE_Q8_K,
398398
.nrows = 1,
399399
},
400-
// [GGML_TYPE_IQ1_BN] = {
401-
// .from_float = quantize_row_iq1_bn,
402-
// .vec_dot = vec_dot_iq1_bn_q8_0,
403-
// .vec_dot_type = GGML_TYPE_IQ1_BN,
404-
// .nrows = 1,
405-
// },
406-
// [GGML_TYPE_IQ2_BN] = {
407-
// .from_float = quantize_row_iq2_bn,
408-
// .vec_dot = vec_dot_iq2_bn_q8_0,
409-
// .vec_dot_type = GGML_TYPE_IQ2_BN,
410-
// .nrows = 1,
411-
// },
400+
[GGML_TYPE_IQ1_BN] = {
401+
.from_float = quantize_row_iq1_bn,
402+
.vec_dot = vec_dot_iq1_bn_q8_K64,
403+
.vec_dot_type = GGML_TYPE_IQ1_BN,
404+
.nrows = 1,
405+
},
406+
[GGML_TYPE_IQ2_BN] = {
407+
.from_float = quantize_row_iq2_bn,
408+
.vec_dot = vec_dot_iq2_bn_q8_K64,
409+
.vec_dot_type = GGML_TYPE_IQ2_BN,
410+
.nrows = 1,
411+
},
412412
[GGML_TYPE_IQ4_NL] = {
413413
.from_float = quantize_row_iq4_nl,
414414
.vec_dot = ggml_vec_dot_iq4_nl_q8_0,
@@ -5254,8 +5254,8 @@ static void ggml_compute_forward_add(
52545254
case GGML_TYPE_IQ3_XXS:
52555255
case GGML_TYPE_IQ1_S:
52565256
case GGML_TYPE_IQ1_M:
5257-
// case GGML_TYPE_IQ1_BN:
5258-
// case GGML_TYPE_IQ2_BN:
5257+
case GGML_TYPE_IQ1_BN:
5258+
case GGML_TYPE_IQ2_BN:
52595259
case GGML_TYPE_IQ4_NL:
52605260
case GGML_TYPE_IQ4_XS:
52615261
case GGML_TYPE_IQ4_KS:
@@ -5645,8 +5645,8 @@ static void ggml_compute_forward_add1(
56455645
case GGML_TYPE_IQ3_XXS:
56465646
case GGML_TYPE_IQ1_S:
56475647
case GGML_TYPE_IQ1_M:
5648-
// case GGML_TYPE_IQ1_BN:
5649-
// case GGML_TYPE_IQ2_BN:
5648+
case GGML_TYPE_IQ1_BN:
5649+
case GGML_TYPE_IQ2_BN:
56505650
case GGML_TYPE_IQ4_NL:
56515651
case GGML_TYPE_IQ4_XS:
56525652
case GGML_TYPE_IQ4_KS:
@@ -5786,8 +5786,8 @@ static void ggml_compute_forward_acc(
57865786
case GGML_TYPE_IQ3_XXS:
57875787
case GGML_TYPE_IQ1_S:
57885788
case GGML_TYPE_IQ1_M:
5789-
// case GGML_TYPE_IQ1_BN:
5790-
// case GGML_TYPE_IQ2_BN:
5789+
case GGML_TYPE_IQ1_BN:
5790+
case GGML_TYPE_IQ2_BN:
57915791
case GGML_TYPE_IQ4_NL:
57925792
case GGML_TYPE_IQ4_XS:
57935793
case GGML_TYPE_IQ4_KS:
@@ -8959,8 +8959,8 @@ static void ggml_compute_forward_out_prod(
89598959
case GGML_TYPE_IQ3_XXS:
89608960
case GGML_TYPE_IQ1_S:
89618961
case GGML_TYPE_IQ1_M:
8962-
// case GGML_TYPE_IQ1_BN:
8963-
// case GGML_TYPE_IQ2_BN:
8962+
case GGML_TYPE_IQ1_BN:
8963+
case GGML_TYPE_IQ2_BN:
89648964
case GGML_TYPE_IQ4_NL:
89658965
case GGML_TYPE_IQ4_XS:
89668966
case GGML_TYPE_IQ4_KS:
@@ -9429,8 +9429,8 @@ static void ggml_compute_forward_set(
94299429
case GGML_TYPE_IQ3_XXS:
94309430
case GGML_TYPE_IQ1_S:
94319431
case GGML_TYPE_IQ1_M:
9432-
// case GGML_TYPE_IQ1_BN:
9433-
// case GGML_TYPE_IQ2_BN:
9432+
case GGML_TYPE_IQ1_BN:
9433+
case GGML_TYPE_IQ2_BN:
94349434
case GGML_TYPE_IQ4_NL:
94359435
case GGML_TYPE_IQ4_XS:
94369436
case GGML_TYPE_IQ4_KS:
@@ -9704,8 +9704,8 @@ static void ggml_compute_forward_get_rows(
97049704
case GGML_TYPE_IQ3_XXS:
97059705
case GGML_TYPE_IQ1_S:
97069706
case GGML_TYPE_IQ1_M:
9707-
// case GGML_TYPE_IQ1_BN:
9708-
// case GGML_TYPE_IQ2_BN:
9707+
case GGML_TYPE_IQ1_BN:
9708+
case GGML_TYPE_IQ2_BN:
97099709
case GGML_TYPE_IQ4_NL:
97109710
case GGML_TYPE_IQ4_XS:
97119711
case GGML_TYPE_IQ4_KS:
@@ -10306,8 +10306,8 @@ static void ggml_compute_forward_clamp(
1030610306
case GGML_TYPE_IQ3_XXS:
1030710307
case GGML_TYPE_IQ1_S:
1030810308
case GGML_TYPE_IQ1_M:
10309-
// case GGML_TYPE_IQ1_BN:
10310-
// case GGML_TYPE_IQ2_BN:
10309+
case GGML_TYPE_IQ1_BN:
10310+
case GGML_TYPE_IQ2_BN:
1031110311
case GGML_TYPE_IQ4_NL:
1031210312
case GGML_TYPE_IQ4_XS:
1031310313
case GGML_TYPE_IQ4_KS:

ggml/src/ggml-cuda/common.cuh

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -502,6 +502,20 @@ struct ggml_cuda_type_traits<GGML_TYPE_IQ1_M> {
502502
static constexpr int qi = QI1_M;
503503
};
504504

505+
template<>
506+
struct ggml_cuda_type_traits<GGML_TYPE_IQ1_BN> {
507+
static constexpr int qk = QK_IQ1BN;
508+
static constexpr int qr = QR1_BN;
509+
static constexpr int qi = QI1_BN;
510+
};
511+
512+
template<>
513+
struct ggml_cuda_type_traits<GGML_TYPE_IQ2_BN> {
514+
static constexpr int qk = QK_IQ1BN;
515+
static constexpr int qr = QR1_BN;
516+
static constexpr int qi = QI1_BN;
517+
};
518+
505519
template<>
506520
struct ggml_cuda_type_traits<GGML_TYPE_IQ4_NL> {
507521
static constexpr int qk = QK4_NL;

ggml/src/ggml-cuda/convert.cu

Lines changed: 15 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1105,21 +1105,21 @@ static void dequantize_row_iq1_m_cuda(const void * vx, dst_t * y, const int64_t
11051105
dequantize_block_iq1_m<<<nb, 32, 0, stream>>>(vx, y);
11061106
}
11071107

1108-
// template<typename dst_t>
1109-
// static void dequantize_row_iq1_bn_cuda(const void * vx, dst_t * y, const int64_t nrows, const int64_t n_per_row, cudaStream_t stream) {
1110-
// const int64_t k = nrows * n_per_row;
1111-
// const int64_t row_size = ggml_row_size(GGML_TYPE_IQ1_BN, n_per_row);
1112-
// const int nb = (k + 255) / 256;
1113-
// dequantize_block_iq1_bn<<<nb, 32, 0, stream>>>(vx, y, n_per_row, row_size, nrows);
1114-
// }
1115-
1116-
// template<typename dst_t>
1117-
// static void dequantize_row_iq2_bn_cuda(const void * vx, dst_t * y, const int64_t nrows, const int64_t n_per_row, cudaStream_t stream) {
1118-
// const int64_t k = nrows * n_per_row;
1119-
// const int64_t row_size = ggml_row_size(GGML_TYPE_IQ2_BN, n_per_row);
1120-
// const int nb = (k + 255) / 256;
1121-
// dequantize_block_iq2_bn<<<nb, 32, 0, stream>>>(vx, y, n_per_row, row_size, nrows);
1122-
// }
1108+
template<typename dst_t>
1109+
static void dequantize_row_iq1_bn_cuda(const void * vx, dst_t * y, const int64_t nrows, const int64_t n_per_row, cudaStream_t stream) {
1110+
const int64_t k = nrows * n_per_row;
1111+
const int64_t row_size = ggml_row_size(GGML_TYPE_IQ1_BN, n_per_row);
1112+
const int nb = (k + 255) / 256;
1113+
dequantize_block_iq1_bn<<<nb, 32, 0, stream>>>(vx, y, n_per_row, row_size, nrows);
1114+
}
1115+
1116+
template<typename dst_t>
1117+
static void dequantize_row_iq2_bn_cuda(const void * vx, dst_t * y, const int64_t nrows, const int64_t n_per_row, cudaStream_t stream) {
1118+
const int64_t k = nrows * n_per_row;
1119+
const int64_t row_size = ggml_row_size(GGML_TYPE_IQ2_BN, n_per_row);
1120+
const int nb = (k + 255) / 256;
1121+
dequantize_block_iq2_bn<<<nb, 32, 0, stream>>>(vx, y, n_per_row, row_size, nrows);
1122+
}
11231123

11241124
template<typename dst_t>
11251125
static void dequantize_row_iq4_xs_cuda(const void * vx, dst_t * y, const int64_t nrows, const int64_t n_per_row, cudaStream_t stream) {

ggml/src/ggml-cuda/iqk_mmvq.cu

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -631,7 +631,7 @@ __device__ __forceinline__ float vec_dot_iq3_k_q8_1(
631631

632632
}
633633

634-
/* __device__ __forceinline__ float vec_dot_iq1_bn_q8_1(
634+
__device__ __forceinline__ float vec_dot_iq1_bn_q8_1(
635635
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) {
636636

637637
half d16; memcpy(&d16, vbq, sizeof(d16));
@@ -729,7 +729,7 @@ __device__ __forceinline__ float vec_dot_iq2_bn_q8_1(
729729
auto d8h = __half22float2(bq8_1[1].ds);
730730
return scale * (d8l.x * (sumi1 + 0.25f*sumi2) + 0.0625f * d8h.x*(sumi3 + 0.25f*sumi4) - 0.5f*d8l.y - 0.5f*d8h.y);
731731
#endif
732-
} */
732+
}
733733

734734
} // namespace
735735

@@ -796,14 +796,16 @@ void mul_mat_vec_iq6_k_q8_1_cuda(
796796
iqk_mul_mat_vec_q_cuda<GGML_TYPE_IQ6_K, VDR_IQ6_K_Q8_1_MMVQ, vec_dot_iq6_k_q8_1>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
797797
}
798798

799-
/* void mul_mat_vec_iq1_bn_q8_1_cuda(
799+
void mul_mat_vec_iq1_bn_q8_1_cuda(
800800
const void * vx, const void * vy, float * dst,
801801
const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
802+
802803
iqk_mul_mat_vec_q_cuda<GGML_TYPE_IQ1_BN, 1, vec_dot_iq1_bn_q8_1>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
803804
}
804805

805806
void mul_mat_vec_iq2_bn_q8_1_cuda(
806807
const void * vx, const void * vy, float * dst,
807808
const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
809+
808810
iqk_mul_mat_vec_q_cuda<GGML_TYPE_IQ2_BN, 1, vec_dot_iq2_bn_q8_1>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
809-
} */
811+
}

ggml/src/ggml-cuda/mmvq.cu

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -428,12 +428,12 @@ void ggml_cuda_op_mul_mat_vec_q(
428428
case GGML_TYPE_IQ1_M:
429429
mul_mat_vec_iq1_m_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
430430
break;
431-
/* case GGML_TYPE_IQ1_BN:
431+
case GGML_TYPE_IQ1_BN:
432432
mul_mat_vec_iq1_bn_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
433433
break;
434434
case GGML_TYPE_IQ2_BN:
435435
mul_mat_vec_iq2_bn_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
436-
break; */
436+
break;
437437
case GGML_TYPE_IQ4_NL:
438438
mul_mat_vec_iq4_nl_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
439439
break;

ggml/src/ggml-quants.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5458,8 +5458,8 @@ bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbyte
54585458
// {
54595459
// VALIDATE_ROW_DATA_D_F16_IMPL(block_iq4_kt, data, nb);
54605460
// } break;
5461-
// case GGML_TYPE_IQ1_BN: break;
5462-
// case GGML_TYPE_IQ2_BN: break;
5461+
case GGML_TYPE_IQ1_BN: break;
5462+
case GGML_TYPE_IQ2_BN: break;
54635463
// case GGML_TYPE_IQ2_K: break;
54645464
case GGML_TYPE_IQ2_KS: break;
54655465
case GGML_TYPE_IQ2_KT: break;

ggml/src/ggml.c

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -837,6 +837,32 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
837837
.from_float_ref = (ggml_from_float_t)quantize_row_iq4_xs_ref,
838838
.row_meta_size = 0,
839839
},
840+
[GGML_TYPE_IQ1_BN] = {
841+
.type_name = "iq1_bn",
842+
.blck_size = QK_IQ1BN,
843+
.type_size = sizeof(block_iq1_bn),
844+
.is_quantized = true,
845+
.to_float = (ggml_to_float_t) dequantize_row_iq1_bn,
846+
// .from_float = quantize_row_iq1_bn,
847+
.from_float_ref = (ggml_from_float_t)quantize_row_iq1_bn_ref,
848+
// .vec_dot = ggml_vec_dot_iq1_bn_q8_K64,
849+
// .vec_dot_type = GGML_TYPE_Q8_K64,
850+
// .nrows = 1,
851+
.row_meta_size = 2,
852+
},
853+
[GGML_TYPE_IQ2_BN] = {
854+
.type_name = "iq2_bn",
855+
.blck_size = QK_IQ1BN,
856+
.type_size = sizeof(block_iq2_bn),
857+
.is_quantized = true,
858+
.to_float = (ggml_to_float_t) dequantize_row_iq2_bn,
859+
// .from_float = quantize_row_iq2_bn,
860+
.from_float_ref = (ggml_from_float_t)quantize_row_iq2_bn_ref,
861+
// .vec_dot = vec_dot_iq2_bn_q8_K64,
862+
// .vec_dot_type = GGML_TYPE_Q8_K64,
863+
// .nrows = 1,
864+
.row_meta_size = 4,
865+
},
840866
[GGML_TYPE_IQ4_KS] = {
841867
.type_name = "iq4_ks",
842868
.blck_size = QK_K,
@@ -1522,6 +1548,8 @@ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) {
15221548
case GGML_FTYPE_MOSTLY_IQ3_XXS: wtype = GGML_TYPE_IQ3_XXS; break;
15231549
case GGML_FTYPE_MOSTLY_IQ1_S: wtype = GGML_TYPE_IQ1_S; break;
15241550
case GGML_FTYPE_MOSTLY_IQ1_M: wtype = GGML_TYPE_IQ1_M; break;
1551+
case GGML_FTYPE_MOSTLY_IQ1_BN: wtype = GGML_TYPE_IQ1_BN; break;
1552+
case GGML_FTYPE_MOSTLY_IQ2_BN: wtype = GGML_TYPE_IQ2_BN; break;
15251553
case GGML_FTYPE_MOSTLY_IQ4_NL: wtype = GGML_TYPE_IQ4_NL; break;
15261554
case GGML_FTYPE_MOSTLY_IQ4_XS: wtype = GGML_TYPE_IQ4_XS; break;
15271555
case GGML_FTYPE_MOSTLY_IQ4_KS: wtype = GGML_TYPE_IQ4_KS; break;
@@ -6899,6 +6927,8 @@ size_t ggml_quantize_chunk(
68996927
case GGML_TYPE_IQ2_S: result = quantize_iq2_s (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
69006928
case GGML_TYPE_IQ1_S: result = quantize_iq1_s (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
69016929
case GGML_TYPE_IQ1_M: result = quantize_iq1_m (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
6930+
case GGML_TYPE_IQ1_BN: result = quantize_iq1_bn (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
6931+
case GGML_TYPE_IQ2_BN: result = quantize_iq2_bn (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
69026932
case GGML_TYPE_IQ4_NL: result = quantize_iq4_nl (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
69036933
case GGML_TYPE_IQ4_XS: result = quantize_iq4_xs (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
69046934
case GGML_TYPE_IQ4_KS: result = quantize_iq4_ks (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;

0 commit comments

Comments
 (0)