Skip to content

Commit 09021f3

Browse files
committed
Try to activate IQ1_BN and IQ2_BN
1 parent 9b033b8 commit 09021f3

File tree

10 files changed

+52
-38
lines changed

10 files changed

+52
-38
lines changed

ggml/src/ggml-cuda/convert.cu

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -568,7 +568,7 @@ static __global__ void dequantize_block_iq1_m(const void * __restrict__ vx, dst_
568568
}
569569
}
570570

571-
/* template<typename dst_t>
571+
template<typename dst_t>
572572
static __global__ void dequantize_block_iq1_bn(const void * __restrict__ vx, dst_t * __restrict__ yy,
573573
int64_t n_per_row, int64_t row_size, int64_t nrows) {
574574

@@ -610,9 +610,9 @@ static __global__ void dequantize_block_iq1_bn(const void * __restrict__ vx, dst
610610
y[7] = d*(vs - 1);
611611

612612
#undef COMPUTE_VS
613-
} */
613+
}
614614

615-
/* template<typename dst_t>
615+
template<typename dst_t>
616616
static __global__ void dequantize_block_iq2_bn(const void * __restrict__ vx, dst_t * __restrict__ yy, int64_t n_per_row, int64_t row_size, int64_t nrows) {
617617

618618
int64_t ii = 256*blockIdx.x;
@@ -636,7 +636,7 @@ static __global__ void dequantize_block_iq2_bn(const void * __restrict__ vx, dst
636636
y[j+32] = d * ((qs[j] >> 4) & 3) + m;
637637
y[j+48] = d * ((qs[j] >> 6) & 3) + m;
638638
}
639-
} */
639+
}
640640

641641
template<typename dst_t>
642642
static __global__ void dequantize_block_iq4_nl(const void * __restrict__ vx, dst_t * __restrict__ yy) {

ggml/src/ggml-cuda/ggml-cuda.cu

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3333,8 +3333,8 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
33333333
case GGML_TYPE_IQ5_K:
33343334
case GGML_TYPE_IQ5_KS:
33353335
case GGML_TYPE_IQ6_K:
3336-
// case GGML_TYPE_IQ1_BN:
3337-
// case GGML_TYPE_IQ2_BN:
3336+
case GGML_TYPE_IQ1_BN:
3337+
case GGML_TYPE_IQ2_BN:
33383338
#ifdef GGML_USE_MUSA
33393339
if (a->type == GGML_TYPE_Q3_K) {
33403340
return false;

ggml/src/ggml-cuda/iqk_mmvq.cu

Lines changed: 14 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -727,7 +727,7 @@ __device__ __forceinline__ float vec_dot_iq3_ks_q8_1(
727727

728728
}
729729

730-
/* __device__ __forceinline__ float vec_dot_iq1_bn_q8_1(
730+
__device__ __forceinline__ float vec_dot_iq1_bn_q8_1(
731731
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) {
732732

733733
half d16; memcpy(&d16, vbq, sizeof(d16));
@@ -784,9 +784,9 @@ __device__ __forceinline__ float vec_dot_iq3_ks_q8_1(
784784
}
785785
return scale * __low2float(bq8_1[iqs].ds) * sumi;
786786
#endif
787-
} */
787+
}
788788

789-
/* __device__ __forceinline__ float vec_dot_iq2_bn_q8_1(
789+
__device__ __forceinline__ float vec_dot_iq2_bn_q8_1(
790790
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) {
791791

792792
float scale = *(const float *)vbq;
@@ -825,7 +825,7 @@ __device__ __forceinline__ float vec_dot_iq3_ks_q8_1(
825825
auto d8h = __half22float2(bq8_1[1].ds);
826826
return scale * (d8l.x * (sumi1 + 0.25f*sumi2) + 0.0625f * d8h.x*(sumi3 + 0.25f*sumi4) - 0.5f*d8l.y - 0.5f*d8h.y);
827827
#endif
828-
} */
828+
}
829829

830830
} // namespace
831831

@@ -906,16 +906,16 @@ void mul_mat_vec_iq6_k_q8_1_cuda(
906906
iqk_mul_mat_vec_q_cuda<GGML_TYPE_IQ6_K, VDR_IQ6_K_Q8_1_MMVQ, vec_dot_iq6_k_q8_1>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
907907
}
908908

909-
// void mul_mat_vec_iq1_bn_q8_1_cuda(
910-
// const void * vx, const void * vy, float * dst,
911-
// const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
909+
void mul_mat_vec_iq1_bn_q8_1_cuda(
910+
const void * vx, const void * vy, float * dst,
911+
const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
912912

913-
// iqk_mul_mat_vec_q_cuda<GGML_TYPE_IQ1_BN, 1, vec_dot_iq1_bn_q8_1>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
914-
// }
913+
iqk_mul_mat_vec_q_cuda<GGML_TYPE_IQ1_BN, 1, vec_dot_iq1_bn_q8_1>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
914+
}
915915

916-
// void mul_mat_vec_iq2_bn_q8_1_cuda(
917-
// const void * vx, const void * vy, float * dst,
918-
// const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
916+
void mul_mat_vec_iq2_bn_q8_1_cuda(
917+
const void * vx, const void * vy, float * dst,
918+
const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
919919

920-
// iqk_mul_mat_vec_q_cuda<GGML_TYPE_IQ2_BN, 1, vec_dot_iq2_bn_q8_1>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
921-
// }
920+
iqk_mul_mat_vec_q_cuda<GGML_TYPE_IQ2_BN, 1, vec_dot_iq2_bn_q8_1>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
921+
}

ggml/src/ggml-cuda/iqk_mmvq.cuh

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -44,11 +44,11 @@ void mul_mat_vec_iq2_kt_q8_1_cuda(
4444
const void * vx, const void * vy, float * dst,
4545
const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream);
4646

47-
// void mul_mat_vec_iq1_bn_q8_1_cuda(
48-
// const void * vx, const void * vy, float * dst,
49-
// const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream);
47+
void mul_mat_vec_iq1_bn_q8_1_cuda(
48+
const void * vx, const void * vy, float * dst,
49+
const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream);
5050

51-
// void mul_mat_vec_iq2_bn_q8_1_cuda(
52-
// const void * vx, const void * vy, float * dst,
53-
// const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream);
51+
void mul_mat_vec_iq2_bn_q8_1_cuda(
52+
const void * vx, const void * vy, float * dst,
53+
const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream);
5454

ggml/src/ggml-cuda/mmvq.cu

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -518,12 +518,12 @@ void ggml_cuda_op_mul_mat_vec_q(
518518
case GGML_TYPE_IQ1_M:
519519
mul_mat_vec_iq1_m_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
520520
break;
521-
// case GGML_TYPE_IQ1_BN:
522-
// mul_mat_vec_iq1_bn_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
523-
// break;
524-
// case GGML_TYPE_IQ2_BN:
525-
// mul_mat_vec_iq2_bn_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
526-
// break;
521+
case GGML_TYPE_IQ1_BN:
522+
mul_mat_vec_iq1_bn_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
523+
break;
524+
case GGML_TYPE_IQ2_BN:
525+
mul_mat_vec_iq2_bn_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
526+
break;
527527
case GGML_TYPE_IQ4_NL:
528528
mul_mat_vec_iq4_nl_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
529529
break;
@@ -597,8 +597,8 @@ bool ggml_cuda_mmvq_type_supported(ggml_type src0_type) {
597597
case GGML_TYPE_IQ3_XXS:
598598
case GGML_TYPE_IQ1_S:
599599
case GGML_TYPE_IQ1_M:
600-
// case GGML_TYPE_IQ1_BN:
601-
// case GGML_TYPE_IQ2_BN:
600+
case GGML_TYPE_IQ1_BN:
601+
case GGML_TYPE_IQ2_BN:
602602
case GGML_TYPE_IQ4_NL:
603603
case GGML_TYPE_IQ4_XS:
604604
case GGML_TYPE_IQ2_K:

ggml/src/ggml-quants.c

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5747,10 +5747,10 @@ bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbyte
57475747
case GGML_TYPE_I16:
57485748
case GGML_TYPE_I32:
57495749
case GGML_TYPE_I64:
5750-
// case GGML_TYPE_IQ1_BN:
5751-
// case GGML_TYPE_IQ2_BN:
5750+
case GGML_TYPE_IQ1_BN:
5751+
case GGML_TYPE_IQ2_BN:
57525752
// nothing to validate
5753-
// break;
5753+
break;
57545754
default:
57555755
{
57565756
fprintf(stderr, "%s: invalid type %d\n", __func__, type);

ggml/src/iqk/iqk_quantize.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -115,6 +115,12 @@ size_t quantize_iq4_xs_r8(const float * GGML_RESTRICT src, void * GGML_RESTRICT
115115
void dequantize_row_iq4_xs_r8(const block_iq4_xs_r8 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
116116
void vec_dot_iq4_xs_r8_q8_k(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
117117

118+
void quantize_row_iq1_bn_ref (const float * GGML_RESTRICT x, block_iq1_bn * GGML_RESTRICT y, int64_t k);
119+
void quantize_row_iq1_bn (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
120+
void dequantize_row_iq1_bn (const block_iq2_bn * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
121+
size_t quantize_iq1_bn (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
122+
void vec_dot_iq1_bn_q8_K64(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
123+
118124
void quantize_row_iq2_bn_ref (const float * GGML_RESTRICT x, block_iq2_bn * GGML_RESTRICT y, int64_t k);
119125
void quantize_row_iq2_bn (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
120126
void dequantize_row_iq2_bn (const block_iq2_bn * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);

ggml/src/iqk_croco/iqk_quantize_croco.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -133,6 +133,12 @@ size_t quantize_iq4_xs_r8(const float * GGML_RESTRICT src, void * GGML_RESTRICT
133133
void dequantize_row_iq4_xs_r8(const block_iq4_xs_r8 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
134134
void vec_dot_iq4_xs_r8_q8_k(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
135135

136+
void quantize_row_iq1_bn_ref (const float * GGML_RESTRICT x, block_iq1_bn * GGML_RESTRICT y, int64_t k);
137+
void quantize_row_iq1_bn (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
138+
void dequantize_row_iq1_bn (const block_iq1_bn * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
139+
size_t quantize_iq1_bn (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
140+
void vec_dot_iq1_bn_q8_K64(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
141+
136142
void quantize_row_iq2_bn_ref (const float * GGML_RESTRICT x, block_iq2_bn * GGML_RESTRICT y, int64_t k);
137143
void quantize_row_iq2_bn (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
138144
void dequantize_row_iq2_bn (const block_iq2_bn * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);

koboldcpp.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -885,7 +885,7 @@ def dump_gguf_metadata(file_path): #if you're gonna copy this into your own proj
885885
data = None
886886
fptr = 0
887887
dt_table = ["u8","i8","u16","i16","u32","i32","f32","bool","str","arr","u64","i64","f64"] #13 types, else error
888-
tt_table = ["f32","f16","q4_0","q4_1","q4_2","q4_3","q5_0","q5_1","q6_0","q8_0","q8_1","q2_k","q3_k","q4_k","q5_k","q6_k","q8_k","iq2_xxs","iq2_xs","iq3_xxs","iq1_s","iq4_nl","iq3_s","iq2_s","iq4_xs","iq2_k","iq3_k","iq4_k","iq5_k","iq6_k","iq2_ks","iq4_kss","iq4_ks","iq5_ks","iq3_ks","iq2_kt","iq3_kt","iq4_kt","i8","i16","i32","i64","f64","iq1_m","bf16","q4_0_4_4","q4_0_4_8","q4_0_8_8","tq1_0","tq2_0","iq4_nl_4_4","unknown","unknown","unknown","unknown","unknown"]
888+
tt_table = ["f32","f16","q4_0","q4_1","q4_2","q4_3","q5_0","q5_1","q6_0","q8_0","q8_1","q2_k","q3_k","q4_k","q5_k","q6_k","q8_k","iq2_xxs","iq2_xs","iq3_xxs","iq1_s","iq4_nl","iq3_s","iq2_s","iq4_xs","iq1_bn","iq2_bn","iq2_k","iq3_k","iq4_k","iq5_k","iq6_k","iq2_ks","iq4_kss","iq4_ks","iq5_ks","iq3_ks","iq2_kt","iq3_kt","iq4_kt","iq2_k_r4","iq3_k_r4","iq4_k_r4","iq5_k_r4","i8","i16","i32","i64","f64","iq1_m","bf16","q4_0_4_4","q4_0_4_8","q4_0_8_8","tq1_0","tq2_0","iq4_nl_4_4","unknown","unknown","unknown","unknown","unknown"]
889889
def read_data(datatype):
890890
nonlocal fptr, data, dt_table
891891
if datatype=="u32":

tools/quantize/quantize.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,8 @@ static const std::vector<quant_option> QUANT_OPTIONS = {
3232
{ "IQ1_M", LLAMA_FTYPE_MOSTLY_IQ1_M, " 1.75 bpw quantization", },
3333
{ "TQ1_0", LLAMA_FTYPE_MOSTLY_TQ1_0, " 1.69 bpw ternarization", },
3434
{ "TQ2_0", LLAMA_FTYPE_MOSTLY_TQ2_0, " 2.06 bpw ternarization", },
35+
{ "IQ1_BN", LLAMA_FTYPE_MOSTLY_IQ1_BN, " 1.62 bpw quantization (Bitnet)", },
36+
{ "IQ2_BN", LLAMA_FTYPE_MOSTLY_IQ2_BN, " 2.00 bpw quantization (Bitnet)", },
3537
{ "Q2_K", LLAMA_FTYPE_MOSTLY_Q2_K, " 2.96G, +3.5199 ppl @ Llama-3-8B", },
3638
{ "Q2_K_S", LLAMA_FTYPE_MOSTLY_Q2_K_S, " 2.96G, +3.1836 ppl @ Llama-3-8B", },
3739
{ "IQ3_XXS", LLAMA_FTYPE_MOSTLY_IQ3_XXS, " 3.06 bpw quantization", },

0 commit comments

Comments
 (0)