Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions examples/quantize/quantize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
{ "IQ1_M", LLAMA_FTYPE_MOSTLY_IQ1_M, " 1.75 bpw quantization", },
{ "IQ1_BN", LLAMA_FTYPE_MOSTLY_IQ1_BN, " 1.62 bpw quantization (Bitnet)", },
{ "IQ2_BN", LLAMA_FTYPE_MOSTLY_IQ2_BN, " 2.00 bpw quantization (Bitnet)", },
{ "IQ1_TN", LLAMA_FTYPE_MOSTLY_IQ1_TN, " 1.69 bpw quantization (TriLM)", },
{ "IQ2_TN", LLAMA_FTYPE_MOSTLY_IQ2_TN, " 2.06 bpw quantization (TriLM)", },
{ "Q2_K", LLAMA_FTYPE_MOSTLY_Q2_K, " 2.63G, +0.6717 ppl @ LLaMA-v1-7B", },
{ "Q2_K_S", LLAMA_FTYPE_MOSTLY_Q2_K_S, " 2.16G, +9.0634 ppl @ LLaMA-v1-7B", },
Expand Down
38 changes: 21 additions & 17 deletions ggml/include/ggml.h
Original file line number Diff line number Diff line change
Expand Up @@ -391,15 +391,17 @@ extern "C" {
GGML_TYPE_Q4_0_4_4 = 31,
GGML_TYPE_Q4_0_4_8 = 32,
GGML_TYPE_Q4_0_8_8 = 33,
GGML_TYPE_IQ1_BN = 34,
GGML_TYPE_IQ2_BN = 35,
GGML_TYPE_Q8_K64 = 36,
GGML_TYPE_IQ2_K = 37,
GGML_TYPE_IQ3_K = 38,
GGML_TYPE_IQ4_K = 39,
GGML_TYPE_IQ5_K = 40,
GGML_TYPE_IQ6_K = 41,
GGML_TYPE_IQ2_TN = 42,
//
GGML_TYPE_IQ1_BN = 134,
GGML_TYPE_IQ2_BN = 135,
GGML_TYPE_Q8_K64 = 136,
GGML_TYPE_IQ2_K = 137,
GGML_TYPE_IQ3_K = 138,
GGML_TYPE_IQ4_K = 139,
GGML_TYPE_IQ5_K = 140,
GGML_TYPE_IQ6_K = 141,
GGML_TYPE_IQ2_TN = 142,
GGML_TYPE_IQ1_TN = 143,
GGML_TYPE_COUNT,
};

Expand Down Expand Up @@ -444,14 +446,16 @@ extern "C" {
GGML_FTYPE_MOSTLY_Q4_0_4_4 = 25, // except 1d tensors
GGML_FTYPE_MOSTLY_Q4_0_4_8 = 26, // except 1d tensors
GGML_FTYPE_MOSTLY_Q4_0_8_8 = 27, // except 1d tensors
GGML_FTYPE_MOSTLY_IQ1_BN = 28, // except 1d tensors
GGML_FTYPE_MOSTLY_IQ2_BN = 29, // except 1d tensors
GGML_FTYPE_MOSTLY_IQ2_K = 30, // except 1d tensors
GGML_FTYPE_MOSTLY_IQ3_K = 31, // except 1d tensors
GGML_FTYPE_MOSTLY_IQ4_K = 32, // except 1d tensors
GGML_FTYPE_MOSTLY_IQ5_K = 33, // except 1d tensors
GGML_FTYPE_MOSTLY_IQ6_K = 34, // except 1d tensors
GGML_FTYPE_MOSTLY_IQ2_TN = 35, // except 1d tensors
//
GGML_FTYPE_MOSTLY_IQ1_BN = 128, // except 1d tensors
GGML_FTYPE_MOSTLY_IQ2_BN = 129, // except 1d tensors
GGML_FTYPE_MOSTLY_IQ2_K = 130, // except 1d tensors
GGML_FTYPE_MOSTLY_IQ3_K = 131, // except 1d tensors
GGML_FTYPE_MOSTLY_IQ4_K = 132, // except 1d tensors
GGML_FTYPE_MOSTLY_IQ5_K = 133, // except 1d tensors
GGML_FTYPE_MOSTLY_IQ6_K = 134, // except 1d tensors
GGML_FTYPE_MOSTLY_IQ2_TN = 135, // except 1d tensors
GGML_FTYPE_MOSTLY_IQ1_TN = 136, // except 1d tensors
};

// available tensor operations:
Expand Down
4 changes: 4 additions & 0 deletions ggml/src/ggml-common.h
Original file line number Diff line number Diff line change
Expand Up @@ -435,6 +435,10 @@ static_assert(sizeof(block_iq2_bn) == QK_IQ2BN/4, "wrong iq2_bn block size/paddi
//
// TriLM - implemented as 2.0625 bpw
//
typedef struct {
uint8_t qs[54];
} block_iq1_tn;
static_assert(sizeof(block_iq1_tn) == 54, "wrong iq1_tn block size/padding");
typedef struct {
ggml_half d;
uint8_t qs[QK_K/4];
Expand Down
1 change: 1 addition & 0 deletions ggml/src/ggml-quants.c
Original file line number Diff line number Diff line change
Expand Up @@ -15015,6 +15015,7 @@ bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbyte
case GGML_TYPE_IQ5_K: break;
case GGML_TYPE_IQ6_K: break;
case GGML_TYPE_IQ2_TN: break;
case GGML_TYPE_IQ1_TN: break;
case GGML_TYPE_Q4_0_4_4:
case GGML_TYPE_Q4_0_4_8:
{
Expand Down
35 changes: 27 additions & 8 deletions ggml/src/ggml.c
Original file line number Diff line number Diff line change
Expand Up @@ -985,6 +985,18 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
.vec_dot_type = GGML_TYPE_Q8_K,
.nrows = 1,
},
[GGML_TYPE_IQ1_TN] = {
.type_name = "iq1_tn",
.blck_size = QK_K,
.type_size = sizeof(block_iq1_tn),
.is_quantized = true,
.to_float = (ggml_to_float_t) dequantize_row_iq1_tn,
.from_float = quantize_row_iq1_tn,
.from_float_ref = (ggml_from_float_t)quantize_row_iq1_tn_ref,
.vec_dot = vec_dot_iq1_tn_q8_k,
.vec_dot_type = GGML_TYPE_Q8_K64,
.nrows = 1,
},
[GGML_TYPE_IQ4_NL] = {
.type_name = "iq4_nl",
.blck_size = QK4_NL,
Expand Down Expand Up @@ -3705,6 +3717,7 @@ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) {
case GGML_FTYPE_MOSTLY_IQ1_BN: wtype = GGML_TYPE_IQ1_BN; break;
case GGML_FTYPE_MOSTLY_IQ2_BN: wtype = GGML_TYPE_IQ2_BN; break;
case GGML_FTYPE_MOSTLY_IQ2_TN: wtype = GGML_TYPE_IQ2_TN; break;
case GGML_FTYPE_MOSTLY_IQ1_TN: wtype = GGML_TYPE_IQ1_TN; break;
case GGML_FTYPE_MOSTLY_IQ4_NL: wtype = GGML_TYPE_IQ4_NL; break;
case GGML_FTYPE_MOSTLY_IQ4_XS: wtype = GGML_TYPE_IQ4_XS; break;
case GGML_FTYPE_MOSTLY_IQ2_K: wtype = GGML_TYPE_IQ2_K; break;
Expand Down Expand Up @@ -10133,6 +10146,7 @@ static void ggml_compute_forward_add(
case GGML_TYPE_IQ1_BN:
case GGML_TYPE_IQ2_BN:
case GGML_TYPE_IQ2_TN:
case GGML_TYPE_IQ1_TN:
case GGML_TYPE_IQ4_NL:
case GGML_TYPE_IQ4_XS:
case GGML_TYPE_IQ2_K:
Expand Down Expand Up @@ -10519,6 +10533,7 @@ static void ggml_compute_forward_add1(
case GGML_TYPE_IQ1_BN:
case GGML_TYPE_IQ2_BN:
case GGML_TYPE_IQ2_TN:
case GGML_TYPE_IQ1_TN:
case GGML_TYPE_IQ4_NL:
case GGML_TYPE_IQ4_XS:
case GGML_TYPE_IQ2_K:
Expand Down Expand Up @@ -10655,6 +10670,7 @@ static void ggml_compute_forward_acc(
case GGML_TYPE_IQ1_BN:
case GGML_TYPE_IQ2_BN:
case GGML_TYPE_IQ2_TN:
case GGML_TYPE_IQ1_TN:
case GGML_TYPE_IQ4_NL:
case GGML_TYPE_IQ4_XS:
case GGML_TYPE_IQ2_K:
Expand Down Expand Up @@ -13078,14 +13094,14 @@ UseGgmlGemm1:;
int64_t t2 = ggml_time_us();
if (ith == 0) printf("quantize(%s): %d us\n", dst->name, (int)(t2 - t1));
#endif
}

if (ith == 0) {
// Every thread starts at ith, so the first unprocessed chunk is nth. This save a bit of coordination right at the start.
atomic_store(&params->shared->current_chunk, nth);
}
if (ith == 0) {
// Every thread starts at ith, so the first unprocessed chunk is nth. This save a bit of coordination right at the start.
atomic_store(&params->shared->current_chunk, nth);
}

ggml_barrier(params->shared);
ggml_barrier(params->shared);
}

const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;

Expand All @@ -13104,8 +13120,6 @@ UseGgmlGemm1:;
IQK_MulMat_Not_Available2:;
#endif

ggml_barrier(params->shared);

#if GGML_USE_LLAMAFILE
if (src1->type != vec_dot_type) {
const size_t row_size = ggml_row_size(vec_dot_type, ne10);
Expand Down Expand Up @@ -13692,6 +13706,7 @@ static void ggml_compute_forward_out_prod(
case GGML_TYPE_IQ1_BN:
case GGML_TYPE_IQ2_BN:
case GGML_TYPE_IQ2_TN:
case GGML_TYPE_IQ1_TN:
case GGML_TYPE_IQ4_NL:
case GGML_TYPE_IQ4_XS:
case GGML_TYPE_IQ2_K:
Expand Down Expand Up @@ -14068,6 +14083,7 @@ static void ggml_compute_forward_set(
case GGML_TYPE_IQ1_BN:
case GGML_TYPE_IQ2_BN:
case GGML_TYPE_IQ2_TN:
case GGML_TYPE_IQ1_TN:
case GGML_TYPE_IQ4_NL:
case GGML_TYPE_IQ4_XS:
case GGML_TYPE_IQ2_K:
Expand Down Expand Up @@ -14338,6 +14354,7 @@ static void ggml_compute_forward_get_rows(
case GGML_TYPE_IQ1_BN:
case GGML_TYPE_IQ2_BN:
case GGML_TYPE_IQ2_TN:
case GGML_TYPE_IQ1_TN:
case GGML_TYPE_IQ4_NL:
case GGML_TYPE_IQ4_XS:
case GGML_TYPE_IQ2_K:
Expand Down Expand Up @@ -14935,6 +14952,7 @@ static void ggml_compute_forward_clamp(
case GGML_TYPE_IQ1_BN:
case GGML_TYPE_IQ2_BN:
case GGML_TYPE_IQ2_TN:
case GGML_TYPE_IQ1_TN:
case GGML_TYPE_IQ4_NL:
case GGML_TYPE_IQ4_XS:
case GGML_TYPE_IQ2_K:
Expand Down Expand Up @@ -21722,6 +21740,7 @@ size_t ggml_quantize_chunk(
case GGML_TYPE_IQ1_BN: result = quantize_iq1_bn (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_IQ2_BN: result = quantize_iq2_bn (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_IQ2_TN: result = quantize_iq2_tn (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_IQ1_TN: result = quantize_iq1_tn (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_IQ4_NL: result = quantize_iq4_nl (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_IQ4_XS: result = quantize_iq4_xs (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_IQ2_K: result = quantize_iq2_k (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
Expand Down
Loading