Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion ggml/include/ggml.h
Original file line number Diff line number Diff line change
Expand Up @@ -417,7 +417,8 @@ extern "C" {
// GGML_TYPE_IQ4_NL_4_8 = 37,
// GGML_TYPE_IQ4_NL_8_8 = 38,
GGML_TYPE_MXFP4 = 39, // MXFP4 (1 block)
GGML_TYPE_COUNT = 40,
GGML_TYPE_BC6H_0 = 40,
GGML_TYPE_COUNT = 41,
};

// precision
Expand Down Expand Up @@ -716,6 +717,7 @@ extern "C" {
GGML_API size_t ggml_element_size(const struct ggml_tensor * tensor);

GGML_API bool ggml_is_quantized(enum ggml_type type);
GGML_API bool ggml_allows_empty_border(enum ggml_type type);

// TODO: temporary until model loading of ggml examples is refactored
GGML_API enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype);
Expand Down Expand Up @@ -2525,6 +2527,7 @@ extern "C" {
int64_t blck_size_interleave; // interleave elements in blocks
size_t type_size;
bool is_quantized;
bool allows_empty_border;
ggml_to_float_t to_float;
ggml_from_float_t from_float_ref;
};
Expand Down
7 changes: 7 additions & 0 deletions ggml/src/ggml-common.h
Original file line number Diff line number Diff line change
Expand Up @@ -427,6 +427,13 @@ typedef struct {
} block_iq4_xs;
static_assert(sizeof(block_iq4_xs) == sizeof(ggml_half) + sizeof(uint16_t) + QK_K/64 + QK_K/2, "wrong iq4_xs block size/padding");

// Texture compression quants
#define BC6H_BITS_PER_BLOCK (128)
#define BC6H_WEIGHTS_PER_BLOCK (16*3)
typedef struct {
uint8_t block[BC6H_BITS_PER_BLOCK/8];
} block_bc6h_0;

#endif // GGML_COMMON_DECL
#endif // GGML_COMMON_DECL

Expand Down
1 change: 1 addition & 0 deletions ggml/src/ggml-cpu/ops.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5428,6 +5428,7 @@ void ggml_compute_forward_clamp(
case GGML_TYPE_I32:
case GGML_TYPE_I64:
case GGML_TYPE_F64:
case GGML_TYPE_BC6H_0:
case GGML_TYPE_COUNT:
{
GGML_ABORT("fatal error");
Expand Down
35 changes: 35 additions & 0 deletions ggml/src/ggml-quants.c
Original file line number Diff line number Diff line change
Expand Up @@ -2550,6 +2550,11 @@ void dequantize_row_iq4_xs(const block_iq4_xs * GGML_RESTRICT x, float * GGML_RE
}
}

void dequantize_row_bc6h_0(const block_bc6h_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
fprintf(stderr, "dequantize_row_bc6h_0(x=%p, y=%p, k=%ld)\n", x, y, k);
exit(1);
}

//===================================== Q8_K ==============================================

void quantize_row_q8_K_ref(const float * GGML_RESTRICT x, block_q8_K * GGML_RESTRICT y, int64_t k) {
Expand Down Expand Up @@ -4997,6 +5002,28 @@ void quantize_row_iq2_s_ref(const float * GGML_RESTRICT x, block_iq2_s * GGML_RE
quantize_iq2_s(x, y, 1, k, NULL);
}

// BC6H_0 quantization
static int64_t roundup(int64_t value, int64_t to) {
int64_t rem = value % to;
if(rem == 0) {
return value;
} else {
return value - rem + to;
}
}

size_t quantize_bc6h_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
int64_t blocks_per_row = roundup(n_per_row, sizeof(block_bc6h_0))/sizeof(block_bc6h_0);

size_t bytes_per_row = blocks_per_row * sizeof(block_bc6h_0);
memset(dst, 0, bytes_per_row * nrow);
return nrow * bytes_per_row;
}

void quantize_row_bc6h_0_ref(const float * GGML_RESTRICT x, block_bc6h_0 * GGML_RESTRICT y, int64_t k) {
quantize_bc6h_0(x, y, 1, k, NULL);
}

// =============================== data validation

static bool validate_float(float f, size_t i) {
Expand Down Expand Up @@ -5044,6 +5071,10 @@ static bool validate_e_e8m0(uint8_t e, size_t i) {
return true;
}

static bool validate_bc6h_0(block_bc6h_0 * block) {
return true;
}

#define VALIDATE_ROW_DATA_D_F16_IMPL(type, data, nb) \
const type * q = (const type *) (data); \
for (size_t i = 0; i < (nb); ++i) { \
Expand Down Expand Up @@ -5307,6 +5338,10 @@ bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbyte
{
VALIDATE_ROW_DATA_D_F16_IMPL(block_iq4_nl, data, nb);
} break;
case GGML_TYPE_BC6H_0:
{
return validate_bc6h_0(data);
} break;

case GGML_TYPE_I8:
case GGML_TYPE_I16:
Expand Down
6 changes: 6 additions & 0 deletions ggml/src/ggml-quants.h
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,8 @@ GGML_API void quantize_row_iq4_xs_ref (const float * GGML_RESTRICT x, block_iq4_
GGML_API void quantize_row_iq3_s_ref (const float * GGML_RESTRICT x, block_iq3_s * GGML_RESTRICT y, int64_t k);
GGML_API void quantize_row_iq2_s_ref (const float * GGML_RESTRICT x, block_iq2_s * GGML_RESTRICT y, int64_t k);

GGML_API void quantize_row_bc6h_0_ref(const float * GGML_RESTRICT x, block_bc6h_0 * GGML_RESTRICT y, int64_t k);

// Dequantization
GGML_API void dequantize_row_q4_0(const block_q4_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
GGML_API void dequantize_row_q4_1(const block_q4_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
Expand Down Expand Up @@ -69,6 +71,8 @@ GGML_API void dequantize_row_iq4_nl (const block_iq4_nl * GGML_RESTRICT x, floa
GGML_API void dequantize_row_iq4_xs (const block_iq4_xs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
GGML_API void dequantize_row_iq3_s (const block_iq3_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);

GGML_API void dequantize_row_bc6h_0(const block_bc6h_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);

// Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization")
GGML_API size_t quantize_iq2_xxs(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
GGML_API size_t quantize_iq2_xs (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
Expand Down Expand Up @@ -96,6 +100,8 @@ GGML_API size_t quantize_q8_0(const float * GGML_RESTRICT src, void * GGML_RESTR

GGML_API size_t quantize_mxfp4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);

GGML_API size_t quantize_bc6h_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);

GGML_API void iq2xs_init_impl(enum ggml_type type);
GGML_API void iq2xs_free_impl(enum ggml_type type);
GGML_API void iq3xs_init_impl(int grid_size);
Expand Down
37 changes: 33 additions & 4 deletions ggml/src/ggml.c
Original file line number Diff line number Diff line change
Expand Up @@ -873,6 +873,15 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
.type_size = 0,
.is_quantized = false,
},
[GGML_TYPE_BC6H_0] = {
.type_name = "bc6h_0",
.blck_size = BC6H_WEIGHTS_PER_BLOCK,
.type_size = sizeof(block_bc6h_0),
.is_quantized = true,
.allows_empty_border = true,
.to_float = (ggml_to_float_t) dequantize_row_bc6h_0,
.from_float_ref = (ggml_from_float_t) quantize_row_bc6h_0_ref,
},
};

const struct ggml_type_traits * ggml_get_type_traits(enum ggml_type type) {
Expand Down Expand Up @@ -1233,9 +1242,22 @@ size_t ggml_type_size(enum ggml_type type) {
return type_traits[type].type_size;
}

static int64_t roundup(int64_t value, int64_t to) {
int64_t rem = value % to;
if(rem == 0) {
return value;
} else {
return value - rem + to;
}
}

size_t ggml_row_size(enum ggml_type type, int64_t ne) {
assert(ne % ggml_blck_size(type) == 0);
return ggml_type_size(type)*ne/ggml_blck_size(type);
if(type_traits[type].allows_empty_border) {
return roundup(ggml_type_size(type)*ne, ggml_blck_size(type))/ggml_blck_size(type);
} else {
assert(ne % ggml_blck_size(type) == 0 );
return ggml_type_size(type)*ne/ggml_blck_size(type);
}
}

double ggml_type_sizef(enum ggml_type type) {
Expand All @@ -1250,6 +1272,10 @@ bool ggml_is_quantized(enum ggml_type type) {
return type_traits[type].is_quantized;
}

bool ggml_allows_empty_border(enum ggml_type type) {
return type_traits[type].allows_empty_border;
}

const char * ggml_op_name(enum ggml_op op) {
return GGML_OP_NAME[op];
}
Expand Down Expand Up @@ -7151,7 +7177,8 @@ size_t ggml_quantize_chunk(
GGML_ASSERT(imatrix != NULL);
}

GGML_ASSERT(start % type_traits[type].blck_size == 0);
// TURBOLLAMA-TODO: calculate this better rather than just disabling the assert
GGML_ASSERT(start % type_traits[type].blck_size == 0 || type_traits[type].allows_empty_border);
GGML_ASSERT(start % n_per_row == 0);

ggml_quantize_init(type); // this is noop if already initialized
Expand Down Expand Up @@ -7184,6 +7211,7 @@ size_t ggml_quantize_chunk(
case GGML_TYPE_IQ1_M: result = quantize_iq1_m (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_IQ4_NL: result = quantize_iq4_nl (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_IQ4_XS: result = quantize_iq4_xs (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_BC6H_0: result = quantize_bc6h_0 (src + start, (char *) dst + start_row + row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_F16:
{
size_t elemsize = sizeof(ggml_fp16_t);
Expand All @@ -7206,7 +7234,8 @@ size_t ggml_quantize_chunk(
assert(false);
}

GGML_ASSERT(result == nrows * row_size);
// TURBOLLAMA-TODO: calculate this better rather than just disabling the assert
GGML_ASSERT(result == nrows * row_size || ggml_allows_empty_border(type));

return result;
}
Expand Down
4 changes: 3 additions & 1 deletion ggml/src/gguf.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1142,7 +1142,9 @@ void gguf_set_tensor_type(struct gguf_context * ctx, const char * name, enum ggm
const int64_t blck_size = ggml_blck_size(type);

tensor->type = type;
GGML_ASSERT(tensor->ne[0] % blck_size == 0 && "tensor row size not divisible by block size of new type");

// TURBOLLAMA-TODO: calculate this better rather than just disabling the assert
GGML_ASSERT((tensor->ne[0] % blck_size == 0 || ggml_allows_empty_border(type)) && "tensor row size not divisible by block size of new type");

tensor->nb[0] = type_size;
tensor->nb[1] = tensor->nb[0]*(tensor->ne[0]/blck_size);
Expand Down
1 change: 1 addition & 0 deletions include/llama.h
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,7 @@ extern "C" {
LLAMA_FTYPE_MOSTLY_TQ1_0 = 36, // except 1d tensors
LLAMA_FTYPE_MOSTLY_TQ2_0 = 37, // except 1d tensors
LLAMA_FTYPE_MOSTLY_MXFP4_MOE = 38, // except 1d tensors
LLAMA_FTYPE_MOSTLY_BC6H_0 = 39, // except 1d tensors

LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
};
Expand Down
5 changes: 3 additions & 2 deletions src/llama-quant.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -439,7 +439,7 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
const int64_t ny = tensor->ne[1];
const int64_t qk_k = ggml_blck_size(new_type);

if (nx % qk_k != 0) {
if (nx % qk_k != 0 && !ggml_allows_empty_border(new_type)) {
LLAMA_LOG_WARN("\n\n%s : tensor cols %" PRId64 " x %" PRId64 " are not divisible by %" PRId64 ", required for %s", __func__, nx, ny, qk_k, ggml_type_name(new_type));
convert_incompatible_tensor = true;
} else {
Expand Down Expand Up @@ -571,6 +571,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
case LLAMA_FTYPE_MOSTLY_IQ4_XS: default_type = GGML_TYPE_IQ4_XS; break;
case LLAMA_FTYPE_MOSTLY_IQ3_S: default_type = GGML_TYPE_IQ3_S; break;
case LLAMA_FTYPE_MOSTLY_IQ3_M: default_type = GGML_TYPE_IQ3_S; break;
case LLAMA_FTYPE_MOSTLY_BC6H_0: default_type = GGML_TYPE_BC6H_0; break;

default: throw std::runtime_error(format("invalid output file type %d\n", ftype));
}
Expand Down Expand Up @@ -1030,7 +1031,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::

// update the gguf meta data as we go
gguf_set_tensor_type(ctx_outs[cur_split].get(), name.c_str(), new_type);
GGML_ASSERT(gguf_get_tensor_size(ctx_outs[cur_split].get(), gguf_find_tensor(ctx_outs[cur_split].get(), name.c_str())) == new_size);
GGML_ASSERT((gguf_get_tensor_size(ctx_outs[cur_split].get(), gguf_find_tensor(ctx_outs[cur_split].get(), name.c_str())) == new_size) || ggml_allows_empty_border(new_type));
gguf_set_tensor_data(ctx_outs[cur_split].get(), name.c_str(), new_data);

// write tensor data + padding
Expand Down
1 change: 1 addition & 0 deletions tools/quantize/quantize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ static const std::vector<quant_option> QUANT_OPTIONS = {
{ "Q8_0", LLAMA_FTYPE_MOSTLY_Q8_0, " 7.96G, +0.0026 ppl @ Llama-3-8B", },
{ "F16", LLAMA_FTYPE_MOSTLY_F16, "14.00G, +0.0020 ppl @ Mistral-7B", },
{ "BF16", LLAMA_FTYPE_MOSTLY_BF16, "14.00G, -0.0050 ppl @ Mistral-7B", },
{ "BC6H_0", LLAMA_FTYPE_MOSTLY_BC6H_0, "BC6H texture compression", },
{ "F32", LLAMA_FTYPE_ALL_F32, "26.00G @ 7B", },
// Note: Ensure COPY comes after F32 to avoid ftype 0 from matching.
{ "COPY", LLAMA_FTYPE_ALL_F32, "only copy tensors, no quantizing", },
Expand Down