Skip to content

Commit 45508b4

Browse files
committed
Remove ggml changes, fix mmq pipeline picker
1 parent 2c086fd commit 45508b4

File tree

4 files changed

+119
-138
lines changed

4 files changed

+119
-138
lines changed

ggml/src/ggml-quants.c

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2020,13 +2020,6 @@ size_t quantize_q8_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst,
20202020
return nrow * row_size;
20212021
}
20222022

2023-
size_t quantize_q8_1(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
2024-
(void)quant_weights; // not used
2025-
const size_t row_size = ggml_row_size(GGML_TYPE_Q8_1, n_per_row);
2026-
quantize_row_q8_1_ref(src, dst, (int64_t)nrow*n_per_row);
2027-
return nrow * row_size;
2028-
}
2029-
20302023
// ====================== Ternary (de)-quantization (BitNet b1.58 and TriLMs)
20312024

20322025
void quantize_row_tq1_0_ref(const float * GGML_RESTRICT x, block_tq1_0 * GGML_RESTRICT y, int64_t k) {

ggml/src/ggml-quants.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,6 @@ GGML_API void quantize_row_q4_1_ref(const float * GGML_RESTRICT x, block_q4_1 *
1919
GGML_API void quantize_row_q5_0_ref(const float * GGML_RESTRICT x, block_q5_0 * GGML_RESTRICT y, int64_t k);
2020
GGML_API void quantize_row_q5_1_ref(const float * GGML_RESTRICT x, block_q5_1 * GGML_RESTRICT y, int64_t k);
2121
GGML_API void quantize_row_q8_0_ref(const float * GGML_RESTRICT x, block_q8_0 * GGML_RESTRICT y, int64_t k);
22-
GGML_API void quantize_row_q8_1_ref(const float * GGML_RESTRICT x, block_q8_1 * GGML_RESTRICT y, int64_t k);
2322

2423
GGML_API void quantize_row_q2_K_ref(const float * GGML_RESTRICT x, block_q2_K * GGML_RESTRICT y, int64_t k);
2524
GGML_API void quantize_row_q3_K_ref(const float * GGML_RESTRICT x, block_q3_K * GGML_RESTRICT y, int64_t k);

ggml/src/ggml-vulkan/ggml-vulkan.cpp

Lines changed: 119 additions & 129 deletions
Original file line numberDiff line numberDiff line change
@@ -1965,11 +1965,6 @@ static void ggml_vk_load_shaders(vk_device& device) {
19651965
CREATE_MM(GGML_TYPE_IQ4_NL, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL].f16acc, matmul_iq4_nl_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
19661966
}
19671967

1968-
// if (device->coopmat_int_support) {
1969-
// CREATE_MMQ(GGML_TYPE_Q4_0, pipeline_dequant_mul_mat_mat_q8_1[GGML_TYPE_Q4_0].f16acc, matmul_q4_0_q8_1, _f16acc, mmq_wg_denoms, warptile_mmq_int, vk_mat_mat_push_constants, 3, );
1970-
// CREATE_MMQ(GGML_TYPE_Q8_0, pipeline_dequant_mul_mat_mat_q8_1[GGML_TYPE_Q8_0].f16acc, matmul_q8_0_q8_1, _f16acc, mmq_wg_denoms, warptile_mmq_int, vk_mat_mat_push_constants, 3, );
1971-
// }
1972-
19731968
CREATE_MM(GGML_TYPE_F32, pipeline_matmul_id_f32, matmul_id_f32_f32, , wg_denoms, warptile, vk_mat_mat_push_constants, 4, _id);
19741969
CREATE_MM2(GGML_TYPE_F16, pipeline_matmul_id_f16, matmul_id_f16, wg_denoms, warptile, vk_mat_mat_push_constants, 4, _id);
19751970
CREATE_MM2(GGML_TYPE_F16, pipeline_matmul_id_f16_f32, matmul_id_f16_f32, wg_denoms, warptile, vk_mat_mat_push_constants, 4, _id);
@@ -2077,7 +2072,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
20772072
CREATE_MM(GGML_TYPE_IQ4_XS, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_XS].f16acc, matmul_iq4_xs_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
20782073
CREATE_MM(GGML_TYPE_IQ4_NL, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL].f16acc, matmul_iq4_nl_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
20792074

2080-
if (device->coopmat_int_support) {
2075+
if (device->integer_dot_product) {
20812076
CREATE_MMQ(GGML_TYPE_Q4_0, pipeline_dequant_mul_mat_mat_q8_1[GGML_TYPE_Q4_0].f16acc, matmul_q4_0_q8_1, _f16acc, mmq_wg_denoms, warptile_mmq_int, vk_mat_mat_push_constants, 3, );
20822077
CREATE_MMQ(GGML_TYPE_Q4_1, pipeline_dequant_mul_mat_mat_q8_1[GGML_TYPE_Q4_1].f16acc, matmul_q4_1_q8_1, _f16acc, mmq_wg_denoms, warptile_mmq_int, vk_mat_mat_push_constants, 3, );
20832078
CREATE_MMQ(GGML_TYPE_Q5_0, pipeline_dequant_mul_mat_mat_q8_1[GGML_TYPE_Q5_0].f16acc, matmul_q5_0_q8_1, _f16acc, mmq_wg_denoms, warptile_mmq_int, vk_mat_mat_push_constants, 3, );
@@ -2162,7 +2157,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
21622157
CREATE_MM(GGML_TYPE_IQ4_XS, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_XS].f32acc, matmul_iq4_xs_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
21632158
CREATE_MM(GGML_TYPE_IQ4_NL, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL].f32acc, matmul_iq4_nl_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
21642159

2165-
if (device->coopmat_int_support) {
2160+
if (device->integer_dot_product) {
21662161
CREATE_MMQ(GGML_TYPE_Q4_0, pipeline_dequant_mul_mat_mat_q8_1[GGML_TYPE_Q4_0].f32acc, matmul_q4_0_q8_1, , mmq_wg_denoms, warptile_mmq_int, vk_mat_mat_push_constants, 3, );
21672162
CREATE_MMQ(GGML_TYPE_Q4_1, pipeline_dequant_mul_mat_mat_q8_1[GGML_TYPE_Q4_1].f32acc, matmul_q4_1_q8_1, , mmq_wg_denoms, warptile_mmq_int, vk_mat_mat_push_constants, 3, );
21682163
CREATE_MMQ(GGML_TYPE_Q5_0, pipeline_dequant_mul_mat_mat_q8_1[GGML_TYPE_Q5_0].f32acc, matmul_q5_0_q8_1, , mmq_wg_denoms, warptile_mmq_int, vk_mat_mat_push_constants, 3, );
@@ -2203,7 +2198,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
22032198
uint32_t rm_stdq = 1;
22042199
uint32_t rm_kq = 2;
22052200
if (device->vendor_id == VK_VENDOR_ID_AMD) {
2206-
if (device->subgroup_min_size == 64 && device->subgroup_max_size == 64) { // GCN
2201+
if (device->architecture == AMD_GCN) {
22072202
rm_stdq = 2;
22082203
rm_kq = 4;
22092204
}
@@ -3409,18 +3404,13 @@ static vk_matmul_pipeline ggml_vk_get_mul_mat_mat_pipeline(ggml_backend_vk_conte
34093404

34103405
// MMQ
34113406
if (src1_type == GGML_TYPE_Q8_1) {
3412-
switch (src0_type) {
3413-
case GGML_TYPE_Q4_0:
3414-
case GGML_TYPE_Q4_1:
3415-
case GGML_TYPE_Q5_0:
3416-
case GGML_TYPE_Q5_1:
3417-
case GGML_TYPE_Q8_0:
3418-
break;
3419-
default:
3420-
return nullptr;
3407+
vk_matmul_pipeline pipelines = ctx->device->pipeline_dequant_mul_mat_mat_q8_1[src0_type].f16acc;
3408+
3409+
if (pipelines->s == nullptr && pipelines->m == nullptr && pipelines->l == nullptr) {
3410+
return nullptr;
34213411
}
34223412

3423-
return ctx->device->pipeline_dequant_mul_mat_mat_q8_1[src0_type].f16acc;
3413+
return pipelines;
34243414
}
34253415

34263416
if (src1_type != GGML_TYPE_F32 && !ctx->device->coopmat2) {
@@ -7402,115 +7392,117 @@ static void ggml_vk_test_dequant(ggml_backend_vk_context * ctx, size_t ne, ggml_
74027392
free(x_chk);
74037393
}
74047394

7405-
typedef uint16_t ggml_half;
7406-
typedef uint32_t ggml_half2;
7407-
7408-
#define QK8_1 32
7409-
typedef struct {
7410-
union {
7411-
struct {
7412-
ggml_half d; // delta
7413-
ggml_half s; // d * sum(qs[i])
7414-
} GGML_COMMON_AGGR_S;
7415-
ggml_half2 ds;
7416-
} GGML_COMMON_AGGR_U;
7417-
int8_t qs[QK8_1]; // quants
7418-
} block_q8_1;
7419-
7420-
static void ggml_vk_test_quantize(ggml_backend_vk_context * ctx, size_t ne, ggml_type quant) {
7421-
VK_LOG_DEBUG("ggml_vk_test_quantize(" << ne << ")");
7422-
GGML_ASSERT(quant == GGML_TYPE_Q8_1);
7423-
7424-
const size_t x_sz = sizeof(float) * ne;
7425-
const size_t qx_sz = ne * ggml_type_size(quant)/ggml_blck_size(quant);
7426-
float * x = (float *) malloc(x_sz);
7427-
block_q8_1 * qx = (block_q8_1 *)malloc(qx_sz);
7428-
block_q8_1 * qx_res = (block_q8_1 *)malloc(qx_sz);
7429-
vk_buffer x_buf = ggml_vk_create_buffer_check(ctx->device, x_sz, vk::MemoryPropertyFlagBits::eDeviceLocal);
7430-
vk_buffer qx_buf = ggml_vk_create_buffer_check(ctx->device, qx_sz, vk::MemoryPropertyFlagBits::eDeviceLocal);
7431-
7432-
for (size_t i = 0; i < ne; i++) {
7433-
x[i] = rand() / (float)RAND_MAX;
7434-
}
7435-
7436-
vk_pipeline p = ggml_vk_get_quantize_pipeline(ctx, quant);
7437-
7438-
ggml_pipeline_request_descriptor_sets(ctx->device, p, 1);
7439-
7440-
if (ctx->device->need_compiles) {
7441-
ggml_vk_load_shaders(ctx->device);
7442-
}
7443-
7444-
ggml_pipeline_allocate_descriptor_sets(ctx->device);
7445-
7446-
ggml_vk_buffer_write(x_buf, 0, x, x_sz);
7447-
7448-
vk_context subctx = ggml_vk_create_context(ctx, ctx->device->compute_queue);
7449-
ggml_vk_ctx_begin(ctx->device, subctx);
7450-
ggml_vk_quantize_q8_1(ctx, subctx, ggml_vk_subbuffer(x_buf), ggml_vk_subbuffer(qx_buf), ne);
7451-
ggml_vk_ctx_end(subctx);
7452-
7453-
auto begin = std::chrono::high_resolution_clock::now();
7454-
7455-
ggml_vk_submit(subctx, ctx->fence);
7456-
VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_test_quantize waitForFences");
7457-
ctx->device->device.resetFences({ ctx->fence });
7458-
7459-
auto end = std::chrono::high_resolution_clock::now();
7460-
7461-
double ms_quant = std::chrono::duration_cast<std::chrono::microseconds>(end-begin).count() / 1000.0;
7462-
ggml_vk_buffer_read(qx_buf, 0, qx, qx_sz);
7463-
7464-
ggml_vk_quantize_data(x, qx_res, ne, quant);
7465-
7466-
int first_err = -1;
7467-
7468-
for (size_t i = 0; i < ne / 32; i++) {
7469-
double error = std::fabs(ggml_fp16_to_fp32(qx_res[i].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.d) - ggml_fp16_to_fp32(qx[i].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.d));
7470-
7471-
if (first_err < 0 && error > 0.1) {
7472-
first_err = i;
7473-
}
7474-
7475-
error = std::fabs(ggml_fp16_to_fp32(qx_res[i].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.s) - ggml_fp16_to_fp32(qx[i].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.s));
7476-
7477-
if (first_err < 0 && error > 0.1) {
7478-
first_err = i;
7479-
}
7480-
7481-
for (size_t j = 0; j < 32; j++) {
7482-
uint64_t error = std::abs(qx_res[i].qs[j] - qx[i].qs[j]);
7483-
7484-
if (first_err < 0 && error > 1) {
7485-
first_err = i;
7486-
}
7487-
}
7488-
}
7489-
7490-
std::cerr << "TEST QUANTIZE " << ggml_type_name(quant) << " time=" << ms_quant << "ms " << (first_err == -1 ? "CORRECT" : "INCORRECT") << std::endl;
7491-
7492-
if (first_err != -1) {
7493-
std::cerr << "first_error = " << first_err << std::endl;
7494-
std::cerr << "Actual result: " << std::endl << std::endl;
7495-
std::cout << "d=" << ggml_fp16_to_fp32(qx[first_err].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.d) << " s=" << ggml_fp16_to_fp32(qx[first_err].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.s) << " ";
7496-
for (size_t j = 0; j < 32; j++) {
7497-
std::cout << " qs" << j << "=" << (uint32_t)qx[first_err].qs[j] << " ";
7498-
}
7499-
std::cerr << std::endl << std::endl << "Expected result: " << std::endl << std::endl;
7500-
std::cout << "d=" << ggml_fp16_to_fp32(qx_res[first_err].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.d) << " s=" << ggml_fp16_to_fp32(qx_res[first_err].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.s) << " ";
7501-
for (size_t j = 0; j < 32; j++) {
7502-
std::cout << " qs" << j << "=" << (uint32_t)qx_res[first_err].qs[j] << " ";
7503-
}
7504-
std::cerr << std::endl;
7505-
}
7506-
7507-
ggml_vk_destroy_buffer(x_buf);
7508-
ggml_vk_destroy_buffer(qx_buf);
7509-
7510-
free(x);
7511-
free(qx);
7512-
free(qx_res);
7513-
}
7395+
// This does not work without ggml q8_1 quantization support
7396+
//
7397+
// typedef uint16_t ggml_half;
7398+
// typedef uint32_t ggml_half2;
7399+
//
7400+
// #define QK8_1 32
7401+
// typedef struct {
7402+
// union {
7403+
// struct {
7404+
// ggml_half d; // delta
7405+
// ggml_half s; // d * sum(qs[i])
7406+
// } GGML_COMMON_AGGR_S;
7407+
// ggml_half2 ds;
7408+
// } GGML_COMMON_AGGR_U;
7409+
// int8_t qs[QK8_1]; // quants
7410+
// } block_q8_1;
7411+
//
7412+
// static void ggml_vk_test_quantize(ggml_backend_vk_context * ctx, size_t ne, ggml_type quant) {
7413+
// VK_LOG_DEBUG("ggml_vk_test_quantize(" << ne << ")");
7414+
// GGML_ASSERT(quant == GGML_TYPE_Q8_1);
7415+
//
7416+
// const size_t x_sz = sizeof(float) * ne;
7417+
// const size_t qx_sz = ne * ggml_type_size(quant)/ggml_blck_size(quant);
7418+
// float * x = (float *) malloc(x_sz);
7419+
// block_q8_1 * qx = (block_q8_1 *)malloc(qx_sz);
7420+
// block_q8_1 * qx_res = (block_q8_1 *)malloc(qx_sz);
7421+
// vk_buffer x_buf = ggml_vk_create_buffer_check(ctx->device, x_sz, vk::MemoryPropertyFlagBits::eDeviceLocal);
7422+
// vk_buffer qx_buf = ggml_vk_create_buffer_check(ctx->device, qx_sz, vk::MemoryPropertyFlagBits::eDeviceLocal);
7423+
//
7424+
// for (size_t i = 0; i < ne; i++) {
7425+
// x[i] = rand() / (float)RAND_MAX;
7426+
// }
7427+
//
7428+
// vk_pipeline p = ggml_vk_get_quantize_pipeline(ctx, quant);
7429+
//
7430+
// ggml_pipeline_request_descriptor_sets(ctx->device, p, 1);
7431+
//
7432+
// if (ctx->device->need_compiles) {
7433+
// ggml_vk_load_shaders(ctx->device);
7434+
// }
7435+
//
7436+
// ggml_pipeline_allocate_descriptor_sets(ctx->device);
7437+
//
7438+
// ggml_vk_buffer_write(x_buf, 0, x, x_sz);
7439+
//
7440+
// vk_context subctx = ggml_vk_create_context(ctx, ctx->device->compute_queue);
7441+
// ggml_vk_ctx_begin(ctx->device, subctx);
7442+
// ggml_vk_quantize_q8_1(ctx, subctx, ggml_vk_subbuffer(x_buf), ggml_vk_subbuffer(qx_buf), ne);
7443+
// ggml_vk_ctx_end(subctx);
7444+
//
7445+
// auto begin = std::chrono::high_resolution_clock::now();
7446+
//
7447+
// ggml_vk_submit(subctx, ctx->fence);
7448+
// VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_test_quantize waitForFences");
7449+
// ctx->device->device.resetFences({ ctx->fence });
7450+
//
7451+
// auto end = std::chrono::high_resolution_clock::now();
7452+
//
7453+
// double ms_quant = std::chrono::duration_cast<std::chrono::microseconds>(end-begin).count() / 1000.0;
7454+
// ggml_vk_buffer_read(qx_buf, 0, qx, qx_sz);
7455+
//
7456+
// ggml_vk_quantize_data(x, qx_res, ne, quant);
7457+
//
7458+
// int first_err = -1;
7459+
//
7460+
// for (size_t i = 0; i < ne / 32; i++) {
7461+
// double error = std::fabs(ggml_fp16_to_fp32(qx_res[i].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.d) - ggml_fp16_to_fp32(qx[i].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.d));
7462+
//
7463+
// if (first_err < 0 && error > 0.1) {
7464+
// first_err = i;
7465+
// }
7466+
//
7467+
// error = std::fabs(ggml_fp16_to_fp32(qx_res[i].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.s) - ggml_fp16_to_fp32(qx[i].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.s));
7468+
//
7469+
// if (first_err < 0 && error > 0.1) {
7470+
// first_err = i;
7471+
// }
7472+
//
7473+
// for (size_t j = 0; j < 32; j++) {
7474+
// uint64_t error = std::abs(qx_res[i].qs[j] - qx[i].qs[j]);
7475+
//
7476+
// if (first_err < 0 && error > 1) {
7477+
// first_err = i;
7478+
// }
7479+
// }
7480+
// }
7481+
//
7482+
// std::cerr << "TEST QUANTIZE " << ggml_type_name(quant) << " time=" << ms_quant << "ms " << (first_err == -1 ? "CORRECT" : "INCORRECT") << std::endl;
7483+
//
7484+
// if (first_err != -1) {
7485+
// std::cerr << "first_error = " << first_err << std::endl;
7486+
// std::cerr << "Actual result: " << std::endl << std::endl;
7487+
// std::cout << "d=" << ggml_fp16_to_fp32(qx[first_err].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.d) << " s=" << ggml_fp16_to_fp32(qx[first_err].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.s) << " ";
7488+
// for (size_t j = 0; j < 32; j++) {
7489+
// std::cout << " qs" << j << "=" << (uint32_t)qx[first_err].qs[j] << " ";
7490+
// }
7491+
// std::cerr << std::endl << std::endl << "Expected result: " << std::endl << std::endl;
7492+
// std::cout << "d=" << ggml_fp16_to_fp32(qx_res[first_err].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.d) << " s=" << ggml_fp16_to_fp32(qx_res[first_err].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.s) << " ";
7493+
// for (size_t j = 0; j < 32; j++) {
7494+
// std::cout << " qs" << j << "=" << (uint32_t)qx_res[first_err].qs[j] << " ";
7495+
// }
7496+
// std::cerr << std::endl;
7497+
// }
7498+
//
7499+
// ggml_vk_destroy_buffer(x_buf);
7500+
// ggml_vk_destroy_buffer(qx_buf);
7501+
//
7502+
// free(x);
7503+
// free(qx);
7504+
// free(qx_res);
7505+
// }
75147506

75157507
static void ggml_vk_test_dequant_matmul(ggml_backend_vk_context * ctx, size_t m, size_t n, size_t k, size_t batch, size_t num_it, size_t split_k, size_t shader_size, ggml_type quant, bool mmq = false) {
75167508
VK_LOG_DEBUG("ggml_vk_test_dequant_matmul(" << m << ", " << n << ", " << k << ", " << batch << ", " << num_it << ", " << split_k << ", " << ggml_type_name(quant) << ")");
@@ -7752,8 +7744,6 @@ static void ggml_vk_test_dequant_matmul(ggml_backend_vk_context * ctx, size_t m,
77527744

77537745
static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
77547746
#if defined(GGML_VULKAN_RUN_TESTS)
7755-
ggml_vk_test_quantize(ctx, 1024*1024, GGML_TYPE_Q8_1);
7756-
77577747
const std::vector<size_t> vals {
77587748
512, 512, 128,
77597749
128, 512, 512,

ggml/src/ggml.c

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6525,7 +6525,6 @@ size_t ggml_quantize_chunk(
65256525
case GGML_TYPE_Q5_0: result = quantize_q5_0(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
65266526
case GGML_TYPE_Q5_1: result = quantize_q5_1(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
65276527
case GGML_TYPE_Q8_0: result = quantize_q8_0(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
6528-
case GGML_TYPE_Q8_1: result = quantize_q8_1(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
65296528
case GGML_TYPE_Q2_K: result = quantize_q2_K(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
65306529
case GGML_TYPE_Q3_K: result = quantize_q3_K(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
65316530
case GGML_TYPE_Q4_K: result = quantize_q4_K(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;

0 commit comments

Comments
 (0)