@@ -1965,11 +1965,6 @@ static void ggml_vk_load_shaders(vk_device& device) {
19651965 CREATE_MM(GGML_TYPE_IQ4_NL, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL].f16acc, matmul_iq4_nl_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
19661966 }
19671967
1968- // if (device->coopmat_int_support) {
1969- // CREATE_MMQ(GGML_TYPE_Q4_0, pipeline_dequant_mul_mat_mat_q8_1[GGML_TYPE_Q4_0].f16acc, matmul_q4_0_q8_1, _f16acc, mmq_wg_denoms, warptile_mmq_int, vk_mat_mat_push_constants, 3, );
1970- // CREATE_MMQ(GGML_TYPE_Q8_0, pipeline_dequant_mul_mat_mat_q8_1[GGML_TYPE_Q8_0].f16acc, matmul_q8_0_q8_1, _f16acc, mmq_wg_denoms, warptile_mmq_int, vk_mat_mat_push_constants, 3, );
1971- // }
1972-
19731968 CREATE_MM(GGML_TYPE_F32, pipeline_matmul_id_f32, matmul_id_f32_f32, , wg_denoms, warptile, vk_mat_mat_push_constants, 4, _id);
19741969 CREATE_MM2(GGML_TYPE_F16, pipeline_matmul_id_f16, matmul_id_f16, wg_denoms, warptile, vk_mat_mat_push_constants, 4, _id);
19751970 CREATE_MM2(GGML_TYPE_F16, pipeline_matmul_id_f16_f32, matmul_id_f16_f32, wg_denoms, warptile, vk_mat_mat_push_constants, 4, _id);
@@ -2077,7 +2072,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
20772072 CREATE_MM(GGML_TYPE_IQ4_XS, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_XS].f16acc, matmul_iq4_xs_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
20782073 CREATE_MM(GGML_TYPE_IQ4_NL, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL].f16acc, matmul_iq4_nl_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
20792074
2080- if (device->coopmat_int_support ) {
2075+ if (device->integer_dot_product ) {
20812076 CREATE_MMQ(GGML_TYPE_Q4_0, pipeline_dequant_mul_mat_mat_q8_1[GGML_TYPE_Q4_0].f16acc, matmul_q4_0_q8_1, _f16acc, mmq_wg_denoms, warptile_mmq_int, vk_mat_mat_push_constants, 3, );
20822077 CREATE_MMQ(GGML_TYPE_Q4_1, pipeline_dequant_mul_mat_mat_q8_1[GGML_TYPE_Q4_1].f16acc, matmul_q4_1_q8_1, _f16acc, mmq_wg_denoms, warptile_mmq_int, vk_mat_mat_push_constants, 3, );
20832078 CREATE_MMQ(GGML_TYPE_Q5_0, pipeline_dequant_mul_mat_mat_q8_1[GGML_TYPE_Q5_0].f16acc, matmul_q5_0_q8_1, _f16acc, mmq_wg_denoms, warptile_mmq_int, vk_mat_mat_push_constants, 3, );
@@ -2162,7 +2157,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
21622157 CREATE_MM(GGML_TYPE_IQ4_XS, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_XS].f32acc, matmul_iq4_xs_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
21632158 CREATE_MM(GGML_TYPE_IQ4_NL, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL].f32acc, matmul_iq4_nl_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
21642159
2165- if (device->coopmat_int_support ) {
2160+ if (device->integer_dot_product ) {
21662161 CREATE_MMQ(GGML_TYPE_Q4_0, pipeline_dequant_mul_mat_mat_q8_1[GGML_TYPE_Q4_0].f32acc, matmul_q4_0_q8_1, , mmq_wg_denoms, warptile_mmq_int, vk_mat_mat_push_constants, 3, );
21672162 CREATE_MMQ(GGML_TYPE_Q4_1, pipeline_dequant_mul_mat_mat_q8_1[GGML_TYPE_Q4_1].f32acc, matmul_q4_1_q8_1, , mmq_wg_denoms, warptile_mmq_int, vk_mat_mat_push_constants, 3, );
21682163 CREATE_MMQ(GGML_TYPE_Q5_0, pipeline_dequant_mul_mat_mat_q8_1[GGML_TYPE_Q5_0].f32acc, matmul_q5_0_q8_1, , mmq_wg_denoms, warptile_mmq_int, vk_mat_mat_push_constants, 3, );
@@ -2203,7 +2198,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
22032198 uint32_t rm_stdq = 1;
22042199 uint32_t rm_kq = 2;
22052200 if (device->vendor_id == VK_VENDOR_ID_AMD) {
2206- if (device->subgroup_min_size == 64 && device->subgroup_max_size == 64 ) { // GCN
2201+ if (device->architecture == AMD_GCN ) {
22072202 rm_stdq = 2;
22082203 rm_kq = 4;
22092204 }
@@ -3409,18 +3404,13 @@ static vk_matmul_pipeline ggml_vk_get_mul_mat_mat_pipeline(ggml_backend_vk_conte
34093404
34103405 // MMQ
34113406 if (src1_type == GGML_TYPE_Q8_1) {
3412- switch (src0_type) {
3413- case GGML_TYPE_Q4_0:
3414- case GGML_TYPE_Q4_1:
3415- case GGML_TYPE_Q5_0:
3416- case GGML_TYPE_Q5_1:
3417- case GGML_TYPE_Q8_0:
3418- break;
3419- default:
3420- return nullptr;
3407+ vk_matmul_pipeline pipelines = ctx->device->pipeline_dequant_mul_mat_mat_q8_1[src0_type].f16acc;
3408+
3409+ if (pipelines->s == nullptr && pipelines->m == nullptr && pipelines->l == nullptr) {
3410+ return nullptr;
34213411 }
34223412
3423- return ctx->device->pipeline_dequant_mul_mat_mat_q8_1[src0_type].f16acc ;
3413+ return pipelines ;
34243414 }
34253415
34263416 if (src1_type != GGML_TYPE_F32 && !ctx->device->coopmat2) {
@@ -7402,115 +7392,117 @@ static void ggml_vk_test_dequant(ggml_backend_vk_context * ctx, size_t ne, ggml_
74027392 free(x_chk);
74037393}
74047394
7405- typedef uint16_t ggml_half;
7406- typedef uint32_t ggml_half2;
7407-
7408- #define QK8_1 32
7409- typedef struct {
7410- union {
7411- struct {
7412- ggml_half d; // delta
7413- ggml_half s; // d * sum(qs[i])
7414- } GGML_COMMON_AGGR_S;
7415- ggml_half2 ds;
7416- } GGML_COMMON_AGGR_U;
7417- int8_t qs[QK8_1]; // quants
7418- } block_q8_1;
7419-
7420- static void ggml_vk_test_quantize(ggml_backend_vk_context * ctx, size_t ne, ggml_type quant) {
7421- VK_LOG_DEBUG("ggml_vk_test_quantize(" << ne << ")");
7422- GGML_ASSERT(quant == GGML_TYPE_Q8_1);
7423-
7424- const size_t x_sz = sizeof(float) * ne;
7425- const size_t qx_sz = ne * ggml_type_size(quant)/ggml_blck_size(quant);
7426- float * x = (float *) malloc(x_sz);
7427- block_q8_1 * qx = (block_q8_1 *)malloc(qx_sz);
7428- block_q8_1 * qx_res = (block_q8_1 *)malloc(qx_sz);
7429- vk_buffer x_buf = ggml_vk_create_buffer_check(ctx->device, x_sz, vk::MemoryPropertyFlagBits::eDeviceLocal);
7430- vk_buffer qx_buf = ggml_vk_create_buffer_check(ctx->device, qx_sz, vk::MemoryPropertyFlagBits::eDeviceLocal);
7431-
7432- for (size_t i = 0; i < ne; i++) {
7433- x[i] = rand() / (float)RAND_MAX;
7434- }
7435-
7436- vk_pipeline p = ggml_vk_get_quantize_pipeline(ctx, quant);
7437-
7438- ggml_pipeline_request_descriptor_sets(ctx->device, p, 1);
7439-
7440- if (ctx->device->need_compiles) {
7441- ggml_vk_load_shaders(ctx->device);
7442- }
7443-
7444- ggml_pipeline_allocate_descriptor_sets(ctx->device);
7445-
7446- ggml_vk_buffer_write(x_buf, 0, x, x_sz);
7447-
7448- vk_context subctx = ggml_vk_create_context(ctx, ctx->device->compute_queue);
7449- ggml_vk_ctx_begin(ctx->device, subctx);
7450- ggml_vk_quantize_q8_1(ctx, subctx, ggml_vk_subbuffer(x_buf), ggml_vk_subbuffer(qx_buf), ne);
7451- ggml_vk_ctx_end(subctx);
7452-
7453- auto begin = std::chrono::high_resolution_clock::now();
7454-
7455- ggml_vk_submit(subctx, ctx->fence);
7456- VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_test_quantize waitForFences");
7457- ctx->device->device.resetFences({ ctx->fence });
7458-
7459- auto end = std::chrono::high_resolution_clock::now();
7460-
7461- double ms_quant = std::chrono::duration_cast<std::chrono::microseconds>(end-begin).count() / 1000.0;
7462- ggml_vk_buffer_read(qx_buf, 0, qx, qx_sz);
7463-
7464- ggml_vk_quantize_data(x, qx_res, ne, quant);
7465-
7466- int first_err = -1;
7467-
7468- for (size_t i = 0; i < ne / 32; i++) {
7469- double error = std::fabs(ggml_fp16_to_fp32(qx_res[i].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.d) - ggml_fp16_to_fp32(qx[i].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.d));
7470-
7471- if (first_err < 0 && error > 0.1) {
7472- first_err = i;
7473- }
7474-
7475- error = std::fabs(ggml_fp16_to_fp32(qx_res[i].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.s) - ggml_fp16_to_fp32(qx[i].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.s));
7476-
7477- if (first_err < 0 && error > 0.1) {
7478- first_err = i;
7479- }
7480-
7481- for (size_t j = 0; j < 32; j++) {
7482- uint64_t error = std::abs(qx_res[i].qs[j] - qx[i].qs[j]);
7483-
7484- if (first_err < 0 && error > 1) {
7485- first_err = i;
7486- }
7487- }
7488- }
7489-
7490- std::cerr << "TEST QUANTIZE " << ggml_type_name(quant) << " time=" << ms_quant << "ms " << (first_err == -1 ? "CORRECT" : "INCORRECT") << std::endl;
7491-
7492- if (first_err != -1) {
7493- std::cerr << "first_error = " << first_err << std::endl;
7494- std::cerr << "Actual result: " << std::endl << std::endl;
7495- std::cout << "d=" << ggml_fp16_to_fp32(qx[first_err].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.d) << " s=" << ggml_fp16_to_fp32(qx[first_err].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.s) << " ";
7496- for (size_t j = 0; j < 32; j++) {
7497- std::cout << " qs" << j << "=" << (uint32_t)qx[first_err].qs[j] << " ";
7498- }
7499- std::cerr << std::endl << std::endl << "Expected result: " << std::endl << std::endl;
7500- std::cout << "d=" << ggml_fp16_to_fp32(qx_res[first_err].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.d) << " s=" << ggml_fp16_to_fp32(qx_res[first_err].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.s) << " ";
7501- for (size_t j = 0; j < 32; j++) {
7502- std::cout << " qs" << j << "=" << (uint32_t)qx_res[first_err].qs[j] << " ";
7503- }
7504- std::cerr << std::endl;
7505- }
7506-
7507- ggml_vk_destroy_buffer(x_buf);
7508- ggml_vk_destroy_buffer(qx_buf);
7509-
7510- free(x);
7511- free(qx);
7512- free(qx_res);
7513- }
7395+ // This does not work without ggml q8_1 quantization support
7396+ //
7397+ // typedef uint16_t ggml_half;
7398+ // typedef uint32_t ggml_half2;
7399+ //
7400+ // #define QK8_1 32
7401+ // typedef struct {
7402+ // union {
7403+ // struct {
7404+ // ggml_half d; // delta
7405+ // ggml_half s; // d * sum(qs[i])
7406+ // } GGML_COMMON_AGGR_S;
7407+ // ggml_half2 ds;
7408+ // } GGML_COMMON_AGGR_U;
7409+ // int8_t qs[QK8_1]; // quants
7410+ // } block_q8_1;
7411+ //
7412+ // static void ggml_vk_test_quantize(ggml_backend_vk_context * ctx, size_t ne, ggml_type quant) {
7413+ // VK_LOG_DEBUG("ggml_vk_test_quantize(" << ne << ")");
7414+ // GGML_ASSERT(quant == GGML_TYPE_Q8_1);
7415+ //
7416+ // const size_t x_sz = sizeof(float) * ne;
7417+ // const size_t qx_sz = ne * ggml_type_size(quant)/ggml_blck_size(quant);
7418+ // float * x = (float *) malloc(x_sz);
7419+ // block_q8_1 * qx = (block_q8_1 *)malloc(qx_sz);
7420+ // block_q8_1 * qx_res = (block_q8_1 *)malloc(qx_sz);
7421+ // vk_buffer x_buf = ggml_vk_create_buffer_check(ctx->device, x_sz, vk::MemoryPropertyFlagBits::eDeviceLocal);
7422+ // vk_buffer qx_buf = ggml_vk_create_buffer_check(ctx->device, qx_sz, vk::MemoryPropertyFlagBits::eDeviceLocal);
7423+ //
7424+ // for (size_t i = 0; i < ne; i++) {
7425+ // x[i] = rand() / (float)RAND_MAX;
7426+ // }
7427+ //
7428+ // vk_pipeline p = ggml_vk_get_quantize_pipeline(ctx, quant);
7429+ //
7430+ // ggml_pipeline_request_descriptor_sets(ctx->device, p, 1);
7431+ //
7432+ // if (ctx->device->need_compiles) {
7433+ // ggml_vk_load_shaders(ctx->device);
7434+ // }
7435+ //
7436+ // ggml_pipeline_allocate_descriptor_sets(ctx->device);
7437+ //
7438+ // ggml_vk_buffer_write(x_buf, 0, x, x_sz);
7439+ //
7440+ // vk_context subctx = ggml_vk_create_context(ctx, ctx->device->compute_queue);
7441+ // ggml_vk_ctx_begin(ctx->device, subctx);
7442+ // ggml_vk_quantize_q8_1(ctx, subctx, ggml_vk_subbuffer(x_buf), ggml_vk_subbuffer(qx_buf), ne);
7443+ // ggml_vk_ctx_end(subctx);
7444+ //
7445+ // auto begin = std::chrono::high_resolution_clock::now();
7446+ //
7447+ // ggml_vk_submit(subctx, ctx->fence);
7448+ // VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_test_quantize waitForFences");
7449+ // ctx->device->device.resetFences({ ctx->fence });
7450+ //
7451+ // auto end = std::chrono::high_resolution_clock::now();
7452+ //
7453+ // double ms_quant = std::chrono::duration_cast<std::chrono::microseconds>(end-begin).count() / 1000.0;
7454+ // ggml_vk_buffer_read(qx_buf, 0, qx, qx_sz);
7455+ //
7456+ // ggml_vk_quantize_data(x, qx_res, ne, quant);
7457+ //
7458+ // int first_err = -1;
7459+ //
7460+ // for (size_t i = 0; i < ne / 32; i++) {
7461+ // double error = std::fabs(ggml_fp16_to_fp32(qx_res[i].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.d) - ggml_fp16_to_fp32(qx[i].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.d));
7462+ //
7463+ // if (first_err < 0 && error > 0.1) {
7464+ // first_err = i;
7465+ // }
7466+ //
7467+ // error = std::fabs(ggml_fp16_to_fp32(qx_res[i].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.s) - ggml_fp16_to_fp32(qx[i].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.s));
7468+ //
7469+ // if (first_err < 0 && error > 0.1) {
7470+ // first_err = i;
7471+ // }
7472+ //
7473+ // for (size_t j = 0; j < 32; j++) {
7474+ // uint64_t error = std::abs(qx_res[i].qs[j] - qx[i].qs[j]);
7475+ //
7476+ // if (first_err < 0 && error > 1) {
7477+ // first_err = i;
7478+ // }
7479+ // }
7480+ // }
7481+ //
7482+ // std::cerr << "TEST QUANTIZE " << ggml_type_name(quant) << " time=" << ms_quant << "ms " << (first_err == -1 ? "CORRECT" : "INCORRECT") << std::endl;
7483+ //
7484+ // if (first_err != -1) {
7485+ // std::cerr << "first_error = " << first_err << std::endl;
7486+ // std::cerr << "Actual result: " << std::endl << std::endl;
7487+ // std::cout << "d=" << ggml_fp16_to_fp32(qx[first_err].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.d) << " s=" << ggml_fp16_to_fp32(qx[first_err].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.s) << " ";
7488+ // for (size_t j = 0; j < 32; j++) {
7489+ // std::cout << " qs" << j << "=" << (uint32_t)qx[first_err].qs[j] << " ";
7490+ // }
7491+ // std::cerr << std::endl << std::endl << "Expected result: " << std::endl << std::endl;
7492+ // std::cout << "d=" << ggml_fp16_to_fp32(qx_res[first_err].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.d) << " s=" << ggml_fp16_to_fp32(qx_res[first_err].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.s) << " ";
7493+ // for (size_t j = 0; j < 32; j++) {
7494+ // std::cout << " qs" << j << "=" << (uint32_t)qx_res[first_err].qs[j] << " ";
7495+ // }
7496+ // std::cerr << std::endl;
7497+ // }
7498+ //
7499+ // ggml_vk_destroy_buffer(x_buf);
7500+ // ggml_vk_destroy_buffer(qx_buf);
7501+ //
7502+ // free(x);
7503+ // free(qx);
7504+ // free(qx_res);
7505+ // }
75147506
75157507static void ggml_vk_test_dequant_matmul(ggml_backend_vk_context * ctx, size_t m, size_t n, size_t k, size_t batch, size_t num_it, size_t split_k, size_t shader_size, ggml_type quant, bool mmq = false) {
75167508 VK_LOG_DEBUG("ggml_vk_test_dequant_matmul(" << m << ", " << n << ", " << k << ", " << batch << ", " << num_it << ", " << split_k << ", " << ggml_type_name(quant) << ")");
@@ -7752,8 +7744,6 @@ static void ggml_vk_test_dequant_matmul(ggml_backend_vk_context * ctx, size_t m,
77527744
77537745static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
77547746#if defined(GGML_VULKAN_RUN_TESTS)
7755- ggml_vk_test_quantize(ctx, 1024*1024, GGML_TYPE_Q8_1);
7756-
77577747 const std::vector<size_t> vals {
77587748 512, 512, 128,
77597749 128, 512, 512,
0 commit comments