Skip to content

Commit 1e0c4cf

Browse files
RbiessyAlcpz
authored andcommitted
dbg
1 parent fc768f3 commit 1e0c4cf

File tree

5 files changed

+77
-21
lines changed

5 files changed

+77
-21
lines changed

ggml/src/ggml-sycl/common.hpp

Lines changed: 4 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@ void ggml_sycl_host_free(void* ptr);
4242

4343
extern int g_ggml_sycl_debug;
4444
extern int g_ggml_sycl_disable_optimize;
45+
extern int g_ggml_sycl_disable_mmvq;
4546

4647
#define GGML_SYCL_DEBUG(...) \
4748
do { \
@@ -285,25 +286,11 @@ struct ggml_tensor_extra_gpu {
285286

286287
void release_extra_gpu(ggml_tensor_extra_gpu * extra, std::vector<queue_ptr> streams={});
287288

288-
inline optimize_feature check_gpu_optimize_feature(syclex::architecture &arch) {
289+
inline optimize_feature check_gpu_optimize_feature(syclex::architecture &/*arch*/) {
289290
optimize_feature opt;
290291

291-
opt.reorder =
292-
(arch == syclex::architecture::intel_gpu_dg1 ||
293-
arch == syclex::architecture::intel_gpu_acm_g10 ||
294-
arch == syclex::architecture::intel_gpu_acm_g11 ||
295-
arch == syclex::architecture::intel_gpu_acm_g12 ||
296-
arch == syclex::architecture::intel_gpu_pvc ||
297-
arch == syclex::architecture::intel_gpu_pvc_vg ||
298-
arch == syclex::architecture::intel_gpu_mtl_u ||
299-
arch == syclex::architecture::intel_gpu_mtl_s ||
300-
arch == syclex::architecture::intel_gpu_mtl_h ||
301-
arch == syclex::architecture::intel_gpu_arl_u ||
302-
arch == syclex::architecture::intel_gpu_arl_s ||
303-
arch == syclex::architecture::intel_gpu_arl_h ||
304-
arch == syclex::architecture::intel_gpu_bmg_g21 ||
305-
arch == syclex::architecture::intel_gpu_lnl_m
306-
);
292+
// TODO: Romain change to Intel vendor?
293+
opt.reorder = true;
307294

308295
return opt;
309296
}

ggml/src/ggml-sycl/dmmv.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1105,8 +1105,10 @@ void ggml_sycl_op_dequantize_mul_mat_vec(
11051105
case GGML_TYPE_Q4_0:
11061106
if ((ggml_tensor_extra_gpu*)dst->src[0]->extra &&
11071107
((ggml_tensor_extra_gpu*)dst->src[0]->extra)->optimized_feature.reorder) {
1108+
GGML_SYCL_DEBUG("Calling dequantize_mul_mat_vec_q4_0_sycl_reorder\n");
11081109
dequantize_mul_mat_vec_q4_0_sycl_reorder(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
11091110
} else {
1111+
GGML_SYCL_DEBUG("Calling dequantize_mul_mat_vec_q4_0_sycl\n");
11101112
dequantize_mul_mat_vec_q4_0_sycl(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
11111113
}
11121114
break;

ggml/src/ggml-sycl/ggml-sycl.cpp

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@ static bool g_sycl_loaded = false;
4848
int g_ggml_sycl_debug = 0;
4949
int g_ggml_sycl_disable_optimize = 0;
5050
int g_ggml_sycl_disable_graph = 0;
51+
int g_ggml_sycl_disable_mmvq = 0;
5152

5253
static ggml_sycl_device_info ggml_sycl_init() {
5354
ggml_sycl_device_info info = {};
@@ -194,11 +195,13 @@ static void ggml_check_sycl() try {
194195
g_ggml_sycl_debug = get_sycl_env("GGML_SYCL_DEBUG", 0);
195196
g_ggml_sycl_disable_optimize= get_sycl_env("GGML_SYCL_DISABLE_OPT", 1);
196197
g_ggml_sycl_disable_graph = get_sycl_env("GGML_SYCL_DISABLE_GRAPH", 1);
198+
g_ggml_sycl_disable_mmvq = get_sycl_env("GGML_SYCL_DISABLE_MMVQ", 0);
197199
GGML_SYCL_DEBUG("[SYCL] call ggml_check_sycl\n");
198200
GGML_LOG_INFO("Running with Environment Variables:\n");
199201
GGML_LOG_INFO(" GGML_SYCL_DEBUG: %d\n", g_ggml_sycl_debug);
200202
GGML_LOG_INFO(" GGML_SYCL_DISABLE_OPT: %d\n", g_ggml_sycl_disable_optimize);
201203
GGML_LOG_INFO(" GGML_SYCL_DISABLE_GRAPH: %d\n", g_ggml_sycl_disable_graph);
204+
GGML_LOG_INFO(" GGML_SYCL_DISABLE_MMVQ: %d\n", g_ggml_sycl_disable_mmvq);
202205
GGML_LOG_INFO("Build with Macros:\n");
203206
#if defined(GGML_SYCL_FORCE_MMQ)
204207
GGML_LOG_INFO(" GGML_SYCL_FORCE_MMQ: yes\n");
@@ -2917,6 +2920,7 @@ static bool ggml_sycl_supports_dmmv(enum ggml_type type) {
29172920

29182921
static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1,
29192922
ggml_tensor * dst) {
2923+
GGML_SYCL_DEBUG("call %s\n", __func__);
29202924
const bool split = ggml_backend_buffer_is_sycl_split(src0->buffer);
29212925
int64_t min_compute_capability = INT_MAX;
29222926

@@ -2961,14 +2965,17 @@ static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor
29612965
static_cast<ggml_tensor_extra_gpu *>(dst->src[0]->extra)->optimized_feature.reorder;
29622966

29632967
// mmvq path is faster in the CUDA backend.
2964-
if (ctx.stream()->get_backend() == sycl::backend::ext_oneapi_cuda
2968+
if (!g_ggml_sycl_disable_mmvq && (ctx.stream()->get_backend() == sycl::backend::ext_oneapi_cuda
29652969
// Dispatch becomes obscure with the reorder, MMVQ when the reorder optimization
29662970
// is enabled takes precedence over DMMV, the current if-else implementation
29672971
// requires disabling DMMV if both conditions are met
2968-
|| (reorder && ggml_sycl_supports_reorder_mmvq(src0->type))) {
2972+
|| (reorder && ggml_sycl_supports_reorder_mmvq(src0->type)))) {
29692973
use_dequantize_mul_mat_vec = use_dequantize_mul_mat_vec && !use_mul_mat_vec_q;
29702974
}
29712975

2976+
// TODO: Romain
2977+
GGML_SYCL_DEBUG("mul_mat use_dequantize_mul_mat_vec=%d use_mul_mat_vec_q=%d use_mul_mat_q=%d reorder=%d split=%d m=%ld n=%ld k=%ld batch0=%ld batch1=%ld\n", use_dequantize_mul_mat_vec, use_mul_mat_vec_q, use_mul_mat_q, reorder, split, src0->ne[1], src1->ne[1], src0->ne[0], src0->ne[3], src1->ne[3]);
2978+
29722979
if (!split && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
29732980
// TODO: Refactor and cleanup of mul mat dispatching.
29742981
if (src0->ne[3] == 1 && src1->ne[3] == 1) {
@@ -2998,6 +3005,7 @@ static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor
29983005
constexpr bool convert_src1_to_q8_1 = false;
29993006
ggml_sycl_op_mul_mat(ctx, src0, src1, dst, ggml_sycl_op_mul_mat_sycl, convert_src1_to_q8_1);
30003007
}
3008+
GGML_SYCL_DEBUG("call %s done\n", __func__);
30013009
}
30023010

30033011

ggml/src/ggml-sycl/mmvq.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1008,8 +1008,10 @@ void ggml_sycl_op_mul_mat_vec_q(ggml_backend_sycl_context & ctx, const ggml_tens
10081008
case GGML_TYPE_Q4_0:
10091009
if ((ggml_tensor_extra_gpu *) dst->src[0]->extra &&
10101010
((ggml_tensor_extra_gpu *) dst->src[0]->extra)->optimized_feature.reorder) {
1011+
GGML_SYCL_DEBUG("Calling reorder_mul_mat_vec_q4_0_q8_1_sycl\n");
10111012
reorder_mul_mat_vec_q4_0_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
10121013
} else {
1014+
GGML_SYCL_DEBUG("Calling mul_mat_vec_q4_0_q8_1_sycl\n");
10131015
mul_mat_vec_q4_0_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
10141016
}
10151017
break;

tests/test-backend-ops.cpp

Lines changed: 59 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,8 @@
3636
#include <thread>
3737
#include <vector>
3838

39+
#include <iostream>
40+
3941
static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float max = 1.0f) {
4042
size_t nels = ggml_nelements(tensor);
4143
std::vector<float> data(nels);
@@ -47,8 +49,8 @@ static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float m
4749
std::random_device rd;
4850
std::vector<std::default_random_engine> vec;
4951
vec.reserve(n_threads);
50-
//for (size_t i = 0; i < n_threads; i++) { vec.emplace_back(1234 + i); } // fixed seed
51-
for (size_t i = 0; i < n_threads; i++) { vec.emplace_back(rd()); }
52+
for (size_t i = 0; i < n_threads; i++) { vec.emplace_back(1234 + i); } // fixed seed
53+
//for (size_t i = 0; i < n_threads; i++) { vec.emplace_back(rd()); }
5254
return vec;
5355
}();
5456

@@ -551,6 +553,54 @@ struct test_case {
551553
}
552554
}
553555

556+
struct err_t {
557+
float a_val, b_val, err;
558+
size_t i;
559+
};
560+
std::vector<err_t> top_k_abs_err;
561+
std::vector<err_t> top_k_rel_err;
562+
size_t k = 10;
563+
auto a = f1.data();
564+
auto b = f2.data(); // ref (cpu backend)
565+
auto save_top_k_err = [=](size_t i, float a_i, float b_i, float err, std::vector<err_t>& top_k_err) {
566+
if (top_k_err.size() < k) {
567+
top_k_err.push_back({a_i, b_i, err, i});
568+
if (top_k_err.size() == k) {
569+
std::sort(top_k_err.begin(), top_k_err.end(), [](const err_t& x, const err_t& y) {
570+
return x.err > y.err;
571+
});
572+
}
573+
} else if (top_k_err.back().err < err) {
574+
top_k_err.back() = {a_i, b_i, err, i};
575+
std::sort(top_k_err.begin(), top_k_err.end(), [](const err_t& x, const err_t& y) {
576+
return x.err > y.err;
577+
});
578+
}
579+
};
580+
double avg_abs_err = 0.f;
581+
double avg_rel_err = 0.f;
582+
for (size_t i = 0; i < f1.size(); i++) {
583+
float a_i = a[i];
584+
float b_i = b[i];
585+
float abs_err = std::fabs(a_i - b_i);
586+
float rel_err = (a_i - b_i) / std::fabs(b_i);
587+
save_top_k_err(i, a_i, b_i, abs_err, top_k_abs_err);
588+
save_top_k_err(i, a_i, b_i, rel_err, top_k_rel_err);
589+
avg_abs_err += abs_err;
590+
avg_rel_err += rel_err;
591+
}
592+
avg_abs_err /= f1.size();
593+
avg_rel_err /= f1.size();
594+
std::cout << "\nAvg abs err=" << avg_abs_err << " Top " << k << " abs err:\n";
595+
for (const auto& err : top_k_abs_err) {
596+
std::cout << "i=" << err.i << " a=" << err.a_val << " b=" << err.b_val << " abs err=" << err.err << "\n";
597+
}
598+
std::cout << "\nAvg rel err=" << avg_rel_err << " Top " << k << " rel err:\n";
599+
for (const auto& err : top_k_rel_err) {
600+
std::cout << "i=" << err.i << " a=" << err.a_val << " b=" << err.b_val << " rel err=" << err.err << "\n";
601+
}
602+
std::cout << std::endl;
603+
554604
double err = nmse(f1.data(), f2.data(), f1.size());
555605
if (err > ud->max_err) {
556606
printf("[%s] NMSE = %.9f > %.9f ", ggml_op_desc(t1), err, ud->max_err);
@@ -4134,6 +4184,13 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
41344184
test_cases.emplace_back(new test_mul_mat(type_a, GGML_TYPE_F32, 16, i, 256, { 1, 1}, {1, 1}));
41354185
}
41364186
}
4187+
//TODO: Romain
4188+
test_cases.emplace_back(new test_mul_mat(GGML_TYPE_Q4_0, GGML_TYPE_F32, 11008, 1, 4096, {1, 1}, {1, 1}));
4189+
test_cases.emplace_back(new test_mul_mat(GGML_TYPE_Q4_0, GGML_TYPE_F32, 11008, 2, 4096, {1, 1}, {1, 1}));
4190+
test_cases.emplace_back(new test_mul_mat(GGML_TYPE_Q4_0, GGML_TYPE_F32, 4096, 1, 11008, {1, 1}, {1, 1}));
4191+
test_cases.emplace_back(new test_mul_mat(GGML_TYPE_Q4_0, GGML_TYPE_F32, 4096, 1, 4096, {1, 1}, {1, 1}));
4192+
test_cases.emplace_back(new test_mul_mat(GGML_TYPE_Q4_0, GGML_TYPE_F32, 4096, 2, 11008, {1, 1}, {1, 1}));
4193+
test_cases.emplace_back(new test_mul_mat(GGML_TYPE_Q4_0, GGML_TYPE_F32, 4096, 2, 4096, {1, 1}, {1, 1}));
41374194

41384195
#if 1
41394196
for (ggml_type type_a : base_types) {

0 commit comments

Comments
 (0)