Skip to content

Commit 7a6b48d

Browse files
committed
detect hw type and save opt feature, and print opt feature
1 parent 78e232a commit 7a6b48d

File tree

8 files changed

+166
-74
lines changed

8 files changed

+166
-74
lines changed

ggml/src/ggml-sycl/CMakeLists.txt

Lines changed: 0 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -4,19 +4,6 @@ if (NOT GGML_SYCL_TARGET MATCHES "^(INTEL|NVIDIA|AMD)$")
44
message(FATAL_ERROR "Invalid backend chosen, supported options are INTEL, NVIDIA, or AMD")
55
endif()
66

7-
if (GGML_SYCL_TARGET STREQUAL "INTEL")
8-
add_compile_definitions(GGML_SYCL_INTEL_TARGET)
9-
endif()
10-
11-
if (GGML_SYCL_TARGET STREQUAL "NVIDIA")
12-
add_compile_definitions(GGML_SYCL_NVIDIA_TARGET)
13-
endif()
14-
15-
if (GGML_SYCL_TARGET STREQUAL "AMD")
16-
add_compile_definitions(GGML_SYCL_AMD_TARGET)
17-
endif()
18-
19-
207
check_cxx_compiler_flag("-fsycl" SUPPORTS_SYCL)
218

229
if (DEFINED ENV{ONEAPI_ROOT})

ggml/src/ggml-sycl/common.hpp

Lines changed: 27 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,8 @@
1919
#include "dpct/helper.hpp"
2020
#include "ggml-sycl.h"
2121
#include "presets.hpp"
22+
#include "sycl_hw.hpp"
23+
2224
#if GGML_SYCL_DNNL
2325
#include "dnnl.hpp"
2426
#include "dnnl_sycl.hpp"
@@ -182,18 +184,24 @@ inline dpct::err0 ggml_sycl_set_device(const int device) try {
182184
}
183185

184186
//////////////////////
187+
struct optimize_feature {
188+
bool reorder=false;
189+
};
190+
191+
struct sycl_device_info {
192+
int cc; // compute capability
193+
// int nsm; // number of streaming multiprocessors
194+
// size_t smpb; // max. shared memory per block
195+
bool vmm; // virtual memory support
196+
size_t total_vram;
197+
sycl_hw_info hw_info;
198+
optimize_feature opt_feature;
199+
};
200+
185201

186202
struct ggml_sycl_device_info {
187203
int device_count;
188204

189-
struct sycl_device_info {
190-
int cc; // compute capability
191-
// int nsm; // number of streaming multiprocessors
192-
// size_t smpb; // max. shared memory per block
193-
bool vmm; // virtual memory support
194-
size_t total_vram;
195-
};
196-
197205
sycl_device_info devices[GGML_SYCL_MAX_DEVICES] = {};
198206

199207
std::array<float, GGML_SYCL_MAX_DEVICES> default_tensor_split = {};
@@ -262,15 +270,26 @@ struct ggml_tensor_extra_gpu {
262270
[GGML_SYCL_MAX_STREAMS]; // events for synchronizing multiple GPUs
263271
};
264272

273+
inline optimize_feature check_gpu_optimize_feature(int hw_family) {
274+
optimize_feature opt;
275+
opt.reorder = ( hw_family==SYCL_HW_FAMILY_INTEL_PVC ||
276+
hw_family==SYCL_HW_FAMILY_INTEL_MTL_ARL ||
277+
hw_family==SYCL_HW_FAMILY_INTEL_LNL ||
278+
hw_family==SYCL_HW_FAMILY_INTEL_ARC);
279+
return opt;
280+
}
281+
265282
struct ggml_backend_sycl_context {
266283
int device;
267284
std::string name;
285+
optimize_feature opt_feature;
268286

269287
queue_ptr qptrs[GGML_SYCL_MAX_DEVICES][GGML_SYCL_MAX_STREAMS] = { { nullptr } };
270288

271289
explicit ggml_backend_sycl_context(int device) :
272290
device(device),
273291
name(GGML_SYCL_NAME + std::to_string(device)) {
292+
opt_feature = ggml_sycl_info().devices[device].opt_feature;
274293
}
275294

276295
queue_ptr stream(int device, int stream) {
@@ -680,5 +699,4 @@ bool gpu_has_xmx(sycl::device &dev);
680699
void ggml_sycl_op_flatten(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
681700
const ggml_tensor *src1, ggml_tensor *dst,
682701
const ggml_sycl_op_flatten_t op);
683-
684702
#endif // GGML_SYCL_COMMON_HPP

ggml/src/ggml-sycl/convert.cpp

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -472,14 +472,14 @@ static void convert_unary_sycl(const void *__restrict__ vx,
472472
}
473473
}
474474

475-
to_fp16_sycl_t ggml_get_to_fp16_sycl(ggml_type type) {
475+
to_fp16_sycl_t ggml_get_to_fp16_sycl(ggml_type type, ggml_backend_sycl_context & ctx) {
476476
switch (type) {
477477
case GGML_TYPE_Q4_0:
478-
#if defined(GGML_SYCL_INTEL_TARGET)
479-
return dequantize_row_q4_0_sycl_reorder;
480-
#else
481-
return dequantize_block_sycl<QK4_0, QR4_0, dequantize_q4_0>;
482-
#endif
478+
if (ctx.opt_feature.reorder) {
479+
return dequantize_row_q4_0_sycl_reorder;
480+
} else {
481+
return dequantize_block_sycl<QK4_0, QR4_0, dequantize_q4_0>;
482+
}
483483
case GGML_TYPE_Q4_1:
484484
return dequantize_block_sycl<QK4_1, QR4_1, dequantize_q4_1>;
485485
case GGML_TYPE_Q5_0:
@@ -523,14 +523,14 @@ to_fp16_sycl_t ggml_get_to_fp16_sycl(ggml_type type) {
523523
}
524524
}
525525

526-
to_fp32_sycl_t ggml_get_to_fp32_sycl(ggml_type type) {
526+
to_fp32_sycl_t ggml_get_to_fp32_sycl(ggml_type type, ggml_backend_sycl_context & ctx) {
527527
switch (type) {
528528
case GGML_TYPE_Q4_0:
529-
#if defined(GGML_SYCL_INTEL_TARGET)
530-
return dequantize_row_q4_0_sycl_reorder;
531-
#else
532-
return dequantize_row_q4_0_sycl;
533-
#endif
529+
if (ctx.opt_feature.reorder) {
530+
return dequantize_row_q4_0_sycl_reorder;
531+
} else {
532+
return dequantize_row_q4_0_sycl;
533+
}
534534
case GGML_TYPE_Q4_1:
535535
return dequantize_row_q4_1_sycl;
536536
case GGML_TYPE_Q5_0:

ggml/src/ggml-sycl/convert.hpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ using to_t_sycl_t = void (*)(const void *__restrict__ x, T *__restrict__ y,
2121
typedef to_t_sycl_t<float> to_fp32_sycl_t;
2222
typedef to_t_sycl_t<sycl::half> to_fp16_sycl_t;
2323

24-
to_fp16_sycl_t ggml_get_to_fp16_sycl(ggml_type type);
25-
to_fp32_sycl_t ggml_get_to_fp32_sycl(ggml_type type);
24+
to_fp16_sycl_t ggml_get_to_fp16_sycl(ggml_type type, ggml_backend_sycl_context & ctx);
25+
to_fp32_sycl_t ggml_get_to_fp32_sycl(ggml_type type, ggml_backend_sycl_context & ctx);
2626

2727
#endif // GGML_SYCL_CONVERT_HPP

ggml/src/ggml-sycl/dmmv.cpp

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1096,7 +1096,7 @@ void ggml_sycl_op_dequantize_mul_mat_vec(
10961096

10971097
if (src1_convert_f16) {
10981098
src1_dfloat = src1_dfloat_a.alloc(ne00);
1099-
const to_fp16_sycl_t to_fp16_sycl = ggml_get_to_fp16_sycl(src1->type);
1099+
const to_fp16_sycl_t to_fp16_sycl = ggml_get_to_fp16_sycl(src1->type, ctx);
11001100
GGML_ASSERT(to_fp16_sycl != nullptr);
11011101
to_fp16_sycl(src1_ddf_i, src1_dfloat, ne00, stream);
11021102
}
@@ -1106,11 +1106,11 @@ void ggml_sycl_op_dequantize_mul_mat_vec(
11061106

11071107
switch (src0->type) {
11081108
case GGML_TYPE_Q4_0:
1109-
#if defined(GGML_SYCL_INTEL_TARGET)
1110-
dequantize_mul_mat_vec_q4_0_sycl_reorder(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
1111-
#else
1112-
dequantize_mul_mat_vec_q4_0_sycl(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
1113-
#endif
1109+
if (ctx.opt_feature.reorder) {
1110+
dequantize_mul_mat_vec_q4_0_sycl_reorder(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
1111+
} else {
1112+
dequantize_mul_mat_vec_q4_0_sycl(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
1113+
}
11141114
break;
11151115
case GGML_TYPE_Q4_1:
11161116
dequantize_mul_mat_vec_q4_1_sycl(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);

ggml/src/ggml-sycl/ggml-sycl.cpp

Lines changed: 49 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@
3939
#include "ggml-sycl/backend.hpp"
4040
#include "ggml-sycl/presets.hpp"
4141
#include "ggml-sycl/gemm.hpp"
42+
#include "ggml-sycl/sycl_hw.hpp"
4243

4344
static bool g_sycl_loaded = false;
4445

@@ -63,14 +64,18 @@ static ggml_sycl_device_info ggml_sycl_init() {
6364
for (int i = 0; i < info.device_count; ++i) {
6465
info.devices[i].vmm = 0;
6566
dpct::device_info prop;
67+
sycl::device device = dpct::dev_mgr::instance().get_device(i);
68+
6669
SYCL_CHECK(CHECK_TRY_ERROR(dpct::get_device_info(
67-
prop, dpct::dev_mgr::instance().get_device(i))));
70+
prop, device)));
6871

6972
info.default_tensor_split[i] = total_vram;
7073
total_vram += prop.get_global_mem_size();
7174

7275
info.devices[i].cc =
7376
100 * prop.get_major_version() + 10 * prop.get_minor_version();
77+
info.devices[i].hw_info = get_device_hw_info(&device);
78+
info.devices[i].opt_feature = check_gpu_optimize_feature(info.devices[i].hw_info.family);
7479

7580
info.max_work_group_sizes[i] = prop.get_max_work_group_size();
7681
}
@@ -109,6 +114,27 @@ void print_device_detail(int id, sycl::device &device, std::string device_type)
109114
global_mem_size, device.get_info<sycl::info::device::driver_version>().c_str());
110115
}
111116

117+
void print_device_opt_feature(int device_count) {
118+
GGML_LOG_INFO("SYCL Optimization Feature:\n");
119+
GGML_LOG_INFO(
120+
"|ID| Device Type|Reorder|\n");
121+
GGML_LOG_INFO(
122+
"|--|-------------------|-------|\n");
123+
std::map<std::string, size_t> DeviceNums;
124+
for (int id = 0; id < device_count; ++id) {
125+
sycl::device device = dpct::dev_mgr::instance().get_device(id);
126+
std::string backend_type = get_device_backend_and_type(device);
127+
int type_id = DeviceNums[backend_type]++;
128+
std::stringstream device_type;
129+
device_type << "[" << backend_type << ":" << std::to_string(type_id)
130+
<< "]";
131+
std::string device_type_s = device_type.str();
132+
device_type_s = std::regex_replace(device_type_s, std::regex("ext_oneapi_"), "");
133+
GGML_LOG_INFO("|%2d|%19s|%7s|\n", id, device_type_s.c_str(),
134+
ggml_sycl_info().devices[id].opt_feature.reorder ? "Y": "N");
135+
}
136+
137+
}
112138
void ggml_backend_sycl_print_sycl_devices() {
113139
GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_print_sycl_devices\n");
114140
int device_count = dpct::dev_mgr::instance().device_count();
@@ -137,6 +163,8 @@ void ggml_backend_sycl_print_sycl_devices() {
137163
<< "]";
138164
print_device_detail(id, device, device_type.str());
139165
}
166+
167+
print_device_opt_feature(device_count);
140168
}
141169

142170
static inline int get_sycl_env(const char *env_name, int default_val) {
@@ -172,18 +200,6 @@ static void ggml_check_sycl() try {
172200
GGML_LOG_INFO(" GGML_SYCL_F16: no\n");
173201
#endif
174202

175-
#if defined(GGML_SYCL_INTEL_TARGET)
176-
GGML_LOG_INFO(" GGML_SYCL_INTEL_TARGET: yes\n");
177-
#endif
178-
179-
#if defined(GGML_SYCL_NVIDIA_TARGET)
180-
GGML_LOG_INFO(" GGML_SYCL_NVIDIA_TARGET: yes\n");
181-
#endif
182-
183-
#if defined(GGML_SYCL_AMD_TARGET)
184-
GGML_LOG_INFO(" GGML_SYCL_AMD_TARGET: yes\n");
185-
#endif
186-
187203
/* NOT REMOVE, keep it for next optimize for XMX.
188204
#if defined(SYCL_USE_XMX)
189205
fprintf(stderr, "%s: SYCL_USE_XMX: yes\n", __func__);
@@ -253,11 +269,13 @@ struct ggml_backend_sycl_buffer_context {
253269
void * dev_ptr = nullptr;
254270
queue_ptr stream;
255271
std::string name;
272+
optimize_feature opt_feature;
256273

257274
ggml_backend_sycl_buffer_context(int device, void * dev_ptr, queue_ptr stream) :
258275
device(device), dev_ptr(dev_ptr), stream(stream) {
259276
check_allow_gpu_index(device);
260277
name = (GGML_SYCL_NAME + std::to_string(device));
278+
opt_feature = ggml_sycl_info().devices[device].opt_feature;
261279
}
262280

263281

@@ -368,14 +386,13 @@ static void ggml_backend_sycl_buffer_set_tensor(ggml_backend_buffer_t buffer,
368386
.wait()));
369387
free(host_buf);
370388

371-
#if defined(GGML_SYCL_INTEL_TARGET)
372-
if (tensor->type == GGML_TYPE_Q4_0) {
373-
size_t ncols = tensor->ne[0];
374-
size_t nrows = tensor->ne[1];
375-
reorder_qw((char *)tensor->data + offset, ncols, nrows, size, stream);
389+
if (ctx->opt_feature.reorder) {
390+
if (tensor->type == GGML_TYPE_Q4_0) {
391+
size_t ncols = tensor->ne[0];
392+
size_t nrows = tensor->ne[1];
393+
reorder_qw((char *)tensor->data + offset, ncols, nrows, size, stream);
394+
}
376395
}
377-
#endif
378-
379396
}
380397
catch (sycl::exception const &exc) {
381398
std::cerr << exc.what() << "Exception caught at file:" << __FILE__
@@ -2655,11 +2672,11 @@ static void ggml_sycl_op_get_rows(ggml_backend_sycl_context & ctx, const ggml_te
26552672
get_rows_sycl_float(ctx, src0, src1, dst, src0_d, src1_i32, dst_d, stream);
26562673
break;
26572674
case GGML_TYPE_Q4_0:
2658-
#if defined(GGML_SYCL_INTEL_TARGET)
2659-
get_rows_sycl_reorder<QK4_0, QR4_0, dequantize_q4_0_reorder>(ctx, src0, src1, dst, src0_d, src1_i32, dst_d, stream);
2660-
#else
2661-
get_rows_sycl<QK4_0, QR4_0, dequantize_q4_0>(ctx, src0, src1, dst, src0_d, src1_i32, dst_d, stream);
2662-
#endif
2675+
if (ctx.opt_feature.reorder) {
2676+
get_rows_sycl_reorder<QK4_0, QR4_0, dequantize_q4_0_reorder>(ctx, src0, src1, dst, src0_d, src1_i32, dst_d, stream);
2677+
} else {
2678+
get_rows_sycl<QK4_0, QR4_0, dequantize_q4_0>(ctx, src0, src1, dst, src0_d, src1_i32, dst_d, stream);
2679+
}
26632680
break;
26642681
case GGML_TYPE_Q4_1:
26652682
get_rows_sycl<QK4_1, QR4_1, dequantize_q4_1>(ctx, src0, src1, dst, src0_d, src1_i32, dst_d, stream);
@@ -2735,7 +2752,7 @@ inline void ggml_sycl_op_mul_mat_sycl(
27352752
// GGML_SYCL_DEBUG("ggml_sycl_op_mul_mat_sycl - fp16 path\n");
27362753
ggml_sycl_pool_alloc<sycl::half> src0_as_f16(ctx.pool());
27372754
if (src0->type != GGML_TYPE_F16) {
2738-
const to_fp16_sycl_t to_fp16_sycl = ggml_get_to_fp16_sycl(src0->type);
2755+
const to_fp16_sycl_t to_fp16_sycl = ggml_get_to_fp16_sycl(src0->type, ctx);
27392756
GGML_ASSERT(to_fp16_sycl != nullptr);
27402757
size_t ne = row_diff*ne00;
27412758
src0_as_f16.alloc(ne);
@@ -2747,7 +2764,7 @@ inline void ggml_sycl_op_mul_mat_sycl(
27472764

27482765
ggml_sycl_pool_alloc<sycl::half> src1_as_f16(ctx.pool());
27492766
if (src1->type != GGML_TYPE_F16) {
2750-
const to_fp16_sycl_t to_fp16_sycl = ggml_get_to_fp16_sycl(src1->type);
2767+
const to_fp16_sycl_t to_fp16_sycl = ggml_get_to_fp16_sycl(src1->type, ctx);
27512768
GGML_ASSERT(to_fp16_sycl != nullptr);
27522769
size_t ne = src1_ncols*ne10;
27532770
src1_as_f16.alloc(ne);
@@ -2768,13 +2785,13 @@ inline void ggml_sycl_op_mul_mat_sycl(
27682785
src1_ptr, dpct::library_data_t::real_half, ne10, &beta_f16,
27692786
dst_f16.get(), dpct::library_data_t::real_half, ldc,
27702787
dpct::library_data_t::real_half)));
2771-
const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(GGML_TYPE_F16);
2788+
const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(GGML_TYPE_F16, ctx);
27722789
to_fp32_sycl(dst_f16.get(), dst_dd_i, row_diff*src1_ncols, stream);
27732790
#else
27742791
auto dnnl_stream = ctx.stream_dnnl(stream);
27752792
DnnlGemmWrapper::row_gemm(dnnl_stream, false, true, src1_ncols, row_diff, ne10, src1_ptr, DnnlGemmWrapper::to_dt<sycl::half>(),
27762793
src0_ptr, DnnlGemmWrapper::to_dt<sycl::half>(), dst_f16.get(), DnnlGemmWrapper::to_dt<sycl::half>());
2777-
const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(GGML_TYPE_F16);
2794+
const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(GGML_TYPE_F16, ctx);
27782795
to_fp32_sycl(dst_f16.get(), dst_dd_i, row_diff* src1_ncols, stream);
27792796
#endif
27802797
}
@@ -2783,13 +2800,13 @@ inline void ggml_sycl_op_mul_mat_sycl(
27832800
ggml_sycl_pool_alloc<float> src0_ddq_as_f32(ctx.pool());
27842801
ggml_sycl_pool_alloc<float> src1_ddq_as_f32(ctx.pool());
27852802
if (src0->type != GGML_TYPE_F32) {
2786-
const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(src0->type);
2803+
const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(src0->type, ctx);
27872804
GGML_ASSERT(to_fp32_sycl != nullptr);
27882805
src0_ddq_as_f32.alloc(row_diff*ne00);
27892806
to_fp32_sycl(src0_dd_i, src0_ddq_as_f32.get(), row_diff*ne00, stream);
27902807
}
27912808
if (src1->type != GGML_TYPE_F32) {
2792-
const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(src1->type);
2809+
const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(src1->type, ctx);
27932810
GGML_ASSERT(to_fp32_sycl != nullptr);
27942811
src1_ddq_as_f32.alloc(src1_ncols*ne10);
27952812
to_fp32_sycl(src1_ddf_i, src1_ddq_as_f32.get(), src1_ncols*ne10, stream);
@@ -3535,7 +3552,7 @@ static void ggml_sycl_mul_mat_batched_sycl(ggml_backend_sycl_context & ctx,
35353552
// convert src1 to fp16
35363553
ggml_sycl_pool_alloc<sycl::half> src1_f16_alloc(ctx.pool());
35373554
if (src1->type != GGML_TYPE_F16) {
3538-
const to_fp16_sycl_t to_fp16_sycl = ggml_get_to_fp16_sycl(src1->type);
3555+
const to_fp16_sycl_t to_fp16_sycl = ggml_get_to_fp16_sycl(src1->type, ctx);
35393556
const int64_t ne_src1 = ggml_nelements(src1);
35403557
src1_f16_alloc.alloc(ne_src1);
35413558
GGML_ASSERT(to_fp16_sycl != nullptr);

0 commit comments

Comments
 (0)