Skip to content

Commit 806861f

Browse files
committed
wip: q6_k_q8_1_tiled_gemv
1 parent 73e53dc commit 806861f

File tree

6 files changed

+425
-18
lines changed

6 files changed

+425
-18
lines changed

ggml/src/ggml-sycl/builtins.hpp

Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
#ifndef GGML_SYCL_BUILTINS_HPP
2+
#define GGML_SYCL_BUILTINS_HPP
3+
4+
#include <sys/types.h>
5+
6+
#include <cstdint>
7+
8+
#include "cacheopts.hpp"
9+
10+
#define GGML_SYCL_UNREACHABLE(x) \
11+
assert(0 && x); \
12+
printf(x);
13+
14+
#ifdef __SYCL_DEVICE_ONLY__
15+
template <class T, int N> using vector_t = T __attribute__((ext_vector_type(N)));
16+
#else
17+
template <class T, int N> using vector_t = sycl::marray<T, N>;
18+
#endif
19+
20+
#ifdef __SYCL_DEVICE_ONLY__
21+
# define SYCL_DEVICE_BUILTIN(x) SYCL_EXTERNAL extern "C" x
22+
#else
23+
# define SYCL_DEVICE_BUILTIN(x)
24+
#endif
25+
26+
#ifdef __SYCL_DEVICE_ONLY__
27+
# define SYCL_DEVICE_OCL(x) SYCL_EXTERNAL extern "C" x
28+
#else
29+
# define SYCL_DEVICE_OCL(x)
30+
#endif
31+
32+
using uint8 = vector_t<uint, 8>;
33+
using uint2 = vector_t<uint, 2>;
34+
35+
using short16 = vector_t<short, 16>;
36+
using short8 = vector_t<unsigned short, 8>;
37+
using short2 = vector_t<unsigned short, 2>;
38+
39+
using uint8_32 = vector_t<uint8_t, 32>;
40+
using char16 = vector_t<char, 16>;
41+
42+
// loads
43+
SYCL_DEVICE_BUILTIN(short16 __builtin_IB_subgroup_block_read_flat_u8_m16k32v1(intptr_t baseoffset, int width_minus_one,
44+
int height_minus_one, int pitch_minus_one,
45+
uint2 coord));
46+
SYCL_DEVICE_BUILTIN(char16 __builtin_IB_subgroup_block_read_flat_u8_m16k16v1(intptr_t baseoffset, int width_minus_one,
47+
int height_minus_one, int pitch_minus_one,
48+
uint2 coord));
49+
SYCL_DEVICE_BUILTIN(int __builtin_IB_subgroup_block_read_flat_u8_m1k64v1(intptr_t baseoffset, int width_minus_one,
50+
int height_minus_one, int pitch_minus_one,
51+
uint2 coord));
52+
53+
//stores
54+
SYCL_DEVICE_BUILTIN(void __builtin_IB_subgroup_block_write_flat_u32_m1k16v1(intptr_t baseoffset, int width_minus_one,
55+
int height_minus_one, int pitch_minus_one,
56+
uint2 coord, uint data));
57+
58+
// prefetches
59+
SYCL_DEVICE_BUILTIN(void __builtin_IB_subgroup_block_read_prefetch_u8_m16k32v1(intptr_t baseoffset, int width_minus_one,
60+
int height_minus_one,
61+
int pitch_minus_one, uint2 coord,
62+
LSC_LDCC cache_control));
63+
SYCL_DEVICE_BUILTIN(void __builtin_IB_subgroup_block_read_prefetch_u8_m16k16v1(intptr_t baseoffset, int width_minus_one,
64+
int height_minus_one,
65+
int pitch_minus_one, uint2 coord,
66+
LSC_LDCC cache_control));
67+
SYCL_DEVICE_BUILTIN(void __builtin_IB_subgroup_block_read_prefetch_u8_m1k64v1(intptr_t baseoffset, int width_minus_one,
68+
int height_minus_one, int pitch_minus_one,
69+
uint2 coord, LSC_LDCC cache_control));
70+
71+
SYCL_DEVICE_BUILTIN(void __builtin_IB_subgroup_block_read_prefetch_u32_m1k16(intptr_t baseoffset, int width_minus_one,
72+
int height_minus_one, int pitch_minus_one,
73+
uint2 coord, LSC_LDCC cache_control));
74+
75+
//DP4A instructions
76+
SYCL_DEVICE_BUILTIN(int __builtin_IB_dp4a_ss(int c, int a, int b, bool isSaturated));
77+
78+
#endif

ggml/src/ggml-sycl/cacheopts.hpp

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
2+
#ifndef GGML_SYCL_CACHEOPTS_HPP
3+
#define GGML_SYCL_CACHEOPTS_HPP
4+
5+
enum LSC_LDCC {
6+
LSC_LDCC_DEFAULT = 0,
7+
LSC_LDCC_L1UC_L3UC = 1, // Override to L1 uncached and L3 uncached
8+
LSC_LDCC_L1UC_L3C = 2, // Override to L1 uncached and L3 cached
9+
LSC_LDCC_L1C_L3UC = 3, // Override to L1 cached and L3 uncached
10+
LSC_LDCC_L1C_L3C = 4, // Override to L1 cached and L3 cached
11+
LSC_LDCC_L1S_L3UC = 5, // Override to L1 streaming load and L3 uncached
12+
LSC_LDCC_L1S_L3C = 6, // Override to L1 streaming load and L3 cached
13+
LSC_LDCC_L1IAR_L3C = 7, // Override to L1 invalidate-after-read, and L3 cached
14+
};
15+
16+
// Store message caching control (also used for atomics)
17+
enum LSC_STCC {
18+
LSC_STCC_DEFAULT = 0,
19+
LSC_STCC_L1UC_L3UC = 1, // Override to L1 uncached and L3 uncached
20+
LSC_STCC_L1UC_L3WB = 2, // Override to L1 uncached and L3 written back
21+
LSC_STCC_L1WT_L3UC = 3, // Override to L1 written through and L3 uncached
22+
LSC_STCC_L1WT_L3WB = 4, // Override to L1 written through and L3 written back
23+
LSC_STCC_L1S_L3UC = 5, // Override to L1 streaming and L3 uncached
24+
LSC_STCC_L1S_L3WB = 6, // Override to L1 streaming and L3 written back
25+
LSC_STCC_L1WB_L3WB = 7, // Override to L1 written through and L3 written back
26+
};
27+
28+
#endif

ggml/src/ggml-sycl/common.hpp

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
#define GGML_SYCL_COMMON_HPP
1515

1616
#include <cstddef>
17+
#include <cstdlib>
1718
#include <fstream>
1819
#include <iostream>
1920
#include <string>
@@ -191,6 +192,7 @@ inline dpct::err0 ggml_sycl_set_device(const int device) try {
191192
//////////////////////
192193
struct optimize_feature {
193194
bool reorder=false;
195+
bool can_use_intel_builtins = false;
194196
};
195197

196198
struct sycl_device_info {
@@ -309,6 +311,28 @@ inline optimize_feature check_gpu_optimize_feature(syclex::architecture &arch) {
309311
return opt;
310312
}
311313

314+
inline int get_sycl_env(const char *env_name, int default_val) {
315+
char *user_device_string = getenv(env_name);
316+
int user_number = default_val;
317+
318+
unsigned n;
319+
if (user_device_string != NULL &&
320+
sscanf(user_device_string, " %u", &n) == 1) {
321+
user_number = (int)n;
322+
} else {
323+
user_number = default_val;
324+
}
325+
return user_number;
326+
}
327+
328+
inline void can_enable_intel_builtins(syclex::architecture & arch, optimize_feature & opt_feature_struct) {
329+
int can_use_intel_builtins_env_var_val = get_sycl_env("GGML_SYCL_USE_INTEL_BUILTINS", 0);
330+
if (can_use_intel_builtins_env_var_val &&
331+
(arch == syclex::architecture::intel_gpu_bmg_g21 || arch == syclex::architecture::intel_gpu_lnl_m)) {
332+
opt_feature_struct.can_use_intel_builtins = true;
333+
}
334+
}
335+
312336
namespace sycl_ex = sycl::ext::oneapi::experimental;
313337
struct ggml_backend_sycl_context {
314338
int device;

ggml/src/ggml-sycl/ggml-sycl.cpp

Lines changed: 109 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -21,13 +21,16 @@
2121
#include <limits>
2222
#include <stdint.h>
2323
#include <stdio.h>
24+
#include <sycl/usm.hpp>
2425
#include <vector>
2526
#include <cmath>
2627
#include <iostream>
2728
#include <fstream>
2829
#include <stdio.h>
2930
#include <stdlib.h>
31+
#include <sys/types.h>
3032
#include <regex>
33+
#include <random>
3134

3235
#include <sycl/sycl.hpp>
3336
#include <sycl/half_type.hpp>
@@ -36,6 +39,7 @@
3639
#include "ggml-impl.h"
3740
#include "ggml-backend-impl.h"
3841

42+
3943
#include "ggml-sycl/backend.hpp"
4044
#include "ggml-sycl/common.hpp"
4145
#include "ggml-sycl/element_wise.hpp"
@@ -45,12 +49,15 @@
4549
#include "ggml-sycl/getrows.hpp"
4650
#include "ggml.h"
4751

52+
#include "ggml-quants.h"
53+
4854
static bool g_sycl_loaded = false;
4955
int g_ggml_sycl_debug = 0;
5056
int g_ggml_sycl_disable_optimize = 0;
5157
int g_ggml_sycl_disable_graph = 0;
5258
int g_ggml_sycl_disable_dnn = 0;
5359
int g_ggml_sycl_prioritize_dmmv = 0;
60+
int g_ggml_sycl_use_intel_builtins = 0;
5461

5562
static ggml_sycl_device_info ggml_sycl_init() {
5663
ggml_sycl_device_info info = {};
@@ -85,6 +92,7 @@ static ggml_sycl_device_info ggml_sycl_init() {
8592
100 * prop.get_major_version() + 10 * prop.get_minor_version();
8693
info.devices[i].hw_info = get_device_hw_info(&device);
8794
info.devices[i].opt_feature = check_gpu_optimize_feature(info.devices[i].hw_info.arch);
95+
can_enable_intel_builtins(info.devices[i].hw_info.arch, info.devices[i].opt_feature);
8896

8997
info.max_work_group_sizes[i] = prop.get_max_work_group_size();
9098
}
@@ -176,20 +184,6 @@ void ggml_backend_sycl_print_sycl_devices() {
176184
print_device_opt_feature(device_count);
177185
}
178186

179-
static inline int get_sycl_env(const char *env_name, int default_val) {
180-
char *user_device_string = getenv(env_name);
181-
int user_number = default_val;
182-
183-
unsigned n;
184-
if (user_device_string != NULL &&
185-
sscanf(user_device_string, " %u", &n) == 1) {
186-
user_number = (int)n;
187-
} else {
188-
user_number = default_val;
189-
}
190-
return user_number;
191-
}
192-
193187
static void ggml_check_sycl() try {
194188
static bool initialized = false;
195189

@@ -199,10 +193,14 @@ static void ggml_check_sycl() try {
199193
g_ggml_sycl_disable_graph = get_sycl_env("GGML_SYCL_DISABLE_GRAPH", 1);
200194
g_ggml_sycl_disable_dnn = get_sycl_env("GGML_SYCL_DISABLE_DNN", 0);
201195
g_ggml_sycl_prioritize_dmmv = get_sycl_env("GGML_SYCL_PRIORITIZE_DMMV", 0);
196+
g_ggml_sycl_use_intel_builtins = get_sycl_env("GGML_SYCL_USE_INTEL_BUILTINS", 0);
197+
202198
GGML_SYCL_DEBUG("[SYCL] call ggml_check_sycl\n");
203199
GGML_LOG_INFO("Running with Environment Variables:\n");
204200
GGML_LOG_INFO(" GGML_SYCL_DEBUG: %d\n", g_ggml_sycl_debug);
205201
GGML_LOG_INFO(" GGML_SYCL_DISABLE_OPT: %d\n", g_ggml_sycl_disable_optimize);
202+
GGML_LOG_INFO(" GGML_SYCL_USE_INTEL_BUILTINS: %d\n", g_ggml_sycl_use_intel_builtins);
203+
206204
#ifdef GGML_SYCL_GRAPH
207205
GGML_LOG_INFO(" GGML_SYCL_DISABLE_GRAPH: %d\n", g_ggml_sycl_disable_graph);
208206
#else
@@ -3131,6 +3129,97 @@ static void reorder_qw_q6_k(uint8_t * data_device, size_t size, size_t offset, d
31313129
sycl::free(tmp_buf, *stream);
31323130
}
31333131

3132+
static void reorder_qw_q6_k_contiguous(uint8_t * data_device, size_t rows, size_t cols, size_t offset,
3133+
dpct::queue_ptr stream) {
3134+
GGML_ASSERT(offset % sizeof(block_q6_K) == 0);
3135+
GGML_ASSERT(cols % QK_K == 0);
3136+
const std::size_t nblocks = rows * (cols / QK_K);
3137+
const std::size_t size = nblocks * sizeof(block_q6_K);
3138+
auto * tmp_buf = sycl::malloc_shared<uint8_t>(size, *stream);
3139+
3140+
SYCL_CHECK(CHECK_TRY_ERROR((*stream).memcpy(tmp_buf, data_device, size).wait()));
3141+
3142+
auto * ql_ptr = data_device;
3143+
auto * qh_ptr = ql_ptr + (QK_K / 2) * nblocks;
3144+
auto * scales_ptr = qh_ptr + (QK_K / 4) * nblocks;
3145+
sycl::half * dm_ptr = (sycl::half *) (scales_ptr + (QK_K / 16) * nblocks);
3146+
3147+
stream
3148+
->parallel_for(nblocks,
3149+
[=](auto i) {
3150+
const block_q6_K * x = (const block_q6_K *) tmp_buf;
3151+
auto row = i / rows;
3152+
auto col = i % rows;
3153+
auto blocks_per_col = cols / QK_K;
3154+
auto block_offset = row * blocks_per_col + col;
3155+
3156+
const uint8_t * ql = x[block_offset].ql;
3157+
const uint8_t * qh = x[block_offset].qh;
3158+
uint8_t * base_ql_ptr = ql_ptr + row * ((QK_K / 2) * blocks_per_col) + (QK_K / 2) * col;
3159+
uint8_t * base_qh_ptr = qh_ptr + row * ((QK_K / 4) * blocks_per_col) + (QK_K / 4) * col;
3160+
auto * base_scales_ptr = scales_ptr + row * ((QK_K / 16) * blocks_per_col) + (QK_K / 16) * col;
3161+
3162+
uint8_t ql_reordered[QK_K / 2];
3163+
uint8_t qh_reordered[QK_K / 4];
3164+
int8_t temp[QK_K];
3165+
3166+
// zero out these intermediate reordered buffers
3167+
for (int j = 0; j < QK_K / 2; j++) {
3168+
ql_reordered[j] = 0;
3169+
}
3170+
3171+
for (int j = 0; j < QK_K / 4; j++) {
3172+
qh_reordered[j] = 0;
3173+
}
3174+
3175+
// first collate and pack ql and qh belonging to the same quant together
3176+
int chunk_offset = 0;
3177+
for (int n = 0; n < QK_K; n += 128) {
3178+
for (int l = 0; l < 32; ++l) {
3179+
const int8_t q1 = (int8_t)((ql[l + 0] & 0xF) | (((qh[l] >> 0) & 3) << 4));
3180+
const int8_t q2 = (int8_t)((ql[l + 32] & 0xF) | (((qh[l] >> 2) & 3) << 4));
3181+
const int8_t q3 = (int8_t)((ql[l + 0] >> 4) | (((qh[l] >> 4) & 3) << 4));
3182+
const int8_t q4 = (int8_t)((ql[l + 32] >> 4) | (((qh[l] >> 6) & 3) << 4));
3183+
temp[chunk_offset + l + 0] = q1;
3184+
temp[chunk_offset + l + 32] = q2;
3185+
temp[chunk_offset + l + 64] = q3;
3186+
temp[chunk_offset + l + 96] = q4;
3187+
}
3188+
chunk_offset += 128;
3189+
ql += 64;
3190+
qh += 32;
3191+
}
3192+
3193+
// Now separate them again
3194+
for (int j = 0; j < QK_K; j++) {
3195+
int8_t low_bits = temp[j] & 0x0F;
3196+
ql_reordered[j / 2] = ql_reordered[j / 2] | (low_bits << (4 * (j % 2)));
3197+
}
3198+
3199+
for (int j = 0; j < QK_K; j++) {
3200+
int8_t high_bits = temp[j] >> 4;
3201+
qh_reordered[j / 4] = qh_reordered[j / 4] | (high_bits << (2 * (j % 4)));
3202+
}
3203+
3204+
for(int j = 0; j < QK_K / 2; j++) {
3205+
base_ql_ptr[j] = ql_reordered[j];
3206+
}
3207+
3208+
for(int j = 0; j < QK_K / 4; j++) {
3209+
base_qh_ptr[j] = qh_reordered[j];
3210+
}
3211+
3212+
for (int j = 0; j < QK_K / 16; ++j) {
3213+
base_scales_ptr[j] = x[block_offset].scales[j];
3214+
}
3215+
3216+
dm_ptr[block_offset] = x[block_offset].d;
3217+
3218+
})
3219+
.wait_and_throw();
3220+
sycl::free(tmp_buf, *stream);
3221+
}
3222+
31343223
static void reorder_qw(const ggml_tensor * src0, dpct::queue_ptr stream) {
31353224
uint8_t * data_device = (uint8_t *) src0->data;
31363225
size_t ncols = src0->ne[0];
@@ -3145,7 +3234,12 @@ static void reorder_qw(const ggml_tensor * src0, dpct::queue_ptr stream) {
31453234
reorder_qw_q4_k(data_device, size, 0, stream);
31463235
break;
31473236
case GGML_TYPE_Q6_K:
3148-
reorder_qw_q6_k(data_device, size, 0, stream);
3237+
std::cout << "g_ggml_sycl_use_intel_builtins: " << g_ggml_sycl_use_intel_builtins << std::endl;
3238+
if (g_ggml_sycl_use_intel_builtins) {
3239+
reorder_qw_q6_k_contiguous(data_device, nrows, ncols, 0, stream);
3240+
} else {
3241+
reorder_qw_q6_k(data_device, size, 0, stream);
3242+
}
31493243
break;
31503244
default:
31513245
GGML_ABORT("reorder_qw() called with unsupported type");

0 commit comments

Comments
 (0)