2121#include < limits>
2222#include < stdint.h>
2323#include < stdio.h>
24+ #include < sycl/usm.hpp>
2425#include < vector>
2526#include < cmath>
2627#include < iostream>
2728#include < fstream>
2829#include < stdio.h>
2930#include < stdlib.h>
31+ #include < sys/types.h>
3032#include < regex>
33+ #include < random>
3134
3235#include < sycl/sycl.hpp>
3336#include < sycl/half_type.hpp>
3639#include " ggml-impl.h"
3740#include " ggml-backend-impl.h"
3841
42+
3943#include " ggml-sycl/backend.hpp"
4044#include " ggml-sycl/common.hpp"
4145#include " ggml-sycl/element_wise.hpp"
4549#include " ggml-sycl/getrows.hpp"
4650#include " ggml.h"
4751
52+ #include " ggml-quants.h"
53+
4854static bool g_sycl_loaded = false ;
4955int g_ggml_sycl_debug = 0 ;
5056int g_ggml_sycl_disable_optimize = 0 ;
5157int g_ggml_sycl_disable_graph = 0 ;
5258int g_ggml_sycl_disable_dnn = 0 ;
5359int g_ggml_sycl_prioritize_dmmv = 0 ;
60+ int g_ggml_sycl_use_intel_builtins = 0 ;
5461
5562static ggml_sycl_device_info ggml_sycl_init () {
5663 ggml_sycl_device_info info = {};
@@ -85,6 +92,7 @@ static ggml_sycl_device_info ggml_sycl_init() {
8592 100 * prop.get_major_version () + 10 * prop.get_minor_version ();
8693 info.devices [i].hw_info = get_device_hw_info (&device);
8794 info.devices [i].opt_feature = check_gpu_optimize_feature (info.devices [i].hw_info .arch );
95+ can_enable_intel_builtins (info.devices [i].hw_info .arch , info.devices [i].opt_feature );
8896
8997 info.max_work_group_sizes [i] = prop.get_max_work_group_size ();
9098 }
@@ -176,20 +184,6 @@ void ggml_backend_sycl_print_sycl_devices() {
176184 print_device_opt_feature (device_count);
177185}
178186
179- static inline int get_sycl_env (const char *env_name, int default_val) {
180- char *user_device_string = getenv (env_name);
181- int user_number = default_val;
182-
183- unsigned n;
184- if (user_device_string != NULL &&
185- sscanf (user_device_string, " %u" , &n) == 1 ) {
186- user_number = (int )n;
187- } else {
188- user_number = default_val;
189- }
190- return user_number;
191- }
192-
193187static void ggml_check_sycl () try {
194188 static bool initialized = false ;
195189
@@ -199,10 +193,14 @@ static void ggml_check_sycl() try {
199193 g_ggml_sycl_disable_graph = get_sycl_env (" GGML_SYCL_DISABLE_GRAPH" , 1 );
200194 g_ggml_sycl_disable_dnn = get_sycl_env (" GGML_SYCL_DISABLE_DNN" , 0 );
201195 g_ggml_sycl_prioritize_dmmv = get_sycl_env (" GGML_SYCL_PRIORITIZE_DMMV" , 0 );
196+ g_ggml_sycl_use_intel_builtins = get_sycl_env (" GGML_SYCL_USE_INTEL_BUILTINS" , 0 );
197+
202198 GGML_SYCL_DEBUG (" [SYCL] call ggml_check_sycl\n " );
203199 GGML_LOG_INFO (" Running with Environment Variables:\n " );
204200 GGML_LOG_INFO (" GGML_SYCL_DEBUG: %d\n " , g_ggml_sycl_debug);
205201 GGML_LOG_INFO (" GGML_SYCL_DISABLE_OPT: %d\n " , g_ggml_sycl_disable_optimize);
202+ GGML_LOG_INFO (" GGML_SYCL_USE_INTEL_BUILTINS: %d\n " , g_ggml_sycl_use_intel_builtins);
203+
206204#ifdef GGML_SYCL_GRAPH
207205 GGML_LOG_INFO (" GGML_SYCL_DISABLE_GRAPH: %d\n " , g_ggml_sycl_disable_graph);
208206#else
@@ -3131,6 +3129,97 @@ static void reorder_qw_q6_k(uint8_t * data_device, size_t size, size_t offset, d
31313129 sycl::free (tmp_buf, *stream);
31323130}
31333131
3132+ static void reorder_qw_q6_k_contiguous (uint8_t * data_device, size_t rows, size_t cols, size_t offset,
3133+ dpct::queue_ptr stream) {
3134+ GGML_ASSERT (offset % sizeof (block_q6_K) == 0 );
3135+ GGML_ASSERT (cols % QK_K == 0 );
3136+ const std::size_t nblocks = rows * (cols / QK_K);
3137+ const std::size_t size = nblocks * sizeof (block_q6_K);
3138+ auto * tmp_buf = sycl::malloc_shared<uint8_t >(size, *stream);
3139+
3140+ SYCL_CHECK (CHECK_TRY_ERROR ((*stream).memcpy (tmp_buf, data_device, size).wait ()));
3141+
3142+ auto * ql_ptr = data_device;
3143+ auto * qh_ptr = ql_ptr + (QK_K / 2 ) * nblocks;
3144+ auto * scales_ptr = qh_ptr + (QK_K / 4 ) * nblocks;
3145+ sycl::half * dm_ptr = (sycl::half *) (scales_ptr + (QK_K / 16 ) * nblocks);
3146+
3147+ stream
3148+ ->parallel_for (nblocks,
3149+ [=](auto i) {
3150+ const block_q6_K * x = (const block_q6_K *) tmp_buf;
3151+ auto row = i / rows;
3152+ auto col = i % rows;
3153+ auto blocks_per_col = cols / QK_K;
3154+ auto block_offset = row * blocks_per_col + col;
3155+
3156+ const uint8_t * ql = x[block_offset].ql ;
3157+ const uint8_t * qh = x[block_offset].qh ;
3158+ uint8_t * base_ql_ptr = ql_ptr + row * ((QK_K / 2 ) * blocks_per_col) + (QK_K / 2 ) * col;
3159+ uint8_t * base_qh_ptr = qh_ptr + row * ((QK_K / 4 ) * blocks_per_col) + (QK_K / 4 ) * col;
3160+ auto * base_scales_ptr = scales_ptr + row * ((QK_K / 16 ) * blocks_per_col) + (QK_K / 16 ) * col;
3161+
3162+ uint8_t ql_reordered[QK_K / 2 ];
3163+ uint8_t qh_reordered[QK_K / 4 ];
3164+ int8_t temp[QK_K];
3165+
3166+ // zero out these intermediate reordered buffers
3167+ for (int j = 0 ; j < QK_K / 2 ; j++) {
3168+ ql_reordered[j] = 0 ;
3169+ }
3170+
3171+ for (int j = 0 ; j < QK_K / 4 ; j++) {
3172+ qh_reordered[j] = 0 ;
3173+ }
3174+
3175+ // first collate and pack ql and qh belonging to the same quant together
3176+ int chunk_offset = 0 ;
3177+ for (int n = 0 ; n < QK_K; n += 128 ) {
3178+ for (int l = 0 ; l < 32 ; ++l) {
3179+ const int8_t q1 = (int8_t )((ql[l + 0 ] & 0xF ) | (((qh[l] >> 0 ) & 3 ) << 4 ));
3180+ const int8_t q2 = (int8_t )((ql[l + 32 ] & 0xF ) | (((qh[l] >> 2 ) & 3 ) << 4 ));
3181+ const int8_t q3 = (int8_t )((ql[l + 0 ] >> 4 ) | (((qh[l] >> 4 ) & 3 ) << 4 ));
3182+ const int8_t q4 = (int8_t )((ql[l + 32 ] >> 4 ) | (((qh[l] >> 6 ) & 3 ) << 4 ));
3183+ temp[chunk_offset + l + 0 ] = q1;
3184+ temp[chunk_offset + l + 32 ] = q2;
3185+ temp[chunk_offset + l + 64 ] = q3;
3186+ temp[chunk_offset + l + 96 ] = q4;
3187+ }
3188+ chunk_offset += 128 ;
3189+ ql += 64 ;
3190+ qh += 32 ;
3191+ }
3192+
3193+ // Now separate them again
3194+ for (int j = 0 ; j < QK_K; j++) {
3195+ int8_t low_bits = temp[j] & 0x0F ;
3196+ ql_reordered[j / 2 ] = ql_reordered[j / 2 ] | (low_bits << (4 * (j % 2 )));
3197+ }
3198+
3199+ for (int j = 0 ; j < QK_K; j++) {
3200+ int8_t high_bits = temp[j] >> 4 ;
3201+ qh_reordered[j / 4 ] = qh_reordered[j / 4 ] | (high_bits << (2 * (j % 4 )));
3202+ }
3203+
3204+ for (int j = 0 ; j < QK_K / 2 ; j++) {
3205+ base_ql_ptr[j] = ql_reordered[j];
3206+ }
3207+
3208+ for (int j = 0 ; j < QK_K / 4 ; j++) {
3209+ base_qh_ptr[j] = qh_reordered[j];
3210+ }
3211+
3212+ for (int j = 0 ; j < QK_K / 16 ; ++j) {
3213+ base_scales_ptr[j] = x[block_offset].scales [j];
3214+ }
3215+
3216+ dm_ptr[block_offset] = x[block_offset].d ;
3217+
3218+ })
3219+ .wait_and_throw ();
3220+ sycl::free (tmp_buf, *stream);
3221+ }
3222+
31343223static void reorder_qw (const ggml_tensor * src0, dpct::queue_ptr stream) {
31353224 uint8_t * data_device = (uint8_t *) src0->data ;
31363225 size_t ncols = src0->ne [0 ];
@@ -3145,7 +3234,12 @@ static void reorder_qw(const ggml_tensor * src0, dpct::queue_ptr stream) {
31453234 reorder_qw_q4_k (data_device, size, 0 , stream);
31463235 break ;
31473236 case GGML_TYPE_Q6_K:
3148- reorder_qw_q6_k (data_device, size, 0 , stream);
3237+ std::cout << " g_ggml_sycl_use_intel_builtins: " << g_ggml_sycl_use_intel_builtins << std::endl;
3238+ if (g_ggml_sycl_use_intel_builtins) {
3239+ reorder_qw_q6_k_contiguous (data_device, nrows, ncols, 0 , stream);
3240+ } else {
3241+ reorder_qw_q6_k (data_device, size, 0 , stream);
3242+ }
31493243 break ;
31503244 default :
31513245 GGML_ABORT (" reorder_qw() called with unsupported type" );
0 commit comments