1313#ifndef GGML_SYCL_COMMON_HPP
1414#define GGML_SYCL_COMMON_HPP
1515
16+ #include < cstddef>
1617#include < fstream>
1718#include < iostream>
19+ #include < string>
1820
1921#include " dpct/helper.hpp"
2022#include " ggml-sycl.h"
@@ -44,11 +46,20 @@ extern int g_ggml_sycl_debug;
4446extern int g_ggml_sycl_disable_optimize;
4547extern int g_ggml_sycl_prioritize_dmmv;
4648
47- #define GGML_SYCL_DEBUG (...) \
48- do { \
49- if (g_ggml_sycl_debug) \
50- fprintf (stderr, __VA_ARGS__); \
51- } while (0 )
49+ #if defined(__clang__) && __has_builtin(__builtin_expect)
50+ // Hint the optimizer to pipeline the more likely following instruction in branches
51+ # define LIKELY (expr ) __builtin_expect(expr, true )
52+ # define UNLIKELY (expr ) __builtin_expect(expr, false )
53+ #else
54+ # define LIKELY (expr ) (expr)
55+ # define UNLIKELY (expr ) (expr)
56+ #endif
57+
58+ #define GGML_SYCL_DEBUG (...) \
59+ do { \
60+ if (UNLIKELY (g_ggml_sycl_debug)) \
61+ fprintf (stderr, __VA_ARGS__); \
62+ } while (0 )
5263
5364#define CHECK_TRY_ERROR (expr ) \
5465 [&]() { \
@@ -280,7 +291,22 @@ void release_extra_gpu(ggml_tensor_extra_gpu * extra, std::vector<queue_ptr> str
280291inline optimize_feature check_gpu_optimize_feature (syclex::architecture &arch) {
281292 optimize_feature opt;
282293
283- opt.reorder = true ;
294+ opt.reorder =
295+ (arch == syclex::architecture::intel_gpu_dg1 ||
296+ arch == syclex::architecture::intel_gpu_acm_g10 ||
297+ arch == syclex::architecture::intel_gpu_acm_g11 ||
298+ arch == syclex::architecture::intel_gpu_acm_g12 ||
299+ arch == syclex::architecture::intel_gpu_pvc ||
300+ arch == syclex::architecture::intel_gpu_pvc_vg ||
301+ arch == syclex::architecture::intel_gpu_mtl_u ||
302+ arch == syclex::architecture::intel_gpu_mtl_s ||
303+ arch == syclex::architecture::intel_gpu_mtl_h ||
304+ arch == syclex::architecture::intel_gpu_arl_u ||
305+ arch == syclex::architecture::intel_gpu_arl_s ||
306+ arch == syclex::architecture::intel_gpu_arl_h ||
307+ arch == syclex::architecture::intel_gpu_bmg_g21 ||
308+ arch == syclex::architecture::intel_gpu_lnl_m
309+ );
284310
285311 return opt;
286312}
@@ -456,6 +482,19 @@ static __dpct_inline__ float warp_reduce_max(float x,
456482 return x;
457483}
458484
485+ /* Helper for Computing the linear offset of a ggml_tensor given
486+ per-dimension sizes, strides, and indices */
487+ template <int N>
488+ __dpct_inline__ size_t calculate_offset (const std::array<int , N> & strides, const std::array<int , N> & indices) {
489+ size_t offset = 0 ;
490+ #pragma unroll
491+ for (int i = 0 ; i < N; i++) {
492+ auto index_i = indices[i];
493+ offset += strides[i] * index_i;
494+ }
495+ return offset;
496+ }
497+
459498// Helper for vec loading aligned data
460499template <typename Tp, int n>
461500inline sycl::vec<Tp, n> vec_aligned_load (const Tp* aligned_ptr) {
@@ -475,4 +514,76 @@ constexpr size_t ceil_div(const size_t m, const size_t n) {
475514}
476515
477516bool gpu_has_xmx (sycl::device &dev);
517+
518+ template <int N, class T > void debug_print_array (const std::string & prefix, const T array[N]) {
519+ if (LIKELY (!g_ggml_sycl_debug)) {
520+ return ;
521+ }
522+ std::stringstream ss;
523+ ss << prefix << " =[" ;
524+ for (std::size_t i = 0 ; i < N - 1 ; ++i) {
525+ ss << array[i] << " , " ;
526+ }
527+ if constexpr (N > 0 ) {
528+ ss << array[N - 1 ];
529+ }
530+ ss << " ]" ;
531+ GGML_SYCL_DEBUG (" %s" , ss.str ().c_str ());
532+ }
533+
534+ inline void debug_print_tensor (const std::string & prefix, const ggml_tensor * tensor,
535+ const std::string & suffix = " " ) {
536+ if (LIKELY (!g_ggml_sycl_debug)) {
537+ return ;
538+ }
539+ GGML_SYCL_DEBUG (" %s=" , prefix.c_str ());
540+ if (tensor) {
541+ GGML_SYCL_DEBUG (" '%s':type=%s" , tensor->name , ggml_type_name (tensor->type ));
542+ debug_print_array<GGML_MAX_DIMS>(" ;ne" , tensor->ne );
543+ debug_print_array<GGML_MAX_DIMS>(" ;nb" , tensor->nb );
544+ if (!ggml_is_contiguous (tensor)) {
545+ GGML_SYCL_DEBUG (" ;strided" );
546+ }
547+ if (ggml_is_permuted (tensor)) {
548+ GGML_SYCL_DEBUG (" ;permuted" );
549+ }
550+ } else {
551+ GGML_SYCL_DEBUG (" nullptr" );
552+ }
553+ GGML_SYCL_DEBUG (" %s" , suffix.c_str ());
554+ }
555+
556+ // Use scope_op_debug_print to log operations coming from running a model
557+ struct scope_op_debug_print {
558+ // Use string_views to avoid the cost of creating a string and concatenating them
559+ // string_views must be alive for as long as the object is alive
560+ // scope_op_debug_print are used with string literals in practice which are stored in constant space so always accessible
561+ scope_op_debug_print (const std::string_view & func, const std::string_view & func_suffix, const ggml_tensor * dst,
562+ std::size_t num_src, const std::string_view & suffix = " " ) :
563+ func (func),
564+ func_suffix (func_suffix) {
565+ if (LIKELY (!g_ggml_sycl_debug)) {
566+ return ;
567+ }
568+ GGML_SYCL_DEBUG (" [SYCL][OP] call %s%s:" , func.data (), func_suffix.data ());
569+ debug_print_tensor (" dst" , dst);
570+ if (dst) {
571+ for (std::size_t i = 0 ; i < num_src; ++i) {
572+ debug_print_tensor (" \t src" + std::to_string (i), dst->src [i]);
573+ }
574+ }
575+ GGML_SYCL_DEBUG (" %s\n " , suffix.data ());
576+ }
577+
578+ scope_op_debug_print (const std::string_view & func, const ggml_tensor * dst, std::size_t num_src,
579+ const std::string_view & suffix = " " ) :
580+ scope_op_debug_print (func, " " , dst, num_src, suffix) {}
581+
582+ ~scope_op_debug_print () { GGML_SYCL_DEBUG (" [SYCL][OP] call %s%s done\n " , func.data (), func_suffix.data ()); }
583+
584+ private:
585+ std::string_view func;
586+ std::string_view func_suffix;
587+ };
588+
478589#endif // GGML_SYCL_COMMON_HPP
0 commit comments