@@ -192,7 +192,7 @@ static void ggml_check_sycl() try {
192192
193193    if  (!initialized) {
194194        g_ggml_sycl_debug = get_sycl_env (" GGML_SYCL_DEBUG"  , 0 );
195-         g_ggml_sycl_disable_optimize= get_sycl_env (" GGML_SYCL_DISABLE_OPT"  , 1 );
195+         g_ggml_sycl_disable_optimize= get_sycl_env (" GGML_SYCL_DISABLE_OPT"  , 0 );
196196        g_ggml_sycl_disable_graph = get_sycl_env (" GGML_SYCL_DISABLE_GRAPH"  , 1 );
197197        GGML_SYCL_DEBUG (" [SYCL] call ggml_check_sycl\n "  );
198198        GGML_LOG_INFO (" Running with Environment Variables:\n "  );
@@ -2852,6 +2852,64 @@ static bool ggml_sycl_supports_dmmv(enum ggml_type type) {
28522852    }
28532853}
28542854
2855+ static  void  reorder_qw (char  *data_device, const  int  ncols, const  int  nrows,
2856+                 size_t  size, size_t  offset, dpct::queue_ptr stream) {
2857+     auto  tmp_buf = sycl::malloc_shared<char >(size, *stream);
2858+     SYCL_CHECK (
2859+         CHECK_TRY_ERROR ((*stream).memcpy (tmp_buf, data_device, size)
2860+             .wait ()));
2861+     GGML_ASSERT ((size % sizeof (block_q4_0) == 0 ));
2862+     GGML_ASSERT ((offset % sizeof (block_q4_0) == 0 ));
2863+     int  offset_blks = offset / sizeof (block_q4_0);
2864+     auto  qs_ptr = (uint8_t *)data_device + offset_blks * QK4_0 / 2 ;;
2865+     auto  d_ptr = (sycl::half*)(qs_ptr + ncols * nrows / 2 ) + offset_blks;
2866+ 
2867+     stream->parallel_for (
2868+         size / sizeof (block_q4_0),
2869+             [=](auto  i) [[sycl::reqd_sub_group_size (WARP_SIZE)]] {
2870+             const  block_q4_0* x = (const  block_q4_0*)tmp_buf;
2871+             const  int  ib = i;
2872+ 
2873+             for  (int  j = 0 ; j < QK4_0/2 ; j ++)
2874+             {
2875+                 *(qs_ptr + ib * QK4_0 / 2  + j) = x[ib].qs [j];
2876+             }
2877+             *(d_ptr + ib) = x[ib].d ;
2878+         });
2879+ 
2880+     sycl::free (tmp_buf, *stream);
2881+ }
2882+ 
2883+ static  void  reorder_qw (const  ggml_tensor * src0, dpct::queue_ptr stream) {
2884+     char *data_device = (char *)src0->data ;
2885+     size_t  ncols = src0->ne [0 ];
2886+     size_t  nrows = src0->ne [1 ];
2887+     size_t  size = ggml_nbytes (src0);
2888+ 
2889+     reorder_qw (data_device, ncols, nrows, size, 0 , stream);
2890+ }
2891+ 
2892+ /* 
2893+ * This function could be called when the OP (mul_mat) function support reorder optimizition. 
2894+ */ 
2895+ static  void  opt_for_reorder (ggml_backend_sycl_context * ctx, const  ggml_tensor * src0, const  ggml_tensor * src1,
2896+     ggml_tensor * dst) {
2897+     if  (!g_ggml_sycl_disable_optimize && // allow optimize, controlled by $GGML_SYCL_DISABLE_OPT
2898+         ctx->opt_feature .reorder  &&      // allow this device due to good perf, skip the devices with bad perf.
2899+         dst->op  == GGML_OP_MUL_MAT &&    // limit to some supported cases of Q4_0, to do for more cases.
2900+         src0->type  == GGML_TYPE_Q4_0 &&
2901+         src1->ne [2 ]==1  && src1->ne [3 ]==1 ) {
2902+ 
2903+         ggml_tensor_extra_gpu* extra = (ggml_tensor_extra_gpu*)src0->extra ;
2904+         if  (!extra) return ; // only happen in CI/UT permute case.
2905+ 
2906+         if  (extra->optimized_feature .reorder ) return ; // skip the tensor which is handled for reorder.
2907+ 
2908+         reorder_qw (src0, ctx->stream ());
2909+         extra->optimized_feature .reorder  = true ; // used to decode/dequan in next steps.
2910+     }
2911+ }
2912+ 
28552913static  void  ggml_sycl_mul_mat (ggml_backend_sycl_context & ctx, const  ggml_tensor * src0, const  ggml_tensor * src1, ggml_tensor * dst) {
28562914
28572915    const  bool  split = ggml_backend_buffer_is_sycl_split (src0->buffer );
@@ -2914,13 +2972,15 @@ static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor
29142972        //  KQ + KQV multi-batch
29152973        ggml_sycl_mul_mat_batched_sycl (ctx, src0, src1, dst);
29162974    } else  if  (use_dequantize_mul_mat_vec) {
2975+         opt_for_reorder (&ctx, src0, src1, dst); // the OP function in this branch support reorder.
29172976        ggml_sycl_op_mul_mat (ctx, src0, src1, dst, ggml_sycl_op_dequantize_mul_mat_vec, false );
29182977        //  save_tensor_txt("1/dst_1.txt", (float*) dst->data, src0->ne[1], sizeof(float), ctx.stream());
29192978    } else  if  (use_mul_mat_vec_q) {
29202979        ggml_sycl_op_mul_mat (ctx, src0, src1, dst, ggml_sycl_op_mul_mat_vec_q, true );
29212980    } else  if  (use_mul_mat_q) {
29222981        ggml_sycl_op_mul_mat (ctx, src0, src1, dst, ggml_sycl_op_mul_mat_q, true );
29232982    } else  {
2983+         opt_for_reorder (&ctx, src0, src1, dst); // the OP function in this branch support reorder.
29242984        ggml_sycl_op_mul_mat (ctx, src0, src1, dst, ggml_sycl_op_mul_mat_sycl, false );
29252985    }
29262986}
@@ -3545,71 +3605,8 @@ catch (sycl::exception const &exc) {
35453605  std::exit (1 );
35463606}
35473607
3548- static  void  reorder_qw (char  *data_device, const  int  ncols, const  int  nrows,
3549-                 size_t  size, size_t  offset, dpct::queue_ptr stream) {
3550-     auto  tmp_buf = sycl::malloc_shared<char >(size, *stream);
3551-     SYCL_CHECK (
3552-         CHECK_TRY_ERROR ((*stream).memcpy (tmp_buf, data_device, size)
3553-             .wait ()));
3554-     GGML_ASSERT ((size % sizeof (block_q4_0) == 0 ));
3555-     GGML_ASSERT ((offset % sizeof (block_q4_0) == 0 ));
3556-     int  offset_blks = offset / sizeof (block_q4_0);
3557-     auto  qs_ptr = (uint8_t *)data_device + offset_blks * QK4_0 / 2 ;;
3558-     auto  d_ptr = (sycl::half*)(qs_ptr + ncols * nrows / 2 ) + offset_blks;
3559- 
3560-     stream->parallel_for (
3561-         size / sizeof (block_q4_0),
3562-             [=](auto  i) [[sycl::reqd_sub_group_size (WARP_SIZE)]] {
3563-             const  block_q4_0* x = (const  block_q4_0*)tmp_buf;
3564-             const  int  ib = i;
3565- 
3566-             for  (int  j = 0 ; j < QK4_0/2 ; j ++)
3567-             {
3568-                 *(qs_ptr + ib * QK4_0 / 2  + j) = x[ib].qs [j];
3569-             }
3570-             *(d_ptr + ib) = x[ib].d ;
3571-         });
3572- 
3573-     sycl::free (tmp_buf, *stream);
3574- }
3575- 
3576- static  void  reorder_qw (ggml_tensor * src0, dpct::queue_ptr stream) {
3577-     char *data_device = (char *)src0->data ;
3578-     size_t  ncols = src0->ne [0 ];
3579-     size_t  nrows = src0->ne [1 ];
3580-     size_t  size = ggml_nbytes (src0);
3581- 
3582-     reorder_qw (data_device, ncols, nrows, size, 0 , stream);
3583- }
3584- 
3585- static  void  opt_for_reorder (ggml_tensor * dst, dpct::queue_ptr stream) {
3586-     ggml_tensor *src0 = dst->src [0 ];
3587-     ggml_tensor *src1 = dst->src [1 ];
3588- 
3589-     if  (dst->op  == GGML_OP_MUL_MAT && src0->type  == GGML_TYPE_Q4_0 &&
3590-         src1->ne [2 ]==1  && src1->ne [3 ]==1 ) {
3591-         reorder_qw (src0, stream);
3592-         ggml_tensor_extra_gpu* extra = (ggml_tensor_extra_gpu*)src0->extra ;
3593-         GGML_ASSERT (extra);
3594-         extra->optimized_feature .reorder  = true ; // used to decode/dequan in next steps.
3595-     }
3596- }
3597- 
3598- static  void  optimize_graph_once (ggml_cgraph * cgraph, ggml_backend_sycl_context * ctx) {
3599-     dpct::queue_ptr stream = ctx->stream ();
3600-     if  (ctx->optimized_graph ) {
3601-        return ;
3602-     }
3603-     ctx->optimized_graph  = true ;
3604- 
3605-     for  (int  i = 0 ; i < cgraph->n_nodes ; i++) {
3606-         if  (ctx->opt_feature .reorder ) opt_for_reorder (cgraph->nodes [i], stream);
3607-     }
3608- }
3609- 
36103608static  void  ggml_backend_sycl_graph_compute_impl (ggml_backend_sycl_context * sycl_ctx, ggml_cgraph * cgraph) {
36113609    ggml_sycl_set_main_device (sycl_ctx->device );
3612-     if  (!g_ggml_sycl_disable_optimize) optimize_graph_once (cgraph, sycl_ctx);
36133610
36143611    for  (int  i = 0 ; i < cgraph->n_nodes ; i++) {
36153612        ggml_tensor * node = cgraph->nodes [i];
0 commit comments