using strided oneDNN API

lslusarczyk · lslusarczyk · commit e711ab9fa11f · 2025-05-02T18:15:16.000+02:00
diff --git a/ggml/src/ggml-sycl/gemm.hpp b/ggml/src/ggml-sycl/gemm.hpp
@@ -44,13 +44,17 @@ class DnnlGemmWrapper {
     static void gemm(ggml_backend_sycl_context & ctx, int m, int n, int k,
         const void * a, dt at, dnnl_dim_t nra, dnnl_dim_t nca, dnnl_dim_t stride_a,
         const void * b, dt bt, dnnl_dim_t nrb, dnnl_dim_t ncb, dnnl_dim_t stride_b,
-        void * c, dt ct, const queue_ptr & q, dnnl_dim_t batches) {
+        void * c, dt ct, const queue_ptr & q, dnnl_dim_t batches_a, dnnl_dim_t batches_b) {
 
         auto stream = ctx.stream_dnnl(q);
         auto eng = ctx.engine_dnnl(q);
-        dnnl::memory::dims a_dims = { batches, m, k };
-        dnnl::memory::dims b_dims = { batches, k, n };
-        dnnl::memory::dims c_dims = { batches, m, n };
+
+        // { # strides, # rows, # columns }
+        dnnl::memory::dims a_dims = { batches_a, m, k };
+        dnnl::memory::dims b_dims = { batches_b, k, n };
+        dnnl::memory::dims c_dims = { std::max(batches_a, batches_b), m, n };
+
+        // { # elements to skip to next stride, # elements to skip to next row, # elements to skip to next column }
         dnnl::memory::dims a_strides = { stride_a, nra, nca };
         dnnl::memory::dims b_strides = { stride_b, nrb, ncb };
 
@@ -85,7 +89,7 @@ class DnnlGemmWrapper {
     static void row_gemm(ggml_backend_sycl_context & ctx, int m, int n, int k,
         const void * a, dt at, const void * b, dt bt, void * c, dt ct, const queue_ptr & q) {
 
-        gemm(ctx, m, n, k, a, at, k, 1, k * m, b, bt, 1, k, n * k, c, ct, q, 1);
+        gemm(ctx, m, n, k, a, at, k, 1, k * m, b, bt, 1, k, n * k, c, ct, q, 1, 1);
     }
 };
 
diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp
@@ -2770,22 +2770,42 @@ static void ggml_sycl_mul_mat_batched_sycl(ggml_backend_sycl_context & ctx,
     // broadcast factors
     const auto r2 = ne12/ne02;
     const auto r3 = ne13/ne03;
-    const auto ne23 = ne12*ne13;
 
-    if (r2 == 1 && r3 == 1 && ggml_is_contiguous_2(src0) && ggml_is_contiguous_2(src1)) {
-        // there is no broadcast and src0, src1 are contiguous across dims 2, 3
 #if GGML_SYCL_DNNL
-        DnnlGemmWrapper::gemm(ctx, ne11, ne01, ne10,
+    if (r2 == 1 && r3 == 1) {
+        DnnlGemmWrapper::gemm(ctx, ne11,ne01, ne10,
             src1_f16, DnnlGemmWrapper::to_dt<sycl::half>(), nb11/nb10, 1, nb12/nb10,
             src0_as_f16, DnnlGemmWrapper::to_dt<sycl::half>(), 1, nb01/nb00, nb02/nb00,
-            dst_t, DnnlGemmWrapper::to_dt<float>(), main_stream, ne23);
+            dst_t, DnnlGemmWrapper::to_dt<float>(), main_stream, ne12*ne13, ne02 * ne03);
+    } else {
+        // nb1X_scaled is in bytes as if matrix 1 type would be sycl::half (it may be already such or it may be 4-bytes)
+        const auto nb12_scaled = src1->type == GGML_TYPE_F16 ? nb12 : nb12 / 2;
+        const auto nb13_scaled = src1->type == GGML_TYPE_F16 ? nb13 : nb13 / 2;
+
+        // iterate over batches from smaller set of matrices (matrix 0)
+        for (int64_t ie02 = 0; ie02 < ne02; ++ie02) {
+            for (int64_t ie03 = 0; ie03 < ne03; ++ie03) {
+                const sycl::half* src0_f16_shifted = src0_as_f16 + ((ie02*nb02 + ie03*nb03)/2); // div2 cuz nb is in bytes and pointer is in f16 (2 bytes)
+                const sycl::half* src1_f16_shifted = src1_f16 + ((ie02*nb12_scaled*r2 + ie03*nb13_scaled*r3)/2);
+                float* dst_shifted = dst_ddf + ((ie02*nb2*r2 + ie03*nb3*r3)/4); // div4 cuz nb is in bytes and pointer is float (4 bytes)
+
+                DnnlGemmWrapper::gemm(ctx, ne11,ne01, ne10,
+                src1_f16_shifted, DnnlGemmWrapper::to_dt<sycl::half>(), nb11/nb10, 1, nb12/nb10,
+                src0_f16_shifted, DnnlGemmWrapper::to_dt<sycl::half>(), 1, nb01/nb00, nb02/nb00,
+                dst_shifted, DnnlGemmWrapper::to_dt<float>(), main_stream, r2 * r3, 1);
+            }
+        }
+    }
 #else
+    const auto ne23 = ne12*ne13;
+    if (r2 == 1 && r3 == 1 && ggml_is_contiguous_2(src0) && ggml_is_contiguous_2(src1)) {
+        // there is no broadcast and src0, src1 are contiguous across dims 2, 3
+
         SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm_batch(
             *main_stream, oneapi::math::transpose::trans, oneapi::math::transpose::nontrans, ne01, ne11, ne10, alpha,
             (const char *) src0_as_f16, dpct::library_data_t::real_half, nb01 / nb00, nb02 / nb00,
             (const char *) src1_f16, dpct::library_data_t::real_half, nb11 / nb10, nb12 / nb10, beta, (char *) dst_t,
             cu_data_type, ne01, nb2 / nb0, ne23, cu_compute_type)));
-#endif
     } else {
 
         ggml_sycl_pool_alloc<const void *> ptrs_src(ctx.pool(), 2*ne23);
@@ -2824,6 +2844,7 @@ static void ggml_sycl_mul_mat_batched_sycl(ggml_backend_sycl_context & ctx,
             (const void **) (ptrs_src.get() + 1 * ne23), dpct::library_data_t::real_half, nb11 / nb10, beta,
             (void **) (ptrs_dst.get() + 0 * ne23), cu_data_type, ne01, ne23, cu_compute_type, matrix_info.get())));
     }
+#endif
 }
 catch (sycl::exception const &exc) {
   std::cerr << exc.what() << "Exception caught at file:" << __FILE__