applied Romain's comments

lslusarczyk · lslusarczyk · commit d262ec689ad0 · 2025-05-15T14:37:05.000+02:00
diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp
@@ -2824,21 +2824,20 @@ static void ggml_sycl_mul_mat_batched_sycl(ggml_backend_sycl_context & ctx, cons
                 dnn_gemm(src1_f16, src0_f16, dst_ddf, ne12*ne13, ne02 * ne03);
             }
             else {
-
                 for (int64_t ie03 = 0; ie03 < ne03; ++ie03) {
-                    const sycl::half* src0_f16_shifted = src0_f16 + ((ie03*nb03)/2); // div2 cuz nb is in bytes and pointer is in f16 (2 bytes)
+                    const sycl::half* src0_f16_shifted = src0_f16 + ((ie03*nb03)/sizeof(sycl::half)); // nb is in bytes
                     const sycl::half* src1_f16_shifted = src1_f16 + ie03*s13;
-                    float* dst_shifted = dst_ddf + ((ie03*nb3)/4); // div4 cuz nb is in bytes and pointer is float (4 bytes)
+                    float* dst_shifted = dst_ddf + ((ie03*nb3)/sizeof(float));
                     dnn_gemm(src1_f16_shifted, src0_f16_shifted, dst_shifted, ne12, ne02);
                 }
             }
         } else {
             // iterate over batches from smaller set of matrices (matrix 0)
             for (int64_t ie02 = 0; ie02 < ne02; ++ie02) {
                 for (int64_t ie03 = 0; ie03 < ne03; ++ie03) {
-                    const sycl::half* src0_f16_shifted = src0_f16 + ((ie02*nb02 + ie03*nb03)/2); // div2 cuz nb is in bytes and pointer is in f16 (2 bytes)
+                    const sycl::half* src0_f16_shifted = src0_f16 + ((ie02*nb02 + ie03*nb03)/sizeof(sycl::half));
                     const sycl::half* src1_f16_shifted = src1_f16 + ie02*s12*r2 + ie03*s13*r3;
-                    float* dst_shifted = dst_ddf + ((ie02*nb2*r2 + ie03*nb3*r3)/4); // div4 cuz nb is in bytes and pointer is float (4 bytes)
+                    float* dst_shifted = dst_ddf + ((ie02*nb2*r2 + ie03*nb3*r3)/sizeof(float));
                     dnn_gemm(src1_f16_shifted, src0_f16_shifted, dst_shifted, r2*r3, 1);
                 }
             }
diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
@@ -3919,7 +3919,7 @@ static const ggml_type other_types[] = {
 // Test cases for evaluation: should try to cover edge cases while using small input sizes to keep the runtime low
 static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
     std::vector<std::unique_ptr<test_case>> test_cases;
-    [[maybe_unused]] std::default_random_engine rng(0);
+    std::default_random_engine rng(0);
 
     // unary ops
     for (ggml_type type : {GGML_TYPE_F16, GGML_TYPE_F32}) {
@@ -4242,8 +4242,6 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
             test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {2, 3}, {1, 1}, {0, 1, 3, 2}));
             test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {2, 3}, {1, 1}, {0, 3, 2, 1}));
 
-            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16,  16, 256, {2, 1}, {1, 1}, {0, 2, 1, 3}));
-
             // test cases with large ne00/ne10 to cover stream-k fixup
             test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16,  1, 1024, {3, 2}, {1, 1}));
             test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16,  8, 1024, {3, 2}, {1, 1}));

Original file line number	Diff line number	Diff line change
`@@ -2824,21 +2824,20 @@ static void ggml_sycl_mul_mat_batched_sycl(ggml_backend_sycl_context & ctx, cons`
`2824`	`2824`	`dnn_gemm(src1_f16, src0_f16, dst_ddf, ne12ne13, ne02 ne03);`
`2825`	`2825`	`}`
`2826`	`2826`	`else {`
`2827`		`-`
`2828`	`2827`	`for (int64_t ie03 = 0; ie03 < ne03; ++ie03) {`
`2829`		`- const sycl::half* src0_f16_shifted = src0_f16 + ((ie03*nb03)/2); // div2 cuz nb is in bytes and pointer is in f16 (2 bytes)`
	`2828`	`+ const sycl::half* src0_f16_shifted = src0_f16 + ((ie03*nb03)/sizeof(sycl::half)); // nb is in bytes`
`2830`	`2829`	`const sycl::half* src1_f16_shifted = src1_f16 + ie03*s13;`
`2831`		`- float* dst_shifted = dst_ddf + ((ie03*nb3)/4); // div4 cuz nb is in bytes and pointer is float (4 bytes)`
	`2830`	`+ float* dst_shifted = dst_ddf + ((ie03*nb3)/sizeof(float));`
`2832`	`2831`	`dnn_gemm(src1_f16_shifted, src0_f16_shifted, dst_shifted, ne12, ne02);`
`2833`	`2832`	`}`
`2834`	`2833`	`}`
`2835`	`2834`	`} else {`
`2836`	`2835`	`// iterate over batches from smaller set of matrices (matrix 0)`
`2837`	`2836`	`for (int64_t ie02 = 0; ie02 < ne02; ++ie02) {`
`2838`	`2837`	`for (int64_t ie03 = 0; ie03 < ne03; ++ie03) {`
`2839`		`- const sycl::half* src0_f16_shifted = src0_f16 + ((ie02nb02 + ie03nb03)/2); // div2 cuz nb is in bytes and pointer is in f16 (2 bytes)`
	`2838`	`+ const sycl::half* src0_f16_shifted = src0_f16 + ((ie02nb02 + ie03nb03)/sizeof(sycl::half));`
`2840`	`2839`	`const sycl::half* src1_f16_shifted = src1_f16 + ie02s12r2 + ie03s13r3;`
`2841`		`- float* dst_shifted = dst_ddf + ((ie02nb2r2 + ie03nb3r3)/4); // div4 cuz nb is in bytes and pointer is float (4 bytes)`
	`2840`	`+ float* dst_shifted = dst_ddf + ((ie02nb2r2 + ie03nb3r3)/sizeof(float));`
`2842`	`2841`	`dnn_gemm(src1_f16_shifted, src0_f16_shifted, dst_shifted, r2*r3, 1);`
`2843`	`2842`	`}`
`2844`	`2843`	`}`