diff --git a/sycl/source/detail/device_info.hpp b/sycl/source/detail/device_info.hpp index b84bfff755abb..2fced6abeffd0 100644 --- a/sycl/source/detail/device_info.hpp +++ b/sycl/source/detail/device_info.hpp @@ -849,14 +849,96 @@ struct get_device_info_impl< matrix_type::sint32, matrix_type::sint32}, {8, 0, 0, 0, 16, 16, matrix_type::fp16, matrix_type::fp16, matrix_type::fp32, matrix_type::fp32}, + {8, 0, 0, 0, 16, 16, matrix_type::fp16, matrix_type::fp16, + matrix_type::fp16, matrix_type::fp32}, + {8, 0, 0, 0, 16, 16, matrix_type::fp16, matrix_type::fp16, + matrix_type::fp32, matrix_type::fp16}, + {8, 0, 0, 0, 16, 16, matrix_type::fp16, matrix_type::fp16, + matrix_type::fp16, matrix_type::fp16}, + {0, 0, 0, 16, 16, 16, matrix_type::fp16, matrix_type::fp16, + matrix_type::fp32, matrix_type::fp16}, + {0, 0, 0, 16, 16, 16, matrix_type::fp16, matrix_type::fp16, + matrix_type::fp16, matrix_type::fp16}, + {0, 0, 0, 1, 64, 16, matrix_type::fp16, matrix_type::fp16, + matrix_type::fp32, matrix_type::fp32}, + {0, 0, 0, 1, 64, 16, matrix_type::fp16, matrix_type::fp16, + matrix_type::fp16, matrix_type::fp32}, + {0, 0, 0, 1, 64, 16, matrix_type::fp16, matrix_type::fp16, + matrix_type::fp32, matrix_type::fp16}, + {0, 0, 0, 1, 64, 16, matrix_type::fp16, matrix_type::fp16, + matrix_type::fp16, matrix_type::fp16}, + {0, 0, 0, 32, 64, 16, matrix_type::fp16, matrix_type::fp16, + matrix_type::fp32, matrix_type::fp32}, + {0, 0, 0, 32, 64, 16, matrix_type::fp16, matrix_type::fp16, + matrix_type::fp16, matrix_type::fp32}, + {0, 0, 0, 32, 64, 16, matrix_type::fp16, matrix_type::fp16, + matrix_type::fp32, matrix_type::bf16}, + {0, 0, 0, 32, 64, 16, matrix_type::fp16, matrix_type::fp16, + matrix_type::fp16, matrix_type::fp16}, + {0, 0, 0, 1, 64, 32, matrix_type::fp16, matrix_type::fp16, + matrix_type::fp32, matrix_type::fp32}, + {0, 0, 0, 1, 64, 32, matrix_type::fp16, matrix_type::fp16, + matrix_type::fp16, matrix_type::fp32}, + {0, 0, 0, 1, 64, 32, matrix_type::fp16, matrix_type::fp16, + matrix_type::fp32, matrix_type::fp16}, + {0, 0, 0, 1, 64, 32, matrix_type::fp16, matrix_type::fp16, + matrix_type::fp16, matrix_type::fp16}, + {0, 0, 0, 32, 64, 32, matrix_type::fp16, matrix_type::fp16, + matrix_type::fp32, matrix_type::fp32}, + {0, 0, 0, 32, 64, 32, matrix_type::fp16, matrix_type::fp16, + matrix_type::fp16, matrix_type::fp32}, + {0, 0, 0, 32, 64, 32, matrix_type::fp16, matrix_type::fp16, + matrix_type::fp32, matrix_type::fp16}, + {0, 0, 0, 32, 64, 32, matrix_type::fp16, matrix_type::fp16, + matrix_type::fp16, matrix_type::fp16}, + {8, 0, 0, 0, 16, 16, matrix_type::bf16, matrix_type::bf16, + matrix_type::bf16, matrix_type::bf16}, + {8, 0, 0, 0, 16, 16, matrix_type::bf16, matrix_type::bf16, + matrix_type::fp32, matrix_type::bf16}, + {8, 0, 0, 0, 16, 16, matrix_type::bf16, matrix_type::bf16, + matrix_type::bf16, matrix_type::fp32}, {8, 0, 0, 0, 16, 16, matrix_type::bf16, matrix_type::bf16, matrix_type::fp32, matrix_type::fp32}, {0, 0, 0, 16, 16, 16, matrix_type::bf16, matrix_type::bf16, matrix_type::fp32, matrix_type::fp32}, + {0, 0, 0, 16, 16, 16, matrix_type::bf16, matrix_type::bf16, + matrix_type::bf16, matrix_type::fp32}, + {0, 0, 0, 16, 16, 16, matrix_type::bf16, matrix_type::bf16, + matrix_type::fp32, matrix_type::bf16}, + {0, 0, 0, 16, 16, 16, matrix_type::bf16, matrix_type::bf16, + matrix_type::bf16, matrix_type::bf16}, {0, 0, 0, 1, 64, 16, matrix_type::bf16, matrix_type::bf16, matrix_type::fp32, matrix_type::fp32}, + {0, 0, 0, 1, 64, 16, matrix_type::bf16, matrix_type::bf16, + matrix_type::bf16, matrix_type::fp32}, + {0, 0, 0, 1, 64, 16, matrix_type::bf16, matrix_type::bf16, + matrix_type::fp32, matrix_type::bf16}, + {0, 0, 0, 1, 64, 16, matrix_type::bf16, matrix_type::bf16, + matrix_type::bf16, matrix_type::bf16}, {0, 0, 0, 32, 64, 16, matrix_type::bf16, matrix_type::bf16, matrix_type::fp32, matrix_type::fp32}, + {0, 0, 0, 32, 64, 16, matrix_type::bf16, matrix_type::bf16, + matrix_type::bf16, matrix_type::fp32}, + {0, 0, 0, 32, 64, 16, matrix_type::bf16, matrix_type::bf16, + matrix_type::fp32, matrix_type::bf16}, + {0, 0, 0, 32, 64, 16, matrix_type::bf16, matrix_type::bf16, + matrix_type::bf16, matrix_type::bf16}, + {0, 0, 0, 1, 64, 32, matrix_type::bf16, matrix_type::bf16, + matrix_type::fp32, matrix_type::fp32}, + {0, 0, 0, 1, 64, 32, matrix_type::bf16, matrix_type::bf16, + matrix_type::bf16, matrix_type::fp32}, + {0, 0, 0, 1, 64, 32, matrix_type::bf16, matrix_type::bf16, + matrix_type::fp32, matrix_type::bf16}, + {0, 0, 0, 1, 64, 32, matrix_type::bf16, matrix_type::bf16, + matrix_type::bf16, matrix_type::bf16}, + {0, 0, 0, 32, 64, 32, matrix_type::bf16, matrix_type::bf16, + matrix_type::fp32, matrix_type::fp32}, + {0, 0, 0, 32, 64, 32, matrix_type::bf16, matrix_type::bf16, + matrix_type::bf16, matrix_type::fp32}, + {0, 0, 0, 32, 64, 32, matrix_type::bf16, matrix_type::bf16, + matrix_type::fp32, matrix_type::bf16}, + {0, 0, 0, 32, 64, 32, matrix_type::bf16, matrix_type::bf16, + matrix_type::bf16, matrix_type::bf16}, {8, 0, 0, 0, 16, 8, matrix_type::tf32, matrix_type::tf32, matrix_type::fp32, matrix_type::fp32}, }; diff --git a/sycl/test-e2e/Matrix/common.hpp b/sycl/test-e2e/Matrix/common.hpp index 2b134f8144667..8d2e460116353 100644 --- a/sycl/test-e2e/Matrix/common.hpp +++ b/sycl/test-e2e/Matrix/common.hpp @@ -63,15 +63,17 @@ void matrix_multiply_ref(Ta *A, Tb *B, Tc *C, int M, int N, int K, if constexpr (std::is_same_v && std::is_same_v) acc += make_fp32(va[i]) * make_fp32(vb[i]); + else if constexpr (std::is_same_v && + std::is_same_v) + acc += (float)va[i] * (float)vb[i]; else if constexpr (std::is_same_v && std::is_same_v || std::is_integral_v && std::is_integral_v || + (std::is_same_v || + std::is_same_v) || (std::is_same_v && std::is_same_v)) acc += va[i] * vb[i]; - else if constexpr (std::is_same_v && - std::is_same_v) - acc += (float)va[i] * (float)vb[i]; else assert(false && "Unsupported type in matrix_multiply_ref."); } diff --git a/sycl/test-e2e/Matrix/joint_matrix_bfloat16_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_bfloat16_impl.hpp index fc1fc1963e047..ede4e795d0d69 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_bfloat16_impl.hpp +++ b/sycl/test-e2e/Matrix/joint_matrix_bfloat16_impl.hpp @@ -14,16 +14,16 @@ void matrix_multiply(big_matrix &C, big_matrix &A, big_matrix &B) { size_t NDRangeM = M / TM; size_t NDRangeN = N / TN; - buffer bufA(A.get_data(), range<2>(M, K)); - buffer bufB(B.get_data(), range<2>(K, N)); - buffer bufC((float *)C.get_data(), range<2>(M, N)); + buffer bufA(A.get_data(), range<2>(M, K)); + buffer bufB(B.get_data(), range<2>(K, N)); + buffer bufC((T1 *)C.get_data(), range<2>(M, N)); queue q; size_t sg_size = get_sg_size>(q); q.submit([&](handler &cgh) { - auto accC = bufC.get_access(cgh); - auto accA = bufA.get_access(cgh); - auto accB = bufB.get_access(cgh); + accessor accA{bufA, cgh}; + accessor accB{bufB, cgh}; + accessor accC{bufC, cgh}; cgh.parallel_for>( nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}), @@ -41,13 +41,11 @@ void matrix_multiply(big_matrix &C, big_matrix &A, const auto sg_starty = global_idy - spmd_item.get_local_id(1); sub_group sg = spmd_item.get_sub_group(); - joint_matrix - sub_a; + joint_matrix sub_a; // For B, we assume B has been already VNNIed. - joint_matrix + joint_matrix sub_b; - joint_matrix sub_c; + joint_matrix sub_c; joint_matrix_load( sg, sub_c, @@ -122,13 +120,21 @@ int main() { if (combinations[i].nsize == 16) { // architecture::intel_gpu_pvc test(); + // test(); // This combination is not currently supported for sub group size = 32 in // IGC #if (!defined(SG_SZ) || SG_SZ != 32) test(); + // test(); test(); + // test(); test(); + // test(); + // test(); + // test(); + // test(); + // test(); #endif break; } diff --git a/sycl/test-e2e/Matrix/joint_matrix_half_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_half_impl.hpp index 0ff483b051067..69ee6d4da5464 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_half_impl.hpp +++ b/sycl/test-e2e/Matrix/joint_matrix_half_impl.hpp @@ -18,13 +18,13 @@ void matrix_multiply(big_matrix &C, big_matrix &A, buffer bufC(C.get_data(), range<2>(M, N)); queue q; - size_t sg_size = get_sg_size>(q); + size_t sg_size = get_sg_size>(q); q.submit([&](handler &cgh) { accessor accA{bufA, cgh}; accessor accB{bufB, cgh}; accessor accC{bufC, cgh}; - cgh.parallel_for>( + cgh.parallel_for>( nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, sg_size}), [=](nd_item<2> spmd_item) #ifdef SG_SZ @@ -122,6 +122,22 @@ int main() { if (combinations[i].nsize == 16) { // architecture::intel_gpu_pvc test(); + // test(); + + // This combination is not currently supported for sub group size = 32 in + // IGC +#if (!defined(SG_SZ) || SG_SZ != 32) + // test(); + // test(); + // test(); + // test(); + // test(); + // test(); + // test(); + // test(); + // test(); + // test(); +#endif break; }