diff --git a/sycl/doc/extensions/experimental/sycl_ext_matrix/sycl_ext_intel_matrix.asciidoc b/sycl/doc/extensions/experimental/sycl_ext_matrix/sycl_ext_intel_matrix.asciidoc index a7db3f3d55f10..598f338af33ed 100644 --- a/sycl/doc/extensions/experimental/sycl_ext_matrix/sycl_ext_intel_matrix.asciidoc +++ b/sycl/doc/extensions/experimental/sycl_ext_matrix/sycl_ext_intel_matrix.asciidoc @@ -113,7 +113,7 @@ an application can use to indicate that the matrix data is loaded or stored in VNNI "packed" format. ```c++ -namespace sycl::ext::oneapi::experimental::matrix::layout { +namespace sycl::ext::oneapi::experimental::matrix { enum class layout { ext_intel_packed diff --git a/sycl/doc/extensions/experimental/sycl_ext_matrix/sycl_ext_oneapi_matrix.asciidoc b/sycl/doc/extensions/experimental/sycl_ext_matrix/sycl_ext_oneapi_matrix.asciidoc index fd3ae8527815a..0cd5370493db4 100644 --- a/sycl/doc/extensions/experimental/sycl_ext_matrix/sycl_ext_oneapi_matrix.asciidoc +++ b/sycl/doc/extensions/experimental/sycl_ext_matrix/sycl_ext_oneapi_matrix.asciidoc @@ -1017,52 +1017,103 @@ is shown in a single column in the table below. ==== Intel XMX Supported Combinations This is currently available in devices with the architecture -`architecture::intel_gpu_pvc`, `architecture::intel_gpu_dg2_g10`, -`architecture::intel_gpu_dg2_g11`, and -`architecture::intel_gpu_dg2_g12`. -In these architectures' -implementation, the type of the C matrix must be the same as the type -of the D matrix. Therefore, that common type is shown in a single -column in the table below. +`architecture::intel_gpu_pvc`, `architecture::intel_gpu_bmg_g21`, +`architecture::intel_gpu_lnl_m`, `architecture::intel_gpu_dg2_g10`, +`architecture::intel_gpu_dg2_g11`, `architecture::intel_gpu_dg2_g12`, +and `architecture::intel_gpu_arl_h`. [frame="none",options="header"] |====================== -| A type | B type | C and D type | M | N | K | device +| A type | B type | C type | D type | M | N | K | device .2+| `matrix_type::uint8` .2+| `matrix_type::uint8` .2+| -`matrix_type::sint32` .2+| +<=+ 8 | 16 .2+| 32 -|`architecture::intel_gpu_pvc` +`matrix_type::sint32` .2+| `matrix_type::sint32` .2+| +<=+ 8 | 16 .2+| 32 +|`architecture::intel_gpu_pvc`, `architecture::intel_gpu_bmg_g21`, +`architecture::intel_gpu_lnl_m` |8|`architecture::intel_gpu_dg2_g10, -architecture::intel_gpu_dg2_g11, architecture::intel_gpu_dg2_g12` +architecture::intel_gpu_dg2_g11, architecture::intel_gpu_dg2_g12`, +`architecture::intel_gpu_arl_h` .2+| `matrix_type::uint8` .2+| `matrix_type::sint8` .2+| -`matrix_type::sint32` .2+| +<=+ 8 | 16 .2+| 32 | -`architecture::intel_gpu_pvc` +`matrix_type::sint32` .2+|`matrix_type::sint32` .2+| +<=+ 8 | 16 .2+| 32 | +`architecture::intel_gpu_pvc`, `architecture::intel_gpu_bmg_g21`, +`architecture::intel_gpu_lnl_m` |8|`architecture::intel_gpu_dg2_g10, -architecture::intel_gpu_dg2_g11, architecture::intel_gpu_dg2_g12` +architecture::intel_gpu_dg2_g11, architecture::intel_gpu_dg2_g12`, +`architecture::intel_gpu_arl_h` .2+| `matrix_type::sint8` .2+| `matrix_type::uint8` .2+| -`matrix_type::sint32` .2+| +<=+ 8 | 16 .2+| 32 | -`architecture::intel_gpu_pvc` +`matrix_type::sint32` .2+|`matrix_type::sint32` .2+| +<=+ 8 | 16 .2+| 32 | +`architecture::intel_gpu_pvc`, `architecture::intel_gpu_bmg_g21`, +`architecture::intel_gpu_lnl_m` |8|`architecture::intel_gpu_dg2_g10, -architecture::intel_gpu_dg2_g11, architecture::intel_gpu_dg2_g12` +architecture::intel_gpu_dg2_g11, architecture::intel_gpu_dg2_g12`, +`architecture::intel_gpu_arl_h` .2+| `matrix_type::sint8` .2+| `matrix_type::sint8` .2+| -`matrix_type::sint32` .2+| +<=+ 8 | 16 .2+| 32 | -`architecture::intel_gpu_pvc` +`matrix_type::sint32` .2+| `matrix_type::sint32` .2+| +<=+ 8 | 16 .2+| 32 | +`architecture::intel_gpu_pvc`, `architecture::intel_gpu_bmg_g21`, +`architecture::intel_gpu_lnl_m` |8|`architecture::intel_gpu_dg2_g10, -architecture::intel_gpu_dg2_g11, architecture::intel_gpu_dg2_g12` -.2+|`matrix_type::fp16` .2+| `matrix_type::fp16` .2+| -`matrix_type::fp32` .2+| +<=+ 8 | 16 .2+| 16 | -`architecture::intel_gpu_pvc` -|8| `architecture::intel_gpu_dg2_g10, -architecture::intel_gpu_dg2_g11, architecture::intel_gpu_dg2_g12` -.6+| `matrix_type::bf16` .6+| `matrix_type::bf16` .6+| -`matrix_type::fp32` | 16 | 16 | 16 .4+|`architecture::intel_gpu_pvc` -| 1 | 64 | 16 | 32 | 64 | 16 +architecture::intel_gpu_dg2_g11, architecture::intel_gpu_dg2_g12`, +`architecture::intel_gpu_arl_h` +.8+|`matrix_type::fp16` .8+| `matrix_type::fp16` .8+| +`matrix_type::fp32` .8+|`matrix_type::fp32` .1+| 16 .1+| 16 | 16 +.6+|`architecture::intel_gpu_pvc`, `architecture::intel_gpu_bmg_g21`, +`architecture::intel_gpu_lnl_m` +.2+| 1 .2+| 64 | 16 |32 +.2+| 32 .2+| 64 | 16 |32 +.2+| +<=+ 8 | 16 .2+| 16 +|8 .2+| `architecture::intel_gpu_dg2_g10, +architecture::intel_gpu_dg2_g11, architecture::intel_gpu_dg2_g12`, +`architecture::intel_gpu_arl_h` +.1+| 32 .1+| 32 .1+| 16 +.6+|`matrix_type::fp16` .6+| `matrix_type::fp16` .6+| +`matrix_type::fp16` .6+|`matrix_type::fp32` .1+| +<=+ 8 | 16 .1+| 16 +.6+| `architecture::intel_gpu_pvc`, `architecture::intel_gpu_bmg_g21`, +`architecture::intel_gpu_lnl_m` +| 16 | 16 | 16 .2+| 1 .2+| 64 | 16 | 32 +.2+| 32 .2+| 64 | 16 | 32 +.6+|`matrix_type::fp16` .6+| `matrix_type::fp16` .6+| +`matrix_type::fp32` .6+|`matrix_type::fp16` .1+| +<=+ 8 | 16 .1+| 16 +.6+|`architecture::intel_gpu_pvc`, `architecture::intel_gpu_bmg_g21`, +`architecture::intel_gpu_lnl_m` +| 16 | 16 | 16 .2+| 1 .2+| 64 | 16 | 32 +.2+| 32 .2+| 64 |16 | 32 +.6+|`matrix_type::fp16` .6+| `matrix_type::fp16` .6+| +`matrix_type::fp16` .6+|`matrix_type::fp16` .1+| +<=+ 8 | 16 .1+| 16 +.6+|`architecture::intel_gpu_pvc`, `architecture::intel_gpu_bmg_g21`, +`architecture::intel_gpu_lnl_m` +| 16 | 16 | 16 .2+| 1 .2+| 64 | 16 |32 .2+| 32 .2+| 64 | 16 | 32 +.8+| `matrix_type::bf16` .8+| `matrix_type::bf16` .8+| +`matrix_type::fp32` .8+| `matrix_type::fp32` | 16 | 16 | 16 +.6+|`architecture::intel_gpu_pvc`, `architecture::intel_gpu_bmg_g21`, +`architecture::intel_gpu_lnl_m` +.2+| 1 .2+| 64 | 16 | 32 +.2+| 32 .2+| 64 | 16 |32 .2+| +<=+ 8 | 16 .2+| 16 |8 .2+| `architecture::intel_gpu_dg2_g10, -architecture::intel_gpu_dg2_g11, architecture::intel_gpu_dg2_g12` +architecture::intel_gpu_dg2_g11, architecture::intel_gpu_dg2_g12`, +`architecture::intel_gpu_arl_h` .1+| 32 .1+| 32 .1+| 16 +.6+|`matrix_type::bf16` .6+| `matrix_type::bf16` .6+| +`matrix_type::bf16` .6+|`matrix_type::fp32` .1+| +<=+ 8 | 16 .1+| 16 .6+| +`architecture::intel_gpu_pvc`, `architecture::intel_gpu_bmg_g21`, +`architecture::intel_gpu_lnl_m` +| 16 | 16 | 16 .2+| 1 .2+| 64 | 16 | 32 +.2+| 32 .2+| 64 |16 | 32 +.6+|`matrix_type::bf16` .6+| `matrix_type::bf16` .6+| +`matrix_type::fp32` .6+|`matrix_type::bf16` .1+| +<=+ 8 | 16 .1+| 16 .6+| +`architecture::intel_gpu_pvc`, `architecture::intel_gpu_bmg_g21`, +`architecture::intel_gpu_lnl_m` +| 16 | 16 | 16 .2+| 1 .2+| 64 | 16 | 32 +.2+| 32 .2+| 64 |16 | 32 +.6+|`matrix_type::bf16` .6+| `matrix_type::bf16` .6+| +`matrix_type::bf16` .6+|`matrix_type::bf16` .1+| +<=+ 8 | 16 .1+| 16 .6+| +`architecture::intel_gpu_pvc`, `architecture::intel_gpu_bmg_g21`, +`architecture::intel_gpu_lnl_m` +| 16 | 16 | 16 .2+| 1 .2+| 64 | 16 | 32 +.2+| 32 .2+| 64 |16 | 32 | `matrix_type::tf32` | `matrix_type::tf32` | -`matrix_type::fp32` | +<=+ 8 | 16 | 8 | -`architecture::intel_gpu_pvc` +`matrix_type::fp32` .2+| `matrix_type::fp32` | +<=+ 8 | 16 | 8 | +`architecture::intel_gpu_pvc`, `architecture::intel_gpu_bmg_g21`, +`architecture::intel_gpu_lnl_m` |====================== ==== Nvidia Tensor Cores Supported Combinations