SYCL Joint Matrix bfloat16 B row major load SG32

YuriPlyakhin · igcbot · commit 9239d02cd673 · 2025-01-17T16:56:30.000+01:00
Enables SYCL Joint Matrix bfloat16 B row major load
for 16x16 for sub-group size 32
diff --git a/IGC/BiFModule/Languages/OpenCL/PreRelease/IBiF_matrix.cl b/IGC/BiFModule/Languages/OpenCL/PreRelease/IBiF_matrix.cl
@@ -421,7 +421,7 @@ Each subgroup stores 16 of 8x16 slices. Hence, row_stride = R / 4 = 32 / 4 = 8 a
 
 #define DEFINE_BLOCK2D_RW_NAME(rw, tx, contrib_bitwidth, WI_rows, tile_height, tile_width) __builtin_IB_subgroup_block_##rw##_flat_cacheopts##tx##_u##contrib_bitwidth##_wi##WI_rows##_m##tile_height##k##tile_width##v1
 #define DEFINE_BLOCK2D_TRANSPOSE_NAME(contrib_bitwidth, WI_rows, tile_height, tile_width) __builtin_IB_subgroup_block_read_flat_cacheopts_transpose_u##contrib_bitwidth##_wi##WI_rows##_m##tile_height##_k##tile_width
-#define DEFINE_BLOCK2D_VNNI_NAME(contrib_bitwidth, tile_height) __builtin_IB_subgroup_block_read_flat_cacheopts_transform_u##contrib_bitwidth##_k##tile_height // tile_width = sub group size (16)
+#define DEFINE_BLOCK2D_VNNI_NAME(contrib_bitwidth, WI_rows, tile_height, tile_width) __builtin_IB_subgroup_block_read_flat_cacheopts_transform_u##contrib_bitwidth##_wi##WI_rows##_##k##tile_height##n##tile_width
 
 /* For platforms without SG16 JointMatrix support block2d is not available. The
  * implementation remains empty, will fallthrough to vector implementation. */
@@ -539,8 +539,8 @@ Each subgroup stores 16 of 8x16 slices. Hence, row_stride = R / 4 = 32 / 4 = 8 a
     int height = orig_M - 1; /* row count */ \
     long x = (offset - baseoffset) / (sizeof (element_type)); /* in elements */ \
     int2 coords = (int2)(x, 0); \
-    OUT_VEC##WI_rows(u##contrib_type) DEFINE_BLOCK2D_VNNI_NAME(elem_bitwidth, orig_M)(long, int, int, int, int2, int); \
-    OUT_VEC##WI_rows(u##contrib_type) res = DEFINE_BLOCK2D_VNNI_NAME(elem_bitwidth, orig_M)(baseoffset, width, height, pitch, coords, cacheOpt); \
+    OUT_VEC##WI_rows(u##contrib_type) DEFINE_BLOCK2D_VNNI_NAME(elem_bitwidth, WI_rows, orig_M, orig_K)(long, int, int, int, int2, int); \
+    OUT_VEC##WI_rows(u##contrib_type) res = DEFINE_BLOCK2D_VNNI_NAME(elem_bitwidth, WI_rows, orig_M, orig_K)(baseoffset, width, height, pitch, coords, cacheOpt); \
     *(__private OUT_VEC##WI_rows(u##contrib_type) *)dst = res; \
     return; \
   }
@@ -591,8 +591,8 @@ Each subgroup stores 16 of 8x16 slices. Hence, row_stride = R / 4 = 32 / 4 = 8 a
   int pitch = sizeof (element_type) * stride - 1; /* in bytes */ \
   int height_size = height - 1; \
   int2 coords = (int2)(x, y); \
-  OUT_VEC##WI_rows(u##contrib_type) DEFINE_BLOCK2D_VNNI_NAME(elem_bitwidth, orig_M)(long, int, int, int, int2, int); \
-  OUT_VEC##WI_rows(u##contrib_type) res = DEFINE_BLOCK2D_VNNI_NAME(elem_bitwidth, orig_M)(offset, width_size, height_size, pitch, coords, cacheOpt); \
+  OUT_VEC##WI_rows(u##contrib_type) DEFINE_BLOCK2D_VNNI_NAME(elem_bitwidth, WI_rows, orig_M, orig_K)(long, int, int, int, int2, int); \
+  OUT_VEC##WI_rows(u##contrib_type) res = DEFINE_BLOCK2D_VNNI_NAME(elem_bitwidth, WI_rows, orig_M, orig_K)(offset, width_size, height_size, pitch, coords, cacheOpt); \
   *(__private OUT_VEC##WI_rows(u##contrib_type) *)dst = res; \
   return;
 
@@ -902,6 +902,7 @@ DEFINE_LOAD_AND_CHECKED(PackedB_RowMajor,    _SG16, short, int, 16, 32, VNNI_TX,
 /* PackedB load i16 SG16 for sub group size = 32*/
 DEFINE_LOAD(PackedB_ColumnMajor, _SG16, short, int, 8, 32, COL_MAJOR, , 4)
 DEFINE_LOAD(PackedB_PackedB,     _SG16, short, int, 8, 32, ROW_MAJOR, , 4)
+DEFINE_LOAD(PackedB_RowMajor,    _SG16, short, int, 8, 32, VNNI_TX,   , 4)
 
 /* PackedB load i8 SG16*/
 DEFINE_LOAD_AND_CHECKED(PackedB_ColumnMajor, _SG16, char, int, 8, 64, COL_MAJOR, , 8)
diff --git a/IGC/Compiler/Optimizer/OpenCLPasses/LSCFuncs/LSCFuncsResolution.cpp b/IGC/Compiler/Optimizer/OpenCLPasses/LSCFuncs/LSCFuncsResolution.cpp
@@ -1032,12 +1032,20 @@ Instruction* LSCFuncsResolution::CreateSubGroup2DBlockOperation(llvm::CallInst&
     else if (isVnniTransform && !isTranspose)
     {
         numBlocksV = 1;
+        tileWidth = subGrpSize;
 
         if (elemSize == 8)
         {
             bool is32Height = funcName.consume_front("_k32");
             IGC_ASSERT_MESSAGE(is32Height, "Only k32 is supported for 8 bit element size, at the moment.");
 
+            // If sub-group size is 32, we still may want to use width = 16
+            // __builtin_IB_subgroup_block_read_flat_cacheopts_transform_u8_wi8_k32n16
+            if (funcName.consume_front("n16"))
+            {
+                tileWidth = 16;
+            }
+
             // __builtin_IB_subgroup_block_read_flat_transform_u8_k32v2
             if (funcName.consume_front("v2"))
             {
@@ -1070,15 +1078,20 @@ Instruction* LSCFuncsResolution::CreateSubGroup2DBlockOperation(llvm::CallInst&
                 return nullptr;
             }
 
+            // If sub-group size is 32, we still may want to use width = 16
+            // __builtin_IB_subgroup_block_read_flat_transform_u16_k16n16
+            if (funcName.consume_front("n16"))
+            {
+                tileWidth = 16;
+            }
+
             // __builtin_IB_subgroup_block_read_flat_transform_u16_k16v2
             // __builtin_IB_subgroup_block_read_flat_transform_u16_k32v2
             if (funcName.consume_front("v2"))
             {
                 numBlocksV = 2;
             }
         }
-
-        tileWidth = subGrpSize;
     }
     else
     {

Original file line number	Diff line number	Diff line change
`@@ -1032,12 +1032,20 @@ Instruction* LSCFuncsResolution::CreateSubGroup2DBlockOperation(llvm::CallInst&`
`1032`	`1032`	`else if (isVnniTransform && !isTranspose)`
`1033`	`1033`	`{`
`1034`	`1034`	`numBlocksV = 1;`
	`1035`	`+ tileWidth = subGrpSize;`
`1035`	`1036`
`1036`	`1037`	`if (elemSize == 8)`
`1037`	`1038`	`{`
`1038`	`1039`	`bool is32Height = funcName.consume_front("_k32");`
`1039`	`1040`	`IGC_ASSERT_MESSAGE(is32Height, "Only k32 is supported for 8 bit element size, at the moment.");`
`1040`	`1041`
	`1042`	`+ // If sub-group size is 32, we still may want to use width = 16`
	`1043`	`+ // __builtin_IB_subgroup_block_read_flat_cacheopts_transform_u8_wi8_k32n16`
	`1044`	`+ if (funcName.consume_front("n16"))`
	`1045`	`+ {`
	`1046`	`+ tileWidth = 16;`
	`1047`	`+ }`
	`1048`	`+`
`1041`	`1049`	`// __builtin_IB_subgroup_block_read_flat_transform_u8_k32v2`
`1042`	`1050`	`if (funcName.consume_front("v2"))`
`1043`	`1051`	`{`
`@@ -1070,15 +1078,20 @@ Instruction* LSCFuncsResolution::CreateSubGroup2DBlockOperation(llvm::CallInst&`
`1070`	`1078`	`return nullptr;`
`1071`	`1079`	`}`
`1072`	`1080`
	`1081`	`+ // If sub-group size is 32, we still may want to use width = 16`
	`1082`	`+ // __builtin_IB_subgroup_block_read_flat_transform_u16_k16n16`
	`1083`	`+ if (funcName.consume_front("n16"))`
	`1084`	`+ {`
	`1085`	`+ tileWidth = 16;`
	`1086`	`+ }`
	`1087`	`+`
`1073`	`1088`	`// __builtin_IB_subgroup_block_read_flat_transform_u16_k16v2`
`1074`	`1089`	`// __builtin_IB_subgroup_block_read_flat_transform_u16_k32v2`
`1075`	`1090`	`if (funcName.consume_front("v2"))`
`1076`	`1091`	`{`
`1077`	`1092`	`numBlocksV = 2;`
`1078`	`1093`	`}`
`1079`	`1094`	`}`
`1080`		`-`
`1081`		`- tileWidth = subGrpSize;`
`1082`	`1095`	`}`
`1083`	`1096`	`else`
`1084`	`1097`	`{`