@@ -253,6 +253,22 @@ def XeGPU_PrefetchNdOp : XeGPU_Op<"prefetch_nd", []> {
253253 It issues an instruction to prefetch a block of data from continuous
254254 memory regions to each level of the cache based on their cache policy.
255255
256+ Arguments:
257+ - `TensorDesc`: A tensor descriptor specifying the base nd-region of
258+ memory and tensor tile to be prefetched.
259+
260+ - `offsets`: index values representing per-dimension offsets from the
261+ base position encoded in `TensorDesc`. It is encoded via "offsets"
262+ and "const_offsets".
263+
264+ - `l1_hint`, `l2_hint`, `l3_hint`: [optional] An cache-hint attribute
265+ indicating the desired behavior at the L1, L2, and L3 cache levels.
266+
267+ - `anchor_layout`: [optional] An attribute that identifies the operation
268+ as an anchor, enabling users to assign a layout that governs distribution
269+ at the subgroup and/or work-item level. Only valid at workgroup and subgroup
270+ level.
271+
256272 Example:
257273 ```mlir
258274 xegpu.prefetch_nd %tdesc {l1_hint = #xegpu.cache_hint<cached>,
@@ -326,16 +342,37 @@ def XeGPU_LoadNdOp : XeGPU_Op<"load_nd", [
326342 a block of data from memory to register. It takes a set of optional cache
327343 hints for each level of cache, L1, L2 and L3. If hardware does not have a
328344 correspoding cache, Corresponding cache hint attribute will be masked.
329- VNNI transformation is an hardware feature for Intel GPU, which is used to
330- do data packing during the load for B operand of matrix operation, if
331- the bit width of the data type is less then 32 bits, e.g., fp16. And
332- transpose is another Intel hardware feature, which will do transpose
333- operation when loading the data if the bit width of the data type is
334- fp32 or fp64. It implies that vnni and transpose cannot exit at the
335- same time. It is only available to 1D or 2D blocked tensor_desc.
345+
346+ On Intel GPUs, hardware-supported packing rearranges data elements during
347+ the load of the B operand when the element bit-width is less than 32 bits
348+ (for example, fp16). The transpose feature reorders data during the load
349+ when the element type is fp32 or fp64. These two features are mutually
350+ exclusive and shall not be enabled simultaneously. Both features support only
351+ 2D blocked tensor_desc.
336352
337353 In SIMT mode, result vector represents the data to be loaded by each work-item.
338354
355+ Arguments:
356+
357+ - `TensorDesc`: A tensor descriptor specifying the base nd-region of memory
358+ and the tensor tile to be loaded.
359+
360+ - `offsets`: Index values representing per-dimension offsets from the base position
361+ encoded in `TensorDesc`. They are encoded via `offsets` and `const_offsets`.
362+
363+ - `packed`: [optional] A unit attribute indicating that packing is applied
364+ during the load when supported by the hardware. Only valid at lane level.
365+
366+ - `transpose`: [optional] An attribute describing a hardware-supported transpose
367+ to be applied during the load. Only valid at Lane level.
368+
369+ - `l1_hint`, `l2_hint`, `l3_hint`: [optional] Cache-hint attributes indicating the
370+ desired behavior at the L1, L2, and L3 cache levels.
371+
372+ - `anchor_layout`: [optional] An attribute that identifies the operation as an anchor,
373+ enabling users to assign a layout that governs distribution at the subgroup and/or
374+ work-item level. Only valid at workgroup and subgroup levels.
375+
339376 Example 1:
340377 ```mlir
341378 xegpu.load_nd %1 {transpose = [1, 0],
@@ -391,7 +428,6 @@ def XeGPU_LoadNdOp : XeGPU_Op<"load_nd", [
391428 return getTensorDescType().getShape();
392429 }
393430
394-
395431 }];
396432
397433 let assemblyFormat = [{
@@ -432,6 +468,23 @@ def XeGPU_StoreNdOp : XeGPU_Op<"store_nd", [
432468
433469 In SIMT mode, the input vector represents the data to be stored by each work-item.
434470
471+ Arguments:
472+
473+ - `value`: A vector value representing the tensor tile to be stored.
474+
475+ - `TensorDesc`: A tensor descriptor specifying the base nd-region of memory and
476+ the tensor tile to be stored.
477+
478+ - `offsets`: Index values representing per-dimension offsets from the base position
479+ encoded in `TensorDesc`. They are encoded via `offsets` and `const_offsets`.
480+
481+ - `l1_hint`, `l2_hint`, `l3_hint`: [optional] Cache-hint attributes indicating the
482+ desired behavior at the L1, L2, and L3 cache levels.
483+
484+ - `anchor_layout`: [optional] An attribute that identifies the operation as an anchor,
485+ enabling users to assign a layout that governs distribution at the subgroup and/or
486+ work-item level. Only valid at workgroup and subgroup levels.
487+
435488 Example 1:
436489 ```mlir
437490 xegpu.store_nd %3, %2 {l1_hint = #xegpu.cache_hint<uncached>,
@@ -568,8 +621,10 @@ def XeGPU_CreateDescOp: XeGPU_Op<"create_tdesc", [Pure, ViewLikeOpInterface]> {
568621 It accepts the following parameters:
569622
570623 Arguments:
624+
571625 - `source`: a 1D memref or pointer (i64, i32, ui64, ui32) represents the flattened
572626 memory object.
627+
573628 - `offsets`: a vector containing offsets of each access point. Its size
574629 is fixed to the hardware supportted subgroup size, e.g., 16 on PVC,
575630 implying each element in the vector corresponds to a work-item (SIMT lane)
@@ -668,17 +723,25 @@ def XeGPU_PrefetchOp : XeGPU_Op<"prefetch", []> {
668723 it works on scattered TensorDesc instead.
669724
670725 Arguments:
726+
671727 - `source`: represents the memory region to be loaded from, which can be either a
672728 tensor_desc or a 1D memref or pointer (ui64, ui32, i64 or i32).
673729 In case of tensor_desc, offsets come from the producer create_tdesc op.
674730 tensor_desc cannot be used in SIMT mode.
731+
675732 - `offsets`: represents offsets from source. required if `source` in not a TensorDescType.
676733 offsets is a vector of `index` type and vector length is either the subgroup size
677734 or 1 in SIMT mode. scalar offset is also valid for SIMT mode.
678- - `l1_hint`, `l2_hint`, `l3_hint`: are optional cache hints for each level of cache.
679- - `offset_align_byte`: required if `source` is a pointer. If `source` is not a pointer,
735+
736+ - `l1_hint`, `l2_hint`, `l3_hint`: [optional] cache hints for each level of cache.
737+
738+ - `offset_align_byte`: [optional] required if `source` is a pointer. If `source` is not a pointer,
680739 it is not allowed. Represents the alignment in bytes of each offset in offsets.
681740
741+ - `anchor_layout`: [optional] An attribute that identifies the operation as an anchor,
742+ enabling users to assign a layout that governs distribution at the subgroup and/or
743+ work-item level. Only valid at workgroup and subgroup levels.
744+
682745 Example 1:
683746 ```mlir
684747 xegpu.prefetch %tdesc {l1_hint = #xegpu.cache_hint<cached>,
@@ -727,7 +790,8 @@ def XeGPU_PrefetchOp : XeGPU_Op<"prefetch", []> {
727790 OptionalAttr<XeGPU_CacheHintAttr>:$l1_hint,
728791 OptionalAttr<XeGPU_CacheHintAttr>:$l2_hint,
729792 OptionalAttr<XeGPU_CacheHintAttr>:$l3_hint,
730- OptionalAttr<I64Attr>:$offset_align_byte);
793+ OptionalAttr<I64Attr>:$offset_align_byte,
794+ OptionalAttr<DistributeLayoutAttr>:$anchor_layout);
731795
732796 let extraClassDeclaration = extraBaseClassDeclaration # [{
733797 Type getSourceType() {
@@ -779,18 +843,27 @@ def XeGPU_LoadGatherOp : XeGPU_Op<"load", [MemoryEffects<[MemRead]>]> {
779843 each work-item. If size is not 1, size should be equal to the chunk size,
780844
781845 Arguments:
846+
782847 - `source`: represents the memory region to be loaded from, which can be either a
783848 tensor_desc or a 1D memref or pointer (ui64, ui32, i64 or i32).
784849 In case of tensor_desc, offsets come from the producer create_tdesc op.
785850 tensor_desc cannot be used in SIMT mode.
851+
786852 - `offsets`: represents offsets from source. required if `source` in not a TensorDescType.
787853 offsets is a vector of `index` type and vector length is either the subgroup size
788854 or 1 in SIMT mode. scalar offset is also valid for SIMT mode.
855+
789856 - `mask`: is a vector of `i1` type, which is used to mask out the memory access.
790857 mask is a vector of size equal to the subgroup size, or 1 in SIMT mode.
791858 scalar mask is also valid for SIMT mode.
792- - `chunk_size`: (optional) represents contiguous number of elements to load from per work item.
793- - `l1_hint`, `l2_hint`, `l3_hint`: are optional cache hints for each level of cache.
859+
860+ - `chunk_size`: [optional] represents contiguous number of elements to load from per work item.
861+
862+ - `l1_hint`, `l2_hint`, `l3_hint`: [optional] cache hints for each level of cache.
863+
864+ - `anchor_layout`: [optional] An attribute that identifies the operation as an anchor,
865+ enabling users to assign a layout that governs distribution at the subgroup and/or
866+ work-item level. Only valid at workgroup and subgroup levels.
794867
795868 Results:
796869 - `res`: represents loaded data
@@ -926,19 +999,30 @@ def XeGPU_StoreScatterOp : XeGPU_Op<"store", [MemoryEffects<[MemWrite]>]> {
926999 each work-item. If size is not 1, size should be equal to the chunk size.
9271000
9281001 Arguments:
1002+
9291003 - `value`: represents the data to be stored.
1004+
9301005 - `dest`: represents the memory region to be stored to, which can be either a
9311006 tensor_desc or a 1D memref or pointer (ui64, ui32, i64 or i32).
9321007 In case of tensor_desc, offsets come from the producer create_tdesc op.
9331008 tensor_desc cannot be used in SIMT mode.
1009+
9341010 - `offsets`: represents offsets from dest. required if `source` in not a TensorDescType.
9351011 offsets is a vector of `index` type and vector length is either the subgroup size
9361012 or 1 in SIMT mode. scalar offset is also valid for SIMT mode.
1013+
9371014 - `mask`: is a vector of `i1` type, which is used to mask out the memory access.
9381015 mask is a vector of size equal to the subgroup size, or 1 in SIMT mode.
9391016 scalar mask is also valid for SIMT mode.
940- - `chunk_size`: (optional) represents contiguous number of elements to store to per work item.
941- - `l1_hint`, `l2_hint`, `l3_hint`: are optional cache hints for each level of cache.
1017+
1018+ - `chunk_size`: [optional] represents contiguous number of elements to store to per work item.
1019+
1020+ - `l1_hint`, `l2_hint`, `l3_hint`: [optional] cache hints for each level of cache.
1021+
1022+ - `anchor_layout`: [optional] An attribute that identifies the operation as an anchor,
1023+ enabling users to assign a layout that governs distribution at the subgroup and/or
1024+ work-item level. Only valid at workgroup and subgroup levels.
1025+
9421026
9431027 Example 1:
9441028 ```mlir
@@ -1115,22 +1199,28 @@ def XeGPU_DpasOp : XeGPU_Op<"dpas", [Pure, AllElementTypesMatch<["lhs", "rhs"]>]
11151199 size, B of `kxn` size, and accumulate on matrix C of `mxn` to the same size
11161200 matrix , `m=8`, `n=16` and `k=8 * 32/bit_width_of_elem_type`. So for fp16
11171201 data type, the matrices are `A: vector<8x16xf16>`, `B: vector<16x16xf16>`,
1118- and `C/D: vector<8x16xf32>`. Besides the matrix size requirements, DPAS
1119- also requires A and B to be loaded with the required data layout. Specially,
1120- VNNI layout is required for B operand. It is achieved via adding `packed`
1121- attribute to the `load_nd` operator. Due to the VNNI transformation, B operands
1122- can be represented as a 3D vector, with the last dimension representing the VNNI
1123- factor, which is computed as `32/bit_width_of_elem_type`. Thus, `B: vector<16x16xf16>`
1124- can be represented as `B: vector<8x16x2xf16>`.
1202+ and `C/D: vector<8x16xf32>`.
11251203
11261204 In SIMT code, each work-item from a subgroup holds a data fragment for A, B, C and the result,
11271205 which are represented as 1D vectors. Please refer to [OpenCL Intel extentions]
11281206 (https://registry.khronos.org/OpenCL/extensions/intel/cl_intel_subgroup_matrix_multiply_accumulate.html)
11291207 for more details about the fragment distribution.
11301208
1131- Note: on PVC, the hardware can perform load with VNNI transformation when data
1132- element type is 16-bit or lower precision, taking 2 or 4 elements from
1133- the first dimension and inserted into the newly added innermost dimension.
1209+ Arguments:
1210+
1211+ - `lhs`: A vector value representing the left-hand-side matrix tile (A) participating in the
1212+ matrix multiply.
1213+
1214+ - `rhs`: A vector value representing the right-hand-side matrix tile (B).
1215+
1216+ - `acc`: [optional] A vector value representing the accumulator matrix tile (C). When present, the
1217+ result is computed as `lhs * rhs + acc`; otherwise, the accumulator is implicitly assumed to be zero.
1218+
1219+ - `anchor_layout_a`, `anchor_layout_b`, `anchor_layout_cd`: [optional] Attributes that identify this
1220+ operation as anchors for operands A, B, and the accumulator/result, enabling users to assign layouts
1221+ that govern distribution at the subgroup and/or work-item level. Only valid at workgroup and subgroup
1222+ level.
1223+
11341224 }];
11351225
11361226 let arguments = (ins
@@ -1187,13 +1277,31 @@ def XeGPU_AtomicRMWOp: XeGPU_Op<"atomic_rmw", [Pure,
11871277 has the same shape with `TensorDesc`, and is used to enable or disable specific
11881278 data points of the `TensorDesc`. The `value` operand represents the new value to
11891279 be applied during the modification.
1280+ Arguments:
1281+ - `kind`: An attribute that specifies the atomic operation to be performed
1282+ (e.g., add, min, max, exchange, etc.).
1283+
1284+ - `tensorDesc`: A `TensorDesc` describing the memory region on which the atomic
1285+ read-modify-write is performed.
1286+
1287+ - `mask`: A predicate mask with the same shape as `tensorDesc`. Only elements
1288+ with a true (non-zero) mask value participate in the atomic operation;
1289+ masked-out elements are not modified.
1290+
1291+ - `value`: The input values used by the atomic operation. It must have the same
1292+ shape and element type as `tensorDesc` and `result`.
1293+
1294+ - `anchor_layout`: [optional] An attribute that identifies the operation as an anchor,
1295+ enabling users to assign a layout that governs distribution at the subgroup
1296+ and/or work-item level. Only valid at workgroup and subgroup levels.
11901297 }];
11911298
11921299 let arguments = (ins
11931300 AtomicRMWKindAttr:$kind,
11941301 XeGPU_TensorDesc:$tensorDesc,
11951302 XeGPU_MaskType:$mask,
1196- XeGPU_ValueType:$value);
1303+ XeGPU_ValueType:$value,
1304+ OptionalAttr<DistributeLayoutAttr>:$anchor_layout);
11971305
11981306 let results = (outs XeGPU_ValueType:$result);
11991307
@@ -1275,6 +1383,13 @@ def XeGPU_ConvertLayoutOp: XeGPU_Op<"convert_layout", [Pure, AllTypesMatch<["sou
12751383 the `target_layout`. Both `input_layout` and `target_layout` must correspond to the same programming
12761384 scope, such as workgroup-level (wg) or subgroup-level (sg) code. This operation is not valid once
12771385 the IR is lowered to WI level because that is the end result of all distributions.
1386+ Arguments:
1387+ - `source`: The input vector whose data is to be redistributed. The source and
1388+ result types must match.
1389+ - `input_layout`: The layout attribute describing the current distribution of `source`
1390+ across subgroups and/or work-items.
1391+ - `target_layout`: The layout attribute describing the desired distribution of the result
1392+ across subgroups and/or work-items.
12781393 }];
12791394 let arguments = (ins XeGPU_VectorType: $source,
12801395 DistributeLayoutAttr: $input_layout,
@@ -1342,12 +1457,13 @@ def XeGPU_LoadMatrixOp: XeGPU_Op<"load_matrix", [MemoryEffects<[MemRead]>,
13421457 Arguments:
13431458 - `mem_desc`: the memory descriptor identifying the SLM region.
13441459 - `offsets`: the coordinates within the matrix to read from.
1345- - `subgroup_block_io`: [optional] An attribute indicating that the operation can be
1346- lowered to a subgroup block load. When this attribute is present,
1347- the offsets are subgroup-uniform across all lanes.
1348- - `anchor_layout`: [optional] An attribute for guiding distributions among
1349- subgroups and/or work-items. It currently can accept either
1350- LayoutAttr or SliceAttr.
1460+ - `subgroup_block_io`: [optional] An attribute indicating that the operation can be lowered
1461+ to a subgroup block load. When this attribute is present, the offsets are subgroup-uniform
1462+ across all lanes. Only used on subgroup and lane level.
1463+ - `anchor_layout`: [optional] An attribute that identifies the operation as an anchor, enabling
1464+ users to assign a layout that governs distribution at the subgroup and/or work-item level.
1465+ Only valid at workgroup and subgroup levels.
1466+
13511467 Results:
13521468 - `res`: the matrix elements loaded from SLM.
13531469 }];
@@ -1393,12 +1509,12 @@ def XeGPU_StoreMatrixOp: XeGPU_Op<"store_matrix", [MemoryEffects<[MemWrite]>,
13931509 - `mem_desc`: the memory descriptor specifying the SLM region.
13941510 - `offsets`: the coordinates within the matrix where the data will be written.
13951511 - `data`: the values to be stored in the matrix.
1396- - `subgroup_block_io`: [optional] An attribute indicating that the operation can be
1397- lowered to a subgroup block store . When this attribute is present,
1398- the offsets are subgroup-uniform across all lanes.
1399- - `anchor_layout`: [optional] An attribute for guiding distributions among
1400- subgroups and/or work-items. It currently can accept either
1401- LayoutAttr or SliceAttr .
1512+ - `subgroup_block_io`: [optional] An attribute indicating that the operation can be lowered
1513+ to a subgroup block load . When this attribute is present, the offsets are subgroup-uniform
1514+ across all lanes. Only used on subgroup and lane level.
1515+ - `anchor_layout`: [optional] An attribute that identifies the operation as an anchor, enabling
1516+ users to assign a layout that governs distribution at the subgroup and/or work-item level.
1517+ Only valid at workgroup and subgroup levels .
14021518 }];
14031519 let builders = [
14041520 OpBuilder<(ins "Value" : $data, "TypedValue<MemDescType>": $mem_desc,
0 commit comments