@@ -80,7 +80,7 @@ def XeGPU_CreateNdDescOp: XeGPU_Op<"create_nd_tdesc", [Pure, ViewLikeOpInterface
80
80
information e.g., memref<?x?xf16>, the strides information has to be explicitly
81
81
passed via the "strides" and "const_strides" argument.
82
82
83
- In SIMT mode, tensor descriptor is augmented with `SGMapAttr ` which describes the
83
+ In SIMT mode, tensor descriptor is augmented with `LayoutAttr ` which describes the
84
84
mapping of the tensor descriptor to the work items.
85
85
86
86
Example 1 (suppose the tensor shape inferred by the compiler is 8x16):
@@ -113,7 +113,7 @@ def XeGPU_CreateNdDescOp: XeGPU_Op<"create_nd_tdesc", [Pure, ViewLikeOpInterface
113
113
%c0 = arith.constant 0 : index
114
114
%c1 = arith.constant 8 : index
115
115
%1 = xegpu.create_nd_tdesc %0[%c0, %c0] : memref<1024x1024xf32>
116
- -> !xegpu.tensor_desc<8x16xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
116
+ -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
117
117
```
118
118
}];
119
119
@@ -306,7 +306,7 @@ def XeGPU_LoadNdOp : XeGPU_Op<"load_nd", [
306
306
fp32 or fp64. It implies that vnni and transpose cannot exit at the
307
307
same time.
308
308
309
- In SIMT mode, LoadNdOp expects the tensor descriptor to be augmented with `SGMapAttr `
309
+ In SIMT mode, LoadNdOp expects the tensor descriptor to be augmented with `LayoutAttr `
310
310
which describes the mapping of the tensor to the work items. In this case, result
311
311
vector represents the data to be loaded by each work-item.
312
312
@@ -323,7 +323,7 @@ def XeGPU_LoadNdOp : XeGPU_Op<"load_nd", [
323
323
xegpu.load_nd %1 {l1_hint = #xegpu.cache_hint<cached>,
324
324
l2_hint = #xegpu.cache_hint<uncached>}>
325
325
: !xegpu.tensor_desc<8x16xf32,
326
- #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>> -> vector<8x1xf32>
326
+ #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<8x1xf32>
327
327
```
328
328
329
329
@@ -364,7 +364,7 @@ def XeGPU_StoreNdOp : XeGPU_Op<"store_nd", [
364
364
of cache, L1, L2 and L3. If hardware does not have a correspoding cache,
365
365
Corresponding cache hint attribute will be masked.
366
366
367
- In SIMT mode, StoreNdOp expects the tensor descriptor to be augmented with `SGMapAttr `
367
+ In SIMT mode, StoreNdOp expects the tensor descriptor to be augmented with `LayoutAttr `
368
368
which describes the mapping of the tensor to the work items. In this case, input
369
369
vector represents the data to be stored by each work-item.
370
370
@@ -381,7 +381,7 @@ def XeGPU_StoreNdOp : XeGPU_Op<"store_nd", [
381
381
l2_hint = #xegpu.cache_hint<write_back>,
382
382
l3_hint = #xegpu.cache_hint<write_through>}
383
383
: vector<8x1xf16>, !xegpu.tensor_desc<8x16xf16,
384
- #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
384
+ #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
385
385
```
386
386
387
387
@@ -422,7 +422,7 @@ def XeGPU_UpdateNdOffsetOp : XeGPU_Op<"update_nd_offset",
422
422
Example 2 (SIMT mode):
423
423
```
424
424
%2 = xegpu.update_nd_offset %1, [0, 16]:
425
- !xegpu.tensor_desc<8x16xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
425
+ !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
426
426
```
427
427
}];
428
428
@@ -482,7 +482,7 @@ def XeGPU_CreateDescOp: XeGPU_Op<"create_tdesc", [Pure, ViewLikeOpInterface]> {
482
482
the chunk_size if the chunk size is larger than 1.
483
483
484
484
In SIMT mode, similar to `create_nd_tdesc` the resulting tensor descriptor is augmented
485
- with `SGMapAttr ` which describes the mapping of the tensor descriptor to the work items.
485
+ with `LayoutAttr ` which describes the mapping of the tensor descriptor to the work items.
486
486
In this case, the first dimension of the tensor descriptor represents the work-items, and
487
487
the second dimension represents the chunk size.
488
488
@@ -517,7 +517,7 @@ def XeGPU_CreateDescOp: XeGPU_Op<"create_tdesc", [Pure, ViewLikeOpInterface]> {
517
517
%off = arith.constant dense<[0, 16, 32, 64]> : vector<4xindex>
518
518
%1 = xegpu.create_tdesc %0, %off : memref<1024xf32>, vector<4xindex>
519
519
-> TensorDesc<4x8xf32, #xegpu.scattered_tdesc_attr<chunk_size = 8>,
520
- #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 1]>>
520
+ #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 1]>>
521
521
```
522
522
}];
523
523
@@ -623,7 +623,7 @@ def XeGPU_LoadGatherOp : XeGPU_Op<"load", [
623
623
The mask operand masks out memory access so that it is safe to pass out-of-boundary
624
624
addresses/offsets as long as they are masked. It applies to slots of SIMD lanes.
625
625
626
- In SIMT mode, LoadGatherOp expects the tensor descriptor to be augmented with `SGMapAttr `
626
+ In SIMT mode, LoadGatherOp expects the tensor descriptor to be augmented with `LayoutAttr `
627
627
which describes the mapping of the tensor to the work items. In this case, result vector
628
628
represents the data to be loaded by each work-item. Each work-item recieves a `chunk_size`
629
629
number of elements.
@@ -653,7 +653,7 @@ def XeGPU_LoadGatherOp : XeGPU_Op<"load", [
653
653
l2_hint = #xegpu.cache_hint<uncached>,
654
654
l3_hint = #xegpu.cache_hint<uncached>}
655
655
: !xegpu.tensor_desc<16x8xf32, #xegpu.scatter_tdesc_attr<memory_space=global, chunk_size=8>,
656
- !xegpu.sg_map<wi_layout = [16, 1], wi_data = [1, 1]>>
656
+ !xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>>
657
657
vector<16xi1> -> vector<8x1xf32>
658
658
```
659
659
@@ -704,7 +704,7 @@ def XeGPU_StoreScatterOp : XeGPU_Op<"store", [
704
704
has transpose effect, which is similar to `load_gather`. Therefore, a transpose attribute is
705
705
introduced on purpose, making sure users are aware of this implicit transformation.
706
706
707
- In SIMT mode, StoreScatterOp expects the tensor descriptor to be augmented with `SGMapAttr `
707
+ In SIMT mode, StoreScatterOp expects the tensor descriptor to be augmented with `LayoutAttr `
708
708
which describes the mapping of the tensor to the work items. In this case, input vector
709
709
represents the data to be stored by each work-item. Each work-item recieves a `chunk_size`
710
710
number of elements.
@@ -732,7 +732,7 @@ def XeGPU_StoreScatterOp : XeGPU_Op<"store", [
732
732
l2_hint = #xegpu.cache_hint<write_back>,
733
733
l3_hint = #xegpu.cache_hint<write_through>}
734
734
: vector<8x1xf32>, !xegpu.tensor_desc<16x8xf32, #xegpu.scattered_tdesc_attr<chunk_size=8>,
735
- !xegpu.sg_map<wi_layout = [16, 1], wi_data = [1, 1]>> vector<16xi1>
735
+ !xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>> vector<16xi1>
736
736
```
737
737
738
738
}];
@@ -790,7 +790,7 @@ def XeGPU_UpdateOffsetOp: XeGPU_Op<"update_offset",
790
790
%off = arith.constant dense<[32, 32, 32, 32]> : vector<4xindex>
791
791
%2 = xegpu.update_offset %1, %off :
792
792
!xegpu.tensor_desc<4x2xf32, #xegpu.scattered_tdesc_attr<chunk_size=2>,
793
- #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 1]>>, vector<4xindex>
793
+ #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 1]>>, vector<4xindex>
794
794
```
795
795
}];
796
796
@@ -840,9 +840,9 @@ def XeGPU_DpasOp : XeGPU_Op<"dpas", [Pure, AllElementTypesMatch<["lhs", "rhs"]>]
840
840
factor, which is computed as `32/bit_width_of_elem_type`. Thus, `B: vector<16x16xf16>`
841
841
can be represented as `B: vector<8x16x2xf16>`.
842
842
843
- In SIMT mode, DpasOp expects attributes `sg_map_a `, `sg_map_b `, and `sg_map_c`
844
- which descibes the data fragment owned by each work-item w.r.t. the tensor
845
- descriptor these data are loaded from.
843
+ In SIMT mode, DpasOp expects layout attributes `a `, `b `, and `c` (only if acc is used)
844
+ which describe the data fragment owned by each work-item w.r.t. the tensor descriptor
845
+ these data are loaded from.
846
846
847
847
Note: on PVC, the hardware can perform load with VNNI transformation when data
848
848
element type is 16-bit or lower precision, taking 2 or 4 elements from
@@ -853,9 +853,9 @@ def XeGPU_DpasOp : XeGPU_Op<"dpas", [Pure, AllElementTypesMatch<["lhs", "rhs"]>]
853
853
XeGPU_DpasOpType : $lhs,
854
854
XeGPU_DpasOpType : $rhs,
855
855
Optional<XeGPU_Vector2DType>: $acc,
856
- OptionalAttr<XeGPU_SGMapAttr >:$sg_map_a ,
857
- OptionalAttr<XeGPU_SGMapAttr >:$sg_map_b ,
858
- OptionalAttr<XeGPU_SGMapAttr >:$sg_map_c );
856
+ OptionalAttr<XeGPU_LayoutAttr >:$a_layout ,
857
+ OptionalAttr<XeGPU_LayoutAttr >:$b_layout ,
858
+ OptionalAttr<XeGPU_LayoutAttr >:$c_layout );
859
859
let results = (outs XeGPU_Vector2DType: $result);
860
860
861
861
let extraClassDeclaration = [{
@@ -876,6 +876,10 @@ def XeGPU_DpasOp : XeGPU_Op<"dpas", [Pure, AllElementTypesMatch<["lhs", "rhs"]>]
876
876
VectorType getResultType() {
877
877
return getResult().getType();
878
878
}
879
+
880
+ bool hasAcc() {
881
+ return getAcc() != nullptr;
882
+ }
879
883
}];
880
884
881
885
let assemblyFormat = [{
@@ -979,4 +983,24 @@ def XeGPU_FenceOp: XeGPU_Op<"fence", []> {
979
983
let extraClassDeclaration = extraBaseClassDeclaration;
980
984
}
981
985
986
+ def XeGPU_ConvertLayoutOp: XeGPU_Op<"convert_layout", [Pure, AllTypesMatch<["source", "result"]>]> {
987
+ let summary = "Convert the layout of the input operand";
988
+ let description = [{
989
+ `convert_layout` adjusts the data distribution across subgroups and/or work-items by modifying
990
+ the `LayoutAttr`. Both `srcMap` and `resMap` must correspond to the same programming scope, such
991
+ as workgroup-level (wg) or subgroup-level (sg) code. This operation is not valid once the IR is
992
+ lowered to WI level because that is the end result of all distributions.
993
+ }];
994
+ let arguments = (ins XeGPU_Vector2DType: $source,
995
+ XeGPU_LayoutAttr: $srcMap,
996
+ XeGPU_LayoutAttr: $resMap
997
+ );
998
+ let results = (outs XeGPU_Vector2DType: $result);
999
+ let assemblyFormat = [{
1000
+ $source attr-dict `:` type($source)
1001
+ }];
1002
+
1003
+ let hasVerifier = 1;
1004
+ }
1005
+
982
1006
#endif // MLIR_DIALECT_XEGPU_IR_XEGPUOPS_TD
0 commit comments