Skip to content

Commit 4838b52

Browse files
committed
extend sg_map from subgroup to workgroup
1 parent f13d583 commit 4838b52

File tree

7 files changed

+457
-321
lines changed

7 files changed

+457
-321
lines changed

mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td

Lines changed: 62 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -154,33 +154,81 @@ def XeGPU_FenceScopeAttr:
154154
let assemblyFormat = "$value";
155155
}
156156

157-
def XeGPU_SGMapAttr : XeGPUAttr<"SGMap", "sg_map"> {
157+
def XeGPU_ScopeWG: I32EnumAttrCase<"WG", 0, "wg">; // workgroup level code
158+
def XeGPU_ScopeSG: I32EnumAttrCase<"SG", 1, "sg">; // subgroup level code
159+
def XeGPU_ScopeWI: I32EnumAttrCase<"WI", 2, "wi">; // simt level code
160+
161+
def XeGPU_ScopeEnums : I32EnumAttr<"Scope", "enumerate of scope",
162+
[XeGPU_ScopeWG,XeGPU_ScopeSG,XeGPU_ScopeWI]> {
163+
let genSpecializedAttr = 0;
164+
let cppNamespace = "::mlir::xegpu";
165+
}
166+
167+
def XeGPU_ScopeAttr
168+
: EnumAttr<XeGPU_Dialect,XeGPU_ScopeEnums, "Stage"> {
169+
let summary = [{Describe the stage of lowering progress}];
170+
let assemblyFormat = "``$value";
171+
}
172+
173+
def XeGPU_LayoutAttr : XeGPUAttr<"Layout", "layout"> {
158174
let summary = [{
159175
Describes the mapping between work item (WI) and the 2D tensor specified by the tensor descriptor.
160176
}];
161177
let description = [{
162-
To distribute the XeGPU operation to work items, the tensor_desc must be specified with the sg_map
163-
attribute at the tensor description creation time.
164-
Within the `sg_map`, `wi_layout` specifies the layout of work items,
165-
describing the mapping of work items to the tensor.
166-
wi_layout[0] x wi_layout[1] must be equal to the total number of work items within a subgroup.
167-
`wi_data` specifies the minimum number of data elements assigned to each work item for a single distribution.
168-
169-
E.g., #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>
170-
In this example, the subgroup has 16 work items in wi_layout=[1, 16],
171-
each accessing 1 element as specified by wi_data=[1, 1].
178+
XeGPU operations leverages LayoutAttr to distribute data across work-item. It is specified in tensor_descs
179+
upon the tensor description creation. LayoutAttr contains the following parameters.
180+
181+
* scope: specifies the scope of current code. It can be either wg (workgroup), sg (subgroup) or wi (workitem).
182+
it is hard required for subgroup, but optional for workgroup and wi. By default, if a LayoutAttr
183+
contains sg_layout and sg_data, it will be treated as workgroup code; and if it only contains
184+
wi_layout and wi_data, it will be considered as workitem level.
185+
* sg_layout: [optional] specifies the total number of subgroups and their layout in a workgroup.
186+
* sg_data: [optional] specifies the data size accessed per subgroup.
187+
* order: [optional] specifies the dimension order used to linearize n-d sbugroup ids to 1-d.
188+
The first dimension in the order list is the fastest-changing dimension.
189+
* wi_layout: [required] specifies the total number of work-items and their layout in a subgroup
190+
* wi_data: [required] specifies the data size accessed per work-item for a single distribution.
172191

173192
`wi_data[0] * wi_data[1]` can be greater than 1, meaning that each work item operates on multiple elements,
174193
which is eventually lowered to "SIMT-flavor" vector, like SPIR-V vector or llvm vector, or packed to a storage data type.
175194
The multiple elements indicated by `wi_data` can only be from one dimension and must be contiguous in the memory along either dimension.
195+
196+
E.g., #xegpu.layout<wi_layout = [1, 16], wi_data = [1, 1]>
197+
In this example, the subgroup has 16 work items in wi_layout=[1, 16], each accessing 1 element as specified by wi_data=[1, 1].
198+
199+
E.g., #xegpu.layout<sg_layout = [2, 4], sg_data = [16, 16], wi_layout = [1, 16], wi_data = [1, 1]>
200+
In this example, the layout representing a workgroup work distribution. A workgroup has 8 subgroups organized as 2x4 layout.
201+
and each subgroup accesses a 16x16 block per instruction, which is further disbributed to 16 work items as described above.
202+
176203
}];
177204
let parameters = (ins
178-
ArrayRefParameter<"uint32_t">:$wi_layout,
179-
ArrayRefParameter<"uint32_t">:$wi_data
205+
OptionalParameter<"ScopeAttr">: $scope,
206+
OptionalParameter<"DenseI32ArrayAttr">: $sg_layout,
207+
OptionalParameter<"DenseI32ArrayAttr">: $sg_data,
208+
OptionalParameter<"DenseI32ArrayAttr">: $order,
209+
"DenseI32ArrayAttr": $wi_layout,
210+
"DenseI32ArrayAttr": $wi_data
180211
);
181212

213+
let extraClassDeclaration = [{
214+
bool isForWorkgroupLevel() {
215+
if (!getScope())
216+
return getSgLayout() && getSgData();
217+
return getScope() == ScopeAttr::get(getContext(), Scope::WG);
218+
}
219+
220+
bool isForSubgroupLevel() {
221+
return getScope() == ScopeAttr::get(getContext(), Scope::SG);
222+
}
223+
224+
bool isForWorkItemLevel() {
225+
if (!getScope())
226+
return !getSgLayout() && !getSgData() && !getOrder();
227+
return getScope() == ScopeAttr::get(getContext(), Scope::WI);
228+
}
229+
}];
182230

183-
let hasCustomAssemblyFormat = 1;
231+
let assemblyFormat = "`<` struct(params) `>`";
184232
let genVerifyDecl = 1;
185233
}
186234

mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td

Lines changed: 42 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,7 @@ def XeGPU_CreateNdDescOp: XeGPU_Op<"create_nd_tdesc", [Pure, ViewLikeOpInterface
8080
information e.g., memref<?x?xf16>, the strides information has to be explicitly
8181
passed via the "strides" and "const_strides" argument.
8282

83-
In SIMT mode, tensor descriptor is augmented with `SGMapAttr` which describes the
83+
In SIMT mode, tensor descriptor is augmented with `LayoutAttr` which describes the
8484
mapping of the tensor descriptor to the work items.
8585

8686
Example 1 (suppose the tensor shape inferred by the compiler is 8x16):
@@ -113,7 +113,7 @@ def XeGPU_CreateNdDescOp: XeGPU_Op<"create_nd_tdesc", [Pure, ViewLikeOpInterface
113113
%c0 = arith.constant 0 : index
114114
%c1 = arith.constant 8 : index
115115
%1 = xegpu.create_nd_tdesc %0[%c0, %c0] : memref<1024x1024xf32>
116-
-> !xegpu.tensor_desc<8x16xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
116+
-> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<wi_layout = [1, 16], wi_data = [1, 1]>>
117117
```
118118
}];
119119

@@ -306,7 +306,7 @@ def XeGPU_LoadNdOp : XeGPU_Op<"load_nd", [
306306
fp32 or fp64. It implies that vnni and transpose cannot exit at the
307307
same time.
308308

309-
In SIMT mode, LoadNdOp expects the tensor descriptor to be augmented with `SGMapAttr`
309+
In SIMT mode, LoadNdOp expects the tensor descriptor to be augmented with `LayoutAttr`
310310
which describes the mapping of the tensor to the work items. In this case, result
311311
vector represents the data to be loaded by each work-item.
312312

@@ -323,7 +323,7 @@ def XeGPU_LoadNdOp : XeGPU_Op<"load_nd", [
323323
xegpu.load_nd %1 {l1_hint = #xegpu.cache_hint<cached>,
324324
l2_hint = #xegpu.cache_hint<uncached>}>
325325
: !xegpu.tensor_desc<8x16xf32,
326-
#xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>> -> vector<8x1xf32>
326+
#xegpu.layout<wi_layout = [1, 16], wi_data = [1, 1]>> -> vector<8x1xf32>
327327
```
328328

329329

@@ -364,7 +364,7 @@ def XeGPU_StoreNdOp : XeGPU_Op<"store_nd", [
364364
of cache, L1, L2 and L3. If hardware does not have a correspoding cache,
365365
Corresponding cache hint attribute will be masked.
366366

367-
In SIMT mode, StoreNdOp expects the tensor descriptor to be augmented with `SGMapAttr`
367+
In SIMT mode, StoreNdOp expects the tensor descriptor to be augmented with `LayoutAttr`
368368
which describes the mapping of the tensor to the work items. In this case, input
369369
vector represents the data to be stored by each work-item.
370370

@@ -381,7 +381,7 @@ def XeGPU_StoreNdOp : XeGPU_Op<"store_nd", [
381381
l2_hint = #xegpu.cache_hint<write_back>,
382382
l3_hint = #xegpu.cache_hint<write_through>}
383383
: vector<8x1xf16>, !xegpu.tensor_desc<8x16xf16,
384-
#xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
384+
#xegpu.layout<wi_layout = [1, 16], wi_data = [1, 1]>>
385385
```
386386

387387

@@ -422,7 +422,7 @@ def XeGPU_UpdateNdOffsetOp : XeGPU_Op<"update_nd_offset",
422422
Example 2 (SIMT mode):
423423
```
424424
%2 = xegpu.update_nd_offset %1, [0, 16]:
425-
!xegpu.tensor_desc<8x16xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
425+
!xegpu.tensor_desc<8x16xf32, #xegpu.layout<wi_layout = [1, 16], wi_data = [1, 1]>>
426426
```
427427
}];
428428

@@ -482,7 +482,7 @@ def XeGPU_CreateDescOp: XeGPU_Op<"create_tdesc", [Pure, ViewLikeOpInterface]> {
482482
the chunk_size if the chunk size is larger than 1.
483483

484484
In SIMT mode, similar to `create_nd_tdesc` the resulting tensor descriptor is augmented
485-
with `SGMapAttr` which describes the mapping of the tensor descriptor to the work items.
485+
with `LayoutAttr` which describes the mapping of the tensor descriptor to the work items.
486486
In this case, the first dimension of the tensor descriptor represents the work-items, and
487487
the second dimension represents the chunk size.
488488

@@ -517,7 +517,7 @@ def XeGPU_CreateDescOp: XeGPU_Op<"create_tdesc", [Pure, ViewLikeOpInterface]> {
517517
%off = arith.constant dense<[0, 16, 32, 64]> : vector<4xindex>
518518
%1 = xegpu.create_tdesc %0, %off : memref<1024xf32>, vector<4xindex>
519519
-> TensorDesc<4x8xf32, #xegpu.scattered_tdesc_attr<chunk_size = 8>,
520-
#xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 1]>>
520+
#xegpu.layout<wi_layout = [4, 1], wi_data = [1, 1]>>
521521
```
522522
}];
523523

@@ -571,7 +571,7 @@ def XeGPU_CreateDescOp: XeGPU_Op<"create_tdesc", [Pure, ViewLikeOpInterface]> {
571571
let hasVerifier = 1;
572572
}
573573

574-
def XeGPU_PrefetchOp : XeGPU_Op<"prefetch", []> {
574+
def XeGPU_PrefetchOp : XeGPU_Op<"prefetch", [MemoryEffects<[MemRead]>]> {
575575
let summary = "prefetches a set of scattered data points to cache";
576576

577577
let description = [{
@@ -623,7 +623,7 @@ def XeGPU_LoadGatherOp : XeGPU_Op<"load", [
623623
The mask operand masks out memory access so that it is safe to pass out-of-boundary
624624
addresses/offsets as long as they are masked. It applies to slots of SIMD lanes.
625625

626-
In SIMT mode, LoadGatherOp expects the tensor descriptor to be augmented with `SGMapAttr`
626+
In SIMT mode, LoadGatherOp expects the tensor descriptor to be augmented with `LayoutAttr`
627627
which describes the mapping of the tensor to the work items. In this case, result vector
628628
represents the data to be loaded by each work-item. Each work-item recieves a `chunk_size`
629629
number of elements.
@@ -653,7 +653,7 @@ def XeGPU_LoadGatherOp : XeGPU_Op<"load", [
653653
l2_hint = #xegpu.cache_hint<uncached>,
654654
l3_hint = #xegpu.cache_hint<uncached>}
655655
: !xegpu.tensor_desc<16x8xf32, #xegpu.scatter_tdesc_attr<memory_space=global, chunk_size=8>,
656-
!xegpu.sg_map<wi_layout = [16, 1], wi_data = [1, 1]>>
656+
!xegpu.layout<wi_layout = [16, 1], wi_data = [1, 1]>>
657657
vector<16xi1> -> vector<8x1xf32>
658658
```
659659

@@ -704,7 +704,7 @@ def XeGPU_StoreScatterOp : XeGPU_Op<"store", [
704704
has transpose effect, which is similar to `load_gather`. Therefore, a transpose attribute is
705705
introduced on purpose, making sure users are aware of this implicit transformation.
706706

707-
In SIMT mode, StoreScatterOp expects the tensor descriptor to be augmented with `SGMapAttr`
707+
In SIMT mode, StoreScatterOp expects the tensor descriptor to be augmented with `LayoutAttr`
708708
which describes the mapping of the tensor to the work items. In this case, input vector
709709
represents the data to be stored by each work-item. Each work-item recieves a `chunk_size`
710710
number of elements.
@@ -732,7 +732,7 @@ def XeGPU_StoreScatterOp : XeGPU_Op<"store", [
732732
l2_hint = #xegpu.cache_hint<write_back>,
733733
l3_hint = #xegpu.cache_hint<write_through>}
734734
: vector<8x1xf32>, !xegpu.tensor_desc<16x8xf32, #xegpu.scattered_tdesc_attr<chunk_size=8>,
735-
!xegpu.sg_map<wi_layout = [16, 1], wi_data = [1, 1]>> vector<16xi1>
735+
!xegpu.layout<wi_layout = [16, 1], wi_data = [1, 1]>> vector<16xi1>
736736
```
737737

738738
}];
@@ -790,7 +790,7 @@ def XeGPU_UpdateOffsetOp: XeGPU_Op<"update_offset",
790790
%off = arith.constant dense<[32, 32, 32, 32]> : vector<4xindex>
791791
%2 = xegpu.update_offset %1, %off :
792792
!xegpu.tensor_desc<4x2xf32, #xegpu.scattered_tdesc_attr<chunk_size=2>,
793-
#xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 1]>>, vector<4xindex>
793+
#xegpu.layout<wi_layout = [4, 1], wi_data = [1, 1]>>, vector<4xindex>
794794
```
795795
}];
796796

@@ -840,9 +840,9 @@ def XeGPU_DpasOp : XeGPU_Op<"dpas", [Pure, AllElementTypesMatch<["lhs", "rhs"]>]
840840
factor, which is computed as `32/bit_width_of_elem_type`. Thus, `B: vector<16x16xf16>`
841841
can be represented as `B: vector<8x16x2xf16>`.
842842

843-
In SIMT mode, DpasOp expects attributes `sg_map_a`, `sg_map_b`, and `sg_map_c`
844-
which descibes the data fragment owned by each work-item w.r.t. the tensor
845-
descriptor these data are loaded from.
843+
In SIMT mode, DpasOp expects layout attributes `a`, `b`, and `c` (only if acc is used)
844+
which descibe the data fragment owned by each work-item w.r.t. the tensor descriptor
845+
these data are loaded from.
846846

847847
Note: on PVC, the hardware can perform load with VNNI transformation when data
848848
element type is 16-bit or lower precision, taking 2 or 4 elements from
@@ -853,9 +853,9 @@ def XeGPU_DpasOp : XeGPU_Op<"dpas", [Pure, AllElementTypesMatch<["lhs", "rhs"]>]
853853
XeGPU_DpasOpType : $lhs,
854854
XeGPU_DpasOpType : $rhs,
855855
Optional<XeGPU_Vector2DType>: $acc,
856-
OptionalAttr<XeGPU_SGMapAttr>:$sg_map_a,
857-
OptionalAttr<XeGPU_SGMapAttr>:$sg_map_b,
858-
OptionalAttr<XeGPU_SGMapAttr>:$sg_map_c);
856+
OptionalAttr<XeGPU_LayoutAttr>:$a_layout,
857+
OptionalAttr<XeGPU_LayoutAttr>:$b_layout,
858+
OptionalAttr<XeGPU_LayoutAttr>:$c_layout);
859859
let results = (outs XeGPU_Vector2DType: $result);
860860

861861
let extraClassDeclaration = [{
@@ -876,6 +876,10 @@ def XeGPU_DpasOp : XeGPU_Op<"dpas", [Pure, AllElementTypesMatch<["lhs", "rhs"]>]
876876
VectorType getResultType() {
877877
return getResult().getType();
878878
}
879+
880+
bool hasAcc() {
881+
return getAcc() != nullptr;
882+
}
879883
}];
880884

881885
let assemblyFormat = [{
@@ -979,4 +983,21 @@ def XeGPU_FenceOp: XeGPU_Op<"fence", []> {
979983
let extraClassDeclaration = extraBaseClassDeclaration;
980984
}
981985

986+
def XeGPU_ConvertLayoutOp: XeGPU_Op<"convert_layout", [Pure, AllTypesMatch<["source", "result"]>]> {
987+
let summary = "Convert the sg layout of the input operand";
988+
let description = [{
989+
convert_layout remaps the distribution of data across workitems by updating the LayoutAttr.
990+
}];
991+
let arguments = (ins XeGPU_Vector2DType: $source,
992+
XeGPU_LayoutAttr: $srcMap,
993+
XeGPU_LayoutAttr: $resMap
994+
);
995+
let results = (outs XeGPU_Vector2DType: $result);
996+
let assemblyFormat = [{
997+
$source attr-dict `:` type($source)
998+
}];
999+
1000+
let hasVerifier = 1;
1001+
}
1002+
9821003
#endif // MLIR_DIALECT_XEGPU_IR_XEGPUOPS_TD

mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,7 @@ def XeGPU_TensorDesc: XeGPUTypeDef<"TensorDesc", "tensor_desc",
6363
element-type ::= float-type | integer-type | index-type
6464
dim-list := (static-dim-list `x`)?
6565
static-dim-list ::= decimal-literal `x` decimal-literal
66-
attr-list = (, memory_space = value)? (, arr_len = value)? (, boundary_check = value)? (, scattered = value)? (, sg_map `<` wi_layout = value, wi_data = value `>`)?
66+
attr-list = (, memory_space = value)? (, arr_len = value)? (, boundary_check = value)? (, scattered = value)? (, layout `<` wi_layout = value, wi_data = value `>`)?
6767
```
6868

6969
Examples:
@@ -78,15 +78,15 @@ def XeGPU_TensorDesc: XeGPUTypeDef<"TensorDesc", "tensor_desc",
7878
// A TensorDesc with 8x16 f32 elements for a memory region in shared memory space.
7979
xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr<memory_space = slm>>
8080

81-
// A TensorDesc with a sg_map
82-
xegpu.tensor_desc<8x16xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
81+
// A TensorDesc with a layout
82+
xegpu.tensor_desc<8x16xf32, #xegpu.layout<wi_layout = [1, 16], wi_data = [1, 1]>>
8383
```
8484
}];
8585

8686
let parameters = (ins ArrayRefParameter<"int64_t">: $shape,
8787
"mlir::Type": $elementType,
8888
OptionalParameter<"mlir::Attribute">: $encoding,
89-
OptionalParameter<"mlir::Attribute">: $sg_map);
89+
OptionalParameter<"mlir::Attribute">: $layout);
9090

9191
let builders = [
9292
TypeBuilderWithInferredContext<(ins
@@ -95,13 +95,13 @@ def XeGPU_TensorDesc: XeGPUTypeDef<"TensorDesc", "tensor_desc",
9595
CArg<"int", "1">: $array_length,
9696
CArg<"bool", "true">: $boundary_check,
9797
CArg<"xegpu::MemorySpace", "xegpu::MemorySpace::Global">:$memory_space,
98-
CArg<"mlir::Attribute", "mlir::Attribute()">:$sg_map)>,
98+
CArg<"mlir::Attribute", "mlir::Attribute()">:$layout)>,
9999
TypeBuilderWithInferredContext<(ins
100100
"llvm::ArrayRef<int64_t>": $shape,
101101
"mlir::Type": $elementType,
102102
CArg<"int", "1">: $chunk_size,
103103
CArg<"xegpu::MemorySpace", "xegpu::MemorySpace::Global">:$memory_space,
104-
CArg<"mlir::Attribute", "mlir::Attribute()">:$sg_map)>
104+
CArg<"mlir::Attribute", "mlir::Attribute()">:$layout)>
105105
];
106106

107107
let extraClassDeclaration = [{
@@ -127,8 +127,8 @@ def XeGPU_TensorDesc: XeGPUTypeDef<"TensorDesc", "tensor_desc",
127127
return llvm::dyn_cast_if_present<ScatterTensorDescAttr>(getEncoding());
128128
}
129129

130-
SGMapAttr getSGMapAttr() const {
131-
return llvm::dyn_cast_if_present<SGMapAttr>(getSgMap());
130+
LayoutAttr getLayoutAttr() const {
131+
return llvm::dyn_cast_if_present<LayoutAttr>(getLayout());
132132
}
133133

134134
xegpu::MemorySpace getMemorySpace() const {

0 commit comments

Comments
 (0)