@@ -54,8 +54,9 @@ class GPU_IndexOp<string mnemonic, list<Trait> traits = []> :
5454 Pure,
5555 DeclareOpInterfaceMethods<InferIntRangeInterface, ["inferResultRanges"]>,
5656 DeclareOpInterfaceMethods<OpAsmOpInterface, ["getAsmResultNames"]>])>,
57- Arguments<(ins GPU_DimensionAttr:$dimension)>, Results<(outs Index)> {
58- let assemblyFormat = "$dimension attr-dict";
57+ Arguments<(ins GPU_DimensionAttr:$dimension,
58+ OptionalAttr<IndexAttr>:$upper_bound)>, Results<(outs Index)> {
59+ let assemblyFormat = "$dimension (`upper_bound` $upper_bound^)? attr-dict";
5960 let extraClassDefinition = [{
6061 void $cppClass::getAsmResultNames(
6162 llvm::function_ref<void(mlir::Value, mlir::StringRef)> setNameFn) {
@@ -66,6 +67,14 @@ class GPU_IndexOp<string mnemonic, list<Trait> traits = []> :
6667 setNameFn(getResult(),resultName);
6768 }
6869 }];
70+ let builders = [
71+ OpBuilder<(ins "::mlir::gpu::Dimension":$dimension), [{
72+ build($_builder, $_state, dimension, /*upperBound=*/nullptr);
73+ }]>,
74+ OpBuilder<(ins "::mlir::Type":$resultType, "::mlir::gpu::Dimension":$dimension), [{
75+ build($_builder, $_state, resultType, dimension, /*upperBound=*/nullptr);
76+ }]>
77+ ];
6978}
7079
7180def GPU_ClusterDimOp : GPU_IndexOp<"cluster_dim"> {
@@ -78,6 +87,12 @@ def GPU_ClusterDimOp : GPU_IndexOp<"cluster_dim"> {
7887 ```mlir
7988 %cDimX = gpu.cluster_dim x
8089 ```
90+
91+ If `upper_bound` is set, then executing (a lowering of) this operation in an
92+ environment where the clusters per grid is greater than `upper_bound` causes
93+ undefined behavior.
94+
95+ There is an implicit upper bound of `kMaxDim` (currently uint32_t::max).
8196 }];
8297}
8398
@@ -91,6 +106,12 @@ def GPU_ClusterDimBlocksOp : GPU_IndexOp<"cluster_dim_blocks"> {
91106 ```mlir
92107 %cDimBlocksX = gpu.cluster_dim_blocks x
93108 ```
109+
110+ If `upper_bound` is set, then executing (a lowering of) this operation in an
111+ environment where the thread blocks per cluster is greater than `upper_bound`
112+ causes undefined behavior.
113+
114+ There is an implicit upper bound of `kMaxClusterDim` (currently 8).
94115 }];
95116}
96117
@@ -104,6 +125,12 @@ def GPU_ClusterIdOp : GPU_IndexOp<"cluster_id"> {
104125 ```mlir
105126 %cIdY = gpu.cluster_id y
106127 ```
128+
129+ If `upper_bound` is set, then executing (a lowering of) this operation in an
130+ environment where the number of clusters in the grid along `dimension` is
131+ greater than `upper_bound` causes undefined behavior.
132+
133+ There is an implicit upper bound of `kMaxDim` (currently uint32_t::max).
107134 }];
108135}
109136
@@ -116,6 +143,12 @@ def GPU_ClusterBlockIdOp : GPU_IndexOp<"cluster_block_id"> {
116143 ```mlir
117144 %cBlockIdY = gpu.cluster_block_id y
118145 ```
146+
147+ If `upper_bound` is set, then executing (a lowering of) this operation in an
148+ environment where the number of thread blocks per cluster along `dimension`
149+ is greater than `upper_bound` causes undefined behavior.
150+
151+ There is an implicit upper bound of `kMaxClusterDim` (currently 8).
119152 }];
120153}
121154
@@ -129,6 +162,19 @@ def GPU_BlockDimOp : GPU_IndexOp<"block_dim"> {
129162 ```mlir
130163 %bDimX = gpu.block_dim x
131164 ```
165+
166+ If `known_block_size` is set on an this operation's enclosing `gpu.func`,
167+ or `gpu.known_block_size` is set on an enclosing `FunctionOpInterface`
168+ implementor, or if the enclosing `gpu.launch` specifies a constant size for
169+ `dimension`'s blocks, these contextual facts may be used to infer that this
170+ operation has a constant value, though such a transformation will not be
171+ performed by canonicalization or the default constant folder. Executions which
172+ cause that constant-value assumption to be false incur undefined behavior.
173+
174+ If `upper_bound` is set, executions where the bblock size along `dimension`
175+ exceeds `upper_bound` cause undefined behavior.
176+
177+ There is an implicit upper bound of `kMaxDim` (currently uint32_t::max).
132178 }];
133179}
134180def GPU_BlockIdOp : GPU_IndexOp<"block_id"> {
@@ -141,6 +187,13 @@ def GPU_BlockIdOp : GPU_IndexOp<"block_id"> {
141187 ```mlir
142188 %bIdY = gpu.block_id y
143189 ```
190+
191+ If `upper_bound` is set, or if one can be inferred from `known_grid_size`-type
192+ annotations in context, executions where the block index in `dimension` would
193+ be greater than or equal to that bound cause undefined behavior. `upper_bound`
194+ takes priority over bounds inferrable from context.
195+
196+ There is an implicit upper bound of `kMaxDim` (currently uint32_t::max).
144197 }];
145198}
146199def GPU_GridDimOp : GPU_IndexOp<"grid_dim"> {
@@ -153,6 +206,20 @@ def GPU_GridDimOp : GPU_IndexOp<"grid_dim"> {
153206 ```mlir
154207 %gDimZ = gpu.grid_dim z
155208 ```
209+
210+
211+ If `known_grid_size` is set on an this operation's enclosing `gpu.func`,
212+ or `gpu.known_grid_size` is set on an enclosing `FunctionOpInterface`
213+ implementor, or if the enclosing `gpu.launch` specifies a constant size for
214+ `dimension`'s grid length, these contextual facts may be used to infer that this
215+ operation has a constant value, though such a transformation will not be
216+ performed by canonicalization or the default constant folder. Executions which
217+ cause that constant-value assumption to be false incur undefined behavior.
218+
219+ If `upper_bound` is set, executions where the grid size in `dimension` would
220+ exceed `upper_bound` cause undefined behavior.
221+
222+ There is an implicit upper bound of `kMaxDim` (currently uint32_t::max).
156223 }];
157224}
158225def GPU_ThreadIdOp : GPU_IndexOp<"thread_id"> {
@@ -165,6 +232,12 @@ def GPU_ThreadIdOp : GPU_IndexOp<"thread_id"> {
165232 ```mlir
166233 %tIdX = gpu.thread_id x
167234 ```
235+
236+ If `upper_bound` is set, or if one can be inferred from `known_block_size`-type
237+ annotations in context, executions where the thread index would be greater
238+ than or equal to that bound cause undefined behavior.
239+
240+ There is an implicit upper bound of `kMaxDim` (currently uint32_t::max).
168241 }];
169242}
170243
@@ -177,14 +250,21 @@ def GPU_LaneIdOp : GPU_Op<"lane_id", [
177250 ```mlir
178251 %laneId = gpu.lane_id
179252 ```
253+
254+ If `upper_bound` is set, executions with more than `upper_bound` lanes per
255+ subgroup cause undefined behavior. In the abscence of `upper_bound`,
256+ the lane id is still assumed to be non-negative and less than the
257+ target-independent `kMaxSubgroupSize` (currently 128).
180258 }];
259+ let arguments = (ins OptionalAttr<IndexAttr>:$upper_bound);
181260 let results = (outs Index:$result);
182- let assemblyFormat = "attr-dict";
261+ let assemblyFormat = "(`upper_bound` $upper_bound^)? attr-dict";
183262}
184263
185264def GPU_SubgroupIdOp : GPU_Op<"subgroup_id", [
186265 Pure, DeclareOpInterfaceMethods<InferIntRangeInterface, ["inferResultRanges"]>]>,
187- Arguments<(ins)>, Results<(outs Index:$result)> {
266+ Arguments<(ins OptionalAttr<IndexAttr>:$upper_bound)>,
267+ Results<(outs Index:$result)> {
188268 let description = [{
189269 Returns the subgroup id, i.e., the index of the current subgroup within the
190270 workgroup.
@@ -194,9 +274,13 @@ def GPU_SubgroupIdOp : GPU_Op<"subgroup_id", [
194274 ```mlir
195275 %sgId = gpu.subgroup_id : index
196276 ```
277+
278+ Executions where there are more than `upper_bound` subgroups per workgroup
279+ cause undefined behavior. There is an implicit upper bound of `kMaxDim`
280+ (currently uint32_t::max).
197281 }];
198282
199- let assemblyFormat = "attr-dict `:` type($result)";
283+ let assemblyFormat = "(`upper_bound` $upper_bound^)? attr-dict `:` type($result)";
200284}
201285
202286def GPU_GlobalIdOp : GPU_IndexOp<"global_id"> {
@@ -209,14 +293,20 @@ def GPU_GlobalIdOp : GPU_IndexOp<"global_id"> {
209293
210294 ```mlir
211295 %gidX = gpu.global_id x
296+ %gidX = gpu.global_id x upper_bound 65536
212297 ```
298+
299+ The `upper_bound` attribute defines an upper bound analogously to the ones on
300+ `thread_id` and `block_id`. If one is not set, the bound may be inferred from
301+ a combination of `known_block_size` and `known_grid_size`-type annotations.
213302 }];
214303}
215304
216305
217306def GPU_NumSubgroupsOp : GPU_Op<"num_subgroups", [
218307 Pure, DeclareOpInterfaceMethods<InferIntRangeInterface, ["inferResultRanges"]>]>,
219- Arguments<(ins)>, Results<(outs Index:$result)> {
308+ Arguments<(ins OptionalAttr<IndexAttr>:$upper_bound)>,
309+ Results<(outs Index:$result)> {
220310 let description = [{
221311 Returns the number of subgroups within a workgroup.
222312
@@ -225,14 +315,19 @@ def GPU_NumSubgroupsOp : GPU_Op<"num_subgroups", [
225315 ```mlir
226316 %numSg = gpu.num_subgroups : index
227317 ```
318+
319+ If `upper_bound` is set, executions with more than `upper_bound` subgroups
320+ per workgroup cause undefined behavior. There is a default upper bound of
321+ `kMaxDim` (currently uint32_t::max).
228322 }];
229323
230- let assemblyFormat = "attr-dict `:` type($result)";
324+ let assemblyFormat = "(`upper_bound` $upper_bound^)? attr-dict `:` type($result)";
231325}
232326
233327def GPU_SubgroupSizeOp : GPU_Op<"subgroup_size", [
234328 Pure, DeclareOpInterfaceMethods<InferIntRangeInterface, ["inferResultRanges"]>]>,
235- Arguments<(ins)>, Results<(outs Index:$result)> {
329+ Arguments<(ins OptionalAttr<IndexAttr>:$upper_bound)>,
330+ Results<(outs Index:$result)> {
236331 let description = [{
237332 Returns the number of threads within a subgroup.
238333
@@ -241,11 +336,20 @@ def GPU_SubgroupSizeOp : GPU_Op<"subgroup_size", [
241336 ```mlir
242337 %sgSz = gpu.subgroup_size : index
243338 ```
339+
340+ Executions where the number of threads per subgroup exceed `upper_bound` cause
341+ undefined behavior. When no `upper_bound` is specified, range analyses and
342+ similar machinery assume the default bound of `kMaxSubgroupSize`, currently
343+ 128.
244344 }];
245345
246- let assemblyFormat = "attr-dict `:` type($result)";
346+ let assemblyFormat = "(`upper_bound` $upper_bound^)? attr-dict `:` type($result)";
247347}
248348
349+ def GPU_OptionalDimSizeHintAttr : ConfinedAttr<OptionalAttr<DenseI32ArrayAttr>,
350+ [AttrConstraint<Or<[IsNullAttr.predicate, DenseArrayCount<3>.predicate]>,
351+ "with 3 elements (if present)">]>;
352+
249353def GPU_GPUFuncOp : GPU_Op<"func", [
250354 HasParent<"GPUModuleOp">, AutomaticAllocationScope, FunctionOpInterface,
251355 IsolatedFromAbove
@@ -274,12 +378,14 @@ def GPU_GPUFuncOp : GPU_Op<"func", [
274378 body region, are not supported.
275379
276380 A function may optionally be annotated with the block and/or grid sizes
277- that will be used when it is launched using the `gpu. known_block_size` and
278- `gpu. known_grid_size` attributes, respectively. If set, these attributes must
381+ that will be used when it is launched using the `known_block_size` and
382+ `known_grid_size` attributes, respectively. If set, these attributes must
279383 be arrays of three 32-bit integers giving the x, y, and z launch dimensions.
280384 Launching a kernel that has these annotations, or that calls a function with
281385 these annotations, using a block size or grid size other than what is specified
282- is undefined behavior.
386+ is undefined behavior. These attributes may be set on non-`gpu.func` functions
387+ by using `gpu.known_block_size` or `gpu.known_grid_size`, but this carries
388+ the risk that they will de discarded.
283389
284390 Syntax:
285391
@@ -322,7 +428,9 @@ def GPU_GPUFuncOp : GPU_Op<"func", [
322428 OptionalAttr<DictArrayAttr>:$arg_attrs,
323429 OptionalAttr<DictArrayAttr>:$res_attrs,
324430 OptionalAttr<DictArrayAttr>:$workgroup_attrib_attrs,
325- OptionalAttr<DictArrayAttr>:$private_attrib_attrs);
431+ OptionalAttr<DictArrayAttr>:$private_attrib_attrs,
432+ GPU_OptionalDimSizeHintAttr:$known_block_size,
433+ GPU_OptionalDimSizeHintAttr:$known_grid_size);
326434 let regions = (region AnyRegion:$body);
327435
328436 let skipDefaultBuilders = 1;
@@ -445,36 +553,6 @@ def GPU_GPUFuncOp : GPU_Op<"func", [
445553 return "workgroup_attributions";
446554 }
447555
448- static constexpr StringLiteral getKnownBlockSizeAttrName() {
449- return StringLiteral("gpu.known_block_size");
450- }
451-
452- static constexpr StringLiteral getKnownGridSizeAttrName() {
453- return StringLiteral("gpu.known_grid_size");
454- }
455-
456- /// Returns the block size this kernel will be launched with along
457- /// dimension `dim` if known. The value of gpu.thread_id dim will be strictly
458- /// less than this size.
459- std::optional<uint32_t> getKnownBlockSize(gpu::Dimension dim) {
460- if (auto array =
461- (*this)->getAttrOfType<DenseI32ArrayAttr>(getKnownBlockSizeAttrName())) {
462- return array[static_cast<uint32_t>(dim)];
463- }
464- return std::nullopt;
465- }
466-
467- /// Returns the grid size this kernel will be launched with along
468- /// dimension `dim` if known. The value of gpu.block_id dim will be strictly
469- /// less than this size.
470- std::optional<uint32_t> getKnownGridSize(gpu::Dimension dim) {
471- if (auto array =
472- (*this)->getAttrOfType<DenseI32ArrayAttr>(getKnownGridSizeAttrName())) {
473- return array[static_cast<uint32_t>(dim)];
474- }
475- return std::nullopt;
476- }
477-
478556 /// Returns the argument types of this function.
479557 ArrayRef<Type> getArgumentTypes() { return getFunctionType().getInputs(); }
480558
@@ -495,8 +573,6 @@ def GPU_GPUFuncOp : GPU_Op<"func", [
495573 LogicalResult verifyBody();
496574 }];
497575 let hasCustomAssemblyFormat = 1;
498-
499- let hasVerifier = 1;
500576}
501577
502578def GPU_DynamicSharedMemoryOp : GPU_Op<"dynamic_shared_memory", [Pure]>
@@ -723,8 +799,8 @@ def GPU_LaunchOp : GPU_Op<"launch", [
723799 Arguments<(ins Variadic<GPU_AsyncToken>:$asyncDependencies,
724800 Index:$gridSizeX, Index:$gridSizeY, Index:$gridSizeZ,
725801 Index:$blockSizeX, Index:$blockSizeY, Index:$blockSizeZ,
726- Optional<Index>:$clusterSizeX,
727- Optional<Index>:$clusterSizeY,
802+ Optional<Index>:$clusterSizeX,
803+ Optional<Index>:$clusterSizeY,
728804 Optional<Index>:$clusterSizeZ,
729805 Optional<I32>:$dynamicSharedMemorySize)>,
730806 Results<(outs Optional<GPU_AsyncToken>:$asyncToken)> {
@@ -748,7 +824,7 @@ def GPU_LaunchOp : GPU_Op<"launch", [
748824 to the amount of dynamic shared memory a kernel's workgroup should be
749825 allocated; when this operand is not present, a zero size is assumed.
750826
751- The body region has at least _twelve_ arguments, or _eighteen_ if cluster
827+ The body region has at least _twelve_ arguments, or _eighteen_ if cluster
752828 dimensions are present, grouped as follows:
753829
754830 - three optional arguments that contain cluster identifiers along x,y,z
@@ -821,7 +897,7 @@ def GPU_LaunchOp : GPU_Op<"launch", [
821897 blocks(%bx, %by, %bz) in (%sz_bx = %3, %sz_by = %4, %sz_bz = %5)
822898 threads(%tx, %ty, %tz) in (%sz_tx = %6, %sz_ty = %7, %sz_tz = %8)
823899 {
824- // Cluster, block and thread identifiers, as well as cluster/block/grid
900+ // Cluster, block and thread identifiers, as well as cluster/block/grid
825901 // sizes are immediately usable inside body region.
826902 "some_op"(%cx, %bx, %tx) : (index, index, index) -> ()
827903 }
@@ -898,7 +974,7 @@ def GPU_LaunchOp : GPU_Op<"launch", [
898974 unsigned getNumConfigOperands() {
899975 return kNumConfigOperands + (hasClusterSize() ? 3 : 0);
900976 }
901- /// Returns the number of region attributes including cluster size
977+ /// Returns the number of region attributes including cluster size
902978 unsigned getNumConfigRegionAttributes() {
903979 return kNumConfigRegionAttributes + (hasClusterSize() ? 6 : 0);
904980 }
0 commit comments