Skip to content

Commit e468ea3

Browse files
authored
[mlir][amdgpu] Fix documentation and verifiers (llvm#167369)
1 parent ae2fec0 commit e468ea3

File tree

3 files changed

+57
-21
lines changed

3 files changed

+57
-21
lines changed

mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td

Lines changed: 25 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -127,7 +127,7 @@ def AMDGPU_ScaledExtPacked816Op
127127
FixedVectorOfShapeAndType<[4], F8E8M0FNU>:$scale,
128128
ConfinedAttr<I32Attr, [IsValidBlockSize]>:$blockSize,
129129
ConfinedAttr<I32Attr, [IntMinValue<0>, IntMaxValue<1>]>:$firstScaleLane,
130-
ConfinedAttr<I32Attr, [IntMinValue<0>, IntMaxValue<2>]>:$firstScaleByte)>,
130+
ConfinedAttr<I32Attr, [IntMinValue<0>, IntMaxValue<3>]>:$firstScaleByte)>,
131131
Results<(
132132
outs AnyTypeOf<[FixedVectorOfShapeAndType<[8], F32>,
133133
FixedVectorOfShapeAndType<[8], F16>,
@@ -139,17 +139,21 @@ def AMDGPU_ScaledExtPacked816Op
139139
let summary = "Extend a vector of packed floating point values";
140140

141141
let description = [{
142-
The scales applied to the input microfloats are stored in two bytes which
142+
The scales applied to the input microfloats are stored in bytes which
143143
come from the `scales` input provided in a *half* of the wave identified
144-
by `firstScaleLane`. The pair of bytes used is selected by
145-
`firstScaleByte`. The 16 vectors in consecutive lanes starting from
144+
by `firstScaleLane`. The bytes used is selected by `firstScaleByte` and depends
145+
on the type of `source`. The 16 vectors in consecutive lanes starting from
146146
`firstScaleLane` (which we'll call the scale vectors) will be used by both
147-
halves of the wave (with lane L reading from L % 16'th scale vector), but
148-
each half will use a different byte.
147+
halves of the wave (with lane L reading from L % 16'th scale vector).
148+
149+
When `source` is either F4E2M1FN, F6E2M3FN, or F6E3M2FN each half of the
150+
wave will use a different byte. The first one being `firstScaleByte` and
151+
the second one being `firstScaleByte` + 1. When the block size is 32,
152+
`firstScaleByte` can be either 0 or 2, selecting halves of the scale vectors.
153+
Lanes 0-15 will read from `firstScaleByte` and lanes 16-31 will read
154+
from `firstScaleByte` + 1.
155+
149156

150-
When the block size is 32, `firstScaleByte` can be either 0 or 2,
151-
selecting halves of the scale vectors. Lanes 0-15 will read from
152-
`firstScaleByte` and lanes 16-31 will read from `firstScaleByte` + 1.
153157
For example:
154158
```mlir
155159
// Input: 8-element vector of F8E4M3FN, converting to F32
@@ -165,7 +169,8 @@ def AMDGPU_ScaledExtPacked816Op
165169
: vector<16xf6E2M3FN>, vector<4xf8E8M0FNU> -> vector<16xf16>
166170
```
167171

168-
However, when the block size is 16, `firstScaleByte` can be 0 or 1.
172+
When `source` is either F4E2M1FN, F6E2M3FN, or F6E3M2FN and
173+
the block size is 16, `firstScaleByte` can be 0 or 1.
169174
Lanes 0-15 read from the `firstScaleByte`th element of the scale vectors,
170175
while lanes 16-31 read from `firstScaleByte` + 2.
171176
For example:
@@ -187,6 +192,16 @@ def AMDGPU_ScaledExtPacked816Op
187192
instructions use for matix scales. These selection operands allows
188193
one to choose portions of the matrix to convert.
189194

195+
When `source` is either F8E4M3FN or F8E5M2 and `blockSize` is 32,
196+
then the same byte will be used by both halves of the wave.
197+
In this case, `firstScaleByte` can be any value from 0 to 3.
198+
199+
When `source` is either F8E4M3FN or F8E5M2 and `blockSize` is 16,
200+
following combinations are allowed:
201+
* `firstScaleLane(0), firstScaleByte(0)`
202+
* `firstScaleLane(1), firstScaleByte(2)`
203+
all other combinations are reserved.
204+
190205
Available on gfx1250+.
191206
}];
192207

mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp

Lines changed: 18 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -344,14 +344,27 @@ void RawBufferAtomicCmpswapOp::getCanonicalizationPatterns(
344344
LogicalResult ScaledExtPacked816Op::verify() {
345345
int blockSize = getBlockSize();
346346
assert((blockSize == 16 || blockSize == 32) && "invalid block size");
347+
347348
int firstScaleByte = getFirstScaleByte();
348-
if (blockSize == 16 && !llvm::is_contained({0, 1}, firstScaleByte)) {
349-
return emitOpError(
350-
"blockSize of 16 can only have firstScaleByte be 0 or 1.");
349+
auto sourceType = cast<VectorType>(getSource().getType());
350+
Type elementType = sourceType.getElementType();
351+
auto floatType = cast<FloatType>(elementType);
352+
int bitWidth = floatType.getWidth();
353+
354+
if (llvm::is_contained({4, 6}, bitWidth) && blockSize == 16 &&
355+
!llvm::is_contained({0, 1}, firstScaleByte)) {
356+
return emitOpError("blockSize of 16 can only have firstScaleByte be 0 or 1 "
357+
"for f4 and f6.");
358+
}
359+
if (llvm::is_contained({4, 6}, bitWidth) && blockSize == 32 &&
360+
!llvm::is_contained({0, 2}, firstScaleByte)) {
361+
return emitOpError("blockSize of 32 can only have firstScaleByte be 0 or 2 "
362+
"for f4 and f6.");
351363
}
352-
if (blockSize == 32 && !llvm::is_contained({0, 2}, firstScaleByte)) {
364+
if (bitWidth == 8 && blockSize == 16 &&
365+
!llvm::is_contained({0, 2}, firstScaleByte)) {
353366
return emitOpError(
354-
"blockSize of 32 can only have firstScaleByte be 0 or 2.");
367+
"blockSize of 16 can only have firstScaleByte be 0 or 2 for f8.");
355368
}
356369

357370
return success();

mlir/test/Dialect/AMDGPU/invalid.mlir

Lines changed: 14 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -333,17 +333,25 @@ func.func @gather_to_lds_non_lds(%idx1 : index, %mem1 : memref<32xf16>, %mem2 :
333333

334334
// -----
335335

336-
func.func @amdgpu.scaled_ext_packed816_invalid_block_size_and_first_scale_byte_16(%v: vector<8xf8E5M2>, %scale: vector<4xf8E8M0FNU>) {
337-
// expected-error@+1 {{'amdgpu.scaled_ext_packed816' op blockSize of 16 can only have firstScaleByte be 0 or 1.}}
338-
%ret0 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(16) firstScaleLane(0) firstScaleByte(2) : vector<8xf8E5M2>, vector<4xf8E8M0FNU> -> vector<8xf16>
336+
func.func @amdgpu.scaled_ext_packed816_invalid_block_size_and_first_scale_byte_16(%v: vector<8xf4E2M1FN>, %scale: vector<4xf8E8M0FNU>) {
337+
// expected-error@+1 {{'amdgpu.scaled_ext_packed816' op blockSize of 16 can only have firstScaleByte be 0 or 1 for f4 and f6}}
338+
%ret0 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(16) firstScaleLane(0) firstScaleByte(2) : vector<8xf4E2M1FN>, vector<4xf8E8M0FNU> -> vector<8xf16>
339339
func.return
340340
}
341341

342342
// -----
343343

344-
func.func @amdgpu.scaled_ext_packed816_invalid_block_size_and_first_scale_byte_32(%v: vector<8xf8E5M2>, %scale: vector<4xf8E8M0FNU>) {
345-
// expected-error@+1 {{'amdgpu.scaled_ext_packed816' op blockSize of 32 can only have firstScaleByte be 0 or 2.}}
346-
%ret0 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(1) : vector<8xf8E5M2>, vector<4xf8E8M0FNU> -> vector<8xf16>
344+
func.func @amdgpu.scaled_ext_packed816_invalid_block_size_and_first_scale_byte_32(%v: vector<8xf4E2M1FN>, %scale: vector<4xf8E8M0FNU>) {
345+
// expected-error@+1 {{'amdgpu.scaled_ext_packed816' op blockSize of 32 can only have firstScaleByte be 0 or 2 for f4 and f6.}}
346+
%ret0 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(1) : vector<8xf4E2M1FN>, vector<4xf8E8M0FNU> -> vector<8xf16>
347+
func.return
348+
}
349+
350+
// -----
351+
352+
func.func @amdgpu.scaled_ext_packed816_invalid_attributes_for_f8(%v: vector<8xf8E5M2>, %scale: vector<4xf8E8M0FNU>) {
353+
// expected-error@+1 {{'amdgpu.scaled_ext_packed816' op blockSize of 16 can only have firstScaleByte be 0 or 2 for f8.}}
354+
%ret0 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(16) firstScaleLane(0) firstScaleByte(1) : vector<8xf8E5M2>, vector<4xf8E8M0FNU> -> vector<8xf16>
347355
func.return
348356
}
349357

0 commit comments

Comments
 (0)