@@ -112,6 +112,97 @@ def AMDGPU_ExtPackedFp8Op :
112112 }];
113113}
114114
115+ def IsValidBlockSize: AttrConstraint<
116+ CPred<"::llvm::is_contained({16, 32}, ::llvm::cast<::mlir::IntegerAttr>($_self).getInt())">,
117+ "whose value is 16 or 32">;
118+
119+ def AMDGPU_ScaledExtPacked816Op
120+ : AMDGPU_Op<"scaled_ext_packed816", [Pure, AllShapesMatch<["source", "res"]>]>,
121+ Arguments<(
122+ ins AnyTypeOf<[FixedVectorOfShapeAndType<[8], F4E2M1FN>,
123+ FixedVectorOfShapeAndType<[8], F8E4M3FN>,
124+ FixedVectorOfShapeAndType<[8], F8E5M2>,
125+ FixedVectorOfShapeAndType<[16], F6E2M3FN>,
126+ FixedVectorOfShapeAndType<[16], F6E3M2FN>]>:$source,
127+ FixedVectorOfShapeAndType<[4], F8E8M0FNU>:$scale,
128+ ConfinedAttr<I32Attr, [IsValidBlockSize]>:$blockSize,
129+ ConfinedAttr<I32Attr, [IntMinValue<0>, IntMaxValue<1>]>:$firstScaleLane,
130+ ConfinedAttr<I32Attr, [IntMinValue<0>, IntMaxValue<2>]>:$firstScaleByte)>,
131+ Results<(
132+ outs AnyTypeOf<[FixedVectorOfShapeAndType<[8], F32>,
133+ FixedVectorOfShapeAndType<[8], F16>,
134+ FixedVectorOfShapeAndType<[8], BF16>,
135+ FixedVectorOfShapeAndType<[16], F32>,
136+ FixedVectorOfShapeAndType<[16], F16>,
137+ FixedVectorOfShapeAndType<[16], BF16>]>:$res)> {
138+
139+ let summary = "Extend a vector of packed floating point values";
140+
141+ let description = [{
142+ The scales applied to the input microfloats are stored in two bytes which
143+ come from the `scales` input provided in a *half* of the wave identified
144+ by `firstScaleLane`. The pair of bytes used is selected by
145+ `firstScaleByte`. The 16 vectors in consecutive lanes starting from
146+ `firstScaleLane` (which we'll call the scale vectors) will be used by both
147+ halves of the wave (with lane L reading from L % 16'th scale vector), but
148+ each half will use a different byte.
149+
150+ When the block size is 32, `firstScaleByte` can be either 0 or 2,
151+ selecting halves of the scale vectors. Lanes 0-15 will read from
152+ `firstScaleByte` and lanes 16-31 will read from `firstScaleByte` + 1.
153+ For example:
154+ ```mlir
155+ // Input: 8-element vector of F8E4M3FN, converting to F32
156+ // Lanes 0-15 read from byte 0, lanes 16-31 read from byte 1
157+ %result = amdgpu.scaled_ext_packed816 %source scale(%scales)
158+ blockSize(32) firstScaleLane(0) firstScaleByte(0)
159+ : vector<8xf8E4M3FN>, vector<4xf8E8M0FNU> -> vector<8xf32>
160+
161+ // Input: 16-element vector of F6E2M3FN, converting to F16
162+ // Lanes 0-15 read from byte 2, lanes 16-31 read from byte 3
163+ %result = amdgpu.scaled_ext_packed816 %source scale(%scales)
164+ blockSize(32) firstScaleLane(1) firstScaleByte(2)
165+ : vector<16xf6E2M3FN>, vector<4xf8E8M0FNU> -> vector<16xf16>
166+ ```
167+
168+ However, when the block size is 16, `firstScaleByte` can be 0 or 1.
169+ Lanes 0-15 read from the `firstScaleByte`th element of the scale vectors,
170+ while lanes 16-31 read from `firstScaleByte` + 2.
171+ For example:
172+ ```mlir
173+ // Input: 8-element vector of F8E5M2, converting to BF16
174+ // Lanes 0-15 read from byte 0, lanes 16-31 read from byte 2 (0+2)
175+ %result = amdgpu.scaled_ext_packed816 %source scale(%scales)
176+ blockSize(16) firstScaleLane(0) firstScaleByte(0)
177+ : vector<8xf8E5M2>, vector<4xf8E8M0FNU> -> vector<8xbf16>
178+
179+ // Input: 16-element vector of F6E3M2FN, converting to F32
180+ // Lanes 0-15 read from byte 1, lanes 16-31 read from byte 3 (1+2)
181+ %result = amdgpu.scaled_ext_packed816 %source scale(%scales)
182+ blockSize(16) firstScaleLane(1) firstScaleByte(1)
183+ : vector<16xf6E3M2FN>, vector<4xf8E8M0FNU> -> vector<16xf32>
184+ ```
185+
186+ Note: the layout for the scales generally mirrors how the WMMA
187+ instructions use for matix scales. These selection operands allows
188+ one to choose portions of the matrix to convert.
189+
190+ Available on gfx1250+.
191+ }];
192+
193+ let assemblyFormat = [{
194+ attr-dict $source
195+ `scale` `(` $scale `)`
196+ `blockSize` `(` $blockSize `)`
197+ `firstScaleLane` `(` $firstScaleLane`)`
198+ `firstScaleByte` `(` $firstScaleByte `)`
199+ `:` type($source) `,` type($scale) `->` type($res)
200+ }];
201+
202+ let hasVerifier = 1;
203+
204+ }
205+
115206def AMDGPU_ScaledExtPackedOp
116207 : AMDGPU_Op<"scaled_ext_packed", [Pure]>,
117208 Arguments<(
@@ -860,7 +951,7 @@ def AMDGPU_MFMAOp :
860951 based on the provided `m`, `k`, `n`, and `nBlks` attributes, along with the
861952 types of the source and destination arguments.
862953
863- For information on the layouts of the input and output matrces (which are stored
954+ For information on the layouts of the input and output matrices (which are stored
864955 in `sourceA`, `sourceB`, `destC`, and `destD`), see the CDNA ISA documentation.
865956
866957 The `cbsz`, `abid`, and `blgp` parameters control how the lanes of the wave
0 commit comments