Skip to content

Commit 03bfe08

Browse files
committed
Merge branch 'users/chencha3/xegpu/xegpu_simt_2d_to_1d' into xegpu_simt_dist
2 parents c81b2e0 + 2159119 commit 03bfe08

File tree

6 files changed

+270
-303
lines changed

6 files changed

+270
-303
lines changed

mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td

Lines changed: 8 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -833,30 +833,27 @@ def XeGPU_DpasOp : XeGPU_Op<"dpas", [Pure, AllElementTypesMatch<["lhs", "rhs"]>]
833833
data type, the matrices are `A: vector<8x16xf16>`, `B: vector<16x16xf16>`,
834834
and `C/D: vector<8x16xf32>`. Besides the matrix size requirements, DPAS
835835
also requires A and B to be loaded with the required data layout. Specially,
836-
837836
VNNI layout is required for B operand. It is achieved via adding `packed`
838837
attribute to the `load_nd` operator. Due to the VNNI transformation, B operands
839838
can be represented as a 3D vector, with the last dimension representing the VNNI
840839
factor, which is computed as `32/bit_width_of_elem_type`. Thus, `B: vector<16x16xf16>`
841840
can be represented as `B: vector<8x16x2xf16>`.
842841

843-
In SIMT mode, DpasOp expects layout attributes `a`, `b`, and `c` (only if acc is used)
844-
which describe the data fragment owned by each work-item w.r.t. the tensor descriptor
845-
these data are loaded from.
842+
In SIMT code, each work-item from a subgroup holds a data fragment for A, B, C and the result,
843+
which are represented as 1D vectors. Please refer to [OpenCL Intel extentions]
844+
(https://registry.khronos.org/OpenCL/extensions/intel/cl_intel_subgroup_matrix_multiply_accumulate.html)
845+
for more details about the fragment distribution.
846846

847847
Note: on PVC, the hardware can perform load with VNNI transformation when data
848848
element type is 16-bit or lower precision, taking 2 or 4 elements from
849849
the first dimension and inserted into the newly added innermost dimension.
850850
}];
851851

852852
let arguments = (ins
853-
XeGPU_DpasOpType : $lhs,
854-
XeGPU_DpasOpType : $rhs,
855-
Optional<XeGPU_Vector2DType>: $acc,
856-
OptionalAttr<XeGPU_LayoutAttr>:$a_layout,
857-
OptionalAttr<XeGPU_LayoutAttr>:$b_layout,
858-
OptionalAttr<XeGPU_LayoutAttr>:$c_layout);
859-
let results = (outs XeGPU_Vector2DType: $result);
853+
XeGPU_DpasOprType : $lhs,
854+
XeGPU_DpasOprType : $rhs,
855+
Optional<XeGPU_DpasResType>: $acc);
856+
let results = (outs XeGPU_DpasResType: $result);
860857

861858
let extraClassDeclaration = [{
862859
VectorType getLhsType() {

mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,8 @@ def XeGPU_IntType: AnyTypeOf<[I1, I8, I16, I32, I64, SI1, SI8, SI16, SI32, SI64,
1717
def XeGPU_FloatType: AnyTypeOf<[F16, F32, F64, BF16, TF32]>;
1818
def XeGPU_ScalarType: AnyTypeOf<[XeGPU_IntType, XeGPU_FloatType]>;
1919
def XeGPU_BaseAddrType: AnyTypeOf<[Non0RankedMemRefOf<[XeGPU_ScalarType]>, UI64, UI32, I64, I32]>;
20-
def XeGPU_DpasOpType: VectorOfRankAndType<[2, 3], [XeGPU_ScalarType]>;
20+
def XeGPU_DpasOprType: VectorOfRankAndType<[1, 2, 3], [XeGPU_ScalarType]>;
21+
def XeGPU_DpasResType: VectorOfRankAndType<[1, 2], [XeGPU_ScalarType]>;
2122
def XeGPU_OffsetType: VectorOfRankAndType<[1], [Index]>;
2223
def XeGPU_MaskType: AnyTypeOf<[VectorOfRankAndType<[1], [I1]>, I1]>;
2324
def XeGPU_ValueType: AnyTypeOf<[VectorOfRankAndType<[1,2,3,4], [XeGPU_ScalarType]>, XeGPU_ScalarType]>;

mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp

Lines changed: 12 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
#include "mlir/IR/DialectImplementation.h"
1313
#include "llvm/ADT/SmallVector.h"
1414
#include "llvm/ADT/TypeSwitch.h"
15+
#include <numeric>
1516

1617
namespace mlir {
1718
namespace xegpu {
@@ -338,32 +339,30 @@ LogicalResult TensorDescType::verify(
338339
// [n_distribution_units, lane_data_size]
339340
FailureOr<VectorType> TensorDescType::getDistributedVectorType() {
340341
auto layout = llvm::dyn_cast_if_present<LayoutAttr>(getLayout());
341-
// If no layout is provided, tensor desc is not used in SIMT mode.
342-
if (!layout)
342+
// It only works for subgroup level layout, which only has lane_layout
343+
// and lane_data, and is to distribute a SIMD code into SIMT code.
344+
if (!layout || !layout.isSgLayout())
343345
return failure();
344346

345347
SmallVector<int64_t> laneData(layout.getLaneData().asArrayRef());
346348
SmallVector<int64_t> laneLayout(layout.getLaneLayout().asArrayRef());
347349
auto tdescShape = getShape();
348350

349-
auto laneDataSize = 1, sgSize = 1;
350-
for (auto [laneDim, laneDataDim] : llvm::zip_equal(laneLayout, laneData)) {
351-
laneDataSize *= laneDataDim;
352-
sgSize *= laneDim;
353-
}
351+
// compute sgSize by multiply elements of laneLayout
352+
// e.g. for 2D layout, sgSize = laneLayout[0] * laneLayout[1]
353+
// e.g. for 1D layout, sgSize = laneLayout[0]
354+
auto sgSize = std::accumulate(laneLayout.begin(), laneLayout.end(), 1,
355+
std::multiplies<int64_t>());
354356

355357
// Case 1: regular loads/stores
356358
auto scatterAttr = getEncodingAsScatterTensorDescAttr();
357359
if (scatterAttr) {
358360
auto chunkSize = scatterAttr.getChunkSize().getInt();
359361
// Verify if the first dimension of the tensor descriptor shape is
360362
// distributable.
361-
assert(tdescShape[0] % (laneLayout[0]) == 0 &&
363+
assert(tdescShape[0] == laneLayout[0] &&
362364
"tensor descriptor shape is not distributable");
363-
if (chunkSize > 1)
364-
return VectorType::get({chunkSize / laneDataSize, laneDataSize},
365-
getElementType());
366-
return VectorType::get({laneDataSize}, getElementType());
365+
return VectorType::get({chunkSize}, getElementType());
367366
}
368367

369368
// Case 2: block loads/stores
@@ -378,12 +377,7 @@ FailureOr<VectorType> TensorDescType::getDistributedVectorType() {
378377
// tensorSize must be adjusted for array_length.
379378
tensorSize *= getArrayLength();
380379

381-
if (layout.getRank() == 1) {
382-
return VectorType::get({tensorSize / sgSize}, getElementType());
383-
}
384-
385-
return VectorType::get({tensorSize / (sgSize * laneDataSize), laneDataSize},
386-
getElementType());
380+
return VectorType::get({tensorSize / sgSize}, getElementType());
387381
}
388382

389383
} // namespace xegpu

0 commit comments

Comments
 (0)