diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td index 16a7f63d60c82..5fa18754305ca 100644 --- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td +++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td @@ -833,16 +833,16 @@ def XeGPU_DpasOp : XeGPU_Op<"dpas", [Pure, AllElementTypesMatch<["lhs", "rhs"]>] data type, the matrices are `A: vector<8x16xf16>`, `B: vector<16x16xf16>`, and `C/D: vector<8x16xf32>`. Besides the matrix size requirements, DPAS also requires A and B to be loaded with the required data layout. Specially, - VNNI layout is required for B operand. It is achieved via adding `packed` attribute to the `load_nd` operator. Due to the VNNI transformation, B operands can be represented as a 3D vector, with the last dimension representing the VNNI factor, which is computed as `32/bit_width_of_elem_type`. Thus, `B: vector<16x16xf16>` can be represented as `B: vector<8x16x2xf16>`. - In SIMT mode, DpasOp expects layout attributes `a`, `b`, and `c` (only if acc is used) - which describe the data fragment owned by each work-item w.r.t. the tensor descriptor - these data are loaded from. + In SIMT code, each work-item from a subgroup holds a data fragment for A, B, C and the result, + which are represented as 1D vectors. Please refer to [OpenCL Intel extentions] + (https://registry.khronos.org/OpenCL/extensions/intel/cl_intel_subgroup_matrix_multiply_accumulate.html) + for more details about the fragment distribution. Note: on PVC, the hardware can perform load with VNNI transformation when data element type is 16-bit or lower precision, taking 2 or 4 elements from @@ -850,13 +850,10 @@ def XeGPU_DpasOp : XeGPU_Op<"dpas", [Pure, AllElementTypesMatch<["lhs", "rhs"]>] }]; let arguments = (ins - XeGPU_DpasOpType : $lhs, - XeGPU_DpasOpType : $rhs, - Optional: $acc, - OptionalAttr:$a_layout, - OptionalAttr:$b_layout, - OptionalAttr:$c_layout); - let results = (outs XeGPU_Vector2DType: $result); + XeGPU_DpasOprType : $lhs, + XeGPU_DpasOprType : $rhs, + Optional: $acc); + let results = (outs XeGPU_DpasResType: $result); let extraClassDeclaration = [{ VectorType getLhsType() { diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td index 173f1462fdd73..3cb71788a15ef 100644 --- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td +++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td @@ -17,7 +17,8 @@ def XeGPU_IntType: AnyTypeOf<[I1, I8, I16, I32, I64, SI1, SI8, SI16, SI32, SI64, def XeGPU_FloatType: AnyTypeOf<[F16, F32, F64, BF16, TF32]>; def XeGPU_ScalarType: AnyTypeOf<[XeGPU_IntType, XeGPU_FloatType]>; def XeGPU_BaseAddrType: AnyTypeOf<[Non0RankedMemRefOf<[XeGPU_ScalarType]>, UI64, UI32, I64, I32]>; -def XeGPU_DpasOpType: VectorOfRankAndType<[2, 3], [XeGPU_ScalarType]>; +def XeGPU_DpasOprType: VectorOfRankAndType<[1, 2, 3], [XeGPU_ScalarType]>; +def XeGPU_DpasResType: VectorOfRankAndType<[1, 2], [XeGPU_ScalarType]>; def XeGPU_OffsetType: VectorOfRankAndType<[1], [Index]>; def XeGPU_MaskType: AnyTypeOf<[VectorOfRankAndType<[1], [I1]>, I1]>; def XeGPU_ValueType: AnyTypeOf<[VectorOfRankAndType<[1,2,3,4], [XeGPU_ScalarType]>, XeGPU_ScalarType]>; diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp index 171a15ce27b59..b865b80f0075e 100644 --- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp +++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp @@ -10,6 +10,7 @@ #include "mlir/IR/Builders.h" #include "mlir/IR/DialectImplementation.h" #include "llvm/ADT/TypeSwitch.h" +#include namespace mlir { namespace xegpu { @@ -319,36 +320,38 @@ LogicalResult TensorDescType::verify( // --------------------------------------------------------------------- // Case 1: Regular loads/stores. // --------------------------------------------------------------------- -// Distributed vector shape must be: -// [chunk_size / lane_data_size, lane_data_size] -// If the tensor descriptor shape is 1D, first dimension is ignored (set to 1). -// [lane_data_size] +// The following conditions must be met: +// * tensor_desc[0] == lane_layout[0] +// Distributed vector is a 1D vector with shape: +// [chunk_size] // --------------------------------------------------------------------- // Case 2: Block loads/stores // --------------------------------------------------------------------- // Additional definitions: // tensor_size = tensor_desc[0] * .. * tensor_desc[r-1] * array_length // n_distribution_units = tensor_size / distribution_unit_size +// fragment_size = n_distribution_units * lane_data_size // Given above definitions, the following conditions must be met: // * tensor_desc[0] % (lane_layout[0] × lane_data[0]) == 0 // * tensor_desc[1] % (lane_layout[1] × lane_data[1]) == 0 -// Distributed vector shape must be: -// [n_distribution_units, lane_data_size] +// Distributed vector is a 1D vector with shape: +// [fragment_size] FailureOr TensorDescType::getDistributedVectorType() { auto layout = llvm::dyn_cast_if_present(getLayout()); - // If no layout is provided, tensor desc is not used in SIMT mode. - if (!layout) + // It only works for subgroup level layout, which only has lane_layout + // and lane_data, and is to distribute a SIMD code into SIMT code. + if (!layout || !layout.isSgLayout()) return failure(); SmallVector laneData(layout.getLaneData().asArrayRef()); SmallVector laneLayout(layout.getLaneLayout().asArrayRef()); auto tdescShape = getShape(); - auto laneDataSize = 1, sgSize = 1; - for (auto [laneDim, laneDataDim] : llvm::zip_equal(laneLayout, laneData)) { - laneDataSize *= laneDataDim; - sgSize *= laneDim; - } + // compute sgSize by multiply elements of laneLayout + // e.g. for 2D layout, sgSize = laneLayout[0] * laneLayout[1] + // e.g. for 1D layout, sgSize = laneLayout[0] + auto sgSize = std::accumulate(laneLayout.begin(), laneLayout.end(), 1, + std::multiplies()); // Case 1: regular loads/stores auto scatterAttr = getEncodingAsScatterTensorDescAttr(); @@ -356,12 +359,9 @@ FailureOr TensorDescType::getDistributedVectorType() { auto chunkSize = scatterAttr.getChunkSize().getInt(); // Verify if the first dimension of the tensor descriptor shape is // distributable. - assert(tdescShape[0] % (laneLayout[0]) == 0 && + assert(tdescShape[0] == laneLayout[0] && "tensor descriptor shape is not distributable"); - if (chunkSize > 1) - return VectorType::get({chunkSize / laneDataSize, laneDataSize}, - getElementType()); - return VectorType::get({laneDataSize}, getElementType()); + return VectorType::get({chunkSize}, getElementType()); } // Case 2: block loads/stores @@ -376,8 +376,7 @@ FailureOr TensorDescType::getDistributedVectorType() { // tensorSize must be adjusted for array_length. tensorSize *= getArrayLength(); - return VectorType::get({tensorSize / (sgSize * laneDataSize), laneDataSize}, - getElementType()); + return VectorType::get({tensorSize / sgSize}, getElementType()); } } // namespace xegpu diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp index 0d67e3d70f945..e0e25365220b5 100644 --- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp +++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp @@ -73,38 +73,6 @@ static bool isWriteHintOrNone(const CachePolicyAttr &attr) { kind == CachePolicy::WRITE_BACK || kind == CachePolicy::WRITE_THROUGH; } -// Helper to validate value shape of LoadNd and StoreNd ops. -static LogicalResult -isArgShapesValid(TensorDescType tdescTy, VectorType valueTy, - ArrayRef adjustedTdescShape, - function_ref emitError) { - auto layout = tdescTy.getLayoutAttr(); - auto valueShape = valueTy.getShape(); - // layout not present means IR is in SIMD mode. In this case value shape must - // match adjusted tensor descriptor shape. - if (!layout) - return valueShape == adjustedTdescShape - ? success() - : emitError() - << "Value shape " << makeString(valueShape) - << " is not consistent with tensor descriptor " << tdescTy; - - // layout present means IR is in SIMT mode. In this case layout determines the - // value shape. - auto expectedValueShapeOrFailure = tdescTy.getDistributedVectorType(); - assert(succeeded(expectedValueShapeOrFailure) && - "Failed to compute distributed vector shape for " - "tensor descriptor "); - - return valueTy == expectedValueShapeOrFailure.value() - ? success() - : emitError() - << "Result shape " << makeString(valueShape) - << " is not consistent with distributed vector shape " - << makeString(expectedValueShapeOrFailure.value().getShape()) - << " for tensor descriptor " << tdescTy; -} - // Checks if the given shape is evenly distributed based on the layout // and data factors provided by the LayoutAttr. The function ensures that // each dimension of the shape can be evenly divided by the corresponding @@ -133,6 +101,53 @@ static bool isEvenDistributed(llvm::ArrayRef shape, return true; } +static LogicalResult +isValidGatherScatterParams(Type maskTy, VectorType valueTy, + TensorDescType tdescTy, UnitAttr transposeAttr, + function_ref emitError) { + + if (!tdescTy.isScattered()) + return emitError() << "Expects a scattered TensorDesc."; + + if (!valueTy) + return emitError() << "Expecting a vector type result."; + + auto maskShape = getShapeOf(maskTy); + auto valueShape = getShapeOf(valueTy); + auto tdescShape = getShapeOf(tdescTy); + auto chunkSize = tdescTy.getChunkSize(); + + if (valueTy.getElementType() != tdescTy.getElementType()) + return emitError() + << "Value should have the same element type as TensorDesc."; + + if (tdescShape[0] != maskShape[0]) + return emitError() + << "dim-0 of the Mask and TensorDesc should be the same."; + + // a valid shape for SIMT case + if (valueTy.getRank() == 1 && valueTy.getNumElements() == chunkSize) { + if (tdescTy.getLayoutAttr()) + return emitError() << "TensorDesc doesn't need LayoutAttr for SIMT code"; + if (transposeAttr) + return emitError() << "doesn't need TransposeAttr for SIMT code"; + return success(); + } + + if (tdescTy.getRank() == 2 && valueTy.getRank() == 2) { + if (!transposeAttr) + return emitError() << "rank-2 tensor has to be transposed."; + transpose({1, 0}, tdescShape); + } + + if (tdescShape != valueShape) + return emitError() << "Value shape " << makeString(valueShape) + << " is neither a valid distribution for SIMT nor " + "consistent with the tensor descriptor for SIMD " + << tdescTy; + return success(); +} + //===----------------------------------------------------------------------===// // XeGPU_CreateNdDescOp //===----------------------------------------------------------------------===// @@ -302,9 +317,32 @@ LogicalResult LoadNdOp::verify() { if (!isReadHintOrNone(getL3HintAttr())) return emitOpError("invalid l3_hint: ") << getL3HintAttr(); - auto array_len = tdescTy.getArrayLength(); - // adjusted tensor descriptor shape tracks the expected shape of the result. - auto adjustedTdescShape = getShapeOf(tdescTy); + int tdescElems = tdescTy.getNumElements() * tdescTy.getArrayLength(); + int valueElems = valueTy.getNumElements(); + + // If the result vector is 1D and has less elements than the tensor + // descriptor, it is supposed to be a SIMT op. The layout attribute in + // tensor_desc is not needed. + if (valueElems < tdescElems && valueTy.getRank() == 1) { + // SIMT mode doesn't need LayoutAttr. + if (tdescTy.getLayoutAttr()) + return emitOpError() + << "TensorDesc doesn't need LayoutAttr for SIMT code"; + + // For SIMT code, the load is evenly distributed across all lanes in a + // subgroup. Since subgroup size is arch dependent, we only check even + // distribution here. + if (tdescElems % valueElems) + return emitOpError() + << "Result shape " << makeString(getShapeOf(valueTy)) + << " is not a valid distribution for tensor descriptor " + << tdescTy; + + return success(); + } + + // Check SIMD mode. + auto tdescShape = getShapeOf(tdescTy); auto valueShape = getShapeOf(valueTy); if (getTranspose()) { @@ -316,7 +354,7 @@ LogicalResult LoadNdOp::verify() { }); if (valid) - transpose(trans, adjustedTdescShape); + transpose(trans, tdescShape); else mlir::emitWarning(getLoc()) << "Invalid transpose attr. It is ignored."; } @@ -325,8 +363,8 @@ LogicalResult LoadNdOp::verify() { if (tdescTy.getRank() == 2) { const int axis = 0; auto vnni_factor = valueShape.back(); - adjustedTdescShape[axis] /= vnni_factor; - adjustedTdescShape.push_back(vnni_factor); + tdescShape[axis] /= vnni_factor; + tdescShape.push_back(vnni_factor); } else { mlir::emitWarning(getLoc()) << "Invalid Packed Attr. It is ignored (available for 2D " @@ -334,13 +372,18 @@ LogicalResult LoadNdOp::verify() { } } + auto array_len = tdescTy.getArrayLength(); if (array_len > 1) { - auto it = adjustedTdescShape.begin(); - adjustedTdescShape.insert(it, array_len); + tdescShape.insert(tdescShape.begin(), array_len); } - return isArgShapesValid(tdescTy, valueTy, adjustedTdescShape, - [&]() { return emitOpError(); }); + if (tdescShape != valueShape) { + return emitOpError() << "Result shape " << makeString(valueShape) + << " is not consistent with tensor descriptor " + << tdescTy; + } + + return success(); } //===----------------------------------------------------------------------===// @@ -368,11 +411,40 @@ LogicalResult StoreNdOp::verify() { if (!isWriteHintOrNone(getL3HintAttr())) return emitOpError("invalid l3_hint: ") << getL3HintAttr(); + auto array_len = dstTy.getArrayLength(); + if (array_len > 1) + return emitOpError("array length is not supported by store_nd.\n"); + + auto tdescElems = dstTy.getNumElements(); + auto valueElems = valTy.getNumElements(); + + // Similar to LoadNdOp, if the value vector is 1D and has less elements than + // the tensor descriptor, it is supposed to be a SIMT op. The layout attribute + // in tensor_desc is not needed. + if (valTy.getRank() == 1 && valueElems < tdescElems) { + // SIMT mode doesn't need LayoutAttr. + if (dstTy.getLayoutAttr()) + return emitOpError() + << "TensorDesc doesn't need LayoutAttr for SIMT code"; + + if (tdescElems % valueElems) { + return emitOpError() + << "Value shape " << makeString(getShapeOf(valTy)) + << " is not a valid distribution for tensor descriptor " << dstTy; + } + return success(); + } + + // SIMD code should have the same shape as the tensor descriptor. auto tdescShape = getShapeOf(dstTy); auto valueShape = getShapeOf(valTy); + if (tdescShape != valueShape) { + return emitOpError() << "Value shape " << makeString(valueShape) + << " is not consistent with tensor descriptor " + << dstTy; + } - return isArgShapesValid(dstTy, valTy, tdescShape, - [&]() { return emitOpError(); }); + return success(); } //===----------------------------------------------------------------------===// @@ -492,12 +564,6 @@ LogicalResult LoadGatherOp::verify() { auto maskTy = getMaskType(); auto valueTy = getValueType(); - if (!valueTy) - return emitOpError("Expecting a vector type result.\n"); - - if (!tdescTy.isScattered()) - return emitOpError("Expects a scattered TensorDesc.\n"); - if (!isReadHintOrNone(getL1HintAttr())) return emitOpError("invalid l1_hint: ") << getL1HintAttr(); @@ -507,27 +573,9 @@ LogicalResult LoadGatherOp::verify() { if (!isReadHintOrNone(getL3HintAttr())) return emitOpError("invalid l3_hint: ") << getL3HintAttr(); - auto tdescElemTy = tdescTy.getElementType(); - auto valueElemTy = getElementType(); - if (tdescElemTy != valueElemTy) - return emitOpError( - "Value should have the same element type as TensorDesc."); - - auto maskShape = getShapeOf(maskTy); - auto valueShape = getShapeOf(valueTy); - auto tdescShape = getShapeOf(tdescTy); - - if (tdescShape[0] != maskShape[0]) - return emitOpError("dim-0 of the Mask and TensorDesc should be the same."); - - if (tdescTy.getRank() == 2) { - if (!getTransposeAttr()) - return emitOpError("load of rank-2 tensor has to be transposed."); - transpose({1, 0}, tdescShape); - } - - return isArgShapesValid(tdescTy, valueTy, tdescShape, - [&]() { return emitOpError(); }); + return isValidGatherScatterParams(maskTy, valueTy, tdescTy, + getTransposeAttr(), + [&]() { return emitOpError(); }); } //===----------------------------------------------------------------------===// @@ -535,8 +583,8 @@ LogicalResult LoadGatherOp::verify() { //===----------------------------------------------------------------------===// LogicalResult StoreScatterOp::verify() { auto tdescTy = getTensorDescType(); - if (!tdescTy.isScattered()) - return emitOpError("Expects a scattered TensorDesc.\n"); + auto maskTy = getMaskType(); + auto valueTy = getValueType(); if (!isWriteHintOrNone(getL1HintAttr())) return emitOpError("invalid l1_hint: ") << getL1HintAttr(); @@ -547,26 +595,9 @@ LogicalResult StoreScatterOp::verify() { if (!isWriteHintOrNone(getL3HintAttr())) return emitOpError("invalid l3_hint: ") << getL3HintAttr(); - auto maskTy = getMaskType(); - auto valueTy = getValueType(); - - if (!valueTy) - return emitOpError("Expecting a vector type for the value.\n"); - - auto maskShape = getShapeOf(maskTy); - auto tdescShape = getShapeOf(tdescTy); - auto valueShape = getShapeOf(valueTy); - if (tdescShape[0] != maskShape[0]) - return emitOpError("dim-0 of the Mask and TensorDesc should be the same."); - - if (tdescTy.getRank() == 2) { - if (!getTransposeAttr()) - return emitOpError("Store of a rank-2 tensor has to be transposed."); - transpose({1, 0}, tdescShape); - } - - return isArgShapesValid(tdescTy, valueTy, tdescShape, - [&]() { return emitOpError(); }); + return isValidGatherScatterParams(maskTy, valueTy, tdescTy, + getTransposeAttr(), + [&]() { return emitOpError(); }); } //===----------------------------------------------------------------------===// @@ -602,63 +633,34 @@ LogicalResult DpasOp::verify() { auto rhsShape = getRhsType().getShape(); auto resShape = getResultType().getShape(); - auto aLayout = getALayoutAttr(); - auto bLayout = getBLayoutAttr(); - auto cLayout = getCLayoutAttr(); - - // make sure the layout attribute is either set for every available - // operand or simply not set at all. C is special, since ACC is optional. - auto hasValidLayoutAttrs = [&]() { - bool result = (aLayout != nullptr) ^ (bLayout != nullptr); - if (hasAcc()) { - result |= (aLayout != nullptr) ^ (cLayout != nullptr); - } - return !result; - }; + if (getAcc() && getAcc().getType() != getResultType()) + return emitOpError("Expecting the acc type to be the same as result."); + + // SIMT code: the size of the B operand has to be a multiple of 32 bits. + // It skips the semantic check since lack of architecture information. + // Users need to ensure the correctness. + if (lhsRank == 1 && rhsRank == 1 && resRank == 1) { + auto numElems = getRhsType().getNumElements(); + auto elemTy = getRhsType().getElementType(); + auto factor = 32 / elemTy.getIntOrFloatBitWidth(); + if (numElems % factor != 0) + return emitOpError("Expecting B operand to be a multiple of 32 bits."); + return success(); + } - if (!hasValidLayoutAttrs()) + // SIMD code + if (lhsRank != 2 || (rhsRank != 2 && rhsRank != 3) || resRank != 2) return emitOpError( - "layout attributes should be either set for all operands (for SIMT " - "code) or not set at all (for SIMD code)."); - - // query the scope from aLayout (a valid setting). - if (aLayout) { - // In SIMT mode, All data fragments must be 2D - if (lhsRank != 2 || rhsRank != 2 || resRank != 2) - return emitOpError("expecting lhs, rhs, and result to be a 2D vector."); - - auto laneLayoutA = aLayout.getLaneLayout(); - auto laneLayoutB = bLayout.getLaneLayout(); - auto laneLayoutC = cLayout.getLaneLayout(); - // Obtain the expanded shapes of the operands and result using lane_layout. - // NOTE: For B, get rid of the packed dimension for the expanded shape. - SmallVector expandedShapeA = {lhsShape[0] * laneLayoutA[0], - lhsShape[1] * laneLayoutA[1]}; - SmallVector expandedShapeB = { - rhsShape[0] * rhsShape[1] * laneLayoutB[0], 1 * laneLayoutB[1]}; - SmallVector expandedShapeC = {resShape[0] * laneLayoutC[0], - resShape[1] * laneLayoutC[1]}; - auto bK = expandedShapeB[0]; - if (bK != expandedShapeA[1]) - return emitOpError("K-dimension mismatch."); - if (expandedShapeA[0] != expandedShapeC[0]) - return emitOpError("M-dimension mismatch."); - if (expandedShapeB[1] != expandedShapeC[1]) - return emitOpError("N-dimension mismatch."); - } else { // For other scopes, operands' shape should match the mxkxn - // semantics. - if (lhsRank != 2 || (rhsRank != 2 && rhsRank != 3) || resRank != 2) - return emitOpError( - "expecting lhs and result to be a 2D vector, and rhs to be either " - "2D or 3D (packed) vector."); - auto bK = rhsRank == 3 ? rhsShape[0] * rhsShape[2] : rhsShape[0]; - if (bK != lhsShape[1]) - return emitOpError("K-dimension mismatch."); - if (lhsShape[0] != resShape[0]) - return emitOpError("M-dimension mismatch."); - if (rhsShape[1] != resShape[1]) - return emitOpError("N-dimension mismatch."); - } + "expecting lhs and result to be a 2D vector, and rhs to be either " + "2D or 3D (packed) vector."); + auto bK = rhsRank == 3 ? rhsShape[0] * rhsShape[2] : rhsShape[0]; + if (bK != lhsShape[1]) + return emitOpError("K-dimension mismatch."); + if (lhsShape[0] != resShape[0]) + return emitOpError("M-dimension mismatch."); + if (rhsShape[1] != resShape[1]) + return emitOpError("N-dimension mismatch."); + return success(); } diff --git a/mlir/test/Dialect/XeGPU/invalid.mlir b/mlir/test/Dialect/XeGPU/invalid.mlir index 48df33a591908..67ed89e11b4c9 100644 --- a/mlir/test/Dialect/XeGPU/invalid.mlir +++ b/mlir/test/Dialect/XeGPU/invalid.mlir @@ -79,25 +79,10 @@ func.func @test_load_nd_vc_3(%src: memref<8x16xf16>) { // ----- func.func @test_load_nd_layout(%src: memref<24x32xf32>) { - %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> - !xegpu.tensor_desc<8x16xf32, #xegpu.layout> - // expected-error@+1 {{Result shape [8, 2] is not consistent with distributed vector shape [8, 1]}} - %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint, - l2_hint = #xegpu.cache_hint}> - : !xegpu.tensor_desc<8x16xf32, #xegpu.layout> - -> vector<8x2xf32> - return -} - -// ----- -func.func @test_load_nd_layout(%src: memref<24x32xf32>) { - %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> - !xegpu.tensor_desc<16xf32, #xegpu.layout> - // expected-error@+1 {{Result shape [8] is not consistent with distributed vector shape [1, 1]}} + %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<16xf32> + // expected-error@+1 {{Result shape [3] is not a valid distribution for tensor descriptor}} %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint, - l2_hint = #xegpu.cache_hint}> - : !xegpu.tensor_desc<16xf32, #xegpu.layout> - -> vector<8xf32> + l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<16xf32> -> vector<3xf32> return } @@ -105,7 +90,7 @@ func.func @test_load_nd_layout(%src: memref<24x32xf32>) { func.func @test_load_nd_vc_6(%src: memref<24x32xf32>) { %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32> - // expected-error@+1 {{Value shape [8, 1] is not consistent with tensor descriptor}} + // expected-error@+1 {{Result shape [8, 1] is not consistent with tensor descriptor}} %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<8x16xf32> -> vector<8x1xf32> @@ -134,22 +119,19 @@ func.func @test_store_nd_vc_2(%dst: memref<16xf16>) { } // ----- -func.func @test_store_nd_layout(%dst: memref<24x32xf32>, %data: vector<8x2xf32>) { - %1 = xegpu.create_nd_tdesc %dst[0, 0] : memref<24x32xf32> -> - !xegpu.tensor_desc<8x16xf32, #xegpu.layout> - // expected-error@+1 {{Result shape [8, 2] is not consistent with distributed vector shape [8, 1] for tensor descriptor}} - xegpu.store_nd %data, %1 - : vector<8x2xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout> +func.func @test_store_nd_vc_3(%dst: memref<24x32xf16>) { + %1 = arith.constant dense<1.0>: vector<2x24x32xf16> + %2 = xegpu.create_nd_tdesc %dst[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<24x32xf16, #xegpu.block_tdesc_attr> + // expected-error@+1 {{array length is not supported by store_nd}} + xegpu.store_nd %1, %2: vector<2x24x32xf16>, !xegpu.tensor_desc<24x32xf16, #xegpu.block_tdesc_attr> return } // ----- -func.func @test_store_nd_layout(%dst: memref<24x32xf32>, %data: vector<2xf32>) { - %1 = xegpu.create_nd_tdesc %dst[0, 0] : memref<24x32xf32> -> - !xegpu.tensor_desc<16xf32, #xegpu.layout> - // expected-error@+1 {{Result shape [2] is not consistent with distributed vector shape [1, 1] for tensor descriptor}} - xegpu.store_nd %data, %1 - : vector<2xf32>, !xegpu.tensor_desc<16xf32, #xegpu.layout> +func.func @test_store_nd_simt(%dst: memref<24x32xf32>, %data: vector<3xf32>) { + %1 = xegpu.create_nd_tdesc %dst[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<16xf32> + // expected-error@+1 {{Value shape [3] is not a valid distribution for tensor descriptor}} + xegpu.store_nd %data, %1 : vector<3xf32>, !xegpu.tensor_desc<16xf32> return } @@ -269,45 +251,23 @@ func.func @test_create_tdesc_layout_3(%src: ui64) { } // ----- -func.func @test_load_gather_layout_1(%src: ui64) { +func.func @test_load_gather_simt_1(%src: ui64) { %0 = arith.constant dense<1>: vector<4xi1> %cst = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex> - %1 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr, #xegpu.layout> - // expected-error@+1 {{Result shape [1, 2] is not consistent with distributed vector shape [2, 1] for tensor descriptor}} - %2 = xegpu.load %1, %0 <{l1_hint = #xegpu.cache_hint, transpose}> : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr, #xegpu.layout>, vector<4xi1> -> vector<1x2xf32> + %1 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr> + // expected-error@+1 {{Value shape [6] is neither a valid distribution for SIMT nor consistent with the tensor descriptor for SIMD}} + %2 = xegpu.load %1, %0 <{l1_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr>, vector<4xi1> -> vector<6xf32> return } // ----- -func.func @test_load_gather_layout_2(%src: ui64) { +func.func @test_store_scatter_simt_1(%src: ui64) { %0 = arith.constant dense<1>: vector<4xi1> %cst = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex> - %1 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr, #xegpu.layout> - // expected-error@+1 {{esult shape [2] is not consistent with distributed vector shape [2, 1] for tensor descriptor}} - %2 = xegpu.load %1, %0 <{l1_hint = #xegpu.cache_hint, transpose}> : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr, #xegpu.layout>, vector<4xi1> -> vector<2xf32> - return -} - - -// ----- -func.func @test_store_scatter_layout_1(%src: ui64) { - %0 = arith.constant dense<1>: vector<4xi1> - %cst = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex> - %val = arith.constant dense<2.9>: vector<1x2xf32> - %1 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr, #xegpu.layout> - // expected-error@+1 {{Result shape [1, 2] is not consistent with distributed vector shape [2, 1] for tensor descriptor}} - xegpu.store %val, %1, %0 <{l1_hint = #xegpu.cache_hint, transpose}> : vector<1x2xf32>, !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr, #xegpu.layout>, vector<4xi1> - return -} - -// ----- -func.func @test_store_scatter_layout_2(%src: ui64) { - %0 = arith.constant dense<1>: vector<4xi1> - %cst = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex> - %val = arith.constant dense<2.9>: vector<2xf32> - %1 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr, #xegpu.layout> - // expected-error@+1 {{esult shape [2] is not consistent with distributed vector shape [2, 1] for tensor descriptor}} - xegpu.store %val, %1, %0 <{l1_hint = #xegpu.cache_hint, transpose}> : vector<2xf32>, !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr, #xegpu.layout>, vector<4xi1> + %val = arith.constant dense<2.9>: vector<6xf32> + %1 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr> + // expected-error@+1 {{Value shape [6] is neither a valid distribution for SIMT nor consistent with the tensor descriptor for SIMD}} + xegpu.store %val, %1, %0 <{l1_hint = #xegpu.cache_hint}> : vector<6xf32>, !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr>, vector<4xi1> return } @@ -387,26 +347,16 @@ func.func @test_dpas_4(%a : vector<16x16xf16>, %b: vector<8x16x2xf16>) { } // ----- -func.func @test_dpas_4(%a : vector<8x16xf16>, %b: vector<8x8x2xf16>) { +func.func @test_dpas_5(%a : vector<8x16xf16>, %b: vector<8x8x2xf16>) { // expected-error@+1 {{N-dimension mismatch}} %1 = xegpu.dpas %a, %b : vector<8x16xf16>, vector<8x8x2xf16> -> vector<8x16xf32> return } // ----- -func.func @test_dpas_layout_1(%a : vector<8x1xf16>, %b: vector<8x2xf16>) { - // expected-error@+1 {{layout attributes should be either set for all operands (for SIMT code) or not set at all (for SIMD code)}} - %1 = xegpu.dpas %a, %b {a_layout = #xegpu.layout} : vector<8x1xf16>, vector<8x2xf16> -> vector<8x1xf32> - return -} - -// ----- -func.func @test_dpas_layout_2(%a : vector<8x1xf16>, %b: vector<4x2xf16>) { - // expected-error@+1 {{K-dimension mismatch}} - %1 = xegpu.dpas %a, %b {a_layout = #xegpu.layout, - b_layout = #xegpu.layout, - c_layout = #xegpu.layout} - : vector<8x1xf16>, vector<4x2xf16> -> vector<8x1xf32> +func.func @test_dpas_simt_1(%a : vector<8xf16>, %b: vector<15xf16>) { + // expected-error@+1 {{Expecting B operand to be a multiple of 32 bits}} + %1 = xegpu.dpas %a, %b : vector<8xf16>, vector<15xf16> -> vector<8xf32> return } diff --git a/mlir/test/Dialect/XeGPU/ops.mlir b/mlir/test/Dialect/XeGPU/ops.mlir index e9895e0d0a71d..71e7e9bdda07d 100644 --- a/mlir/test/Dialect/XeGPU/ops.mlir +++ b/mlir/test/Dialect/XeGPU/ops.mlir @@ -125,11 +125,11 @@ gpu.func @test_load_nd_vc(%src: memref<8x16xf16>) { // CHECK: func @test_load_nd_simt(%[[arg0:.*]]: memref<8x16xf16>) { gpu.func @test_load_nd_simt(%src: memref<8x16xf16>) { - // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout> - %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout> - // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, packed}> : !xegpu.tensor_desc<8x16xf16, #xegpu.layout> -> vector<4x2xf16> + // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16> + %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16> + // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, packed}> : !xegpu.tensor_desc<8x16xf16> -> vector<8xf16> %2 = xegpu.load_nd %1 <{packed, l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> - : !xegpu.tensor_desc<8x16xf16, #xegpu.layout> -> vector<4x2xf16> + : !xegpu.tensor_desc<8x16xf16> -> vector<8xf16> gpu.return } @@ -144,10 +144,10 @@ gpu.func @test_load_nd_vc_2(%src: memref<8x16xf16>) { // CHECK: func @test_load_nd_simt_2(%[[arg0:.*]]: memref<8x16xf16>) { gpu.func @test_load_nd_simt_2(%src: memref<8x16xf16>) { - // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<16xf16, #xegpu.layout> - %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<16xf16, #xegpu.layout> - // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<16xf16, #xegpu.layout> -> vector<1x1xf16> - %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<16xf16, #xegpu.layout> -> vector<1x1xf16> + // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<16xf16> + %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<16xf16> + // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<16xf16> -> vector<1xf16> + %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<16xf16> -> vector<1xf16> gpu.return } @@ -162,11 +162,10 @@ gpu.func @test_load_nd_vc_3(%src: memref<24x32xf32>) { // CHECK: func @test_load_nd_simt_3(%[[arg0:.*]]: memref<24x32xf32>) { gpu.func @test_load_nd_simt_3(%src: memref<24x32xf32>) { - // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout> - %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> - !xegpu.tensor_desc<8x16xf32, #xegpu.layout> - // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<8x16xf32, #xegpu.layout> -> vector<8x1xf32> - %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<8x16xf32, #xegpu.layout> -> vector<8x1xf32> + // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32> + %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32> + // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<8x16xf32> -> vector<8xf32> + %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<8x16xf32> -> vector<8xf32> gpu.return } @@ -181,11 +180,10 @@ gpu.func @test_load_nd_vc_4(%src: memref<24x32xf16>) { // CHECK: func @test_load_nd_simt_4(%[[arg0:.*]]: memref<24x32xf16>) { gpu.func @test_load_nd_simt_4(%src: memref<24x32xf16>) { - // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout> - %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> -> - !xegpu.tensor_desc<16x16xf16, #xegpu.layout> - // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -> vector<8x2xf16> - %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -> vector<8x2xf16> + // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16> + %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16> + // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<16x16xf16> -> vector<16xf16> + %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<16x16xf16> -> vector<16xf16> gpu.return } @@ -200,11 +198,10 @@ gpu.func @test_load_nd_vc_5(%src: memref<24x32xf32>) { // CHECK: func @test_load_nd_simt_5(%[[arg0:.*]]: memref<24x32xf32>) { gpu.func @test_load_nd_simt_5(%src: memref<24x32xf32>) { - // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<32xf32, #xegpu.layout> - %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> - !xegpu.tensor_desc<32xf32, #xegpu.layout> - // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32xf32, #xegpu.layout> -> vector<2x1xf32> - %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32xf32, #xegpu.layout> -> vector<2x1xf32> + // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<32xf32> + %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<32xf32> + // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32xf32> -> vector<2xf32> + %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32xf32> -> vector<2xf32> gpu.return } @@ -219,11 +216,11 @@ gpu.func @test_load_nd_vc_6(%src: memref<24x32xf16>) { // CHECK: func @test_load_nd_simt_6(%[[arg0:.*]]: memref<24x32xf16>) { gpu.func @test_load_nd_simt_6(%src: memref<24x32xf16>) { - // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr, #xegpu.layout> - %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr, #xegpu.layout> - // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr, #xegpu.layout> -> vector<32x1xf16> + // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr> + %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr> + // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr> -> vector<32xf16> %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : - !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr, #xegpu.layout> -> vector<32x1xf16> + !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr> -> vector<32xf16> gpu.return } @@ -238,11 +235,11 @@ gpu.func @test_load_nd_vc_7(%src: memref<24x32xf16>) { // CHECK: func @test_load_nd_simt_7(%[[arg0:.*]]: memref<24x32xf16>) { gpu.func @test_load_nd_simt_7(%src: memref<24x32xf16>) { - // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr, #xegpu.layout> - %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr, #xegpu.layout> - // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr, #xegpu.layout> -> vector<16x2xf16> + // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr> + %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr> + // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr> -> vector<32xf16> %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : - !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr, #xegpu.layout> -> vector<16x2xf16> + !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr> -> vector<32xf16> gpu.return } @@ -257,10 +254,10 @@ gpu.func @test_load_nd_vc_8(%src: memref<24x32xf32>) { // CHECK: func @test_load_nd_simt_8(%[[arg0:.*]]: memref<24x32xf32>) { gpu.func @test_load_nd_simt_8(%src: memref<24x32xf32>) { - // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<16x8xf32, #xegpu.layout> - %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<16x8xf32, #xegpu.layout> - // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, transpose = array}> : !xegpu.tensor_desc<16x8xf32, #xegpu.layout> -> vector<8x1xf32> - %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, transpose = array}> : !xegpu.tensor_desc<16x8xf32, #xegpu.layout> -> vector<8x1xf32> + // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<16x8xf32> + %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<16x8xf32> + // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, transpose = array}> : !xegpu.tensor_desc<16x8xf32> -> vector<8xf32> + %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, transpose = array}> : !xegpu.tensor_desc<16x8xf32> -> vector<8xf32> gpu.return } @@ -277,13 +274,12 @@ gpu.func @test_store_nd_vc(%dst: memref<24x32xf16>) { // CHECK: func @test_store_nd_simt(%[[arg0:.*]]: memref<24x32xf16>) { gpu.func @test_store_nd_simt(%src: memref<24x32xf16>) { - // CHECK: %[[C:.*]] = arith.constant dense<1.000000e+00> : vector<48x1xf16> - %1 = arith.constant dense<1.0>: vector<48x1xf16> - // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<24x32xf16, #xegpu.layout> - %2 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> -> - !xegpu.tensor_desc<24x32xf16, #xegpu.layout> - // CHECK: xegpu.store_nd %[[C]], %[[R0]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : vector<48x1xf16>, !xegpu.tensor_desc<24x32xf16, #xegpu.layout> - xegpu.store_nd %1, %2 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}>: vector<48x1xf16>, !xegpu.tensor_desc<24x32xf16, #xegpu.layout> + // CHECK: %[[C:.*]] = arith.constant dense<1.000000e+00> : vector<48xf16> + %1 = arith.constant dense<1.0>: vector<48xf16> + // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<24x32xf16> + %2 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<24x32xf16> + // CHECK: xegpu.store_nd %[[C]], %[[R0]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : vector<48xf16>, !xegpu.tensor_desc<24x32xf16> + xegpu.store_nd %1, %2 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}>: vector<48xf16>, !xegpu.tensor_desc<24x32xf16> gpu.return } @@ -303,13 +299,12 @@ gpu.func @test_store_nd_vc_2(%dst: memref<24x32xf16>) { // CHECK: func @test_store_nd_simt_2(%[[arg0:.*]]: memref<24x32xf16>) { gpu.func @test_store_nd_simt_2(%src: memref<24x32xf16>) { - // CHECK: %[[C:.*]] = arith.constant dense<1.000000e+00> : vector<2x1xf16> - %1 = arith.constant dense<1.0>: vector<2x1xf16> - // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<32xf16, #xegpu.layout> - %2 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> -> - !xegpu.tensor_desc<32xf16, #xegpu.layout> - // CHECK: xegpu.store_nd %[[C]], %[[R0]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : vector<2x1xf16>, !xegpu.tensor_desc<32xf16, #xegpu.layout> - xegpu.store_nd %1, %2 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}>: vector<2x1xf16>, !xegpu.tensor_desc<32xf16, #xegpu.layout> + // CHECK: %[[C:.*]] = arith.constant dense<1.000000e+00> : vector<2xf16> + %1 = arith.constant dense<1.0>: vector<2xf16> + // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<32xf16> + %2 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<32xf16> + // CHECK: xegpu.store_nd %[[C]], %[[R0]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : vector<2xf16>, !xegpu.tensor_desc<32xf16> + xegpu.store_nd %1, %2 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}>: vector<2xf16>, !xegpu.tensor_desc<32xf16> gpu.return } @@ -425,10 +420,10 @@ gpu.func @test_load_simt(%src: ui64) { %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex> //CHECK: %[[cst1:.*]] = arith.constant dense : vector<4xi1> %1 = arith.constant dense<1>: vector<4xi1> - //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr, #xegpu.layout> - %2 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr, #xegpu.layout> - //CHECK: %[[R1:.*]] = xegpu.load %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, transpose}> : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr, #xegpu.layout>, vector<4xi1> -> vector<2x1xf32> - %3 = xegpu.load %2, %1 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, transpose}> : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr, #xegpu.layout>, vector<4xi1> -> vector<2x1xf32> + //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr> + %2 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr> + //CHECK: %[[R1:.*]] = xegpu.load %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr>, vector<4xi1> -> vector<2xf32> + %3 = xegpu.load %2, %1 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr>, vector<4xi1> -> vector<2xf32> gpu.return } @@ -451,10 +446,10 @@ gpu.func @test_load_simt_2(%src: ui64) { %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex> //CHECK: %[[cst1:.*]] = arith.constant dense : vector<4xi1> %1 = arith.constant dense<1>: vector<4xi1> - //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout> - %2 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout> - //CHECK: %[[R1:.*]] = xegpu.load %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout>, vector<4xi1> -> vector<1xf32> - %3 = xegpu.load %2, %1 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout>, vector<4xi1> -> vector<1xf32> + //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>> + %2 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>> + //CHECK: %[[R1:.*]] = xegpu.load %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>>, vector<4xi1> -> vector<1xf32> + %3 = xegpu.load %2, %1 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>>, vector<4xi1> -> vector<1xf32> gpu.return } @@ -477,10 +472,10 @@ gpu.func @test_load_simt_3(%src: ui64) { %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex> //CHECK: %[[cst1:.*]] = arith.constant dense : vector<4xi1> %1 = arith.constant dense<1>: vector<4xi1> - //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x8xf16, #xegpu.scatter_tdesc_attr, #xegpu.layout> - %2 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x8xf16, #xegpu.scatter_tdesc_attr, #xegpu.layout> - //CHECK: %[[R1:.*]] = xegpu.load %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, transpose}> : !xegpu.tensor_desc<4x8xf16, #xegpu.scatter_tdesc_attr, #xegpu.layout>, vector<4xi1> -> vector<4x2xf16> - %3 = xegpu.load %2, %1 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, transpose}> : !xegpu.tensor_desc<4x8xf16, #xegpu.scatter_tdesc_attr, #xegpu.layout>, vector<4xi1> -> vector<4x2xf16> + //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x8xf16, #xegpu.scatter_tdesc_attr> + %2 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x8xf16, #xegpu.scatter_tdesc_attr> + //CHECK: %[[R1:.*]] = xegpu.load %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<4x8xf16, #xegpu.scatter_tdesc_attr>, vector<4xi1> -> vector<8xf16> + %3 = xegpu.load %2, %1 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<4x8xf16, #xegpu.scatter_tdesc_attr>, vector<4xi1> -> vector<8xf16> gpu.return } @@ -507,12 +502,12 @@ gpu.func @test_store_simt(%src: ui64) { %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex> //CHECK: %[[cst1:.*]] = arith.constant dense : vector<4xi1> %1 = arith.constant dense<1>: vector<4xi1> - //CHECK: %[[cst2:.*]] = arith.constant dense<2.900000e+00> : vector<2x1xf32> - %2 = arith.constant dense<2.9>: vector<2x1xf32> - //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr, #xegpu.layout> - %3 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr, #xegpu.layout> - //CHECK: xegpu.store %[[cst2]], %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, transpose}> : vector<2x1xf32>, !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr, #xegpu.layout>, vector<4xi1> - xegpu.store %2, %3, %1 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, transpose}> : vector<2x1xf32>, !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr, #xegpu.layout>, vector<4xi1> + //CHECK: %[[cst2:.*]] = arith.constant dense<2.900000e+00> : vector<2xf32> + %2 = arith.constant dense<2.9>: vector<2xf32> + //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr> + %3 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr> + //CHECK: xegpu.store %[[cst2]], %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : vector<2xf32>, !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr>, vector<4xi1> + xegpu.store %2, %3, %1 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : vector<2xf32>, !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr>, vector<4xi1> gpu.return } @@ -539,12 +534,12 @@ gpu.func @test_store_simt_2(%src: ui64) { %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex> //CHECK: %[[cst1:.*]] = arith.constant dense : vector<4xi1> %1 = arith.constant dense<1>: vector<4xi1> - //CHECK: %[[cst2:.*]] = arith.constant {{.*}} : vector<1x2xf16> - %2 = arith.constant dense<2.9>: vector<1x2xf16> - //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf16, #xegpu.scatter_tdesc_attr, #xegpu.layout> - %3 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf16, #xegpu.scatter_tdesc_attr, #xegpu.layout> - //CHECK: xegpu.store %[[cst2]], %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, transpose}> : vector<1x2xf16>, !xegpu.tensor_desc<4x2xf16, #xegpu.scatter_tdesc_attr, #xegpu.layout>, vector<4xi1> - xegpu.store %2, %3, %1 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, transpose}> : vector<1x2xf16>, !xegpu.tensor_desc<4x2xf16, #xegpu.scatter_tdesc_attr, #xegpu.layout>, vector<4xi1> + //CHECK: %[[cst2:.*]] = arith.constant {{.*}} : vector<2xf16> + %2 = arith.constant dense<2.9>: vector<2xf16> + //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf16, #xegpu.scatter_tdesc_attr> + %3 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf16, #xegpu.scatter_tdesc_attr> + //CHECK: xegpu.store %[[cst2]], %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : vector<2xf16>, !xegpu.tensor_desc<4x2xf16, #xegpu.scatter_tdesc_attr>, vector<4xi1> + xegpu.store %2, %3, %1 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : vector<2xf16>, !xegpu.tensor_desc<4x2xf16, #xegpu.scatter_tdesc_attr>, vector<4xi1> gpu.return } @@ -572,10 +567,10 @@ gpu.func @test_store_simt_3(%src: ui64) { %1 = arith.constant dense<1>: vector<4xi1> //CHECK: %[[cst2:.*]] = arith.constant dense<2.900000e+00> : vector<1xf32> %2 = arith.constant dense<2.9>: vector<1xf32> - //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout> - %3 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout> - //CHECK: xegpu.store %[[cst2]], %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : vector<1xf32>, !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout>, vector<4xi1> - xegpu.store %2, %3, %1 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : vector<1xf32>, !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout>, vector<4xi1> + //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>> + %3 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>> + //CHECK: xegpu.store %[[cst2]], %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : vector<1xf32>, !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>>, vector<4xi1> + xegpu.store %2, %3, %1 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : vector<1xf32>, !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>>, vector<4xi1> gpu.return } @@ -635,15 +630,10 @@ gpu.func @test_dpas_vc(%a : vector<8x16xf16>, %b: vector<16x16xf16>) { gpu.return } -// CHECK: gpu.func @test_dpas_simt(%[[arg0:.*]]: vector<8x1xf16>, %[[arg1:.*]]: vector<8x2xf16>) -gpu.func @test_dpas_simt(%a : vector<8x1xf16>, %b: vector<8x2xf16>) { - // CHECK: xegpu.dpas %[[arg0]], %[[arg1]] {a_layout = #xegpu.layout, - // CHECK: b_layout = #xegpu.layout, - // CHECK: c_layout = #xegpu.layout} : vector<8x1xf16>, vector<8x2xf16> -> vector<8x1xf32> - %1 = xegpu.dpas %a, %b {a_layout = #xegpu.layout, - b_layout = #xegpu.layout, - c_layout = #xegpu.layout} - : vector<8x1xf16>, vector<8x2xf16> -> vector<8x1xf32> +// CHECK: gpu.func @test_dpas_simt(%[[arg0:.*]]: vector<8xf16>, %[[arg1:.*]]: vector<16xf16>) +gpu.func @test_dpas_simt(%a : vector<8xf16>, %b: vector<16xf16>) { + // CHECK: xegpu.dpas %[[arg0]], %[[arg1]] : vector<8xf16>, vector<16xf16> -> vector<8xf32> + %1 = xegpu.dpas %a, %b : vector<8xf16>, vector<16xf16> -> vector<8xf32> gpu.return }