diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td index 3cb71788a15ef..ecab280b76f55 100644 --- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td +++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td @@ -189,11 +189,6 @@ def XeGPU_TensorDesc: XeGPUTypeDef<"TensorDesc", "tensor_desc", return scatter_attr.getChunkSize().getInt(); return 1; } - - // This returns a vector type that represents the fragment of data owned by - // a work item in SIMT mode if this tensor descriptor is used in a XeGPU - // load/store operation. - FailureOr getDistributedVectorType(); }]; let hasCustomAssemblyFormat = true; diff --git a/mlir/include/mlir/Dialect/XeGPU/Transforms/Transforms.h b/mlir/include/mlir/Dialect/XeGPU/Transforms/Transforms.h index 63ea26df06937..3e94021c7a1ea 100644 --- a/mlir/include/mlir/Dialect/XeGPU/Transforms/Transforms.h +++ b/mlir/include/mlir/Dialect/XeGPU/Transforms/Transforms.h @@ -16,6 +16,8 @@ namespace xegpu { /// Appends patterns for folding aliasing ops into XeGPU ops into `patterns`. void populateXeGPUFoldAliasOpsPatterns(RewritePatternSet &patterns); +/// Appends patterns for XeGPU SIMT distribution into `patterns`. +void populateXeGPUSubgroupDistributePatterns(RewritePatternSet &patterns); } // namespace xegpu } // namespace mlir diff --git a/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h b/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h new file mode 100644 index 0000000000000..3616fa614e7f9 --- /dev/null +++ b/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h @@ -0,0 +1,57 @@ +//===- XeGPUUtils.h - Vector Utilities --------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef MLIR_DIALECT_XEGPU_UTILS_XEGPUUTILS_H_ +#define MLIR_DIALECT_XEGPU_UTILS_XEGPUUTILS_H_ + +#include "mlir/IR/BuiltinTypes.h" +namespace mlir { + +class VectorType; +namespace xegpu { +class LayoutAttr; +class TensorDescType; +} // namespace xegpu + +namespace xegpu { + +/// If tensor descriptor has a layout attribute it is used in SIMT mode. +/// In this mode, the distributed vector shape is determined as follows: +/// Definitions: +/// lane_data_size = lane_data[0] × lane_data[1] +/// subgroup_size = lane_layout[0] × lane_layout[1] +/// distribution_unit_size = subgroup_size × lane_data_size +/// +/// Case 1: Regular loads/stores. +/// The following conditions must be met: +/// * tensor_desc[0] == lane_layout[0] +/// Distributed vector is a 1D vector with shape: +/// [chunk_size] +/// +/// Case 2: Block loads/stores +/// Additional definitions: +/// tensor_size = tensor_desc[0] * .. * tensor_desc[r-1] * array_length +/// n_distribution_units = tensor_size / distribution_unit_size +/// fragment_size = n_distribution_units * lane_data_size +/// Given above definitions, the following conditions must be met: +/// * tensor_desc[0] % (lane_layout[0] × lane_data[0]) == 0 +/// * tensor_desc[1] % (lane_layout[1] × lane_data[1]) == 0 +/// Distributed vector is a 1D vector with shape: +/// [fragment_size] +FailureOr getDistributedVectorType(xegpu::TensorDescType tdescTy); + +/// Helper to get the distributed vector type for a given vector type according +/// to a given LayoutAttr. +FailureOr getDistributedVectorType(VectorType originalType, + LayoutAttr layout); + +} // namespace xegpu + +} // namespace mlir + +#endif // MLIR_DIALECT_XEGPU_UTILS_XEGPUUTILS_H_ diff --git a/mlir/lib/Dialect/XeGPU/CMakeLists.txt b/mlir/lib/Dialect/XeGPU/CMakeLists.txt index 9f57627c321fb..31167e6af908b 100644 --- a/mlir/lib/Dialect/XeGPU/CMakeLists.txt +++ b/mlir/lib/Dialect/XeGPU/CMakeLists.txt @@ -1,2 +1,3 @@ add_subdirectory(IR) add_subdirectory(Transforms) +add_subdirectory(Utils) diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp index b2d217d192934..6790c5e3af2c0 100644 --- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp +++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp @@ -376,74 +376,6 @@ LogicalResult TensorDescType::verify( return success(); } -// If tensor descriptor has a layout attribute it is used in SIMT mode. -// In this mode, the distributed vector shape is determined as follows: -// Definitions: -// lane_data_size = lane_data[0] × lane_data[1] -// subgroup_size = lane_layout[0] × lane_layout[1] -// distribution_unit_size = subgroup_size × lane_data_size -// --------------------------------------------------------------------- -// Case 1: Regular loads/stores. -// --------------------------------------------------------------------- -// The following conditions must be met: -// * tensor_desc[0] == lane_layout[0] -// Distributed vector is a 1D vector with shape: -// [chunk_size] -// --------------------------------------------------------------------- -// Case 2: Block loads/stores -// --------------------------------------------------------------------- -// Additional definitions: -// tensor_size = tensor_desc[0] * .. * tensor_desc[r-1] * array_length -// n_distribution_units = tensor_size / distribution_unit_size -// fragment_size = n_distribution_units * lane_data_size -// Given above definitions, the following conditions must be met: -// * tensor_desc[0] % (lane_layout[0] × lane_data[0]) == 0 -// * tensor_desc[1] % (lane_layout[1] × lane_data[1]) == 0 -// Distributed vector is a 1D vector with shape: -// [fragment_size] -FailureOr TensorDescType::getDistributedVectorType() { - auto layout = llvm::dyn_cast_if_present(getLayout()); - // It only works for subgroup level layout, which only has lane_layout - // and lane_data, and is to distribute a SIMD code into SIMT code. - if (!layout || !layout.isSgLayout()) - return failure(); - - SmallVector laneData(layout.getLaneData().asArrayRef()); - SmallVector laneLayout(layout.getLaneLayout().asArrayRef()); - auto tdescShape = getShape(); - - // compute sgSize by multiply elements of laneLayout - // e.g. for 2D layout, sgSize = laneLayout[0] * laneLayout[1] - // e.g. for 1D layout, sgSize = laneLayout[0] - auto sgSize = std::accumulate(laneLayout.begin(), laneLayout.end(), 1, - std::multiplies()); - - // Case 1: regular loads/stores - auto scatterAttr = getEncodingAsScatterTensorDescAttr(); - if (scatterAttr) { - auto chunkSize = scatterAttr.getChunkSize().getInt(); - // Verify if the first dimension of the tensor descriptor shape is - // distributable. - assert(tdescShape[0] == laneLayout[0] && - "tensor descriptor shape is not distributable"); - return VectorType::get({chunkSize}, getElementType()); - } - - // Case 2: block loads/stores - // Check if the tensor descriptor shape is distributable. - int64_t tensorSize = 1; - for (auto [tdescDim, laneDim, laneDataDim] : - llvm::zip_equal(tdescShape, laneLayout, laneData)) { - assert((tdescDim % (laneDim * laneDataDim) == 0) && - "tensor descriptor shape is not distributable"); - tensorSize *= tdescDim; - } - // tensorSize must be adjusted for array_length. - tensorSize *= getArrayLength(); - - return VectorType::get({tensorSize / sgSize}, getElementType()); -} - } // namespace xegpu } // namespace mlir diff --git a/mlir/lib/Dialect/XeGPU/Transforms/CMakeLists.txt b/mlir/lib/Dialect/XeGPU/Transforms/CMakeLists.txt index 9f041aae511df..901e02d3c9cf5 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/CMakeLists.txt +++ b/mlir/lib/Dialect/XeGPU/Transforms/CMakeLists.txt @@ -16,4 +16,7 @@ add_mlir_dialect_library(MLIRXeGPUTransforms MLIRPass MLIRTransforms MLIRGPUDialect + MLIRXeGPUUtils + MLIRGPUUtils + MLIRVectorTransforms ) diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp index 721a815cf76b9..019032f7743bf 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp @@ -10,14 +10,33 @@ #include "mlir/Analysis/DataFlow/SparseAnalysis.h" #include "mlir/Analysis/DataFlowFramework.h" #include "mlir/Dialect/GPU/IR/GPUDialect.h" +#include "mlir/Dialect/GPU/Utils/DistributionUtils.h" #include "mlir/Dialect/MemRef/IR/MemRef.h" #include "mlir/Dialect/Vector/IR/VectorOps.h" +#include "mlir/Dialect/Vector/Transforms/VectorDistribution.h" #include "mlir/Dialect/XeGPU/IR/XeGPU.h" #include "mlir/Dialect/XeGPU/Transforms/Passes.h" #include "mlir/Dialect/XeGPU/Transforms/Transforms.h" +#include "mlir/Dialect/XeGPU/Utils/XeGPUUtils.h" +#include "mlir/IR/AffineMap.h" +#include "mlir/IR/Attributes.h" #include "mlir/IR/Builders.h" +#include "mlir/IR/BuiltinAttributes.h" +#include "mlir/IR/BuiltinOps.h" +#include "mlir/IR/BuiltinTypes.h" +#include "mlir/IR/Operation.h" +#include "mlir/IR/PatternMatch.h" +#include "mlir/IR/TypeRange.h" +#include "mlir/IR/Value.h" +#include "mlir/IR/Visitors.h" #include "mlir/Interfaces/FunctionInterfaces.h" +#include "mlir/Transforms/GreedyPatternRewriteDriver.h" +#include "mlir/Transforms/InliningUtils.h" +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/ADT/TypeSwitch.h" +#include "llvm/Support/FormatVariadic.h" #include "llvm/Support/InterleavedRange.h" #include "llvm/Support/raw_ostream.h" @@ -28,27 +47,32 @@ namespace xegpu { } // namespace xegpu } // namespace mlir +#define DEBUG_TYPE "xegpu-subgroup-distribute" +#define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE "]: ") + using namespace mlir; using namespace mlir::dataflow; /// HW dependent constants. /// TODO: These constants should be queried from the target information. -constexpr unsigned subgroupSize = 16; // How many work items in a subgroup. +constexpr unsigned subgroupSize = 16; // How many lanes in a subgroup. /// If DPAS A or B operands have low precision element types they must be packed /// according to the following sizes. constexpr unsigned packedSizeInBitsForDefault = 16; // Minimum packing size per register for DPAS A. constexpr unsigned packedSizeInBitsForDpasB = 32; // Minimum packing size per register for DPAS B. +static const char *const operandLayoutNamePrefix = "layout_operand_"; +static const char *const resultLayoutNamePrefix = "layout_result_"; namespace { -///===----------------------------------------------------------------------===/// -/// Layout -///===----------------------------------------------------------------------===/// +//===----------------------------------------------------------------------===// +// Layout +//===----------------------------------------------------------------------===// -/// Helper class to store the ND layout of work items within a subgroup and data -/// owned by each work item. +/// Helper class to store the ND layout of lanes within a subgroup and data +/// owned by each lane. struct Layout { SmallVector layout; Layout() = default; @@ -67,235 +91,248 @@ int64_t Layout::operator[](size_t idx) const { return layout[idx]; } -/// WiLayout represents the layout of work items within a subgroup when it -/// accesses some value. WiData represents the layout of data owned by each work -/// item. -using WiLayout = Layout; -using WiData = Layout; +/// LaneLayout represents the logical layout of lanes within a subgroup when it +/// accesses some value. LaneData represents the logical layout of data owned by +/// each work item. +using LaneLayout = Layout; +using LaneData = Layout; -///===----------------------------------------------------------------------===/// -/// SGMap -///===----------------------------------------------------------------------===/// +//===----------------------------------------------------------------------===// +// LayoutInfo +//===----------------------------------------------------------------------===// -/// Helper class for tracking the analysis state of a value. For SGPropagation, -/// the analysis state is simply the wi_layout and wi_data of each value. -/// Purpose of this analysis to propagate some unique layout for each value in -/// the program starting from some known values (like DPAS, StoreNd, etc.). +/// Helper class for tracking the analysis state of an mlir value. For layout +/// propagation, the analysis state is simply the lane_layout and lane_data of +/// each value. Purpose of this analysis to propagate some unique layout for +/// each value in the program starting from a set of anchor operations (like +/// DPAS, StoreNd, etc.). /// -/// Given this, SGMap satisifies the following properties: -/// 1) SGMap is a lattice with two states - assigned and not assigned. -/// 2) Two SGMap values are equal if they are both assigned or both not -/// assigned. The concrete value of assigned state does not matter. +/// Given this, LayoutInfo satisifies the following properties: +/// 1) A LayoutInfo value can be in one of two states - `assigned` or `not +/// assigned`. +/// 2) Two LayoutInfo values are equal if they are both assigned or +/// both not assigned. The concrete value of assigned state does not matter. /// 3) The meet operator works as follows: /// - If current state is assigned, return the current state. (already /// a unique layout is assigned. don't change it) /// - Otherwise, return the other state. -struct SGMap { +struct LayoutInfo { private: - WiLayout wiLayout; - WiData wiData; + LaneLayout laneLayout; + LaneData laneData; public: - SGMap() = default; - SGMap(const WiLayout &layout, const WiData &data) - : wiLayout(layout), wiData(data) {} + LayoutInfo() = default; + LayoutInfo(const LaneLayout &layout, const LaneData &data) + : laneLayout(layout), laneData(data) {} - /// Two lattice values are equal if they have `some` layout. The actual - /// content of the layout does not matter. - bool operator==(const SGMap &other) const { + // Two lattice values are equal if they have `some` layout. The actual + // content of the layout does not matter. + bool operator==(const LayoutInfo &other) const { return this->isAssigned() == other.isAssigned(); } - static SGMap meet(const SGMap &lhs, const SGMap &rhs); + static LayoutInfo meet(const LayoutInfo &lhs, const LayoutInfo &rhs); - static SGMap join(const SGMap &lhs, const SGMap &rhs); + static LayoutInfo join(const LayoutInfo &lhs, const LayoutInfo &rhs); void print(raw_ostream &os) const; - bool isAssigned() const { return wiLayout.size() > 0 && wiData.size() > 0; } + bool isAssigned() const { + return laneLayout.size() > 0 && laneData.size() > 0; + } - SGMap getTransposedLayout(ArrayRef permutation) const; + LayoutInfo getTransposedLayout(ArrayRef permutation) const; - const WiLayout &getLayout() const { return wiLayout; } - const WiData &getData() const { return wiData; } + const LaneLayout &getLayout() const { return laneLayout; } + const LaneData &getData() const { return laneData; } + ArrayRef getLayoutAsArrayRef() const { return laneLayout.layout; } + ArrayRef getDataAsArrayRef() const { return laneData.layout; } }; -void SGMap::print(raw_ostream &os) const { +void LayoutInfo::print(raw_ostream &os) const { if (isAssigned()) { - os << "wi_layout: "; - wiLayout.print(os); - os << ", wi_data: "; - wiData.print(os); + os << "lane_layout: "; + laneLayout.print(os); + os << ", lane_data: "; + laneData.print(os); } else os << "Not assigned."; } -SGMap SGMap::meet(const SGMap &lhs, const SGMap &rhs) { +LayoutInfo LayoutInfo::meet(const LayoutInfo &lhs, const LayoutInfo &rhs) { if (!lhs.isAssigned()) return rhs; return lhs; } /// Since this is a backward analysis, join method is not used. -SGMap SGMap::join(const SGMap &lhs, const SGMap &rhs) { - llvm_unreachable("Join should not be triggered by SGMapPropagation."); +LayoutInfo LayoutInfo::join(const LayoutInfo &lhs, const LayoutInfo &rhs) { + llvm_unreachable("Join should not be triggered by layout propagation."); } /// Get the transposed layout according to the given permutation. -SGMap SGMap::getTransposedLayout(ArrayRef permutation) const { +LayoutInfo +LayoutInfo::getTransposedLayout(ArrayRef permutation) const { if (!isAssigned()) return {}; - WiLayout newLayout; - WiData newData; - for (auto idx : permutation) { - newLayout.layout.push_back(wiLayout.layout[idx]); - newData.layout.push_back(wiData.layout[idx]); + LaneLayout newLayout; + LaneData newData; + for (int64_t idx : permutation) { + newLayout.layout.push_back(laneLayout.layout[idx]); + newData.layout.push_back(laneData.layout[idx]); } - return SGMap(newLayout, newData); + return LayoutInfo(newLayout, newData); } -///===----------------------------------------------------------------------===/// -/// SGMapLattice -///===----------------------------------------------------------------------===/// +//===----------------------------------------------------------------------===// +// LayoutInfoLattice +//===----------------------------------------------------------------------===// -/// Lattice holding the SGMap for each value. -struct SGMapLattice : public Lattice { - MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(SGMapLattice) +/// Lattice holding the LayoutInfo for each value. +struct LayoutInfoLattice : public Lattice { + MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(LayoutInfoLattice) using Lattice::Lattice; }; /// Helper Functions to get default layouts. A `default layout` is a layout that /// is assigned to a value when the layout is not fixed by some anchor operation -/// (like DPAS). This is the natural layout work items are arranged in a -/// subgroup. +/// (like DPAS). /// Helper Function to get the default layout for uniform values like constants. -/// For 1D vector, wi_layout is [subgroupSize] and wi_data is [1]. -/// For 2D vector, wi_layout is [1, subgroupSize] and wi_data is [1, 1]. -static SGMap getDefaultSgMap(unsigned rank) { +/// For 1D vector, lane_layout is [subgroupSize] and lane_data is [1]. +/// For 2D vector, lane_layout is [1, subgroupSize] and lane_data is [1, 1]. +static LayoutInfo getDefaultLayoutInfo(unsigned rank) { assert((rank == 1 || rank == 2) && "Expected 1D or 2D vector."); if (rank == 1) - return SGMap(WiLayout({subgroupSize}), WiData({1})); - return SGMap(WiLayout({1, subgroupSize}), WiData({1, 1})); + return LayoutInfo(LaneLayout({subgroupSize}), LaneData({1})); + return LayoutInfo(LaneLayout({1, subgroupSize}), LaneData({1, 1})); } /// Helper to get the default layout for a vector type. -static SGMap getDefaultSgMap(VectorType vectorTy) { - /// Expecting a 1D or 2D vector. +static LayoutInfo getDefaultLayoutInfo(VectorType vectorTy) { + // Expecting a 1D or 2D vector. assert((vectorTy.getRank() == 1 || vectorTy.getRank() == 2) && "Expected 1D or 2D vector."); - /// Expecting int or float element type. + // Expecting int or float element type. assert(vectorTy.getElementType().isIntOrFloat() && "Expected int or float element type."); - /// If the rank is 1, then return default layout for 1D vector. + // If the rank is 1, then return default layout for 1D vector. if (vectorTy.getRank() == 1) - return getDefaultSgMap(1); - /// Packing factor is determined by the element type bitwidth. + return getDefaultLayoutInfo(1); + // Packing factor is determined by the element type bitwidth. int packingFactor = 1; - auto bitwidth = vectorTy.getElementType().getIntOrFloatBitWidth(); + unsigned bitwidth = vectorTy.getElementType().getIntOrFloatBitWidth(); if (bitwidth < packedSizeInBitsForDefault) packingFactor = packedSizeInBitsForDefault / bitwidth; - return SGMap(WiLayout({1, subgroupSize}), WiData({1, packingFactor})); + return LayoutInfo(LaneLayout({1, subgroupSize}), + LaneData({1, packingFactor})); } -/// Helper Function to get the expected layouts for DPAS operands. `wi_data` is -/// set according to the following criteria: +/// Helper Function to get the expected layouts for DPAS operands. `lane_data` +/// is set according to the following criteria: /// * For A operand, the data must be packed in minimum /// `packedSizeInBitsForDefault` /// * For B operand, the data must be packed in minimum /// `packedSizeInBitsForDpasB` -static SGMap getSGMapForDPASOperand(VectorType vectorTy, unsigned operandNum) { - auto elementTy = vectorTy.getElementType(); +static LayoutInfo getLayoutInfoForDPASOperand(VectorType vectorTy, + unsigned operandNum) { + Type elementTy = vectorTy.getElementType(); assert(elementTy.isIntOrFloat() && "Expected int or float type in DPAS operands"); - WiLayout layout({1, subgroupSize}); - /// For B operand, data must be packed in minimum `packedDpasBSizeInBits` and - /// must have the VNNI format. + LaneLayout layout({1, subgroupSize}); + // For B operand, data must be packed in minimum `packedDpasBSizeInBits` and + // must have the VNNI format. if (operandNum == 1 && elementTy.getIntOrFloatBitWidth() < packedSizeInBitsForDpasB) { - WiData data( + LaneData data( {packedSizeInBitsForDpasB / elementTy.getIntOrFloatBitWidth(), 1}); - return SGMap(layout, data); + return LayoutInfo(layout, data); } - /// Otherwise, return the default layout for the vector type. - return getDefaultSgMap(vectorTy); + // Otherwise, return the default layout for the vector type. + return getDefaultLayoutInfo(vectorTy); } -///===----------------------------------------------------------------------===/// -/// SGMapPropagation -///===----------------------------------------------------------------------===/// +//===----------------------------------------------------------------------===// +// LayoutInfoPropagation +//===----------------------------------------------------------------------===// -/// Backward data flow analysis to propagate the wi_layout and wi_data of each -/// value in the program. Currently, the layouts for operands DPAS, StoreNd, and -/// StoreScatter are fixed (known before propagation). Purpose of this analysis -/// is to propagate those known layouts to all their producers and (other) -/// consumers. -class SGMapPropagation : public SparseBackwardDataFlowAnalysis { +/// Backward data flow analysis to propagate the lane_layout and lane_data of +/// each value in the program. Currently, the layouts for operands DPAS, +/// StoreNd, and StoreScatter are fixed (known before propagation). Purpose of +/// this analysis is to propagate those known layouts to all their producers and +/// (other) consumers. +class LayoutInfoPropagation + : public SparseBackwardDataFlowAnalysis { private: - void visitDpasOp(xegpu::DpasOp dpas, ArrayRef operands, - ArrayRef results); + void visitDpasOp(xegpu::DpasOp dpas, ArrayRef operands, + ArrayRef results); - void visitStoreNdOp(xegpu::StoreNdOp store, ArrayRef operands, - ArrayRef results); + void visitStoreNdOp(xegpu::StoreNdOp store, + ArrayRef operands, + ArrayRef results); void visitStoreScatterOp(xegpu::StoreScatterOp storeScatter, - ArrayRef operands, - ArrayRef results); + ArrayRef operands, + ArrayRef results); - void visitLoadNdOp(xegpu::LoadNdOp load, ArrayRef operands, - ArrayRef results); + void visitLoadNdOp(xegpu::LoadNdOp load, + ArrayRef operands, + ArrayRef results); void visitLoadGatherOp(xegpu::LoadGatherOp load, - ArrayRef operands, - ArrayRef results); + ArrayRef operands, + ArrayRef results); void visitTransposeOp(vector::TransposeOp transpose, - ArrayRef operands, - ArrayRef results); + ArrayRef operands, + ArrayRef results); void visitVectorBitcastOp(vector::BitCastOp bitcast, - ArrayRef operands, - ArrayRef results); + ArrayRef operands, + ArrayRef results); void visitCreateDescOp(xegpu::CreateDescOp createDesc, - ArrayRef operands, - ArrayRef results); + ArrayRef operands, + ArrayRef results); void visitUpdateNdOffsetOp(xegpu::UpdateNdOffsetOp updateNdOffset, - ArrayRef operands, - ArrayRef results); + ArrayRef operands, + ArrayRef results); void visitVectorMultiReductionOp(vector::MultiDimReductionOp reduction, - ArrayRef operands, - ArrayRef results); + ArrayRef operands, + ArrayRef results); public: - SGMapPropagation(DataFlowSolver &solver, SymbolTableCollection &symbolTable) + LayoutInfoPropagation(DataFlowSolver &solver, + SymbolTableCollection &symbolTable) : SparseBackwardDataFlowAnalysis(solver, symbolTable) {} using SparseBackwardDataFlowAnalysis::SparseBackwardDataFlowAnalysis; - LogicalResult visitOperation(Operation *op, ArrayRef operands, - ArrayRef results) override; + LogicalResult + visitOperation(Operation *op, ArrayRef operands, + ArrayRef results) override; void visitBranchOperand(OpOperand &operand) override {}; void visitCallOperand(OpOperand &operand) override {}; void visitExternalCall(CallOpInterface call, - ArrayRef operands, - ArrayRef results) override {}; + ArrayRef operands, + ArrayRef results) override { + }; - void setToExitState(SGMapLattice *lattice) override { - (void)lattice->meet(SGMap()); + void setToExitState(LayoutInfoLattice *lattice) override { + (void)lattice->meet(LayoutInfo()); } }; } // namespace -LogicalResult -SGMapPropagation::visitOperation(Operation *op, - ArrayRef operands, - ArrayRef results) { +LogicalResult LayoutInfoPropagation::visitOperation( + Operation *op, ArrayRef operands, + ArrayRef results) { TypeSwitch(op) .Case( [&](auto dpasOp) { visitDpasOp(dpasOp, operands, results); }) @@ -315,8 +352,8 @@ SGMapPropagation::visitOperation(Operation *op, .Case([&](auto updateNdOffsetOp) { visitUpdateNdOffsetOp(updateNdOffsetOp, operands, results); }) - /// No need to propagate the layout to operands in CreateNdDescOp because - /// they are scalars (offsets, sizes, etc.). + // No need to propagate the layout to operands in CreateNdDescOp because + // they are scalars (offsets, sizes, etc.). .Case([&](auto createNdDescOp) {}) .Case([&](auto transposeOp) { visitTransposeOp(transposeOp, operands, results); @@ -327,245 +364,251 @@ SGMapPropagation::visitOperation(Operation *op, .Case([&](auto reductionOp) { visitVectorMultiReductionOp(reductionOp, operands, results); }) - /// All other ops. + // All other ops. .Default([&](Operation *op) { - for (const SGMapLattice *r : results) { - for (SGMapLattice *operand : operands) { - /// Propagate the layout of the result to the operand. + for (const LayoutInfoLattice *r : results) { + for (LayoutInfoLattice *operand : operands) { + // Propagate the layout of the result to the operand. if (r->getValue().isAssigned()) meet(operand, *r); } } }); - /// Add a dependency from each result to program point after the operation. - for (const SGMapLattice *r : results) { - addDependency(const_cast(r), getProgramPointAfter(op)); + // Add a dependency from each result to program point after the operation. + for (const LayoutInfoLattice *r : results) { + addDependency(const_cast(r), getProgramPointAfter(op)); } return success(); } -void SGMapPropagation::visitVectorMultiReductionOp( - vector::MultiDimReductionOp reduction, ArrayRef operands, - ArrayRef results) { - /// The layout of the result must be present. - auto resultLayout = results[0]->getValue(); +void LayoutInfoPropagation::visitVectorMultiReductionOp( + vector::MultiDimReductionOp reduction, + ArrayRef operands, + ArrayRef results) { + // The layout of the result must be present. + LayoutInfo resultLayout = results[0]->getValue(); if (!resultLayout.isAssigned()) return; - /// We only consider 2D -> 1D reductions at this point. + // We only consider 2D -> 1D reductions at this point. assert(resultLayout.getLayout().size() == 1 && "Expected 1D layout for reduction result."); - /// Given that the result is 1D, the layout of the operand should be 2D with - /// default layout. - auto operandLayout = getDefaultSgMap(2); + // Given that the result is 1D, the layout of the operand should be 2D with + // default layout. + LayoutInfo operandLayout = getDefaultLayoutInfo(2); propagateIfChanged(operands[0], operands[0]->meet(operandLayout)); - /// Accumulator should have the same layout as the result. + // Accumulator should have the same layout as the result. propagateIfChanged(operands[1], operands[1]->meet(resultLayout)); } /// Propagate the layout of the result tensor to the source tensor descriptor in /// UpdateNdOffsetOp. -void SGMapPropagation::visitUpdateNdOffsetOp( - xegpu::UpdateNdOffsetOp updateNdOffset, ArrayRef operands, - ArrayRef results) { - /// The layout of the result must be present. - auto resultLayout = results[0]->getValue(); +void LayoutInfoPropagation::visitUpdateNdOffsetOp( + xegpu::UpdateNdOffsetOp updateNdOffset, + ArrayRef operands, + ArrayRef results) { + // The layout of the result must be present. + LayoutInfo resultLayout = results[0]->getValue(); if (!resultLayout.isAssigned()) return; - /// Propagate the layout to the source operand. + // Propagate the layout to the source operand. propagateIfChanged(operands[0], operands[0]->meet(resultLayout)); } /// Set the layouts for DPAS A, B, and C operands. -void SGMapPropagation::visitDpasOp(xegpu::DpasOp dpas, - ArrayRef operands, - ArrayRef results) { - auto aTy = dpas.getLhsType(); - auto bTy = dpas.getRhsType(); +void LayoutInfoPropagation::visitDpasOp( + xegpu::DpasOp dpas, ArrayRef operands, + ArrayRef results) { + VectorType aTy = dpas.getLhsType(); + VectorType bTy = dpas.getRhsType(); propagateIfChanged(operands[0], - operands[0]->meet(getSGMapForDPASOperand(aTy, 0))); + operands[0]->meet(getLayoutInfoForDPASOperand(aTy, 0))); propagateIfChanged(operands[1], - operands[1]->meet(getSGMapForDPASOperand(bTy, 1))); + operands[1]->meet(getLayoutInfoForDPASOperand(bTy, 1))); if (operands.size() > 2) { - auto cTy = dpas.getAccType(); + VectorType cTy = dpas.getAccType(); propagateIfChanged(operands[2], - operands[2]->meet(getSGMapForDPASOperand(cTy, 2))); + operands[2]->meet(getLayoutInfoForDPASOperand(cTy, 2))); } } /// Set the layout for the value and tensor descriptor operands in StoreNdOp. -void SGMapPropagation::visitStoreNdOp(xegpu::StoreNdOp store, - ArrayRef operands, - ArrayRef results) { - auto storeLayout = getDefaultSgMap(store.getValueType()); - /// Both operands should have the same layout - for (SGMapLattice *operand : operands) { +void LayoutInfoPropagation::visitStoreNdOp( + xegpu::StoreNdOp store, ArrayRef operands, + ArrayRef results) { + LayoutInfo storeLayout = getDefaultLayoutInfo(store.getValueType()); + // Both operands should have the same layout + for (LayoutInfoLattice *operand : operands) { propagateIfChanged(operand, operand->meet(storeLayout)); } } /// Propagate the layout of the value to the tensor descriptor operand in /// LoadNdOp. -void SGMapPropagation::visitLoadNdOp(xegpu::LoadNdOp load, - ArrayRef operands, - ArrayRef results) { - auto valueLayout = results[0]->getValue(); - /// Need the layout of the value to propagate to the tensor descriptor. +void LayoutInfoPropagation::visitLoadNdOp( + xegpu::LoadNdOp load, ArrayRef operands, + ArrayRef results) { + LayoutInfo valueLayout = results[0]->getValue(); + // Need the layout of the value to propagate to the tensor descriptor. if (!valueLayout.isAssigned()) return; - SGMap tensorDescLayout = valueLayout; - /// LoadNdOp has the transpose effect. However, at the stage of this analysis - /// this effect is not expected and should be abstracted away. Emit a warning. + LayoutInfo tensorDescLayout = valueLayout; + // LoadNdOp has the transpose effect. However, at the stage of this analysis + // this effect is not expected and should be abstracted away. Emit a warning. if (auto transpose = load.getTranspose()) { load.emitWarning("Transpose effect is not expected for LoadNdOp at " - "SGMapPropagation stage."); + "LayoutInfoPropagation stage."); tensorDescLayout = valueLayout.getTransposedLayout(transpose.value()); } - /// Propagate the new layout to the tensor descriptor operand. + // Propagate the new layout to the tensor descriptor operand. propagateIfChanged(operands[0], operands[0]->meet(tensorDescLayout)); } /// For vector::TransposeOp, the layout of the result is transposed and /// propagated to the operand. -void SGMapPropagation::visitTransposeOp( - vector::TransposeOp transpose, ArrayRef operands, - ArrayRef results) { - /// Need the layout of transpose result to propagate to the operands. - auto resultLayout = results[0]->getValue(); +void LayoutInfoPropagation::visitTransposeOp( + vector::TransposeOp transpose, ArrayRef operands, + ArrayRef results) { + // Need the layout of transpose result to propagate to the operands. + LayoutInfo resultLayout = results[0]->getValue(); if (!resultLayout.isAssigned()) return; - auto newLayout = resultLayout.getTransposedLayout(transpose.getPermutation()); - /// Propagate the new layout to the vector operand. + LayoutInfo newLayout = + resultLayout.getTransposedLayout(transpose.getPermutation()); + // Propagate the new layout to the vector operand. propagateIfChanged(operands[0], operands[0]->meet(newLayout)); } -/// For vector::BitCastOp, the wi_data of the source layout is changed based on -/// the bit width of the source and result types. -void SGMapPropagation::visitVectorBitcastOp( - vector::BitCastOp bitcast, ArrayRef operands, - ArrayRef results) { - /// Need the layout of bitcast result to propagate to the operands. - auto resultLayout = results[0]->getValue(); +/// For vector::BitCastOp, the lane_data of the source layout is changed based +/// on the bit width of the source and result types. +void LayoutInfoPropagation::visitVectorBitcastOp( + vector::BitCastOp bitcast, ArrayRef operands, + ArrayRef results) { + // Need the layout of bitcast result to propagate to the operands. + LayoutInfo resultLayout = results[0]->getValue(); if (!resultLayout.isAssigned()) return; - auto inElemTyBitWidth = + int inElemTyBitWidth = bitcast.getSourceVectorType().getElementType().getIntOrFloatBitWidth(); - auto outElemTyBitWidth = + int outElemTyBitWidth = bitcast.getResultVectorType().getElementType().getIntOrFloatBitWidth(); - /// WiLayout does not change. - const WiLayout &newWiLayout = resultLayout.getLayout(); - const WiData &currData = resultLayout.getData(); - WiData newWiData; - /// It's a widening bitcast + // LaneLayout does not change. + const LaneLayout &newLaneLayout = resultLayout.getLayout(); + const LaneData &currData = resultLayout.getData(); + LaneData newLaneData; + // It's a widening bitcast if (inElemTyBitWidth < outElemTyBitWidth) { - auto ratio = outElemTyBitWidth / inElemTyBitWidth; - newWiData = resultLayout.getData()[0] == 1 - ? WiData({1, currData[1] * ratio}) - : WiData({currData[0] * ratio, 1}); + int ratio = outElemTyBitWidth / inElemTyBitWidth; + newLaneData = resultLayout.getData()[0] == 1 + ? LaneData({1, currData[1] * ratio}) + : LaneData({currData[0] * ratio, 1}); } else { - /// It's a narrowing bitcast - auto ratio = inElemTyBitWidth / outElemTyBitWidth; - newWiData = resultLayout.getData()[0] == 1 - ? WiData({1, currData[1] / ratio}) - : WiData({currData[0] / ratio, 1}); + // It's a narrowing bitcast + int ratio = inElemTyBitWidth / outElemTyBitWidth; + newLaneData = resultLayout.getData()[0] == 1 + ? LaneData({1, currData[1] / ratio}) + : LaneData({currData[0] / ratio, 1}); } propagateIfChanged(operands[0], - operands[0]->meet(SGMap(newWiLayout, newWiData))); + operands[0]->meet(LayoutInfo(newLaneLayout, newLaneData))); } /// Propagate the layout of the result to the tensor descriptor and mask /// operands in LoadGatherOp. -void SGMapPropagation::visitLoadGatherOp( - xegpu::LoadGatherOp load, ArrayRef operands, - ArrayRef results) { - auto valueLayout = results[0]->getValue(); - /// Need the layout of the value to propagate to the tensor descriptor. +void LayoutInfoPropagation::visitLoadGatherOp( + xegpu::LoadGatherOp load, ArrayRef operands, + ArrayRef results) { + LayoutInfo valueLayout = results[0]->getValue(); + // Need the layout of the value to propagate to the tensor descriptor. if (!valueLayout.isAssigned()) return; - SGMap tensorDescLayout = valueLayout; + LayoutInfo tensorDescLayout = valueLayout; if (load.getTranspose()) { - /// LoadGatherOp has the transpose effect. However, at the stage of this - /// analyis this effect is not expected and should be abstracted away. Emit - /// a warning. + // LoadGatherOp has the transpose effect. However, at the stage of this + // analyis this effect is not expected and should be abstracted away. Emit + // a warning. load.emitWarning("Transpose effect is not expected for LoadGatherOp at " - "SGMapPropagation stage."); + "LayoutInfoPropagation stage."); tensorDescLayout = valueLayout.getTransposedLayout({1, 0}); } - /// Mask operand should have 1D default layout. - auto maskLayout = getDefaultSgMap(1); - /// Propagate the new layout to the tensor descriptor operand. + // Mask operand should have 1D default layout. + LayoutInfo maskLayout = getDefaultLayoutInfo(1); + // Propagate the new layout to the tensor descriptor operand. propagateIfChanged(operands[0], operands[0]->meet(tensorDescLayout)); - /// Propagate the new layout to the mask operand. + // Propagate the new layout to the mask operand. propagateIfChanged(operands[1], operands[1]->meet(maskLayout)); } /// Propagate the layout of the descriptor to the vector offset operand in /// CreateDescOp. -void SGMapPropagation::visitCreateDescOp( - xegpu::CreateDescOp createDesc, ArrayRef operands, - ArrayRef results) { - auto descLayout = results[0]->getValue(); - /// Need the layout of the descriptor to propagate to the operands. +void LayoutInfoPropagation::visitCreateDescOp( + xegpu::CreateDescOp createDesc, ArrayRef operands, + ArrayRef results) { + LayoutInfo descLayout = results[0]->getValue(); + // Need the layout of the descriptor to propagate to the operands. if (!descLayout.isAssigned()) return; - /// For offset operand propagate 1D default layout. - SGMap layout = getDefaultSgMap(1); + // For offset operand propagate 1D default layout. + LayoutInfo layout = getDefaultLayoutInfo(1); propagateIfChanged(operands[1], operands[1]->meet(layout)); } /// Set the layout for the value, tensor descriptor, and mask operands in the /// StoreScatterOp. -void SGMapPropagation::visitStoreScatterOp( - xegpu::StoreScatterOp storeScatter, ArrayRef operands, - ArrayRef results) { - /// Currently, for 2D StoreScatterOp we expect that the height dimension of - /// the tensor descriptor is evenly divisible by the subgroup size. - /// TODO: Add support for other 2D shapes. - auto tdescShape = storeScatter.getTensorDescType().getShape(); - if (tdescShape.size() > 1 && tdescShape[0] % subgroupSize != 0) { - storeScatter.emitError("Height dimension of the tensor descriptor should " - "be evenly divisible by the subgroup size."); - return; - } - auto valueLayout = getDefaultSgMap(storeScatter.getValueType()); - SGMap storeScatterLayout = valueLayout; +void LayoutInfoPropagation::visitStoreScatterOp( + xegpu::StoreScatterOp storeScatter, ArrayRef operands, + ArrayRef results) { + // Currently, for 2D StoreScatterOp we expect that the height dimension of + // the tensor descriptor is equal to the subgroup size. This is ensured by + // the op verifier. + ArrayRef tdescShape = storeScatter.getTensorDescType().getShape(); + if (tdescShape.size() > 1) + assert( + tdescShape[0] == subgroupSize && + "Expected the first dimension of 2D tensor descriptor to be equal to " + "subgroup size."); + + LayoutInfo valueLayout = getDefaultLayoutInfo(storeScatter.getValueType()); + LayoutInfo storeScatterLayout = valueLayout; if (storeScatter.getTranspose()) { - /// StoreScatteOp allows transpose effect. However, at the stage of this - /// analyis this effect is not expected and should be abstracted away. Emit - /// a warning. + // StoreScatteOp allows transpose effect. However, at the stage of this + // analyis this effect is not expected and should be abstracted away. Emit + // a warning. storeScatter.emitWarning("Transpose effect is not expected for " - "StoreScatterOp at SGMapPropagation stage."); + "StoreScatterOp at LayoutInfoPropagation stage."); storeScatterLayout = valueLayout.getTransposedLayout({1, 0}); } - /// Propagate the value layout. + // Propagate the value layout. propagateIfChanged(operands[0], operands[0]->meet(valueLayout)); - /// Propagate the tensor descriptor layout. + // Propagate the tensor descriptor layout. propagateIfChanged(operands[1], operands[1]->meet(storeScatterLayout)); - /// Use default 1D layout for mask operand. - auto maskLayout = getDefaultSgMap(1); + // Use default 1D layout for mask operand. + LayoutInfo maskLayout = getDefaultLayoutInfo(1); propagateIfChanged(operands[2], operands[2]->meet(maskLayout)); } namespace { -///===----------------------------------------------------------------------===/// -/// RunSGMapPropagation -///===----------------------------------------------------------------------===/// +//===----------------------------------------------------------------------===// +// RunLayoutInfoPropagation +//===----------------------------------------------------------------------===// -/// Driver class for running the SGMapPropagation analysis. -class RunSGMapPropagation { +/// Driver class for running the LayoutInfoPropagation analysis. +class RunLayoutInfoPropagation { public: - RunSGMapPropagation(Operation *op) : target(op) { + MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(RunLayoutInfoPropagation) + + RunLayoutInfoPropagation(Operation *op) : target(op) { SymbolTableCollection symbolTable; solver.load(); solver.load(); - solver.load(symbolTable); + solver.load(symbolTable); (void)solver.initializeAndRun(op); } - SGMap getSGMap(Value val); + LayoutInfo getLayoutInfo(Value val); void printAnalysisResult(llvm::raw_ostream &os); @@ -575,21 +618,21 @@ class RunSGMapPropagation { }; } // namespace -SGMap RunSGMapPropagation::getSGMap(Value val) { - auto *state = solver.lookupState(val); +LayoutInfo RunLayoutInfoPropagation::getLayoutInfo(Value val) { + auto *state = solver.lookupState(val); if (!state) return {}; return state->getValue(); } -void RunSGMapPropagation::printAnalysisResult(llvm::raw_ostream &os) { +void RunLayoutInfoPropagation::printAnalysisResult(llvm::raw_ostream &os) { auto printFunctionResult = [&](FunctionOpInterface funcOp) { os << "function: " << funcOp.getName() << ":\n"; // Function arguments - for (auto arg : funcOp.getArguments()) { - auto layout = getSGMap(arg); + for (BlockArgument arg : funcOp.getArguments()) { + LayoutInfo layout = getLayoutInfo(arg); os << "argument: " << arg << "\n"; - os << "sg_map : "; + os << "layout : "; layout.print(os); os << "\n"; } @@ -599,16 +642,16 @@ void RunSGMapPropagation::printAnalysisResult(llvm::raw_ostream &os) { if (op->getResults().empty()) return; os << "op : "; - /// For control-flow ops, print the op name only. + // For control-flow ops, print the op name only. if (isa(op) || isa(op)) os << op->getName(); else op->print(os); os << "\n"; - /// Print the sg_map for each result. + // Print the layout for each result. for (auto [i, r] : llvm::enumerate(op->getResults())) { - auto layout = getSGMap(r); - os << "sg_map for result #" << i << ": "; + LayoutInfo layout = getLayoutInfo(r); + os << "layout for result #" << i << ": "; layout.print(os); os << "\n"; } @@ -620,19 +663,757 @@ void RunSGMapPropagation::printAnalysisResult(llvm::raw_ostream &os) { for (auto funcOp : modOp.getOps()) { funcOps.push_back(funcOp); } - /// Collect all GpuFuncOps in the module. + // Collect all GpuFuncOps in the module. for (auto gpuModOp : modOp.getOps()) { for (auto gpuFuncOp : gpuModOp.getOps()) { funcOps.push_back(gpuFuncOp); } } } - /// Print the analysis result for each function. - for (auto funcOp : funcOps) { + // Print the analysis result for each function. + for (FunctionOpInterface funcOp : funcOps) { printFunctionResult(funcOp); } } +namespace { + +//===----------------------------------------------------------------------===// +// LayoutAttrAssignment +//===----------------------------------------------------------------------===// + +/// This class is responsible for assigning the layout attributes to the ops and +/// their users based on the layout propagation analysis result. +class LayoutAttrAssignment { +public: + LayoutAttrAssignment(Operation *top, + function_ref getLayout) + : getAnalysisResult(getLayout), top(top) {} + + LogicalResult run(); + +private: + LogicalResult assign(Operation *op); + void assignToUsers(Value v, xegpu::LayoutAttr layout); + xegpu::LayoutAttr getLayoutAttrForValue(Value v); + LogicalResult resolveConflicts(); + // Callable to get the layout of a value based on the layout propagation + // analysis. + function_ref getAnalysisResult; + Operation *top; +}; + +} // namespace + +/// Helper to assign the layout attribute to the users of the value. +void LayoutAttrAssignment::assignToUsers(Value v, xegpu::LayoutAttr layout) { + for (OpOperand &user : v.getUses()) { + Operation *owner = user.getOwner(); + unsigned operandNumber = user.getOperandNumber(); + // Use a generic name for ease of querying the layout attribute later. + std::string attrName = + operandLayoutNamePrefix + std::to_string(operandNumber); + owner->setAttr(attrName, layout); + } +} + +/// Convert the layout assigned to a value to xegpu::LayoutAttr. +xegpu::LayoutAttr LayoutAttrAssignment::getLayoutAttrForValue(Value v) { + LayoutInfo layout = getAnalysisResult(v); + if (!layout.isAssigned()) + return {}; + SmallVector laneLayout, laneData; + for (auto [layout, data] : llvm::zip_equal(layout.getLayoutAsArrayRef(), + layout.getDataAsArrayRef())) { + laneLayout.push_back(static_cast(layout)); + laneData.push_back(static_cast(data)); + } + return xegpu::LayoutAttr::get(v.getContext(), laneLayout, laneData); +} + +/// Assign xegpu::LayoutAttr to the op and its users. The layout is assigned +/// based on the layout propagation analysis result. +LogicalResult LayoutAttrAssignment::assign(Operation *op) { + // For function ops, propagate the function argument layout to the users. + if (auto func = dyn_cast(op)) { + for (BlockArgument arg : func.getArguments()) { + xegpu::LayoutAttr layoutInfo = getLayoutAttrForValue(arg); + if (layoutInfo) { + assignToUsers(arg, layoutInfo); + } + } + return success(); + } + // If no results, move on. + if (op->getNumResults() == 0) + return success(); + // If all the results are scalars, move on. + if (llvm::all_of(op->getResultTypes(), + [](Type t) { return t.isIntOrIndexOrFloat(); })) + return success(); + // If the op has more than one result and at least one result is a tensor + // descriptor, exit. This case is not supported yet. + // TODO: Support this case. + if (op->getNumResults() > 1 && llvm::any_of(op->getResultTypes(), [](Type t) { + return isa(t); + })) { + LLVM_DEBUG( + DBGS() << op->getName() + << " op has more than one result and at least one is a tensor " + "descriptor. This case is not handled.\n"); + return failure(); + } + // If the result is a tensor descriptor, attach the layout to the tensor + // descriptor itself. + if (auto tensorDescTy = + dyn_cast(op->getResultTypes()[0])) { + xegpu::LayoutAttr layoutInfo = getLayoutAttrForValue(op->getResult(0)); + if (!layoutInfo) { + LLVM_DEBUG(DBGS() << "No layout for result of " << *op << "\n"); + return failure(); + } + + // Clone the op, attach the layout to the result tensor descriptor, and + // remove the original op. + OpBuilder builder(op); + Operation *newOp = builder.clone(*op); + auto newTensorDescTy = xegpu::TensorDescType::get( + tensorDescTy.getContext(), tensorDescTy.getShape(), + tensorDescTy.getElementType(), tensorDescTy.getEncoding(), layoutInfo); + newOp->getResult(0).setType(newTensorDescTy); + op->replaceAllUsesWith(newOp->getResults()); + op->erase(); + return success(); + } + // Otherwise simply attach the layout to the op itself. + for (auto [i, r] : llvm::enumerate(op->getResults())) { + xegpu::LayoutAttr layoutInfo = getLayoutAttrForValue(r); + if (layoutInfo) { + std::string attrName = resultLayoutNamePrefix + std::to_string(i); + op->setAttr(attrName, layoutInfo); + // Attach the layout attribute to the users of the result. + assignToUsers(r, layoutInfo); + } + } + return success(); +} + +/// Walk the IR and attach xegpu::LayoutAttr to all ops and their users. +LogicalResult LayoutAttrAssignment::run() { + auto walkResult = top->walk([&](Operation *op) { + if (failed(assign(op))) + return WalkResult::interrupt(); + return WalkResult::advance(); + }); + + if (walkResult.wasInterrupted()) + return failure(); + + return resolveConflicts(); +} + +/// TODO: Implement the layout conflict resolution. This must ensure mainly two +/// things: +/// 1) Is a given layout supported by the op? (need to query the target +/// HW info). Otherwise can we achive this layout using a layout conversion? +/// 2) Do all the operands have the required layout? If not, can it +/// be resolved using a layout conversion? +LogicalResult LayoutAttrAssignment::resolveConflicts() { return success(); } + +namespace { + +//===----------------------------------------------------------------------===// +// SIMT Distribution Patterns +//===----------------------------------------------------------------------===// + +/// Helper function to get distributed vector type for a source vector type +/// according to the lane_layout. We simply divide each dimension of tensor +/// descriptor shape by corresponding lane_layout dimension. If +/// array_length > 1, that is appended to the front of the ditributed shape. +/// NOTE: This is the vector type that will be returned by the +/// gpu.warp_execute_on_lane0 op. +/// +/// Examples: +/// | original vector shape | lane_layout | distributed vector shape | +/// |-----------------------|-------------|--------------------------| +/// | 32x16 | [1, 16] | 32x1 | +/// | 32x16 | [2, 8] | 16x2 | +/// | 2x32x16 | [1, 16] | 2x32x1 | +static FailureOr +getDistVecTypeBasedOnLaneLayout(xegpu::LayoutAttr layout, + VectorType originalType) { + if (!layout) + return failure(); + + auto laneLayout = layout.getLaneLayout().asArrayRef(); + assert(originalType.getShape().size() >= laneLayout.size() && + "Rank of the original vector type should be greater or equal to the " + "size of the lane layout to distribute the vector type."); + SmallVector distributedShape(originalType.getShape()); + // Only distribute the last `laneLayout.size()` dimensions. The remaining + // dimensions are not distributed. + unsigned distributionStart = originalType.getRank() - laneLayout.size(); + for (auto [i, dim] : llvm::enumerate(originalType.getShape())) { + if (i < distributionStart) { + continue; + } + // Check if the dimension can be distributed evenly. + if (dim % laneLayout[i - distributionStart] != 0) + return failure(); + distributedShape[i] = dim / laneLayout[i - distributionStart]; + } + return VectorType::get(distributedShape, originalType.getElementType()); +} + +// Drop the layout attribute from the tensor descriptor type if layout is +// present. +static xegpu::TensorDescType dropLayouts(xegpu::TensorDescType tensorDesc) { + if (tensorDesc.getLayoutAttr() == xegpu::LayoutAttr()) + return tensorDesc; + + return xegpu::TensorDescType::get( + tensorDesc.getContext(), tensorDesc.getShape(), + tensorDesc.getElementType(), tensorDesc.getEncoding(), + xegpu::LayoutAttr()); +} + +/// Helper function to resolve types if the distributed type out of +/// gpu.warp_execute_on_lane0 is different from the expected xegpu SIMT type. +/// Example 1: +/// distributed type: vector<8x1xf32> +/// expected type: vector<8xf32> +/// resolved using, +/// %0 = vector.shape_cast %1 : vector<8x1xf32> to vector<8xf32> +/// Example 2: +/// distributed type: xegpu.tensor_desc<8x16xf32, #xegpu.layout<...>> +/// expected type: xegpu.tensor_desc<8x16xf32> +/// resolved using, +/// %0 = unrealized_conversion_cast %1 : +/// xegpu.tensor_desc<8x16xf32, #xegpu.layout<..>> -> +/// xegpu.tensor_desc<8x16xf32> +template +static Value resolveDistributedTy(Value orig, T expected, + PatternRewriter &rewriter) { + // If orig and expected types are the same, return orig. + if (orig.getType() == expected) + return orig; + // If orig is a vector type, create a shape cast op to reconcile the types. + if (auto origVecType = isa(orig.getType())) { + auto castOp = + rewriter.create(orig.getLoc(), expected, orig); + return castOp.getResult(); + } + // If orig is a tensor descriptor type, create an unrealized conversion cast + // op to reconcile the types. + if (auto origTensorDescTy = isa(orig.getType())) { + auto castOp = rewriter.create(orig.getLoc(), + expected, orig); + return castOp.getResult(0); + } + llvm_unreachable("Unsupported type for reconciliation"); + return orig; +} + +/// Helper function to filter out the temporary layout attributes attached +/// during the layout assignment process. These are not needed after going to +/// SIMT. +static SmallVector +removeTemporaryLayoutAttributes(ArrayRef attrs) { + SmallVector newAttrs; + for (NamedAttribute attr : attrs) { + if (attr.getName().strref().contains(operandLayoutNamePrefix) || + attr.getName().strref().contains(resultLayoutNamePrefix)) { + continue; + } + newAttrs.push_back(attr); + } + return newAttrs; +} + +/// Helper function to check if the layout is packed. Layout is packed if it is +/// 2D and lane_data[0] != 1 (data packed from col dimension). +static bool hasPackedLayout(xegpu::LayoutAttr layout) { + if (layout == xegpu::LayoutAttr()) + return false; + DenseI32ArrayAttr laneData = layout.getLaneData(); + if (!laneData || laneData.size() != 2) + return false; + return laneData.asArrayRef()[0] != 1; +} + +/// Given a GPUFuncOp, this pattern creates a new GPUFuncOp and moves the body +/// of the original GPUFuncOp to the new GPUFuncOp such that entire body is +/// contained within a WarpExecuteOnLane0Op. +/// Example: +/// +/// ``` +/// gpu.func @foo(%arg0: memref<*xf16>) -> vector<8x16xf32> { +/// ... +/// ... +/// gpu.return %result: vector<8x16xf32> +/// } +/// ``` +/// To +/// ``` +/// gpu.func @foo(%arg0: memref<*xf16>) -> vector<8x16xf32> { +/// %laneid = gpu.lane_id : index +/// %0 = gpu.warp_execute_on_lane_0(%laneid) -> vector<8x16xf32> { +/// ... +/// ... +/// gpu.yield %result: vector<8x16xf32> +/// } +/// return %0 +/// } +struct MoveFuncBodyToWarpExecuteOnLane0 + : public OpRewritePattern { + using OpRewritePattern::OpRewritePattern; + LogicalResult matchAndRewrite(gpu::GPUFuncOp gpuFuncOp, + PatternRewriter &rewriter) const override { + // If the function only contains a single void return, skip. + if (llvm::all_of(gpuFuncOp.getBody().getOps(), [](Operation &op) { + return isa(op) && !op.getNumOperands(); + })) + return failure(); + // If the function already moved inside a warp_execute_on_lane0, skip. + if (llvm::any_of(gpuFuncOp.getBody().getOps(), [](Operation &op) { + return isa(op); + })) + return failure(); + // Create a new function with the same signature. + auto newGpuFunc = rewriter.create( + gpuFuncOp.getLoc(), gpuFuncOp.getName(), gpuFuncOp.getFunctionType()); + // Create a WarpExecuteOnLane0Op with same arguments and results as the + // original gpuFuncOp. + rewriter.setInsertionPointToEnd(&newGpuFunc.getFunctionBody().front()); + auto laneId = rewriter.create( + newGpuFunc.getLoc(), rewriter.getIndexType(), + /** upperBound = **/ mlir::IntegerAttr()); + ArrayRef gpuFuncResultType = gpuFuncOp.getFunctionType().getResults(); + auto warpOp = rewriter.create( + laneId.getLoc(), gpuFuncResultType, laneId, subgroupSize, + newGpuFunc.getArguments(), newGpuFunc.getArgumentTypes()); + Block &warpBodyBlock = warpOp.getBodyRegion().front(); + // Replace the ReturnOp of the original gpu function with a YieldOp. + auto origRetunOp = + cast(gpuFuncOp.getBlocks().back().getTerminator()); + rewriter.setInsertionPointAfter(origRetunOp); + rewriter.create(origRetunOp.getLoc(), + origRetunOp.getOperands()); + rewriter.eraseOp(origRetunOp); + // Move the original function body to the WarpExecuteOnLane0Op body. + rewriter.inlineRegionBefore(gpuFuncOp.getBody(), warpOp.getBodyRegion(), + warpOp.getBodyRegion().begin()); + rewriter.eraseBlock(&warpBodyBlock); + // Insert a new ReturnOp after the WarpExecuteOnLane0Op. + rewriter.setInsertionPointAfter(warpOp); + rewriter.create(newGpuFunc.getLoc(), warpOp.getResults()); + rewriter.replaceOp(gpuFuncOp, newGpuFunc); + return success(); + } +}; + +/// Distribute a create_nd_tdesc feeding into vector.yield op of the enclosing +/// `gpu.warp_execute_on_lane_0` region. After the sinking, the warp op will +/// still contain the original op that will not be used by the yield op (and +/// should be cleaned up later). The yield op will bypass the create_nd_tdesc's +/// arguments. Tensor descriptor shape is not distributed because it is a +/// uniform value across all work items within the subgroup. However, the +/// layout information is dropped in the new tensor descriptor type. +/// +/// Example: +/// +/// ``` +/// #lo0 = #xegpu.layout +/// %r = gpu.warp_execute_on_lane_0(%laneid) -> +/// (!xegpu.tensor_desc<4x8xf32, #lo0>) { +/// ... +/// %td = xegpu.create_nd_tdesc %arg0[0, 0] +/// : memref<4x8xf32> -> !xegpu.tensor_desc<4x8xf32, #lo0> +/// vector.yield %td +/// } +/// ``` +/// To +/// ``` +/// %r:2 = gpu.warp_execute_on_lane_0(%laneid) -> (...) { +/// ... +/// %dead = xegpu.create_nd_tdesc %arg0[0, 0] +/// : memref<4x8xf32> -> !xegpu.tensor_desc<4x8xf32, #lo0> +/// vector.yield %arg0, %dead +/// } +/// %td = xegpu.create_nd_tdesc %r#0[0, 0]: memref<4x8xf32> +/// -> !xegpu.tensor_desc<4x8xf32> +/// +/// ``` +struct CreateNdDescDistribution final : public gpu::WarpDistributionPattern { + using gpu::WarpDistributionPattern::WarpDistributionPattern; + LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op subgroupOp, + PatternRewriter &rewriter) const override { + OpOperand *operand = + getWarpResult(subgroupOp, llvm::IsaPred); + if (!operand) + return rewriter.notifyMatchFailure( + subgroupOp, "warp result is not a xegpu::CreateNdDesc op"); + auto descOp = operand->get().getDefiningOp(); + unsigned operandIdx = operand->getOperandNumber(); + + xegpu::LayoutAttr layout = descOp.getType().getLayoutAttr(); + if (!layout) + return rewriter.notifyMatchFailure( + descOp, "the tensor descriptor lacks layout attribute"); + + SmallVector newRetIndices; + SmallVector newYieldValues; + SmallVector newYieldTypes; + + for (Value operand : descOp->getOperands()) { + newYieldValues.push_back(operand); + newYieldTypes.push_back(operand.getType()); + } + rewriter.setInsertionPoint(subgroupOp); + gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns( + rewriter, subgroupOp, /* new yieled values = */ newYieldValues, + /* new yielded types = */ newYieldTypes, newRetIndices); + + SmallVector newDescOperands; + for (size_t i : newRetIndices) { + newDescOperands.push_back(newWarpOp.getResult(i)); + } + rewriter.setInsertionPointAfter(newWarpOp); + xegpu::TensorDescType distributedTensorDescTy = + dropLayouts(descOp.getType()); // Distributed tensor descriptor type + // does not contain layout info. + auto newDescOp = rewriter.create( + newWarpOp.getLoc(), distributedTensorDescTy, newDescOperands, + descOp->getAttrs()); + + Value distributedVal = newWarpOp.getResult(operandIdx); + rewriter.replaceAllUsesWith(distributedVal, newDescOp); + return success(); + } +}; + +/// Distribute a store_nd op at the end of enclosing +/// `gpu.warp_execute_on_lane_0`. In case arguments for the store are passed +/// through the warp op interface they would be propagated as returned values. +/// Source vector is distributed based on lane layout. Appropriate cast ops are +/// inserted if the distributed types does not match expected xegpu SIMT types. +/// +/// Example: +/// +/// ``` +/// #lo0 = #xegpu.layout +/// gpu.warp_execute_on_lane_0(%laneid) -> () { +/// ... +/// xegpu.store_nd %arg0, %arg1: vector<4x8xf32>, +/// !xegpu.tensor_desc<4x8xf32, #lo0> +/// } +/// ``` +/// To +/// ``` +/// %r:2 = gpu.warp_execute_on_lane_0(%laneid) -> (vector<4x1xf32>, +/// !xegpu.tensor_desc<4x8xf32, #lo0>) { +/// gpu.yield %arg0, %arg1: vector<4x8xf32>, !xegpu.tensor_desc<4x8xf32, +/// #lo0> +/// } +/// %0 = vector.shape_cast %r#0: vector<4x1xf32> to vector<4xf32> +/// %1 = unrealized_conversion_cast %r#1: !xegpu.tensor_desc<4x8xf32, +/// #lo0> +/// -> !xegpu.tensor_desc<4x8xf32> +/// xegpu.store_nd %0, %1: vector<4xf32>, +/// !xegpu.tensor_desc<4x8xf32> +/// +/// ``` +struct StoreNdDistribution final : public gpu::WarpDistributionPattern { + using gpu::WarpDistributionPattern::WarpDistributionPattern; + LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op subgroupOp, + PatternRewriter &rewriter) const override { + auto yield = cast( + subgroupOp.getBodyRegion().getBlocks().begin()->getTerminator()); + Operation *lastNode = yield->getPrevNode(); + auto storeOp = dyn_cast_or_null(lastNode); + if (!storeOp) + return failure(); + + xegpu::TensorDescType tensorDescTy = storeOp.getTensorDescType(); + xegpu::LayoutAttr layout = tensorDescTy.getLayoutAttr(); + if (!layout) + return rewriter.notifyMatchFailure( + storeOp, "the source tensor descriptor lacks layout attribute"); + + FailureOr distributedTypeByWarpOpOrFailure = + getDistVecTypeBasedOnLaneLayout(layout, storeOp.getValueType()); + if (failed(distributedTypeByWarpOpOrFailure)) + return rewriter.notifyMatchFailure(storeOp, + "Failed to distribute the type"); + VectorType distributedTypeByWarpOp = + distributedTypeByWarpOpOrFailure.value(); + + SmallVector newRetIndices; + gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns( + rewriter, subgroupOp, + /* new yielded values = */ + ValueRange{storeOp.getValue(), storeOp.getTensorDesc()}, + /* new yielded types = */ + TypeRange{distributedTypeByWarpOp, storeOp.getTensorDescType()}, + newRetIndices); + // Create a new store op outside the warp op with the distributed vector + // type. Tensor descriptor is not distributed. + rewriter.setInsertionPointAfter(newWarpOp); + SmallVector newStoreOperands; + + // For the value operand, there can be a mismatch between the vector type + // distributed by the warp op and (xegpu-specific) distributed type + // supported by the store op. Type mismatch must be resolved using + // appropriate cast op. + FailureOr storeNdDistributedValueTyOrFailure = + xegpu::getDistributedVectorType(storeOp.getTensorDescType()); + if (failed(storeNdDistributedValueTyOrFailure)) + return rewriter.notifyMatchFailure( + storeOp, "Failed to get distributed vector type for the store op"); + newStoreOperands.push_back(resolveDistributedTy( + newWarpOp.getResult(newRetIndices[0]), + storeNdDistributedValueTyOrFailure.value(), rewriter)); + // For the tensor descriptor operand, the layout attibute is dropped after + // distribution. Types needs to be resolved in this case also. + xegpu::TensorDescType distributedTensorDescTy = + dropLayouts(storeOp.getTensorDescType()); + newStoreOperands.push_back( + resolveDistributedTy(newWarpOp.getResult(newRetIndices[1]), + distributedTensorDescTy, rewriter)); + + rewriter.create( + newWarpOp.getLoc(), TypeRange{}, newStoreOperands, + removeTemporaryLayoutAttributes(storeOp->getAttrs())); + rewriter.eraseOp(storeOp); + return success(); + } +}; + +/// Distribute a load_nd op feeding into vector.yield op for the enclosing +/// `gpu.warp_execute_on_lane_0` and put it after the warp op. +/// The warp op will still contain the original op that will not be used by +/// the yield op (and should be cleaned up later). The yield op will +/// bypass the load's arguments. Only the loaded vector is distributed +/// according to lane layout and, tensor descriptor types is not +/// distributed. Appropriate cast ops are inserted if the distributed types does +/// not match expected xegpu SIMT types. +/// +/// Example: +/// +/// ``` +/// #lo0 = #xegpu.layout +/// %r = gpu.warp_execute_on_lane_0(%laneid) -> +/// (vector<4x1xf32>) { +/// ... +/// %ld = xegpu.load_nd %arg0, %arg1: !xegpu.tensor_desc<4x8xf32, #lo0> -> +/// vector<4x8xf32> +/// gpu.yield %ld +/// } +/// ``` +/// To +/// ``` +/// %r:2 = gpu.warp_execute_on_lane_0(%laneid) -> (vector<4x1xf32>, +/// !xegpu.tensor_desc<4x8xf32, #lo0>) { +/// ... +/// %dead = xegpu.load_nd %arg0: !xegpu.tensor_desc<4x8xf32, #lo0> -> +/// vector<4x8xf32> gpu.yield %dead, %arg0 +/// } +/// %0 = unrealized_conversion_cast %r#1: !xegpu.tensor_desc<4x8xf32, +/// #lo0> -> !xegpu.tensor_desc<4x8xf32> +/// %1 = xegpu.load_nd %0: !xegpu.tensor_desc<4x8xf32> -> vector<4xf32> +/// %2 = vector.shape_cast %r#0: vector<4xf32> to vector<4x1xf32> +/// +/// ``` +struct LoadNdDistribution final : public gpu::WarpDistributionPattern { + using gpu::WarpDistributionPattern::WarpDistributionPattern; + LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op subgroupOp, + PatternRewriter &rewriter) const override { + OpOperand *operand = + getWarpResult(subgroupOp, llvm::IsaPred); + if (!operand) + return rewriter.notifyMatchFailure( + subgroupOp, "warp result is not a xegpu::LoadNd op"); + + auto loadOp = operand->get().getDefiningOp(); + xegpu::TensorDescType tensorDescTy = loadOp.getTensorDescType(); + xegpu::LayoutAttr layout = tensorDescTy.getLayoutAttr(); + if (!layout) + return rewriter.notifyMatchFailure( + loadOp, "the source tensor descriptor lacks layout attribute"); + + unsigned operandIdx = operand->getOperandNumber(); + VectorType distributedTypeByWarpOp = + cast(subgroupOp.getResult(operandIdx).getType()); + + SmallVector newRetIndices; + gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns( + rewriter, subgroupOp, + /* new yielded values = */ loadOp.getTensorDesc(), + /* new yielded types = */ tensorDescTy, newRetIndices); + + // Create a new load op outside the warp op with the distributed vector + // type. + rewriter.setInsertionPointAfter(newWarpOp); + FailureOr loadNdDistValueTyOrFailure = + xegpu::getDistributedVectorType(loadOp.getTensorDescType()); + if (failed(loadNdDistValueTyOrFailure)) + return rewriter.notifyMatchFailure( + loadOp, "Failed to get distributed vector type for the load op"); + xegpu::TensorDescType distributedTensorDescTy = + dropLayouts(loadOp.getTensorDescType()); // Distributed tensor + // descriptor type does not + // contain layout info. + auto newLoadOp = rewriter.create( + newWarpOp.getLoc(), loadNdDistValueTyOrFailure.value(), + resolveDistributedTy(newWarpOp->getResult(newRetIndices[0]), + distributedTensorDescTy, rewriter), + removeTemporaryLayoutAttributes(loadOp->getAttrs())); + // Set the packed attribute if the layout requires it. + newLoadOp.setPacked(hasPackedLayout(layout)); + Value distributedVal = newWarpOp.getResult(operandIdx); + // There can be a conflict between the vector type distributed by the + // warp op and (xegpu-specific) distributed type supported by the load + // op. Resolve these mismatches by inserting a cast. + Value tyResolvedVal = resolveDistributedTy( + newLoadOp.getResult(), distributedTypeByWarpOp, rewriter); + rewriter.replaceAllUsesWith(distributedVal, tyResolvedVal); + return success(); + } +}; + +/// Distribute a dpas op feeding into vector.yield op for the enclosing +/// `gpu.warp_execute_on_lane_0` and put it after the warp op. +/// The warp op will still contain the original op that will not be used by +/// the yield op (and should be cleaned up later). The yield op will +/// bypass the dpas's arguments. Appropriate cast ops are inserted if the +/// distributed types does not match expected xegpu SIMT types. +/// Example: +/// ``` +/// #lo_a = #xegpu.layout +/// #lo_b = #xegpu.layout +/// #lo_c = #xegpu.layout +/// %r = gpu.warp_execute_on_lane_0(%laneid) -> +/// (vector<8x1xf32>) { +/// ... +/// %dpas = xegpu.dpas %arg0, %arg1: vector<8x16xf16>, vector<16x16xf16> -> +/// vector<8x16xf32> +/// gpu.yield %dpas +/// } +/// ``` +/// To +/// ``` +/// %r:2 = gpu.warp_execute_on_lane_0(%laneid) -> (vector<8x1xf32>, +/// vector<8x1xf16>, vector<16x1xf16>) { +/// ... +/// %dead = xegpu.dpas %arg0, %arg1: vector<8x16xf16>, vector<16x16xf16> +/// -> vector<8x16xf32> +/// gpu.yield %dead, %arg0, %arg1 +/// } +/// %0 = vector.shape_cast %r#1: vector<8x1xf16> to vector<8xf16> +/// %1 = vector.shape_cast %r#2: vector<16x1xf16> to vector<16xf16> +/// %2 = xegpu.dpas %0, %1: vector<8xf16>, vector<16xf16> -> +/// vector<8xf32> +/// %dpas = vector.shape_cast %2: vector<8xf32> to vector<8x1xf32> +/// ``` +struct DpasDistribution final : public gpu::WarpDistributionPattern { + using gpu::WarpDistributionPattern::WarpDistributionPattern; + LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op subgroupOp, + PatternRewriter &rewriter) const override { + OpOperand *operand = + getWarpResult(subgroupOp, llvm::IsaPred); + if (!operand) + return rewriter.notifyMatchFailure(subgroupOp, + "warp result is not a xegpu::Dpas op"); + + auto dpasOp = operand->get().getDefiningOp(); + unsigned operandIdx = operand->getOperandNumber(); + std::string layoutAName = + llvm::formatv("{0}{1}", operandLayoutNamePrefix, 0).str(); + std::string layoutBName = + llvm::formatv("{0}{1}", operandLayoutNamePrefix, 1).str(); + auto layoutCName = llvm::formatv("{0}{1}", resultLayoutNamePrefix, 0).str(); + xegpu::LayoutAttr layoutA = + dpasOp->getAttrOfType(layoutAName); + xegpu::LayoutAttr layoutB = + dpasOp->getAttrOfType(layoutBName); + xegpu::LayoutAttr layoutOut = + dpasOp->getAttrOfType(layoutCName); + if (!layoutA || !layoutB || !layoutOut) + return rewriter.notifyMatchFailure( + dpasOp, + "the xegpu::Dpas op lacks layout attribute for A, B or output"); + + FailureOr distLhsTypeByWarpOpOrFailure = + getDistVecTypeBasedOnLaneLayout(layoutA, dpasOp.getLhsType()); + FailureOr distRhsTypeByWarpOpOrFailure = + getDistVecTypeBasedOnLaneLayout(layoutB, dpasOp.getRhsType()); + FailureOr distResultTypeByWarpOpOrFailure = + getDistVecTypeBasedOnLaneLayout(layoutOut, dpasOp.getResultType()); + if (failed(distLhsTypeByWarpOpOrFailure) || + failed(distRhsTypeByWarpOpOrFailure) || + failed(distResultTypeByWarpOpOrFailure)) + return rewriter.notifyMatchFailure( + dpasOp, + "Failed to distribute the A, B or output types in xegpu::Dpas op"); + + llvm::SmallVector newYieldValues{dpasOp.getLhs(), + dpasOp.getRhs()}; + llvm::SmallVector newYieldTypes{ + distLhsTypeByWarpOpOrFailure.value(), + distRhsTypeByWarpOpOrFailure.value()}; + // Dpas acc operand is optional. + if (dpasOp.getAcc()) { + newYieldValues.push_back(dpasOp.getAcc()); + newYieldTypes.push_back(distResultTypeByWarpOpOrFailure.value()); + } + // Create a new warp op without the dpas. + SmallVector newRetIndices; + gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns( + rewriter, subgroupOp, newYieldValues, newYieldTypes, newRetIndices); + + FailureOr expectedDistLhsTyOrFailure = + xegpu::getDistributedVectorType(dpasOp.getLhsType(), layoutA); + FailureOr expectedDistRhsTyOrFailure = + xegpu::getDistributedVectorType(dpasOp.getRhsType(), layoutB); + FailureOr expectedDistResultTyOrFailure = + xegpu::getDistributedVectorType(dpasOp.getResultType(), layoutOut); + if (failed(expectedDistLhsTyOrFailure) || + failed(expectedDistRhsTyOrFailure) || + failed(expectedDistResultTyOrFailure)) + return rewriter.notifyMatchFailure( + dpasOp, + "Failed to get distributed vector type for the dpas operands."); + // Create a new dpas op outside the warp op. + rewriter.setInsertionPointAfter(newWarpOp); + SmallVector newDpasOperands; + SmallVector newDpasOperandExpectedTypes; + + // Resolve the distributed types with the original types. + newDpasOperandExpectedTypes.push_back(expectedDistLhsTyOrFailure.value()); + newDpasOperandExpectedTypes.push_back(expectedDistRhsTyOrFailure.value()); + VectorType distributedResultTy = expectedDistResultTyOrFailure.value(); + if (dpasOp.getAcc()) + newDpasOperandExpectedTypes.push_back(distributedResultTy); + + for (unsigned i = 0; i < newRetIndices.size(); i++) { + newDpasOperands.push_back( + resolveDistributedTy(newWarpOp.getResult(newRetIndices[i]), + newDpasOperandExpectedTypes[i], rewriter)); + } + Value newDpasOp = rewriter.create( + newWarpOp->getLoc(), distributedResultTy, newDpasOperands, + removeTemporaryLayoutAttributes(dpasOp->getAttrs())); + Value distributedVal = newWarpOp.getResult(operandIdx); + // Resolve the output type. + newDpasOp = resolveDistributedTy( + newDpasOp, distResultTypeByWarpOpOrFailure.value(), rewriter); + rewriter.replaceAllUsesWith(distributedVal, newDpasOp); + return success(); + } +}; + +} // namespace + namespace { struct XeGPUSubgroupDistributePass final : public xegpu::impl::XeGPUSubgroupDistributeBase< @@ -646,14 +1427,61 @@ struct XeGPUSubgroupDistributePass final }; } // namespace -void XeGPUSubgroupDistributePass::runOnOperation() { - Operation *op = getOperation(); - RunSGMapPropagation solver(op); +void xegpu::populateXeGPUSubgroupDistributePatterns( + RewritePatternSet &patterns) { + patterns.add(patterns.getContext()); +} - // Print the analysis result and exit. +void XeGPUSubgroupDistributePass::runOnOperation() { + auto &analyis = getAnalysis(); + // Print the analysis result and exit. (for testing purposes) if (printOnly) { auto &os = llvm::outs(); - solver.printAnalysisResult(os); + analyis.printAnalysisResult(os); + return; + } + auto getPropagatedLayout = [&](Value val) { + return analyis.getLayoutInfo(val); + }; + + // Assign xegpu::LayoutAttr to all ops and their users based on the layout + // propagation analysis result. + LayoutAttrAssignment layoutAssignment(getOperation(), getPropagatedLayout); + if (failed(layoutAssignment.run())) { + signalPassFailure(); + return; + } + + // Move all operations of a GPU function inside gpu.warp_execute_on_lane_0 + // operation. + { + RewritePatternSet patterns(&getContext()); + patterns.add(&getContext()); + + if (failed(applyPatternsGreedily(getOperation(), std::move(patterns)))) { + signalPassFailure(); + return; + } + } + // Finally, do the SIMD to SIMT distribution. + RewritePatternSet patterns(&getContext()); + xegpu::populateXeGPUSubgroupDistributePatterns(patterns); + // TODO: distributionFn and shuffleFn are not used at this point. + auto distributionFn = [](Value val) { + VectorType vecType = dyn_cast(val.getType()); + int64_t vecRank = vecType ? vecType.getRank() : 0; + OpBuilder builder(val.getContext()); + if (vecRank == 0) + return AffineMap::get(val.getContext()); + return AffineMap::getMultiDimIdentityMap(vecRank, val.getContext()); + }; + auto shuffleFn = [](Location loc, OpBuilder &builder, Value val, Value srcIdx, + int64_t warpSz) { return Value(); }; + vector::populatePropagateWarpVectorDistributionPatterns( + patterns, distributionFn, shuffleFn); + if (failed(applyPatternsGreedily(getOperation(), std::move(patterns)))) { + signalPassFailure(); return; } } diff --git a/mlir/lib/Dialect/XeGPU/Utils/CMakeLists.txt b/mlir/lib/Dialect/XeGPU/Utils/CMakeLists.txt new file mode 100644 index 0000000000000..afd8e2d5c4df3 --- /dev/null +++ b/mlir/lib/Dialect/XeGPU/Utils/CMakeLists.txt @@ -0,0 +1,10 @@ +add_mlir_dialect_library(MLIRXeGPUUtils + XeGPUUtils.cpp + + ADDITIONAL_HEADER_DIRS + ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/XeGPU/Utils + + LINK_LIBS PUBLIC + MLIRIR + MLIRXeGPUDialect + ) diff --git a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp new file mode 100644 index 0000000000000..6b45ed0ae4ced --- /dev/null +++ b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp @@ -0,0 +1,85 @@ +//===---- XeGPUUtils.cpp - MLIR Utilities for XeGPUOps ------------------===// +// +// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements utility methods for working with the XeGPU dialect. +// +//===----------------------------------------------------------------------===// + +#include "mlir/Dialect/XeGPU/Utils/XeGPUUtils.h" +#include "mlir/Dialect/XeGPU/IR/XeGPU.h" +#include +#include + +using namespace mlir; + +FailureOr +mlir::xegpu::getDistributedVectorType(xegpu::TensorDescType tdescTy) { + auto layout = llvm::dyn_cast_if_present(tdescTy.getLayout()); + // It only works for subgroup level layout, which only has lane_layout + // and lane_data, and is to distribute a SIMD code into SIMT code. + if (!layout || !layout.isSgLayout()) + return failure(); + + SmallVector laneData(layout.getLaneData().asArrayRef()); + SmallVector laneLayout(layout.getLaneLayout().asArrayRef()); + auto tdescShape = tdescTy.getShape(); + auto elementType = tdescTy.getElementType(); + + // compute sgSize by multiply elements of laneLayout + // e.g. for 2D layout, sgSize = laneLayout[0] * laneLayout[1] + // e.g. for 1D layout, sgSize = laneLayout[0] + auto sgSize = std::accumulate(laneLayout.begin(), laneLayout.end(), 1, + std::multiplies()); + + // Case 1: regular loads/stores + auto scatterAttr = tdescTy.getEncodingAsScatterTensorDescAttr(); + if (scatterAttr) { + auto chunkSize = scatterAttr.getChunkSize().getInt(); + // Verify if the first dimension of the tensor descriptor shape is + // distributable. + assert(tdescShape[0] == laneLayout[0] && + "tensor descriptor shape is not distributable"); + return VectorType::get({chunkSize}, elementType); + } + + // Case 2: block loads/stores + // Check if the tensor descriptor shape is distributable. + int64_t tensorSize = 1; + for (auto [tdescDim, laneDim, laneDataDim] : + llvm::zip_equal(tdescShape, laneLayout, laneData)) { + assert((tdescDim % (laneDim * laneDataDim) == 0) && + "tensor descriptor shape is not distributable"); + tensorSize *= tdescDim; + } + // tensorSize must be adjusted for array_length. + tensorSize *= tdescTy.getArrayLength(); + + return VectorType::get({tensorSize / sgSize}, elementType); +} + +FailureOr +mlir::xegpu::getDistributedVectorType(VectorType originalType, + xegpu::LayoutAttr layout) { + int64_t rank = originalType.getRank(); + // Distributed vector type is only supported for 1D, 2D and 3D vectors. + if (rank < 1 || rank > 3) + return failure(); + ArrayRef shape = originalType.getShape(); + // arrayLength is 1 for 1D and 2D vectors, and equal to the first dimension + // of the 3D vector. + int arrayLength = 1; + if (rank == 3) { + arrayLength = shape[0]; + shape = shape.drop_front(); + } + auto helperTdescTy = xegpu::TensorDescType::get( + shape, originalType.getElementType(), arrayLength, + /*boundary_check=*/true, + /*memory_space=*/xegpu::MemorySpace::Global, layout); + return xegpu::getDistributedVectorType(helperTdescTy); +} diff --git a/mlir/test/Dialect/XeGPU/subgroup-distribution.mlir b/mlir/test/Dialect/XeGPU/subgroup-distribution.mlir new file mode 100644 index 0000000000000..f8f2cd55c28d0 --- /dev/null +++ b/mlir/test/Dialect/XeGPU/subgroup-distribution.mlir @@ -0,0 +1,162 @@ +// RUN: mlir-opt -xegpu-subgroup-distribute -split-input-file %s | FileCheck %s + +// CHECK-LABEL: gpu.func @store_nd_1d +// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<16xf32>) { +// CHECK-DAG: %[[CST:.*]] = arith.constant dense<1.000000e+00> : vector<1xf32> +// CHECK-DAG: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}] : memref<16xf32> -> !xegpu.tensor_desc<16xf32> +// CHECK: xegpu.store_nd %[[CST]], %[[T0]] : vector<1xf32>, !xegpu.tensor_desc<16xf32> +// CHECK: gpu.return +gpu.module @test { +gpu.func @store_nd_1d(%arg0: memref<16xf32>){ + %c0 = arith.constant 0 : index + %1 = arith.constant dense<1.000000e+00> : vector<16xf32> + %0 = xegpu.create_nd_tdesc %arg0[%c0] : memref<16xf32> -> !xegpu.tensor_desc<16xf32> + xegpu.store_nd %1, %0 : vector<16xf32>, !xegpu.tensor_desc<16xf32> + gpu.return +} +} + +// ----- +// CHECK-LABEL: gpu.func @store_nd_2d +// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<16x16xf16>) { +// CHECK-DAG: %[[CST:.*]] = arith.constant dense<1.000000e+00> : vector<16xf16> +// CHECK-DAG: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16> +// CHECK: xegpu.store_nd %[[CST]], %[[T0]] : vector<16xf16>, !xegpu.tensor_desc<16x16xf16> +gpu.module @test { +gpu.func @store_nd_2d(%arg0: memref<16x16xf16>){ + %c0 = arith.constant 0 : index + %1 = arith.constant dense<1.000000e+00> : vector<16x16xf16> + %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16> + xegpu.store_nd %1, %0 : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16> + gpu.return +} +} + + + +// ----- +// CHECK-LABEL: gpu.func @load_nd_1d +// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<16xf32>, %[[ARG1:[0-9a-zA-Z]+]]: memref<16xf32>) { +// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}] : memref<16xf32> -> !xegpu.tensor_desc<16xf32> +// CHECK-DAG: %[[T1:.*]] = xegpu.load_nd %[[T0]] : !xegpu.tensor_desc<16xf32> -> vector<1xf32> +// CHECK-DAG: %[[T2:.*]] = xegpu.create_nd_tdesc %[[ARG1]][%{{.*}}] : memref<16xf32> -> !xegpu.tensor_desc<16xf32> +// CHECK: xegpu.store_nd %[[T1]], %[[T2]] : vector<1xf32>, !xegpu.tensor_desc<16xf32> +gpu.module @test { +gpu.func @load_nd_1d(%arg0: memref<16xf32>, %arg1: memref<16xf32>){ + %c0 = arith.constant 0 : index + %0 = xegpu.create_nd_tdesc %arg0[%c0] : memref<16xf32> -> !xegpu.tensor_desc<16xf32> + %1 = xegpu.load_nd %0 : !xegpu.tensor_desc<16xf32> -> vector<16xf32> + %2 = xegpu.create_nd_tdesc %arg1[%c0] : memref<16xf32> -> !xegpu.tensor_desc<16xf32> + xegpu.store_nd %1, %2 : vector<16xf32>, !xegpu.tensor_desc<16xf32> + gpu.return +} +} + +// ----- +// CHECK-LABEL: gpu.func @load_nd_2d +// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<16x16xf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<16x16xf16>) { +// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16> +// CHECK-DAG: %[[T1:.*]] = xegpu.load_nd %[[T0]] : !xegpu.tensor_desc<16x16xf16> -> vector<16xf16> +// CHECK-DAG: %[[T2:.*]] = xegpu.create_nd_tdesc %[[ARG1]][%{{.*}}] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16> +// CHECK: xegpu.store_nd %[[T1]], %[[T2]] : vector<16xf16>, !xegpu.tensor_desc<16x16xf16> +gpu.module @test { +gpu.func @load_nd_2d(%arg0: memref<16x16xf16>, %arg1: memref<16x16xf16>){ + %c0 = arith.constant 0 : index + %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16> + %1 = xegpu.load_nd %0 : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16> + %2 = xegpu.create_nd_tdesc %arg1[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16> + xegpu.store_nd %1, %2 : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16> + gpu.return +} +} + +// ----- +// CHECK-LABEL: gpu.func @load_nd_array_length +// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<16x16xf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<16x16xf16>) { +// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr> +// CHECK: %[[T1:.*]] = xegpu.load_nd %[[T0]] : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr> -> vector<32xf16> +// CHECK: %[[T2:.*]] = vector.shape_cast %[[T1]] : vector<32xf16> to vector<2x16x1xf16> +// CHECK: %[[T3:.*]] = vector.extract %[[T2]][0] : vector<16x1xf16> from vector<2x16x1xf16> +// CHECK-DAG: %[[T4:.*]] = xegpu.create_nd_tdesc %[[ARG1]][%{{.*}}] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16> +// CHECK-DAG: %[[T5:.*]] = vector.shape_cast %[[T3]] : vector<16x1xf16> to vector<16xf16> +// CHECK: xegpu.store_nd %[[T5]], %[[T4]] : vector<16xf16>, !xegpu.tensor_desc<16x16xf16> +gpu.module @test { +gpu.func @load_nd_array_length(%arg0: memref<16x16xf16>, %arg1: memref<16x16xf16>){ + %c0 = arith.constant 0 : index + %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr> + %1 = xegpu.load_nd %0 : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr> -> vector<2x16x16xf16> + %2 = vector.extract %1[%c0] : vector<16x16xf16> from vector<2x16x16xf16> + %3 = xegpu.create_nd_tdesc %arg1[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16> + xegpu.store_nd %2, %3 : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16> + gpu.return +} +} + +// ----- +// CHECK-LABEL: gpu.func @dpas +// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: vector<8x16xf16>, %[[ARG1:[0-9a-zA-Z]+]]: vector<16x16xf16>, %[[ARG2:[0-9a-zA-Z]+]]: vector<8x16xf32>, %[[ARG3:[0-9a-zA-Z]+]]: memref<8x16xf32>) { +// CHECK: %[[T1:.*]]:3 = gpu.warp_execute_on_lane_0(%{{.*}})[16] args(%[[ARG0]], %[[ARG1]], %[[ARG2]], %[[ARG3]] +// CHECK-SAME: vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32>, memref<8x16xf32>) -> (vector<8x1xf16>, vector<16x1xf16>, vector<8x1xf32>) { +// CHECK: ^bb0(%[[ARG4:[0-9a-zA-Z]+]]: vector<8x16xf16>, %[[ARG5:[0-9a-zA-Z]+]]: vector<16x16xf16>, %[[ARG6:[0-9a-zA-Z]+]]: vector<8x16xf32>, %[[ARG7:[0-9a-zA-Z]+]]: memref<8x16xf32>): +// CHECK: gpu.yield %[[ARG4]], %[[ARG5]], %[[ARG6]] : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> +// CHECK: } +// CHECK-DAG: %[[T2:.*]] = vector.shape_cast %[[T1]]#0 : vector<8x1xf16> to vector<8xf16> +// CHECK-DAG: %[[T3:.*]] = vector.shape_cast %[[T1]]#1 : vector<16x1xf16> to vector<16xf16> +// CHECK-DAG: %[[T4:.*]] = vector.shape_cast %[[T1]]#2 : vector<8x1xf32> to vector<8xf32> +// CHECK: %[[T5:.*]] = xegpu.dpas %[[T2]], %[[T3]], %[[T4]] : vector<8xf16>, vector<16xf16>, vector<8xf32> -> vector<8xf32> +// CHECK: %[[T6:.*]] = xegpu.create_nd_tdesc %[[ARG3]][%{{.*}}] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32> +// CHECK: xegpu.store_nd %[[T5]], %[[T6]] : vector<8xf32>, !xegpu.tensor_desc<8x16xf32> +gpu.module @test { +gpu.func @dpas(%arg0: vector<8x16xf16>, %arg1: vector<16x16xf16>, %arg3: vector<8x16xf32>, %arg2: memref<8x16xf32>){ + %c0 = arith.constant 0 : index + %0 = xegpu.dpas %arg0, %arg1, %arg3 : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32> + %3 = xegpu.create_nd_tdesc %arg2[%c0, %c0] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32> + xegpu.store_nd %0, %3 : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32> + gpu.return +} +} + +// ----- +// CHECK-LABEL: gpu.func @load_dpas_store +// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<8x16xf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<16x16xf16>, %[[ARG2:[0-9a-zA-Z]+]]: memref<8x16xf32>) { +// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG1]][%{{.*}}] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16> +// CHECK: %[[T1:.*]] = xegpu.load_nd %[[T0]] <{packed}> : !xegpu.tensor_desc<16x16xf16> -> vector<16xf16> +// CHECK: %[[T2:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16> +// CHECK: %[[T3:.*]] = xegpu.load_nd %[[T2]] : !xegpu.tensor_desc<8x16xf16> -> vector<8xf16> +// CHECK-DAG: %[[T4:.*]] = xegpu.dpas %[[T3]], %[[T1]] : vector<8xf16>, vector<16xf16> -> vector<8xf32> +// CHECK-DAG: %[[T5:.*]] = xegpu.create_nd_tdesc %[[ARG2]][%{{.*}}] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32> +// CHECK: xegpu.store_nd %[[T4]], %[[T5]] : vector<8xf32>, !xegpu.tensor_desc<8x16xf32> +gpu.module @test { +gpu.func @load_dpas_store(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg3: memref<8x16xf32>){ + %c0 = arith.constant 0 : index + %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16> + %1 = xegpu.load_nd %0 : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16> + %2 = xegpu.create_nd_tdesc %arg1[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16> + %3 = xegpu.load_nd %2 : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16> + %4 = xegpu.dpas %1, %3 : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32> + %5 = xegpu.create_nd_tdesc %arg3[%c0, %c0] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32> + xegpu.store_nd %4, %5 : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32> + gpu.return +} +} + +// ----- +gpu.module @test { +// CHECK-LABEL: gpu.func @create_nd_tdesc_non_memref +// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: ui64, %[[ARG1:[0-9a-zA-Z]+]]: ui64, %[[ARG2:[0-9a-zA-Z]+]]: index, +// CHECK-SAME: %[[ARG3:[0-9a-zA-Z]+]]: index, %[[ARG4:[0-9a-zA-Z]+]]: index, +// CHECK-SAME: %[[ARG5:[0-9a-zA-Z]+]]: index, %[[ARG6:[0-9a-zA-Z]+]]: index, %[[ARG7:[0-9a-zA-Z]+]]: index) { +// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][{{.*}}], [%[[ARG2]], %[[ARG3]]], [%[[ARG4]], %[[ARG5]]] : ui64 -> !xegpu.tensor_desc<16x16xf16> +// CHECK: %[[T1:.*]] = xegpu.load_nd %[[T0]] : !xegpu.tensor_desc<16x16xf16> -> vector<16xf16> +// CHECK: %[[T2:.*]] = xegpu.create_nd_tdesc %[[ARG1]][{{.*}}], [%[[ARG2]], %[[ARG3]]], [%[[ARG4]], %[[ARG5]]] : ui64 -> !xegpu.tensor_desc<16x16xf16> +// CHECK: xegpu.store_nd %[[T1]], %[[T2]] : vector<16xf16>, !xegpu.tensor_desc<16x16xf16> +gpu.func @create_nd_tdesc_non_memref(%arg0: ui64, %arg1: ui64, + %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index) { + %c0 = arith.constant 0 : index + %0 = xegpu.create_nd_tdesc %arg0 [%c0, %c0], [%arg2, %arg3], [%arg4, %arg5] : ui64 -> !xegpu.tensor_desc<16x16xf16> + %1 = xegpu.load_nd %0 : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16> + %2 = xegpu.create_nd_tdesc %arg1[%c0, %c0], [%arg2, %arg3], [%arg4, %arg5] : ui64 -> !xegpu.tensor_desc<16x16xf16> + xegpu.store_nd %1, %2 : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16> + gpu.return +} +} diff --git a/mlir/test/Dialect/XeGPU/subgroup-map-propagation.mlir b/mlir/test/Dialect/XeGPU/subgroup-map-propagation.mlir index 1ae4348af33e6..a5468681e68dc 100644 --- a/mlir/test/Dialect/XeGPU/subgroup-map-propagation.mlir +++ b/mlir/test/Dialect/XeGPU/subgroup-map-propagation.mlir @@ -2,27 +2,27 @@ // CHECK: function: test_dpas_f16: // CHECK-NEXT: argument: of type 'memref<8x16xf16>' at index: 0 -// CHECK-NEXT: sg_map : Not assigned. +// CHECK-NEXT: layout : Not assigned. // CHECK-NEXT: argument: of type 'memref<16x16xf16>' at index: 1 -// CHECK-NEXT: sg_map : Not assigned. +// CHECK-NEXT: layout : Not assigned. // CHECK-NEXT: argument: of type 'memref<8x16xf32>' at index: 2 -// CHECK-NEXT: sg_map : Not assigned. +// CHECK-NEXT: layout : Not assigned. // CHECK-NEXT: op : %{{.*}} = arith.constant 0 : index -// CHECK-NEXT: sg_map for result #0: Not assigned. +// CHECK-NEXT: layout for result #0: Not assigned. // CHECK-NEXT: op : %{{.*}} = arith.constant dense<0.000000e+00> : vector<8x16xf32> -// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1] +// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1] // CHECK-NEXT: op : %[[T0:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16> -// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1] +// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1] // CHECK-NEXT: op : %[[T1:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16> -// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [2, 1] +// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [2, 1] // CHECK-NEXT: op : %[[T2:.*]] = xegpu.load_nd %[[T0]] : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16> -// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1] +// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1] // CHECK-NEXT: op : %[[T3:.*]] = xegpu.load_nd %[[T1]] : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16> -// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [2, 1] +// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [2, 1] // CHECK-NEXT: op : %[[T4:.*]] = xegpu.dpas %[[T2]], %[[T3]], %{{.*}} : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32> -// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1] +// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1] // CHECK-NEXT: op : %{{.*}} = xegpu.create_nd_tdesc %{{.*}} : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32> -// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1] +// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1] func.func @test_dpas_f16(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg2: memref<8x16xf32>) { %c0 = arith.constant 0 : index %cst = arith.constant dense<0.000000e+00> : vector<8x16xf32> @@ -40,17 +40,17 @@ func.func @test_dpas_f16(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg // ----- // CHECK: function: test_dpas_i8: // CHECK-NEXT: argument: of type 'vector<8x32xi8>' at index: 0 -// CHECK-NEXT: sg_map : wi_layout: [1, 16], wi_data: [1, 2] +// CHECK-NEXT: layout : lane_layout: [1, 16], lane_data: [1, 2] // CHECK-NEXT: argument: of type 'vector<32x16xi8>' at index: 1 -// CHECK-NEXT: sg_map : wi_layout: [1, 16], wi_data: [4, 1] +// CHECK-NEXT: layout : lane_layout: [1, 16], lane_data: [4, 1] // CHECK-NEXT: argument: of type 'memref<8x16xi32>' at index: 2 -// CHECK-NEXT: sg_map : Not assigned. +// CHECK-NEXT: layout : Not assigned. // CHECK-NEXT: op : %{{.*}} = arith.constant 0 : index -// CHECK-NEXT: sg_map for result #0: Not assigned. +// CHECK-NEXT: layout for result #0: Not assigned. // CHECK-NEXT: op : %[[T0:.*]] = xegpu.dpas %{{.*}} : vector<8x32xi8>, vector<32x16xi8> -> vector<8x16xi32> -// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1] +// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1] // CHECK-NEXT: op : %[[T1:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<8x16xi32> -> !xegpu.tensor_desc<8x16xi32> -// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1] +// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1] func.func @test_dpas_i8(%arg0: vector<8x32xi8>, %arg1: vector<32x16xi8>, %arg2: memref<8x16xi32>) { %c0 = arith.constant 0 : index %0 = xegpu.dpas %arg0, %arg1 : vector<8x32xi8>, vector<32x16xi8> -> vector<8x16xi32> @@ -62,27 +62,27 @@ func.func @test_dpas_i8(%arg0: vector<8x32xi8>, %arg1: vector<32x16xi8>, %arg2: // ----- // CHECK: function: test_load_with_transpose_effect: // CHECK-NEXT: argument: of type 'memref<8x16xf16>' at index: 0 -// CHECK-NEXT: sg_map : Not assigned. +// CHECK-NEXT: layout : Not assigned. // CHECK-NEXT: argument: of type 'memref<16x16xf16>' at index: 1 -// CHECK-NEXT: sg_map : Not assigned. +// CHECK-NEXT: layout : Not assigned. // CHECK-NEXT: argument: of type 'memref<8x16xf32>' at index: 2 -// CHECK-NEXT: sg_map : Not assigned. +// CHECK-NEXT: layout : Not assigned. // CHECK-NEXT: op : %{{.*}} = arith.constant 0 : index -// CHECK-NEXT: sg_map for result #0: Not assigned. +// CHECK-NEXT: layout for result #0: Not assigned. // CHECK-NEXT: op : %[[CST:.*]] = arith.constant dense<0.000000e+00> : vector<8x16xf32> -// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1] +// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1] // CHECK-NEXT: op : %[[T0:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16> -// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1] +// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1] // CHECK-NEXT: op : %[[T1:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16> -// CHECK-NEXT: sg_map for result #0: wi_layout: [16, 1], wi_data: [1, 2] +// CHECK-NEXT: layout for result #0: lane_layout: [16, 1], lane_data: [1, 2] // CHECK-NEXT: op : %[[T2:.*]] = xegpu.load_nd %[[T0]] : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16> -// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1] +// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1] // CHECK-NEXT: op : %[[T3:.*]] = xegpu.load_nd %[[T1]] <{transpose = array}> : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16> -// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [2, 1] +// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [2, 1] // CHECK-NEXT: op : %[[T4:.*]] = xegpu.dpas %[[T2]], %[[T3]], %[[CST]] : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32> -// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1] +// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1] // CHECK-NEXT: op : %[[T5:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32> -// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1] +// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1] func.func @test_load_with_transpose_effect(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg2: memref<8x16xf32>) { %c0 = arith.constant 0 : index %cst = arith.constant dense<0.000000e+00> : vector<8x16xf32> @@ -99,29 +99,29 @@ func.func @test_load_with_transpose_effect(%arg0: memref<8x16xf16>, %arg1: memre // ----- // CHECK: function: test_vector_transpose: // CHECK-NEXT: argument: of type 'memref<8x16xf16>' at index: 0 -// CHECK-NEXT: sg_map : Not assigned. +// CHECK-NEXT: layout : Not assigned. // CHECK-NEXT: argument: of type 'memref<16x16xf16>' at index: 1 -// CHECK-NEXT: sg_map : Not assigned. +// CHECK-NEXT: layout : Not assigned. // CHECK-NEXT: argument: of type 'memref<8x16xf32>' at index: 2 -// CHECK-NEXT: sg_map : Not assigned. +// CHECK-NEXT: layout : Not assigned. // CHECK-NEXT: op : %{{.*}} = arith.constant 0 : index -// CHECK-NEXT: sg_map for result #0: Not assigned. +// CHECK-NEXT: layout for result #0: Not assigned. // CHECK-NEXT: op : %[[CST:.*]] = arith.constant dense<0.000000e+00> : vector<8x16xf32> -// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1] +// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1] // CHECK-NEXT: op : %[[T0:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16> -// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1] +// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1] // CHECK-NEXT: op : %[[T1:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16> -// CHECK-NEXT: sg_map for result #0: wi_layout: [16, 1], wi_data: [1, 2] +// CHECK-NEXT: layout for result #0: lane_layout: [16, 1], lane_data: [1, 2] // CHECK-NEXT: op : %[[T2:.*]] = xegpu.load_nd %[[T0]] : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16> -// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1] +// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1] // CHECK-NEXT: op : %[[T3:.*]] = xegpu.load_nd %[[T1]] : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16> -// CHECK-NEXT: sg_map for result #0: wi_layout: [16, 1], wi_data: [1, 2] +// CHECK-NEXT: layout for result #0: lane_layout: [16, 1], lane_data: [1, 2] // CHECK-NEXT: op : %[[T4:.*]] = vector.transpose %[[T3]], [1, 0] : vector<16x16xf16> to vector<16x16xf16> -// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [2, 1] +// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [2, 1] // CHECK-NEXT: op : %[[T5:.*]] = xegpu.dpas %[[T2]], %[[T4]], %[[CST]] : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32> -// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1] +// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1] // CHECK-NEXT: op : %[[T6:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32> -// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1] +// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1] func.func @test_vector_transpose(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg2: memref<8x16xf32>) { %c0 = arith.constant 0 : index %cst = arith.constant dense<0.000000e+00> : vector<8x16xf32> @@ -139,19 +139,19 @@ func.func @test_vector_transpose(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf1 // ----- // CHECK: function: test_extf_truncf: // CHECK-NEXT: argument: of type '!xegpu.tensor_desc<8x16xf16>' at index: 0 -// CHECK-NEXT: sg_map : wi_layout: [1, 16], wi_data: [1, 1] +// CHECK-NEXT: layout : lane_layout: [1, 16], lane_data: [1, 1] // CHECK-NEXT: argument: of type '!xegpu.tensor_desc<16x16xf16>' at index: 1 -// CHECK-NEXT: sg_map : wi_layout: [1, 16], wi_data: [2, 1] +// CHECK-NEXT: layout : lane_layout: [1, 16], lane_data: [2, 1] // CHECK-NEXT: op : %[[T0:.*]] = xegpu.load_nd %{{.*}} : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16> -// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1] +// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1] // CHECK-NEXT: op : %[[T1:.*]] = xegpu.load_nd %{{.*}} : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16> -// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [2, 1] +// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [2, 1] // CHECK-NEXT: op : %[[T2:.*]] = arith.extf %[[T1]] : vector<16x16xf16> to vector<16x16xf32> -// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [2, 1] +// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [2, 1] // CHECK-NEXT: op : %[[T3:.*]] = arith.truncf %[[T2]] : vector<16x16xf32> to vector<16x16xf16> -// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [2, 1] +// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [2, 1] // CHECK-NEXT: op : %[[T4:.*]] = xegpu.dpas %[[T0]], %[[T3]] : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32> -// CHECK-NEXT: sg_map for result #0: Not assigned. +// CHECK-NEXT: layout for result #0: Not assigned. func.func @test_extf_truncf(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu.tensor_desc<16x16xf16>) -> vector<8x16xf32> { %0 = xegpu.load_nd %arg0 : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16> %1 = xegpu.load_nd %arg1 : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16> @@ -164,29 +164,29 @@ func.func @test_extf_truncf(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu.t // ----- // CHECK: function: test_load_gather_with_transpose_effect: // CHECK-NEXT: argument: of type 'memref<8x16xf16>' at index: 0 -// CHECK-NEXT: sg_map : Not assigned. +// CHECK-NEXT: layout : Not assigned. // CHECK-NEXT: argument: of type 'memref<256xf16>' at index: 1 -// CHECK-NEXT: sg_map : Not assigned. +// CHECK-NEXT: layout : Not assigned. // CHECK-NEXT: argument: of type 'memref<8x16xf32>' at index: 2 -// CHECK-NEXT: sg_map : Not assigned. +// CHECK-NEXT: layout : Not assigned. // CHECK-NEXT: op : %{{.*}} = arith.constant 0 : index -// CHECK-NEXT: sg_map for result #0: Not assigned. +// CHECK-NEXT: layout for result #0: Not assigned. // CHECK-NEXT: op : %[[T0:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16> -// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1] +// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1] // CHECK-NEXT: op : %[[T1:.*]] = xegpu.load_nd %[[T0]] : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16> -// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1] +// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1] // CHECK-NEXT: op : %[[CST:.*]] = arith.constant dense<[0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240]> : vector<16xindex> -// CHECK-NEXT: sg_map for result #0: wi_layout: [16], wi_data: [1] +// CHECK-NEXT: layout for result #0: lane_layout: [16], lane_data: [1] // CHECK-NEXT: op : %[[CST0:.*]] = arith.constant dense : vector<16xi1> -// CHECK-NEXT: sg_map for result #0: wi_layout: [16], wi_data: [1] +// CHECK-NEXT: layout for result #0: lane_layout: [16], lane_data: [1] // CHECK-NEXT: op : %[[T2:.*]] = xegpu.create_tdesc %{{.*}}, %[[CST]] : memref<256xf16>, vector<16xindex> -> !xegpu.tensor_desc<16x16xf16, #xegpu.scatter_tdesc_attr> -// CHECK-NEXT: sg_map for result #0: wi_layout: [16, 1], wi_data: [1, 2] +// CHECK-NEXT: layout for result #0: lane_layout: [16, 1], lane_data: [1, 2] // CHECK-NEXT: op : %[[T3:.*]] = xegpu.load %[[T2]], %[[CST0]] <{transpose}> : !xegpu.tensor_desc<16x16xf16, #xegpu.scatter_tdesc_attr>, vector<16xi1> -> vector<16x16xf16> -// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [2, 1] +// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [2, 1] // CHECK-NEXT: op : %[[T4:.*]] = xegpu.dpas %[[T1]], %[[T3]] : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32> -// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1] +// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1] // CHECK-NEXT: op : %[[T5:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32> -// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1] +// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1] func.func @test_load_gather_with_transpose_effect(%arg0: memref<8x16xf16>, %arg1: memref<256xf16>, %arg2: memref<8x16xf32>) { %c0 = arith.constant 0 : index %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16> @@ -204,17 +204,17 @@ func.func @test_load_gather_with_transpose_effect(%arg0: memref<8x16xf16>, %arg1 // ----- // CHECK: function: test_load_gather_1d: // CHECK: argument: of type 'memref<256xf32>' at index: 0 -// CHECK-NEXT: sg_map : Not assigned. +// CHECK-NEXT: layout : Not assigned. // CHECK-NEXT: argument: of type '!xegpu.tensor_desc<16xf32>' at index: 1 -// CHECK-NEXT: sg_map : wi_layout: [16], wi_data: [1] +// CHECK-NEXT: layout : lane_layout: [16], lane_data: [1] // CHECK-NEXT: op : %[[CST:.*]] = arith.constant dense<[0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240]> : vector<16xindex> -// CHECK-NEXT: sg_map for result #0: wi_layout: [16], wi_data: [1] +// CHECK-NEXT: layout for result #0: lane_layout: [16], lane_data: [1] // CHECK-NEXT: op : %[[CST0:.*]] = arith.constant dense : vector<16xi1> -// CHECK-NEXT: sg_map for result #0: wi_layout: [16], wi_data: [1] +// CHECK-NEXT: layout for result #0: lane_layout: [16], lane_data: [1] // CHECK-NEXT: op : %[[T0:.*]] = xegpu.create_tdesc %{{.*}}, %[[CST]] : memref<256xf32>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>> -// CHECK-NEXT: sg_map for result #0: wi_layout: [16], wi_data: [1] +// CHECK-NEXT: layout for result #0: lane_layout: [16], lane_data: [1] // CHECK-NEXT: op : %[[T1]] = xegpu.load %[[T0]], %[[CST0]] : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xi1> -> vector<16xf32> -// CHECK-NEXT: sg_map for result #0: wi_layout: [16], wi_data: [1] +// CHECK-NEXT: layout for result #0: lane_layout: [16], lane_data: [1] func.func @test_load_gather_1d(%arg0: memref<256xf32>, %arg1: !xegpu.tensor_desc<16xf32>) { %cst = arith.constant dense<[0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240]> : vector<16xindex> %cst_0 = arith.constant dense : vector<16xi1> @@ -227,15 +227,15 @@ func.func @test_load_gather_1d(%arg0: memref<256xf32>, %arg1: !xegpu.tensor_desc // ----- // CHECK: function: test_store_scatter_with_transpose_effect: // CHECK-NEXT: argument: of type 'memref<128xf32>' at index: 0 -// CHECK-NEXT: sg_map : Not assigned. +// CHECK-NEXT: layout : Not assigned. // CHECK-NEXT: op : %[[CST:.*]] = arith.constant dense<1.000000e+00> : vector<8x16xf32> -// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1] +// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1] // CHECK-NEXT: op : %[[CST0:.*]] = arith.constant dense : vector<16xi1> -// CHECK-NEXT: sg_map for result #0: wi_layout: [16], wi_data: [1] +// CHECK-NEXT: layout for result #0: lane_layout: [16], lane_data: [1] // CHECK-NEXT: op : %[[CST1:.*]] = arith.constant dense<[0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240]> : vector<16xindex> -// CHECK-NEXT: sg_map for result #0: wi_layout: [16], wi_data: [1] +// CHECK-NEXT: layout for result #0: lane_layout: [16], lane_data: [1] // CHECK-NEXT: op : %[[T0:.*]] = xegpu.create_tdesc %{{.*}}, %[[CST1]] : memref<128xf32>, vector<16xindex> -> !xegpu.tensor_desc<16x8xf32, #xegpu.scatter_tdesc_attr> -// CHECK-NEXT: sg_map for result #0: wi_layout: [16, 1], wi_data: [1, 1] +// CHECK-NEXT: layout for result #0: lane_layout: [16, 1], lane_data: [1, 1] func.func @test_store_scatter_with_transpose_effect(%arg0: memref<128xf32>) { %cst = arith.constant dense<1.000000e+00> : vector<8x16xf32> %cst_0 = arith.constant dense : vector<16xi1> @@ -248,15 +248,15 @@ func.func @test_store_scatter_with_transpose_effect(%arg0: memref<128xf32>) { // ----- // CHECK: function: test_store_scatter_1d: // CHECK-NEXT: argument: of type 'vector<16xf32>' at index: 0 -// CHECK-NEXT: sg_map : wi_layout: [16], wi_data: [1] +// CHECK-NEXT: layout : lane_layout: [16], lane_data: [1] // CHECK-NEXT: argument: of type 'memref<256xf32>' at index: 1 -// CHECK-NEXT: sg_map : Not assigned. +// CHECK-NEXT: layout : Not assigned. // CHECK-NEXT: op : %[[CST:.*]] = arith.constant dense<[0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240]> : vector<16xindex> -// CHECK-NEXT: sg_map for result #0: wi_layout: [16], wi_data: [1] +// CHECK-NEXT: layout for result #0: lane_layout: [16], lane_data: [1] // CHECK-NEXT: op : %[[CST1:.*]] = arith.constant dense : vector<16xi1> -// CHECK-NEXT: sg_map for result #0: wi_layout: [16], wi_data: [1] +// CHECK-NEXT: layout for result #0: lane_layout: [16], lane_data: [1] // CHECK-NEXT: op : %[[T0:.*]] = xegpu.create_tdesc %{{.*}}, %[[CST]] : memref<256xf32>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>> -// CHECK-NEXT: sg_map for result #0: wi_layout: [16], wi_data: [1] +// CHECK-NEXT: layout for result #0: lane_layout: [16], lane_data: [1] func.func @test_store_scatter_1d(%arg0: vector<16xf32>, %arg1: memref<256xf32>) { %cst = arith.constant dense<[0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240]> : vector<16xindex> %cst_0 = arith.constant dense : vector<16xi1> @@ -268,27 +268,27 @@ func.func @test_store_scatter_1d(%arg0: vector<16xf32>, %arg1: memref<256xf32>) // ----- // CHECK: function: test_vector_bitcast_i16_to_i8: // CHECK-NEXT: argument: of type 'memref<8x16xi16>' at index: 0 -// CHECK-NEXT: sg_map : Not assigned. +// CHECK-NEXT: layout : Not assigned. // CHECK-NEXT: argument: of type 'memref<32x16xi8>' at index: 1 -// CHECK-NEXT: sg_map : Not assigned. +// CHECK-NEXT: layout : Not assigned. // CHECK-NEXT: argument: of type 'memref<8x16xi32>' at index: 2 -// CHECK-NEXT: sg_map : Not assigned. +// CHECK-NEXT: layout : Not assigned. // CHECK-NEXT: op : %{{.*}} = arith.constant 0 : index -// CHECK-NEXT: sg_map for result #0: Not assigned. +// CHECK-NEXT: layout for result #0: Not assigned. // CHECK-NEXT: op : %[[T0:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<8x16xi16> -> !xegpu.tensor_desc<8x16xi16> -// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1] +// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1] // CHECK-NEXT: op : %[[T1:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<32x16xi8> -> !xegpu.tensor_desc<32x16xi8> -// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [4, 1] +// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [4, 1] // CHECK-NEXT: op : %[[T2:.*]] = xegpu.load_nd %[[T0]] : !xegpu.tensor_desc<8x16xi16> -> vector<8x16xi16> -// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1] +// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1] // CHECK-NEXT: op : %[[T3:.*]] = xegpu.load_nd %[[T1]] : !xegpu.tensor_desc<32x16xi8> -> vector<32x16xi8> -// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [4, 1] +// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [4, 1] // CHECK-NEXT: op : %[[T4:.*]] = vector.bitcast %[[T2]] : vector<8x16xi16> to vector<8x32xi8> -// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 2] +// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 2] // CHECK-NEXT: op : %[[T5:.*]] = xegpu.dpas %[[T4]], %[[T3]] : vector<8x32xi8>, vector<32x16xi8> -> vector<8x16xi32> -// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1] +// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1] // CHECK-NEXT: op : %[[T6:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<8x16xi32> -> !xegpu.tensor_desc<8x16xi32> -// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1] +// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1] func.func @test_vector_bitcast_i16_to_i8(%arg0: memref<8x16xi16>, %arg1: memref<32x16xi8>, %arg2: memref<8x16xi32>) { %c0 = arith.constant 0 : index %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<8x16xi16> -> !xegpu.tensor_desc<8x16xi16> @@ -305,29 +305,29 @@ func.func @test_vector_bitcast_i16_to_i8(%arg0: memref<8x16xi16>, %arg1: memref< // ----- // CHECK: function: test_vector_bitcast_i8_to_f16: // CHECK-NEXT: argument: of type 'memref<8x32xi8>' at index: 0 -// CHECK-NEXT: sg_map : Not assigned. +// CHECK-NEXT: layout : Not assigned. // CHECK-NEXT: argument: of type 'memref<16x32xi8>' at index: 1 -// CHECK-NEXT: sg_map : Not assigned. +// CHECK-NEXT: layout : Not assigned. // CHECK-NEXT: argument: of type 'memref<8x16xf32>' at index: 2 -// CHECK-NEXT: sg_map : Not assigned. +// CHECK-NEXT: layout : Not assigned. // CHECK-NEXT: op : %{{.*}} = arith.constant 0 : index -// CHECK-NEXT: sg_map for result #0: Not assigned. +// CHECK-NEXT: layout for result #0: Not assigned. // CHECK-NEXT: op : %[[T0:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<8x32xi8> -> !xegpu.tensor_desc<8x32xi8> -// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 2] +// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 2] // CHECK-NEXT: op : %[[T1:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<16x32xi8> -> !xegpu.tensor_desc<16x32xi8> -// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [4, 1] +// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [4, 1] // CHECK-NEXT: op : %[[T2:.*]] = xegpu.load_nd %[[T0]] : !xegpu.tensor_desc<8x32xi8> -> vector<8x32xi8> -// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 2] +// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 2] // CHECK-NEXT: op : %[[T3:.*]] = xegpu.load_nd %[[T1]] : !xegpu.tensor_desc<16x32xi8> -> vector<16x32xi8> -// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [4, 1] +// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [4, 1] // CHECK-NEXT: op : %[[T4:.*]] = vector.bitcast %[[T2]] : vector<8x32xi8> to vector<8x16xf16> -// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1] +// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1] // CHECK-NEXT: op : %[[T5:.*]] = vector.bitcast %[[T3]] : vector<16x32xi8> to vector<16x16xf16> -// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [2, 1] +// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [2, 1] // CHECK-NEXT: op : %[[T6:.*]] = xegpu.dpas %[[T4]], %[[T5]] : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32> -// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1] +// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1] // CHECK-NEXT: op : %[[T7:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32> -// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1] +// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1] func.func @test_vector_bitcast_i8_to_f16(%arg0: memref<8x32xi8>, %arg1: memref<16x32xi8>, %arg2: memref<8x16xf32>) { %c0 = arith.constant 0 : index %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<8x32xi8> -> !xegpu.tensor_desc<8x32xi8> @@ -345,21 +345,21 @@ func.func @test_vector_bitcast_i8_to_f16(%arg0: memref<8x32xi8>, %arg1: memref<1 // ----- // CHECK: function: test_binary_op_one_use: // CHECK-NEXT: argument: of type '!xegpu.tensor_desc<8x16xf16>' at index: 0 -// CHECK-NEXT: sg_map : wi_layout: [1, 16], wi_data: [1, 1] +// CHECK-NEXT: layout : lane_layout: [1, 16], lane_data: [1, 1] // CHECK-NEXT: argument: of type '!xegpu.tensor_desc<16x16xf16>' at index: 1 -// CHECK-NEXT: sg_map : wi_layout: [1, 16], wi_data: [2, 1] +// CHECK-NEXT: layout : lane_layout: [1, 16], lane_data: [2, 1] // CHECK-NEXT: argument: of type '!xegpu.tensor_desc<8x16xf32>' at index: 2 -// CHECK-NEXT: sg_map : wi_layout: [1, 16], wi_data: [1, 1] +// CHECK-NEXT: layout : lane_layout: [1, 16], lane_data: [1, 1] // CHECK-NEXT: op : %[[T0:.*]] = xegpu.load_nd %{{.*}} : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16> -// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1] +// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1] // CHECK-NEXT: op : %[[T1:.*]] = xegpu.load_nd %{{.*}} : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16> -// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [2, 1] +// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [2, 1] // CHECK-NEXT: op : %[[T2:.*]] = xegpu.load_nd %{{.*}} : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16> -// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [2, 1] +// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [2, 1] // CHECK-NEXT: op : %[[T3:.*]] = arith.addf %[[T1]], %[[T2]] : vector<16x16xf16> -// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [2, 1] +// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [2, 1] // CHECK-NEXT: op : %[[T4:.*]] = xegpu.dpas %[[T0]], %[[T3]] : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32> -// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1] +// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1] func.func @test_binary_op_one_use(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu.tensor_desc<16x16xf16>, %arg2: !xegpu.tensor_desc<8x16xf32>) { %0 = xegpu.load_nd %arg0 : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16> %1 = xegpu.load_nd %arg1 : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16> @@ -373,23 +373,23 @@ func.func @test_binary_op_one_use(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !x // ----- // CHECK: function: test_binary_op_multiple_uses: // CHECK-NEXT: argument: of type '!xegpu.tensor_desc<8x16xf16>' at index: 0 -// CHECK-NEXT: sg_map : wi_layout: [1, 16], wi_data: [1, 1] +// CHECK-NEXT: layout : lane_layout: [1, 16], lane_data: [1, 1] // CHECK-NEXT: argument: of type '!xegpu.tensor_desc<16x16xf16>' at index: 1 -// CHECK-NEXT: sg_map : wi_layout: [1, 16], wi_data: [1, 1] +// CHECK-NEXT: layout : lane_layout: [1, 16], lane_data: [1, 1] // CHECK-NEXT: argument: of type '!xegpu.tensor_desc<8x16xf32>' at index: 2 -// CHECK-NEXT: sg_map : wi_layout: [1, 16], wi_data: [1, 1] +// CHECK-NEXT: layout : lane_layout: [1, 16], lane_data: [1, 1] // CHECK-NEXT: argument: of type '!xegpu.tensor_desc<16x16xf16>' at index: 3 -// CHECK-NEXT: sg_map : wi_layout: [1, 16], wi_data: [1, 1] +// CHECK-NEXT: layout : lane_layout: [1, 16], lane_data: [1, 1] // CHECK-NEXT: op : %[[T0:.*]] = xegpu.load_nd %{{.*}} : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16> -// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1] +// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1] // CHECK-NEXT: op : %[[T1:.*]] = xegpu.load_nd %{{.*}} : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16> -// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1] +// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1] // CHECK-NEXT: op : %[[CST:.*]] = arith.constant dense<1.000000e+00> : vector<16x16xf16> -// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1] +// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1] // CHECK-NEXT: op : %[[T2:.*]] = arith.addf %[[T1]], %[[CST]] : vector<16x16xf16> -// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1] +// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1] // CHECK-NEXT: op : %[[T3:.*]] = xegpu.dpas %[[T0]], %[[T2]] : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32> -// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1] +// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1] func.func @test_binary_op_multiple_uses(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu.tensor_desc<16x16xf16>, %arg2: !xegpu.tensor_desc<8x16xf32>, %arg3: !xegpu.tensor_desc<16x16xf16>) { %0 = xegpu.load_nd %arg0 : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16> %1 = xegpu.load_nd %arg1 : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16> @@ -404,39 +404,39 @@ func.func @test_binary_op_multiple_uses(%arg0: !xegpu.tensor_desc<8x16xf16>, %ar // ----- // CHECK: function: test_for_op: // CHECK-NEXT: argument: of type 'memref<8x128xf16>' at index: 0 -// CHECK-NEXT: sg_map : Not assigned. +// CHECK-NEXT: layout : Not assigned. // CHECK-NEXT: argument: of type 'memref<128x16xf16>' at index: 1 -// CHECK-NEXT: sg_map : Not assigned. +// CHECK-NEXT: layout : Not assigned. // CHECK-NEXT: argument: of type 'memref<8x16xf32>' at index: 2 -// CHECK-NEXT: sg_map : Not assigned. +// CHECK-NEXT: layout : Not assigned. // CHECK-NEXT: op : %{{.*}} = arith.constant 0 : index -// CHECK-NEXT: sg_map for result #0: Not assigned. +// CHECK-NEXT: layout for result #0: Not assigned. // CHECK-NEXT: op : %{{.*}} = arith.constant 128 : index -// CHECK-NEXT: sg_map for result #0: Not assigned. +// CHECK-NEXT: layout for result #0: Not assigned. // CHECK-NEXT: op : %{{.*}} = arith.constant 16 : index -// CHECK-NEXT: sg_map for result #0: Not assigned. +// CHECK-NEXT: layout for result #0: Not assigned. // CHECK-NEXT: op : %[[T0:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<8x128xf16> -> !xegpu.tensor_desc<8x16xf16> -// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1] +// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1] // CHECK-NEXT: op : %[[T1:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<128x16xf16> -> !xegpu.tensor_desc<16x16xf16> -// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [2, 1] +// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [2, 1] // CHECK-NEXT: op : %[[CST:.*]] = arith.constant dense<0.000000e+00> : vector<8x16xf32> -// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1] +// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1] // CHECK-NEXT: op : %[[T4:.*]] = xegpu.load_nd %{{.*}} : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16> -// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1] +// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1] // CHECK-NEXT: op : %[[T5:.*]] = xegpu.load_nd %{{.*}} : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16> -// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [2, 1] +// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [2, 1] // CHECK-NEXT: op : %[[T6:.*]] = xegpu.dpas %[[T4]], %[[T5]], %{{.*}} : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32> -// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1] +// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1] // CHECK-NEXT: op : %[[T7:.*]] = xegpu.update_nd_offset %{{.*}} : !xegpu.tensor_desc<8x16xf16> -// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1] +// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1] // CHECK-NEXT: op : %[[T8:.*]] = xegpu.update_nd_offset %{{.*}} : !xegpu.tensor_desc<16x16xf16> -// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [2, 1] +// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [2, 1] // CHECK-NEXT: op : scf.for -// CHECK-NEXT: sg_map for result #0: Not assigned. -// CHECK-NEXT: sg_map for result #1: Not assigned. -// CHECK-NEXT: sg_map for result #2: wi_layout: [1, 16], wi_data: [1, 1] +// CHECK-NEXT: layout for result #0: Not assigned. +// CHECK-NEXT: layout for result #1: Not assigned. +// CHECK-NEXT: layout for result #2: lane_layout: [1, 16], lane_data: [1, 1] // CHECK-NEXT: op : %[[T3:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32> -// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1] +// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1] func.func @test_for_op(%arg0: memref<8x128xf16>, %arg1: memref<128x16xf16>, %arg2: memref<8x16xf32>) { %c0 = arith.constant 0 : index %c128 = arith.constant 128 : index @@ -460,23 +460,23 @@ func.func @test_for_op(%arg0: memref<8x128xf16>, %arg1: memref<128x16xf16>, %arg // ----- // CHECK: function: test_if_single_use: // CHECK-NEXT: argument: of type '!xegpu.tensor_desc<8x16xf16>' at index: 0 -// CHECK-NEXT: sg_map : wi_layout: [1, 16], wi_data: [1, 1] +// CHECK-NEXT: layout : lane_layout: [1, 16], lane_data: [1, 1] // CHECK-NEXT: argument: of type '!xegpu.tensor_desc<16x16xf16>' at index: 1 -// CHECK-NEXT: sg_map : wi_layout: [1, 16], wi_data: [2, 1] +// CHECK-NEXT: layout : lane_layout: [1, 16], lane_data: [2, 1] // CHECK-NEXT: argument: of type 'i1' at index: 2 -// CHECK-NEXT: sg_map : Not assigned. +// CHECK-NEXT: layout : Not assigned. // CHECK-NEXT: argument: of type '!xegpu.tensor_desc<8x16xf32>' at index: 3 -// CHECK-NEXT: sg_map : wi_layout: [1, 16], wi_data: [1, 1] +// CHECK-NEXT: layout : lane_layout: [1, 16], lane_data: [1, 1] // CHECK-NEXT: op : %[[T0:.*]] = xegpu.load_nd %{{.*}} : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16> -// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1] +// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1] // CHECK-NEXT: op : %[[T3:.*]] = xegpu.load_nd %{{.*}} : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16> -// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [2, 1] +// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [2, 1] // CHECK-NEXT: op : %[[T4:.*]] = xegpu.load_nd %{{.*}} : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16> -// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [2, 1] +// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [2, 1] // CHECK-NEXT: op : scf.if -// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [2, 1] +// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [2, 1] // CHECK-NEXT: op : %[[T2:.*]] = xegpu.dpas %[[T0]], %{{.*}} : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32> -// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1] +// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1] func.func @test_if_single_use(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu.tensor_desc<16x16xf16>, %arg2: i1, %arg3: !xegpu.tensor_desc<8x16xf32>) { %0 = xegpu.load_nd %arg0 : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16> %1 = scf.if %arg2 -> (vector<16x16xf16>) { @@ -494,25 +494,25 @@ func.func @test_if_single_use(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu // ----- // CHECK: function: test_if_multiple_uses: // CHECK-NEXT: argument: of type '!xegpu.tensor_desc<8x16xf16>' at index: 0 -// CHECK-NEXT: sg_map : wi_layout: [1, 16], wi_data: [1, 1] +// CHECK-NEXT: layout : lane_layout: [1, 16], lane_data: [1, 1] // CHECK-NEXT: argument: of type '!xegpu.tensor_desc<16x16xf16>' at index: 1 -// CHECK-NEXT: sg_map : wi_layout: [1, 16], wi_data: [1, 1] +// CHECK-NEXT: layout : lane_layout: [1, 16], lane_data: [1, 1] // CHECK-NEXT: argument: of type 'i1' at index: 2 -// CHECK-NEXT: sg_map : Not assigned. +// CHECK-NEXT: layout : Not assigned. // CHECK-NEXT: argument: of type '!xegpu.tensor_desc<8x16xf32>' at index: 3 -// CHECK-NEXT: sg_map : wi_layout: [1, 16], wi_data: [1, 1] +// CHECK-NEXT: layout : lane_layout: [1, 16], lane_data: [1, 1] // CHECK-NEXT: argument: of type '!xegpu.tensor_desc<16x16xf16>' at index: 4 -// CHECK-NEXT: sg_map : wi_layout: [1, 16], wi_data: [1, 1] +// CHECK-NEXT: layout : lane_layout: [1, 16], lane_data: [1, 1] // CHECK-NEXT: op : %[[T0:.*]] = xegpu.load_nd %{{.*}} : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16> -// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1] +// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1] // CHECK-NEXT: op : %[[T3:.*]] = xegpu.load_nd %{{.*}} : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16> -// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1] +// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1] // CHECK-NEXT: op : %[[T4:.*]] = xegpu.load_nd %{{.*}} : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16> -// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1] +// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1] // CHECK-NEXT: op : scf.if -// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1] +// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1] // CHECK-NEXT: op : %[[T2:.*]] = xegpu.dpas %[[T0]], %{{.*}} : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32> -// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1] +// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1] func.func @test_if_multiple_uses(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu.tensor_desc<16x16xf16>, %arg2: i1, %arg3: !xegpu.tensor_desc<8x16xf32>, %arg4: !xegpu.tensor_desc<16x16xf16>) { %0 = xegpu.load_nd %arg0 : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16> %1 = scf.if %arg2 -> (vector<16x16xf16>) { @@ -531,13 +531,13 @@ func.func @test_if_multiple_uses(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xe // ----- // CHECK: function: test_vector_outer_reduction: // CHECK-NEXT: argument: of type 'vector<16x16xf32>' at index: 0 -// CHECK-NEXT: sg_map : wi_layout: [1, 16], wi_data: [1, 1] +// CHECK-NEXT: layout : lane_layout: [1, 16], lane_data: [1, 1] // CHECK-NEXT: argument: of type '!xegpu.tensor_desc<16xf32>' at index: 1 -// CHECK-NEXT: sg_map : wi_layout: [16], wi_data: [1] +// CHECK-NEXT: layout : lane_layout: [16], lane_data: [1] // CHECK-NEXT: op : %[[CST:.*]] = arith.constant dense<0.000000e+00> : vector<16xf32> -// CHECK-NEXT: sg_map for result #0: wi_layout: [16], wi_data: [1] +// CHECK-NEXT: layout for result #0: lane_layout: [16], lane_data: [1] // CHECK-NEXT: op : %[[T0:.*]] = vector.multi_reduction , %{{.*}}, %[[CST]] [0] : vector<16x16xf32> to vector<16xf32> -// CHECK-NEXT: sg_map for result #0: wi_layout: [16], wi_data: [1] +// CHECK-NEXT: layout for result #0: lane_layout: [16], lane_data: [1] func.func @test_vector_outer_reduction(%arg0: vector<16x16xf32>, %arg1: !xegpu.tensor_desc<16xf32>) { %cst = arith.constant dense<0.000000e+00> : vector<16xf32> %0 = vector.multi_reduction , %arg0, %cst [0] : vector<16x16xf32> to vector<16xf32> @@ -548,13 +548,13 @@ func.func @test_vector_outer_reduction(%arg0: vector<16x16xf32>, %arg1: !xegpu.t // ----- // CHECK: function: test_vector_inner_reduction: // CHECK-NEXT: argument: of type 'vector<16x16xf32>' at index: 0 -// CHECK-NEXT: sg_map : wi_layout: [1, 16], wi_data: [1, 1] +// CHECK-NEXT: layout : lane_layout: [1, 16], lane_data: [1, 1] // CHECK-NEXT: argument: of type '!xegpu.tensor_desc<16xf32>' at index: 1 -// CHECK-NEXT: sg_map : wi_layout: [16], wi_data: [1] +// CHECK-NEXT: layout : lane_layout: [16], lane_data: [1] // CHECK-NEXT: op : %[[CST:.*]] = arith.constant dense<0.000000e+00> : vector<16xf32> -// CHECK-NEXT: sg_map for result #0: wi_layout: [16], wi_data: [1] +// CHECK-NEXT: layout for result #0: lane_layout: [16], lane_data: [1] // CHECK-NEXT: op : %[[T0:.*]] = vector.multi_reduction , %{{.*}}, %[[CST]] [1] : vector<16x16xf32> to vector<16xf32> -// CHECK-NEXT: sg_map for result #0: wi_layout: [16], wi_data: [1] +// CHECK-NEXT: layout for result #0: lane_layout: [16], lane_data: [1] func.func @test_vector_inner_reduction(%arg0: vector<16x16xf32>, %arg1: !xegpu.tensor_desc<16xf32>) { %cst = arith.constant dense<0.000000e+00> : vector<16xf32> %0 = vector.multi_reduction , %arg0, %cst [1] : vector<16x16xf32> to vector<16xf32>