From af334a20374cade4d16df5abdc4164a8ac6261a6 Mon Sep 17 00:00:00 2001 From: toudefu <2696810816@qq.com> Date: Thu, 25 Sep 2025 18:12:52 +0800 Subject: [PATCH 1/2] conversion tritonGPUToLLVM utility decoupling (has a bug) --- .../include/flagtree_spec.h | 1 + .../TritonGPUToLLVM/iluvatar_Utility.h | 14 ++ .../lib/CMakeLists.txt | 3 +- .../Conversion/TritonGPUToLLVM/CMakeLists.txt | 7 + .../Conversion/TritonGPUToLLVM/Utility.cpp | 186 ++++++++++++++++++ .../Conversion/TritonGPUToLLVM/Utility.h | 51 ++++- .../Conversion/TritonGPUToLLVM/Utility.cpp | 25 +-- 7 files changed, 259 insertions(+), 28 deletions(-) create mode 100644 third_party/iluvatar/backend/flagtree_backend_specialization/include/triton/Conversion/TritonGPUToLLVM/iluvatar_Utility.h create mode 100644 third_party/iluvatar/backend/flagtree_backend_specialization/lib/Conversion/TritonGPUToLLVM/CMakeLists.txt create mode 100644 third_party/iluvatar/backend/flagtree_backend_specialization/lib/Conversion/TritonGPUToLLVM/Utility.cpp diff --git a/third_party/iluvatar/backend/flagtree_backend_specialization/include/flagtree_spec.h b/third_party/iluvatar/backend/flagtree_backend_specialization/include/flagtree_spec.h index 065fd04f6..c579f275b 100644 --- a/third_party/iluvatar/backend/flagtree_backend_specialization/include/flagtree_spec.h +++ b/third_party/iluvatar/backend/flagtree_backend_specialization/include/flagtree_spec.h @@ -3,3 +3,4 @@ #include "triton/Analysis/iluvatar_Utility.h" #include "triton/Conversion/TritonGPUToLLVM/iluvatar_ElementwiseOpToLLVMBase.h" #include "triton/Conversion/TritonGPUToLLVM/iluvatar_TargetInfoBase.h" +#include "triton/Conversion/TritonGPUToLLVM/iluvatar_Utility.h" diff --git a/third_party/iluvatar/backend/flagtree_backend_specialization/include/triton/Conversion/TritonGPUToLLVM/iluvatar_Utility.h b/third_party/iluvatar/backend/flagtree_backend_specialization/include/triton/Conversion/TritonGPUToLLVM/iluvatar_Utility.h new file mode 100644 index 000000000..8d5b230be --- /dev/null +++ b/third_party/iluvatar/backend/flagtree_backend_specialization/include/triton/Conversion/TritonGPUToLLVM/iluvatar_Utility.h @@ -0,0 +1,14 @@ +#ifndef ILUVATAR_TRITON_CONVERSION_TRITONGPU_TO_LLVM_UTILITY_H +#define ILUVATAR_TRITON_CONVERSION_TRITONGPU_TO_LLVM_UTILITY_H + +#define FLAGTREE_SPEC_Conversion_TritonGPUToLLVM_Utility_heads +#define FLAGTREE_SPEC_Conversion_TritonGPUToLLVM_Utility_functionPtr +#define FLAGTREE_SPEC_Conversion_TritonGPUToLLVM_Utility_createIndexConstant +#define FLAGTREE_SPEC_Conversion_TritonGPUToLLVM_Utility_getMultiDimOffset_ARG bool +#define FLAGTREE_SPEC_Conversion_TritonGPUToLLVM_Utility_IluvatarMmaEncodingAttr +#define FLAGTREE_SPEC_Conversion_TritonGPUToLLVM_Utility_emitBaseIndexForLayoutImpl +#define FLAGTREE_SPEC_Conversion_TritonGPUToLLVM_Utility_emitOffsetForLayout +#define FLAGTREE_SPEC_Conversion_TritonGPUToLLVM_Utility_getSwizzledSharedPtrs +#define FLAGTREE_SPEC_Conversion_TritonGPUToLLVM_Utility_storeDistributedToShared + +#endif // ILUVATAR_TRITON_CONVERSION_TRITONGPU_TO_LLVM_UTILITY_H \ No newline at end of file diff --git a/third_party/iluvatar/backend/flagtree_backend_specialization/lib/CMakeLists.txt b/third_party/iluvatar/backend/flagtree_backend_specialization/lib/CMakeLists.txt index 5c6d3ffe1..f0011e83b 100644 --- a/third_party/iluvatar/backend/flagtree_backend_specialization/lib/CMakeLists.txt +++ b/third_party/iluvatar/backend/flagtree_backend_specialization/lib/CMakeLists.txt @@ -1 +1,2 @@ -add_subdirectory(Analysis) \ No newline at end of file +add_subdirectory(Analysis) +add_subdirectory(Conversion/TritonGPUToLLVM) \ No newline at end of file diff --git a/third_party/iluvatar/backend/flagtree_backend_specialization/lib/Conversion/TritonGPUToLLVM/CMakeLists.txt b/third_party/iluvatar/backend/flagtree_backend_specialization/lib/Conversion/TritonGPUToLLVM/CMakeLists.txt new file mode 100644 index 000000000..884ac2f0f --- /dev/null +++ b/third_party/iluvatar/backend/flagtree_backend_specialization/lib/Conversion/TritonGPUToLLVM/CMakeLists.txt @@ -0,0 +1,7 @@ +add_triton_library(FlagTree_iluvatar_TritonConversionTritonGPUToLLVM + Utility.cpp + + DEPENDS + TritonTableGen + TritonGPUAttrDefsIncGen +) \ No newline at end of file diff --git a/third_party/iluvatar/backend/flagtree_backend_specialization/lib/Conversion/TritonGPUToLLVM/Utility.cpp b/third_party/iluvatar/backend/flagtree_backend_specialization/lib/Conversion/TritonGPUToLLVM/Utility.cpp new file mode 100644 index 000000000..caadb4056 --- /dev/null +++ b/third_party/iluvatar/backend/flagtree_backend_specialization/lib/Conversion/TritonGPUToLLVM/Utility.cpp @@ -0,0 +1,186 @@ +#include "triton/Conversion/TritonGPUToLLVM/Utility.h" +#include "triton/../../lib/Conversion/TritonGPUToLLVM/Utility.cpp" + +namespace mlir { +namespace LLVM { + +Value createIndexConstant(OpBuilder &builder, Location loc, + TypeConverter *converter, int64_t value) { + Type ty = converter->convertType(builder.getIndexType()); + return builder.create(loc, ty, + builder.getIntegerAttr(ty, value)); +} + +SmallVector getMultiDimOffset(Attribute layout, Location loc, + ConversionPatternRewriter &rewriter, + const TargetInfoBase &targetInfo, + unsigned elemId, RankedTensorType type, + ArrayRef multiDimCTAInRepId, + ArrayRef shapePerCTATile, + bool isTrans, bool stNotRd) { + auto shape = type.getShape(); + unsigned rank = shape.size(); + if (auto blockedLayout = dyn_cast(layout)) { + auto multiDimOffsetFirstElem = emitBaseIndexForLayout( + loc, rewriter, targetInfo, blockedLayout, type, false); + SmallVector multiDimOffset(rank); + SmallVector multiDimElemId = getMultiDimIndex( + elemId, getSizePerThread(layout), getOrder(layout)); + for (unsigned d = 0; d < rank; ++d) { + multiDimOffset[d] = + add(multiDimOffsetFirstElem[d], + i32_val(multiDimCTAInRepId[d] * shapePerCTATile[d] + + multiDimElemId[d])); + } + return multiDimOffset; + } + if (auto sliceLayout = mlir::dyn_cast(layout)) { + unsigned dim = sliceLayout.getDim(); + auto parentEncoding = sliceLayout.getParent(); + auto parentSizePerThread = getSizePerThread(parentEncoding); + auto parentShape = sliceLayout.paddedShape(shape); + auto parentTy = RankedTensorType::get(parentShape, type.getElementType(), + parentEncoding); + auto offsets = emitOffsetForLayout(layout, type); + auto parentOffset = emitOffsetForLayout(parentEncoding, parentTy); + SmallVector idxs; + for (SmallVector off : offsets) { + off.insert(off.begin() + dim, 0); + auto it = std::find(parentOffset.begin(), parentOffset.end(), off); + idxs.push_back(std::distance(parentOffset.begin(), it)); + } + auto multiDimOffsetParent = getMultiDimOffset( + parentEncoding, loc, rewriter, targetInfo, idxs[elemId], parentTy, + sliceLayout.paddedShape(multiDimCTAInRepId), + sliceLayout.paddedShape(shapePerCTATile)); + SmallVector multiDimOffset(rank); + for (unsigned d = 0; d < rank + 1; ++d) { + if (d == dim) + continue; + unsigned slicedD = d < dim ? d : (d - 1); + multiDimOffset[slicedD] = multiDimOffsetParent[d]; + } + return multiDimOffset; + } + if (auto mmaLayout = mlir::dyn_cast(layout)) { + assert(rank == 2 || + (rank == 3 && mmaLayout.isAmpere()) && "Unexpected rank"); + auto shapePerCTA = getShapePerCTA(mmaLayout, shape); + auto instrShape = mmaLayout.getInstrShape(); + SmallVector mmaColIdx(2); + SmallVector mmaRowIdx(2); + Value threadId = getThreadId(rewriter, loc); + Value warpSize = i32_val(32); + Value laneId = urem(threadId, warpSize); + Value warpId = udiv(threadId, warpSize); + // TODO: fix the bug in MMAEncodingAttr document + SmallVector multiDimWarpId(2); + auto warpsPerCTA = mmaLayout.getWarpsPerCTA(); + auto warpOrder = triton::gpu::getWarpOrder(mmaLayout); + multiDimWarpId = delinearize(rewriter, loc, warpId, warpsPerCTA, warpOrder); + Value _1 = i32_val(1); + Value _2 = i32_val(2); + Value _4 = i32_val(4); + Value _8 = i32_val(8); + Value _16 = i32_val(16); + if (mmaLayout.isAmpere() || mmaLayout.isHopper()) { + multiDimWarpId[rank - 1] = urem( + multiDimWarpId[rank - 1], + i32_val(ceil(shapePerCTA[rank - 1], instrShape[rank - 1]))); + multiDimWarpId[rank - 2] = urem( + multiDimWarpId[rank - 2], + i32_val(ceil(shapePerCTA[rank - 2], instrShape[rank - 2]))); + + Value mmaGrpId = udiv(laneId, _4); + Value mmaGrpIdP8 = add(mmaGrpId, _8); + Value mmaThreadIdInGrp = urem(laneId, _4); + Value mmaThreadIdInGrpM2 = mul(mmaThreadIdInGrp, _2); + Value mmaThreadIdInGrpM2P1 = add(mmaThreadIdInGrpM2, _1); + Value rowWarpOffset = + mul(multiDimWarpId[rank - 2], i32_val(instrShape[rank - 2])); + mmaRowIdx[0] = add(mmaGrpId, rowWarpOffset); + mmaRowIdx[1] = add(mmaGrpIdP8, rowWarpOffset); + Value colWarpOffset = + mul(multiDimWarpId[rank - 1], i32_val(instrShape[rank - 1])); + mmaColIdx[0] = add(mmaThreadIdInGrpM2, colWarpOffset); + mmaColIdx[1] = add(mmaThreadIdInGrpM2P1, colWarpOffset); + } else if (mmaLayout.isVolta()) { + // Volta doesn't follow the pattern here. + } else { + llvm_unreachable("Unexpected MMALayout version"); + } + + SmallVector multiDimOffset(rank); + if (mmaLayout.isHopper()) { + unsigned elemIdRem4 = elemId % 4; + unsigned nGrpId = elemId / 4; + multiDimOffset[0] = elemIdRem4 < 2 ? mmaRowIdx[0] : mmaRowIdx[1]; + multiDimOffset[1] = elemIdRem4 % 2 == 0 ? mmaColIdx[0] : mmaColIdx[1]; + multiDimOffset[1] = add(multiDimOffset[1], i32_val(8 * nGrpId)); + multiDimOffset[0] = add(multiDimOffset[0], i32_val(multiDimCTAInRepId[0] * + shapePerCTATile[0])); + multiDimOffset[1] = add(multiDimOffset[1], i32_val(multiDimCTAInRepId[1] * + shapePerCTATile[1])); + } else if (mmaLayout.isAmpere()) { + if (rank == 3) + multiDimOffset[0] = + add(multiDimWarpId[0], + i32_val(multiDimCTAInRepId[0] * shapePerCTATile[0])); + multiDimOffset[rank - 2] = elemId < 2 ? mmaRowIdx[0] : mmaRowIdx[1]; + multiDimOffset[rank - 1] = elemId % 2 == 0 ? mmaColIdx[0] : mmaColIdx[1]; + multiDimOffset[rank - 2] = + add(multiDimOffset[rank - 2], i32_val(multiDimCTAInRepId[rank - 2] * + shapePerCTATile[rank - 2])); + multiDimOffset[rank - 1] = + add(multiDimOffset[rank - 1], i32_val(multiDimCTAInRepId[rank - 1] * + shapePerCTATile[rank - 1])); + } else if (mmaLayout.isVolta()) { + auto [isARow, isBRow, isAVec4, isBVec4, _] = + mmaLayout.decodeVoltaLayoutStates(); + auto coords = SharedToDotOperandMMAv1::getMNCoords( + threadId, loc, rewriter, mmaLayout.getWarpsPerCTA(), mmaLayout, shape, + isARow, isBRow, isAVec4, isBVec4); + return coords[elemId]; + } else { + llvm_unreachable("Unexpected MMALayout version"); + } + return multiDimOffset; + } + if (auto mmaLayout = mlir::dyn_cast(layout)) { + assert(rank == 2 && "Unexpected rank"); + SmallVector multiDimOffset(rank); + Value threadId = getThreadId(rewriter, loc); + if (mmaLayout.isVolta()) { + int bitwidth = type.getElementType().getIntOrFloatBitWidth(); + int elemVecSize = stNotRd ? (32 / bitwidth) : 1; + static auto func = SharedToDotOperandMMAv1::load_getMNCoords_func( + "iluvatar", "getMNCoords"); + auto coords = func(threadId, loc, rewriter, mmaLayout.getWarpsPerCTA(), + mmaLayout, shape, bitwidth, elemVecSize, isTrans); + return coords[elemId]; + } else { + llvm_unreachable("Unexpected MMALayout version"); + } + } + if (isa(layout)) { + auto multiDimBase = + emitBaseIndexForLayout(loc, rewriter, targetInfo, layout, type, false); + SmallVector> offsets; + assert(rank == 2); + SmallVector multiDimOffset(rank); + if (auto mfmaLayout = dyn_cast(layout)) { + emitMfmaOffsetForCTA(mfmaLayout, offsets, 0, multiDimCTAInRepId[0], + multiDimCTAInRepId[1]); + } else if (auto wmmaLayout = dyn_cast(layout)) { + emitWmmaOffsetForCTA(wmmaLayout, offsets, 0, multiDimCTAInRepId[0], + multiDimCTAInRepId[1]); + } + multiDimOffset[0] = add(multiDimBase[0], i32_val(offsets[elemId][0])); + multiDimOffset[1] = add(multiDimBase[1], i32_val(offsets[elemId][1])); + return multiDimOffset; + } + llvm_unreachable("unexpected layout in getMultiDimOffset"); +} + +} // namespace LLVM +} // namespace mlir \ No newline at end of file diff --git a/third_party/iluvatar/include/triton/Conversion/TritonGPUToLLVM/Utility.h b/third_party/iluvatar/include/triton/Conversion/TritonGPUToLLVM/Utility.h index 46d02d5bd..76cfe12da 100644 --- a/third_party/iluvatar/include/triton/Conversion/TritonGPUToLLVM/Utility.h +++ b/third_party/iluvatar/include/triton/Conversion/TritonGPUToLLVM/Utility.h @@ -6,20 +6,28 @@ #include "mlir/Conversion/LLVMCommon/Pattern.h" #include "mlir/Dialect/ControlFlow/IR/ControlFlowOps.h" #include "mlir/Dialect/LLVMIR/LLVMDialect.h" -#include "python/src/plugin.h" #include "triton/Analysis/Utility.h" #include "triton/Conversion/MLIRTypes.h" #include "triton/Conversion/TritonGPUToLLVM/TargetInfoBase.h" #include "triton/Dialect/Triton/IR/Utility.h" #include "triton/Dialect/TritonGPU/IR/Dialect.h" #include "triton/Dialect/TritonGPU/IR/LinearLayoutConversions.h" + +#ifndef FLAGTREE_SPEC_Conversion_TritonGPUToLLVM_Utility_heads +#include "triton/Dialect/TritonNvidiaGPU/IR/Dialect.h" +#else +#include "python/src/plugin.h" #include "triton/Dialect/TritonGPU/Transforms/Utility.h" +#endif + #include "triton/Tools/LinearLayout.h" #include "triton/Tools/StrUtil.h" #include "triton/Tools/Sys/GetEnv.hpp" #include "llvm/ADT/STLExtras.h" #include "llvm/Support/ErrorHandling.h" +#include "triton/../../backend/flagtree_backend_specialization/include/flagtree_spec.h" + #define DEBUG_TYPE "ttgpu_to_llvm" #define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE "]: ") #define LDBG(X) LLVM_DEBUG(DBGS() << X << "\n") @@ -27,6 +35,7 @@ using namespace mlir; using namespace mlir::triton; +#ifdef FLAGTREE_SPEC_Conversion_TritonGPUToLLVM_Utility_functionPtr using emitOffsetForTCULayoutFunc = SmallVector> (*)( const triton::gpu::IluvatarMmaEncodingAttr &, RankedTensorType); DEFINE_LOAD_FUNC(emitOffsetForTCULayout) @@ -39,6 +48,7 @@ DEFINE_LOAD_FUNC(emitBaseIndexForTCULayout) using remapOffsetFunc = Value (*)(Value, Value, RankedTensorType, bool, Location, RewriterBase &, int, bool); DEFINE_LOAD_FUNC(remapOffset) +#endif // Shortcuts for some commonly used LLVM ops to keep code simple and intuitive // Operators @@ -245,8 +255,13 @@ Value createConstantF64(Location loc, OpBuilder &rewriter, double v); Value createNaNConstant(Location loc, OpBuilder &rewriter, Type type); /// Create an index type constant. +#ifndef FLAGTREE_SPEC_Conversion_TritonGPUToLLVM_Utility_createIndexConstant +Value createIndexConstant(OpBuilder &builder, Location loc, + const TypeConverter *converter, int64_t value); +#else Value createIndexConstant(OpBuilder &builder, Location loc, TypeConverter *converter, int64_t value); +#endif /// Create an integer constant of \param width bits. Value createLLVMIntegerConstant(OpBuilder &builder, Location loc, short width, @@ -359,11 +374,23 @@ Value addStringToModule(Location loc, ConversionPatternRewriter &rewriter, // the smem buffer. Recall that the smem buffer will only store a single replica // when converting distributed to distributed layout. Also, a replica is the // smallest CTA tile that is common between input and output layouts. -SmallVector getMultiDimOffset( - Attribute layout, Location loc, ConversionPatternRewriter &rewriter, - const TargetInfoBase &targetInfo, unsigned elemId, RankedTensorType type, - ArrayRef multiDimCTAInRepId, ArrayRef shapePerCTATile, - bool isTrans = false, bool stNotRd = false); +#ifndef FLAGTREE_SPEC_Conversion_TritonGPUToLLVM_Utility_getMultiDimOffset_ARG +SmallVector getMultiDimOffset(Attribute layout, Location loc, + ConversionPatternRewriter &rewriter, + const TargetInfoBase &targetInfo, + unsigned elemId, RankedTensorType type, + ArrayRef multiDimCTAInRepId, + ArrayRef shapePerCTATile); +#else +SmallVector getMultiDimOffset(Attribute layout, Location loc, + ConversionPatternRewriter &rewriter, + const TargetInfoBase &targetInfo, + unsigned elemId, RankedTensorType type, + ArrayRef multiDimCTAInRepId, + ArrayRef shapePerCTATile, + FLAGTREE_SPEC_Conversion_TritonGPUToLLVM_Utility_getMultiDimOffset_ARG spec_arg1 = false, + FLAGTREE_SPEC_Conversion_TritonGPUToLLVM_Utility_getMultiDimOffset_ARG spec_arg2 = false); +#endif // Given a multiDimOffset, this function wraps around each dimension to be // within shape. @@ -434,7 +461,11 @@ using ::mlir::triton::gpu::AMDWmmaEncodingAttr; using ::mlir::triton::gpu::BlockedEncodingAttr; using ::mlir::triton::gpu::CTALayoutAttr; using ::mlir::triton::gpu::DotOperandEncodingAttr; + +#ifdef FLAGTREE_SPEC_Conversion_TritonGPUToLLVM_Utility_IluvatarMmaEncodingAttr using ::mlir::triton::gpu::IluvatarMmaEncodingAttr; +#endif + using ::mlir::triton::gpu::NvidiaMmaEncodingAttr; using ::mlir::triton::gpu::SliceEncodingAttr; @@ -1128,11 +1159,13 @@ emitBaseIndexForLayoutImpl(Location loc, RewriterBase &rewriter, if (mmaLayout.isAmpere() || mmaLayout.isHopper()) result = emitBaseIndexWithinCTAForMmaLayoutV2V3(loc, rewriter, mmaLayout, type); +#ifdef FLAGTREE_SPEC_Conversion_TritonGPUToLLVM_Utility_emitBaseIndexForLayoutImpl } else if (auto mmaLayout = mlir::dyn_cast(layout)) { if (mmaLayout.isVolta()) { DEFINE_CALL_LOAD_FUNC(iluvatar, emitBaseIndexForTCULayout) result = func(loc, rewriter, mmaLayout, type); } +#endif } else if (auto mfmaLayout = mlir::dyn_cast(layout)) { result = emitBaseIndexForMfmaLayout(loc, rewriter, mfmaLayout, type); } else if (auto wmmaLayout = mlir::dyn_cast(layout)) { @@ -1201,12 +1234,14 @@ emitOffsetForLayout(Attribute layout, RankedTensorType type) { if (mmaLayout.isHopper()) return emitOffsetForMmaLayoutV3(mmaLayout, type); } +#ifdef FLAGTREE_SPEC_Conversion_TritonGPUToLLVM_Utility_emitOffsetForLayout if (auto mmaLayout = dyn_cast(layout)) { if (mmaLayout.isVolta()) { DEFINE_CALL_LOAD_FUNC(iluvatar, emitOffsetForTCULayout) return func(mmaLayout, type); } } +#endif if (auto mfmaLayout = mlir::dyn_cast(layout)) { return emitOffsetForMfmaLayout(mfmaLayout, type); } @@ -1362,7 +1397,7 @@ inline DenseMap getSwizzledSharedPtrs( } // compute phase = (row // perPhase) % maxPhase Value phase = urem(udiv(idxRow, i32_val(perPhase)), i32_val(maxPhase)); -#if defined(__ILUVATAR__) +#ifdef FLAGTREE_SPEC_Conversion_TritonGPUToLLVM_Utility_getSwizzledSharedPtrs // corex swizzle bool isRow = outOrder[0] == 1; Value off = NULL; @@ -1524,7 +1559,7 @@ inline void storeDistributedToShared(Value src, ArrayRef inVals, // If the shmem layout is not swizzled, we can trivially vectorize stores // across the whole width of the most-minor dimension of the shape, because // Triton requires all the dims are powers of 2. -#ifdef __ILUVATAR__ +#ifdef FLAGTREE_SPEC_Conversion_TritonGPUToLLVM_Utility_storeDistributedToShared unsigned outVec = dstSharedLayout.getVec(); #else unsigned outVec = dstSharedLayout.getMaxPhase() == 1 diff --git a/third_party/iluvatar/lib/Conversion/TritonGPUToLLVM/Utility.cpp b/third_party/iluvatar/lib/Conversion/TritonGPUToLLVM/Utility.cpp index b65259974..abdcfc952 100644 --- a/third_party/iluvatar/lib/Conversion/TritonGPUToLLVM/Utility.cpp +++ b/third_party/iluvatar/lib/Conversion/TritonGPUToLLVM/Utility.cpp @@ -299,12 +299,14 @@ Value createNaNConstant(Location loc, OpBuilder &rewriter, Type type) { } // Create an index type constant. +#ifndef FLAGTREE_SPEC_Conversion_TritonGPUToLLVM_Utility_createIndexConstant Value createIndexConstant(OpBuilder &builder, Location loc, - TypeConverter *converter, int64_t value) { + const TypeConverter *converter, int64_t value) { Type ty = converter->convertType(builder.getIndexType()); return builder.create(loc, ty, builder.getIntegerAttr(ty, value)); } +#endif // Create an integer constant of \param width bits. Value createLLVMIntegerConstant(OpBuilder &builder, Location loc, short width, @@ -474,13 +476,13 @@ Value addStringToModule(Location loc, ConversionPatternRewriter &rewriter, return stringStart; } +#ifndef FLAGTREE_SPEC_Conversion_TritonGPUToLLVM_Utility_getMultiDimOffset_ARG SmallVector getMultiDimOffset(Attribute layout, Location loc, ConversionPatternRewriter &rewriter, const TargetInfoBase &targetInfo, unsigned elemId, RankedTensorType type, ArrayRef multiDimCTAInRepId, - ArrayRef shapePerCTATile, - bool isTrans, bool stNotRd) { + ArrayRef shapePerCTATile) { auto shape = type.getShape(); unsigned rank = shape.size(); if (auto blockedLayout = dyn_cast(layout)) { @@ -609,22 +611,6 @@ SmallVector getMultiDimOffset(Attribute layout, Location loc, } return multiDimOffset; } - if (auto mmaLayout = mlir::dyn_cast(layout)) { - assert(rank == 2 && "Unexpected rank"); - SmallVector multiDimOffset(rank); - Value threadId = getThreadId(rewriter, loc); - if (mmaLayout.isVolta()) { - int bitwidth = type.getElementType().getIntOrFloatBitWidth(); - int elemVecSize = stNotRd ? (32 / bitwidth) : 1; - static auto func = SharedToDotOperandMMAv1::load_getMNCoords_func( - "iluvatar", "getMNCoords"); - auto coords = func(threadId, loc, rewriter, mmaLayout.getWarpsPerCTA(), - mmaLayout, shape, bitwidth, elemVecSize, isTrans); - return coords[elemId]; - } else { - llvm_unreachable("Unexpected MMALayout version"); - } - } if (isa(layout)) { auto multiDimBase = emitBaseIndexForLayout(loc, rewriter, targetInfo, layout, type, false); @@ -644,6 +630,7 @@ SmallVector getMultiDimOffset(Attribute layout, Location loc, } llvm_unreachable("unexpected layout in getMultiDimOffset"); } +#endif SmallVector getWrappedMultiDimOffset( ConversionPatternRewriter &rewriter, Location loc, From 06132fd0521041cc3e66024e3859fbb0f26b1278 Mon Sep 17 00:00:00 2001 From: toudefu <2696810816@qq.com> Date: Sun, 28 Sep 2025 20:53:54 +0800 Subject: [PATCH 2/2] code bug check --- .../lib/Conversion/TritonGPUToLLVM/Utility.cpp | 4 +++- .../Conversion/TritonGPUToLLVM/Utility.h | 18 ++++++++++++++++++ 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/third_party/iluvatar/backend/flagtree_backend_specialization/lib/Conversion/TritonGPUToLLVM/Utility.cpp b/third_party/iluvatar/backend/flagtree_backend_specialization/lib/Conversion/TritonGPUToLLVM/Utility.cpp index caadb4056..593b41192 100644 --- a/third_party/iluvatar/backend/flagtree_backend_specialization/lib/Conversion/TritonGPUToLLVM/Utility.cpp +++ b/third_party/iluvatar/backend/flagtree_backend_specialization/lib/Conversion/TritonGPUToLLVM/Utility.cpp @@ -1,8 +1,10 @@ #include "triton/Conversion/TritonGPUToLLVM/Utility.h" -#include "triton/../../lib/Conversion/TritonGPUToLLVM/Utility.cpp" namespace mlir { namespace LLVM { +using namespace mlir::triton; +using mlir::triton::gpu::getOrder; +using mlir::triton::gpu::getSizePerThread; Value createIndexConstant(OpBuilder &builder, Location loc, TypeConverter *converter, int64_t value) { diff --git a/third_party/iluvatar/include/triton/Conversion/TritonGPUToLLVM/Utility.h b/third_party/iluvatar/include/triton/Conversion/TritonGPUToLLVM/Utility.h index 76cfe12da..ac10953a8 100644 --- a/third_party/iluvatar/include/triton/Conversion/TritonGPUToLLVM/Utility.h +++ b/third_party/iluvatar/include/triton/Conversion/TritonGPUToLLVM/Utility.h @@ -1677,4 +1677,22 @@ inline bool isLayoutMmaV1(Attribute layout) { } // namespace mlir +namespace SharedToDotOperandMMAv1 { + +using CoordTy = SmallVector; +using ValueTable = std::map, std::pair>; + +using getMNCoordsFunc = SmallVector (*)( + Value, Location, ConversionPatternRewriter &, ArrayRef, + const IluvatarMmaEncodingAttr &, ArrayRef, int, int, bool); + +getMNCoordsFunc load_getMNCoords_func(const char *target, const char *name); + +static SmallVector +getMNCoords(Value thread, Location loc, ConversionPatternRewriter &rewriter, + ArrayRef wpt, const NvidiaMmaEncodingAttr &mmaLayout, + ArrayRef shape, bool isARow, bool isBRow, bool isAVec4, + bool isBVec4); +} // namespace SharedToDotOperandMMAv1 + #endif