[NFI]: Prepare code for FMA loop generation feature (#5160)

etiotto · web-flow · commit daae5229f58b · 2025-09-22T15:36:22.000-04:00
This PR prepares the codebase for FMA loop generation by refactoring dot
operation conversion code. The main purpose is to extract and organize
FMA-related functionality into separate modules for better code
organization and reusability.

---------

Signed-off-by: Ettore Tiotto &lt;ettore.tiotto@intel.com&gt;
diff --git a/third_party/intel/lib/TritonIntelGPUToLLVM/CMakeLists.txt b/third_party/intel/lib/TritonIntelGPUToLLVM/CMakeLists.txt
@@ -5,6 +5,8 @@ add_triton_library(TritonIntelGPUToLLVM
     ControlFlowOpToLLVM.cpp
     ConvertLayoutOpToLLVM.cpp
     DotOpToLLVM/DPAS.cpp
+    DotOpToLLVM/FMA.cpp
+    DotOpToLLVM/FMADotUtility.cpp
     DotOpToLLVM.cpp
     ElementwiseOpToLLVM.cpp
     Fp4ToFpOpToLLVM.cpp
diff --git a/third_party/intel/lib/TritonIntelGPUToLLVM/DotOpToLLVM.cpp b/third_party/intel/lib/TritonIntelGPUToLLVM/DotOpToLLVM.cpp
@@ -6,11 +6,17 @@ using namespace mlir::triton;
 using ::mlir::triton::gpu::getShapePerCTA;
 using ::mlir::triton::gpu::intel::DpasEncodingAttr;
 
-namespace fma_details {
+namespace mlir::triton::gpu::intel {
+
+LogicalResult convertFMADot(DotOp op, DotOp::Adaptor adaptor,
+                            const LLVMTypeConverter *typeConverter,
+                            ConversionPatternRewriter &rewriter);
+
 LogicalResult convertDPAS(triton::DotOp op, triton::DotOp::Adaptor adaptor,
                           TritonIntelGPUToLLVMTypeConverter *typeConverter,
                           ConversionPatternRewriter &rewriter);
-} // namespace fma_details
+
+} // namespace mlir::triton::gpu::intel
 
 namespace {
 struct DotOpConversion : public ConvertTritonGPUOpToLLVMPattern<triton::DotOp> {
@@ -33,13 +39,14 @@ struct DotOpConversion : public ConvertTritonGPUOpToLLVMPattern<triton::DotOp> {
 
     if (!isOuter && isa<DpasEncodingAttr>(
                         cast<RankedTensorType>(D.getType()).getEncoding())) {
-      return fma_details::convertDPAS(op, adaptor, getTypeConverter(),
-                                      rewriter);
+      return triton::gpu::intel::convertDPAS(op, adaptor, getTypeConverter(),
+                                             rewriter);
     }
 
     if (isa<BlockedEncodingAttr>(
             cast<RankedTensorType>(D.getType()).getEncoding()))
-      return convertFMADot(op, adaptor, getTypeConverter(), rewriter);
+      return triton::gpu::intel::convertFMADot(op, adaptor, getTypeConverter(),
+                                               rewriter);
 
     llvm::report_fatal_error(
         "Unsupported DotOp found when converting TritonGPU to LLVM.");
diff --git a/third_party/intel/lib/TritonIntelGPUToLLVM/DotOpToLLVM/DPAS.cpp b/third_party/intel/lib/TritonIntelGPUToLLVM/DotOpToLLVM/DPAS.cpp
@@ -406,7 +406,8 @@ class DotOpDPASConversionHelper {
 
 } // namespace
 
-namespace fma_details {
+namespace mlir::triton::gpu::intel {
+
 LogicalResult convertDPAS(triton::DotOp op, triton::DotOp::Adaptor adaptor,
                           TritonIntelGPUToLLVMTypeConverter *typeConverter,
                           ConversionPatternRewriter &rewriter) {
@@ -441,4 +442,5 @@ LogicalResult convertDPAS(triton::DotOp op, triton::DotOp::Adaptor adaptor,
 
   return helper.convertDot(op, adaptor);
 }
-} // namespace fma_details
+
+} // namespace mlir::triton::gpu::intel
diff --git a/third_party/intel/lib/TritonIntelGPUToLLVM/DotOpToLLVM/FMA.cpp b/third_party/intel/lib/TritonIntelGPUToLLVM/DotOpToLLVM/FMA.cpp
@@ -0,0 +1,66 @@
+#include "triton/Conversion/TritonGPUToLLVM/FMADotUtility.h"
+#include "triton/Conversion/TritonGPUToLLVM/Utility.h"
+#include "llvm/ADT/TypeSwitch.h"
+
+using namespace mlir;
+using namespace mlir::triton;
+using namespace ::mlir::triton::gpu;
+
+namespace {
+class GenericFMAVectorMultiplier : public FMAVectorMultiplier {
+  OpBuilder &builder;
+  Location loc;
+
+public:
+  GenericFMAVectorMultiplier(OpBuilder &builder, Location loc)
+      : builder(builder), loc(loc) {}
+
+  Value multiplyVectors(ArrayRef<Value> a, ArrayRef<Value> b,
+                        Value c) override {
+    auto K = a.size();
+    assert(b.size() == K);
+    Value accum = c;
+    Type tgtTy = accum.getType();
+    for (auto it = llvm::zip(a, b).begin(); it != llvm::zip(a, b).end(); ++it) {
+      const auto &aElem = std::get<0>(*it);
+      const auto &bElem = std::get<1>(*it);
+
+      assert(aElem.getType() == tgtTy);
+      assert(bElem.getType() == tgtTy);
+
+      // to avoid: 'llvm.intr.fmuladd' op operand #0 must be floating point LLVM
+      // type or LLVM dialect-compatible vector of floating point LLVM type, but
+      // got 'i32'
+      llvm::TypeSwitch<Type>(tgtTy)
+          .Case<FloatType>([&](auto) {
+            accum = builder.create<LLVM::FMulAddOp>(loc, aElem, bElem, accum);
+          })
+          .Case<IntegerType>([&](auto) {
+            accum = builder.create<LLVM::AddOp>(
+                loc, builder.create<LLVM::MulOp>(loc, aElem, bElem), accum);
+          });
+    }
+    return accum;
+  }
+};
+
+} // namespace
+
+namespace mlir::triton::gpu::intel {
+
+LogicalResult parametricConvertFMADot(DotOp op, DotOp::Adaptor adaptor,
+                                      const LLVMTypeConverter *typeConverter,
+                                      ConversionPatternRewriter &rewriter,
+                                      FMAVectorMultiplier &multiplier);
+
+LogicalResult convertFMADot(DotOp op, DotOp::Adaptor adaptor,
+                            const LLVMTypeConverter *typeConverter,
+                            ConversionPatternRewriter &rewriter) {
+  auto *ctx = rewriter.getContext();
+  auto loc = op.getLoc();
+  GenericFMAVectorMultiplier multiplier(rewriter, loc);
+  return intel::parametricConvertFMADot(op, adaptor, typeConverter, rewriter,
+                                        multiplier);
+}
+
+} // namespace mlir::triton::gpu::intel
diff --git a/third_party/intel/lib/TritonIntelGPUToLLVM/DotOpToLLVM/FMADotUtility.cpp b/third_party/intel/lib/TritonIntelGPUToLLVM/DotOpToLLVM/FMADotUtility.cpp
@@ -0,0 +1,170 @@
+#include "triton/Conversion/TritonGPUToLLVM/FMADotUtility.h"
+#include "triton/Conversion/TritonGPUToLLVM/Utility.h"
+
+using namespace mlir;
+
+namespace {
+
+/// OperandValueKey structure represents compile time part
+/// of spatial coordinates of a value in a tensor.
+///
+/// Every Value spatial coordinates(i.e. [batch;nonK;k]) in tensor can be
+/// defined as:
+///
+/// batch = (bRepIdx * CTABSize + bIdx) + (laneBCoord + warpBCoord)
+/// nonK = (nonKRepIdx * CTANKSize + nonKIdx) + (laneNonKCoord + warpNonKCoord)
+/// k = kIdx
+///
+/// Where:
+/// CTABSize, CTANKSize: constants;
+/// laneBCoord, warpBCoord, laneNonKCoord, warpNonKCoord: runtime components;
+/// bRepIdx, nonKRepIdx, bIdx, nonKIdx, kIdx: compile time components.
+struct OperandValueKey {
+  unsigned bRepIdx, nonKRepIdx;
+  unsigned bIdx, nonKIdx, kIdx;
+
+  bool operator==(const OperandValueKey &other) const {
+    return (bRepIdx == other.bRepIdx && nonKRepIdx == other.nonKRepIdx &&
+            bIdx == other.bIdx && nonKIdx == other.nonKIdx &&
+            kIdx == other.kIdx);
+  }
+};
+
+} // namespace
+
+template <> struct std::hash<OperandValueKey> {
+  std::size_t operator()(const OperandValueKey &k) const {
+    return llvm::hash_combine(k.bRepIdx, k.nonKRepIdx, k.bIdx, k.nonKIdx,
+                              k.kIdx);
+  }
+};
+
+namespace {
+
+using ValueTableFMA = std::unordered_map<OperandValueKey, Value>;
+
+ValueTableFMA getValueTableFromStructFMA(
+    Value val, ArrayRef<unsigned> perRepShape, ArrayRef<unsigned> repetitions,
+    unsigned kDim, unsigned nonKDim, ConversionPatternRewriter &rewriter,
+    Location loc, ArrayRef<unsigned> inRepOrder, ArrayRef<unsigned> repOrder) {
+  ValueTableFMA res;
+  auto elems = unpackLLElements(loc, val, rewriter);
+  assert(perRepShape.size() == 3);
+  auto numElemsRep = product(perRepShape);
+  assert(elems.size() == numElemsRep * product(repetitions));
+  assert(kDim == 1 || kDim == 2);
+  assert(nonKDim == 1 || nonKDim == 2);
+  const unsigned bDim = 0;
+
+  for (unsigned idx = 0; idx < elems.size(); ++idx) {
+    auto inRepLinearIdx = idx % numElemsRep;
+    auto repLinearIdx = idx / numElemsRep;
+    auto inRepSpatialIdx =
+        mlir::LLVM::delinearize(inRepLinearIdx, perRepShape, inRepOrder);
+    auto repSpatialIdx =
+        mlir::LLVM::delinearize(repLinearIdx, repetitions, repOrder);
+    OperandValueKey key{repSpatialIdx[0], repSpatialIdx[nonKDim],
+                        inRepSpatialIdx[0], inRepSpatialIdx[nonKDim],
+                        inRepSpatialIdx[kDim]};
+    res[key] = elems[idx];
+  }
+  return res;
+}
+
+} // namespace
+
+namespace mlir::triton::gpu::intel {
+
+LogicalResult parametricConvertFMADot(DotOp op, DotOp::Adaptor adaptor,
+                                      const LLVMTypeConverter *typeConverter,
+                                      ConversionPatternRewriter &rewriter,
+                                      FMAVectorMultiplier &multiplier) {
+  auto *ctx = rewriter.getContext();
+  auto loc = op.getLoc();
+
+  auto A = op.getA();
+  auto D = op.getResult();
+
+  auto aTensorTy = cast<RankedTensorType>(A.getType());
+  auto dTensorTy = cast<RankedTensorType>(D.getType());
+
+  SmallVector<int64_t> aShapePerCTA =
+      expandMatrixShapeWithBatch(ArrayRef(getShapePerCTA(aTensorTy)));
+  auto dShapePerCTA =
+      expandMatrixShapeWithBatch(ArrayRef(getShapePerCTA(dTensorTy)));
+
+  BlockedEncodingAttr dLayout =
+      cast<BlockedEncodingAttr>(dTensorTy.getEncoding());
+  // TODO process A and B operand separately
+  auto inRepOrder = expandMatrixOrderWithBatch(dLayout.getOrder());
+  auto repOrder = expandMatrixOrderWithBatch(dLayout.getRepOrder());
+  auto cc = unpackLLElements(loc, adaptor.getC(), rewriter);
+
+  Value llA = adaptor.getA();
+  Value llB = adaptor.getB();
+
+  auto sizePerThread = getContigPerThread(dTensorTy);
+  auto numElemsPerThread = product(sizePerThread);
+  SmallVector<unsigned> shapePerCTATile;
+  for (auto [reg, thread, warp] :
+       llvm::zip(sizePerThread, dLayout.getThreadsPerWarp(),
+                 dLayout.getWarpsPerCTA())) {
+    shapePerCTATile.push_back(reg * thread * warp);
+  }
+  shapePerCTATile = expandMatrixShapeWithBatch(ArrayRef(shapePerCTATile));
+  sizePerThread = expandMatrixShapeWithBatch(ArrayRef(sizePerThread));
+
+  unsigned K = aShapePerCTA[2];
+
+  unsigned threadTileShape[3];
+  unsigned repetitions[3];
+  for (int i = 0; i < 3; ++i) {
+    repetitions[i] =
+        ceil(dShapePerCTA[i], static_cast<int64_t>(shapePerCTATile[i]));
+  }
+
+  auto has = getValueTableFromStructFMA(
+      llA, {sizePerThread[0], sizePerThread[1], K},
+      {repetitions[0], repetitions[1], 1},
+      /*kDim*/ 2, /*nonKDim*/ 1, rewriter, loc, inRepOrder, repOrder);
+  auto hbs = getValueTableFromStructFMA(
+      llB, {sizePerThread[0], K, sizePerThread[2]},
+      {repetitions[0], 1, repetitions[2]},
+      /*kDim*/ 1, /*nonKDim*/ 2, rewriter, loc, inRepOrder, repOrder);
+
+  SmallVector<Value> acc = cc;
+
+  for (unsigned bRep = 0; bRep < repetitions[0]; ++bRep)
+    for (unsigned mRep = 0; mRep < repetitions[1]; ++mRep)
+      for (unsigned nRep = 0; nRep < repetitions[2]; ++nRep)
+        for (unsigned b = 0; b < sizePerThread[0]; ++b)
+          for (unsigned m = 0; m < sizePerThread[1]; ++m)
+            for (unsigned n = 0; n < sizePerThread[2]; ++n) {
+              SmallVector<unsigned> multiDimAccumIdx = {b, m, n};
+              unsigned linearInRepIdx =
+                  LLVM::linearize(multiDimAccumIdx, sizePerThread, inRepOrder);
+              SmallVector<unsigned> multiDimRepIdx = {bRep, mRep, nRep};
+              unsigned linearRepIdx =
+                  LLVM::linearize(multiDimRepIdx, repetitions, repOrder);
+              unsigned linearAccumIdx =
+                  linearInRepIdx + linearRepIdx * numElemsPerThread;
+
+              SmallVector<Value> aOpVector;
+              SmallVector<Value> bOpVector;
+
+              for (unsigned k = 0; k < K; ++k) {
+                aOpVector.push_back(has.at({bRep, mRep, b, m, k}));
+                bOpVector.push_back(hbs.at({bRep, nRep, b, n, k}));
+              }
+
+              acc[linearAccumIdx] = multiplier.multiplyVectors(
+                  aOpVector, bOpVector, acc[linearAccumIdx]);
+            }
+
+  auto res = packLLElements(loc, typeConverter, acc, rewriter, dTensorTy);
+  rewriter.replaceOp(op, res);
+
+  return success();
+}
+
+} // namespace mlir::triton::gpu::intel