Fix default FMA implementation for tensors with integer elements (#7419)

anmyachev · web-flow · commit 673ca353d2a9 · 2025-07-09T14:16:53.000+01:00
Several examples from our repo:
```bash
FAILED language/test_core.py::test_dot[1-128-256-32-8-True-True-none-tf32-int8-int8-1-None0] - RuntimeError: PassManager::run failed
FAILED language/test_core.py::test_dot[1-128-256-32-8-True-True-none-tf32-int8-int8-1-None1] - RuntimeError: PassManager::run failed
FAILED language/test_core.py::test_dot[1-128-256-32-8-True-False-none-tf32-int8-int8-1-None0] - RuntimeError: PassManager::run failed
```

Most likely you use a different implementation in such cases. I could
add a test for such cases, but need to somehow disable more advanced
implementations (I'm not sure what the good way to do this is).

---------

Signed-off-by: Anatoly Myachev &lt;anatoly.myachev@intel.com&gt;
diff --git a/lib/Conversion/TritonGPUToLLVM/DotOpToLLVM/FMA.cpp b/lib/Conversion/TritonGPUToLLVM/DotOpToLLVM/FMA.cpp
@@ -1,5 +1,6 @@
 #include "triton/Conversion/TritonGPUToLLVM/FMADotUtility.h"
 #include "triton/Conversion/TritonGPUToLLVM/Utility.h"
+#include "llvm/ADT/TypeSwitch.h"
 
 using namespace mlir;
 using namespace mlir::triton;
@@ -19,8 +20,26 @@ class GenericFMAVectorMultiplier : public FMAVectorMultiplier {
     auto K = a.size();
     assert(b.size() == K);
     Value accum = c;
-    for (auto [aElem, bElem] : llvm::zip(a, b))
-      accum = builder.create<LLVM::FMulAddOp>(loc, aElem, bElem, accum);
+    Type tgtTy = accum.getType();
+    for (auto it = llvm::zip(a, b).begin(); it != llvm::zip(a, b).end(); ++it) {
+      const auto &aElem = std::get<0>(*it);
+      const auto &bElem = std::get<1>(*it);
+
+      assert(aElem.getType() == tgtTy);
+      assert(bElem.getType() == tgtTy);
+
+      // to avoid: 'llvm.intr.fmuladd' op operand #0 must be floating point LLVM
+      // type or LLVM dialect-compatible vector of floating point LLVM type, but
+      // got 'i32'
+      llvm::TypeSwitch<Type>(tgtTy)
+          .Case<FloatType>([&](auto) {
+            accum = builder.create<LLVM::FMulAddOp>(loc, aElem, bElem, accum);
+          })
+          .Case<IntegerType>([&](auto) {
+            accum = builder.create<LLVM::AddOp>(
+                loc, builder.create<LLVM::MulOp>(loc, aElem, bElem), accum);
+          });
+    }
     return accum;
   }
 };
diff --git a/test/Conversion/tritongpu_to_llvm.mlir b/test/Conversion/tritongpu_to_llvm.mlir
@@ -1347,6 +1347,32 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32} {
 
 // -----
 
+#blocked = #ttg.blocked<{sizePerThread = [1, 4], threadsPerWarp = [2, 16], warpsPerCTA = [1, 4], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
+#shared = #ttg.swizzled_shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
+#dot_operand_a = #ttg.dot_op<{opIdx=0, parent=#blocked}>
+#dot_operand_b = #ttg.dot_op<{opIdx=1, parent=#blocked}>
+#smem = #ttg.shared_memory
+module attributes {"ttg.target" = "cuda:70", "ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32} {
+  // CHECK-LABEL: matmul_fmadot_integer
+  tt.func @matmul_fmadot_integer(%ptr:!tt.ptr<i32> {tt.divisibility = 16 : i32},
+  %a:!ttg.memdesc<32x16xi32, #shared, #smem>, %b:!ttg.memdesc<16x32xi32, #shared, #smem>) {
+    %cst = arith.constant dense<0> : tensor<32x32xi32, #blocked>
+    // CHECK-NOT: llvm.intr.fmuladd
+    // CHECK: llvm.mul
+    // CHECK: llvm.add
+    %a_mat = ttg.local_load %a : !ttg.memdesc<32x16xi32, #shared, #smem> -> tensor<32x16xi32, #dot_operand_a>
+    %b_mat = ttg.local_load %b : !ttg.memdesc<16x32xi32, #shared, #smem> -> tensor<16x32xi32, #dot_operand_b>
+
+    %28 = tt.dot %a_mat, %b_mat, %cst, inputPrecision = ieee : tensor<32x16xi32, #dot_operand_a> * tensor<16x32xi32, #dot_operand_b> -> tensor<32x32xi32, #blocked>
+    %30 = tt.splat %ptr : !tt.ptr<i32> -> tensor<32x1x!tt.ptr<i32>, #blocked>
+    %36 = tt.broadcast %30 : tensor<32x1x!tt.ptr<i32>, #blocked> -> tensor<32x32x!tt.ptr<i32>, #blocked>
+    tt.store %36, %28 : tensor<32x32x!tt.ptr<i32>, #blocked>
+    tt.return
+  }
+}
+
+// -----
+
 #mma = #ttg.nvidia_mma<{versionMajor=2, warpsPerCTA=[2, 2], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [0, 1], instrShape = [16, 8]}>
 #shared = #ttg.swizzled_shared<{vec = 4, perPhase = 1, maxPhase = 4, order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
 #blocked = #ttg.blocked<{sizePerThread = [1, 4], threadsPerWarp = [2, 16], warpsPerCTA = [1, 4], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>