[MLIR][GPU] subgroup_mma fp64 extension - take 2 #169061

castigli · 2025-11-21T16:40:43Z

This PR re-lands #165873.

This PR extends the gpu.subgroup_mma_* ops to support fp64 type.
The extension requires special handling during the lowering to nvvm due to the return type for load ops for fragment a and b (they return a scalar instead of a struct).

The original PR did not guard the new test based on the required architecture (sm80) which lead to a failure on the cuda runners with T4 GPUs.

llvmbot · 2025-11-21T16:41:17Z

@llvm/pr-subscribers-mlir

@llvm/pr-subscribers-mlir-gpu

Author: Giacomo Castiglioni (castigli)

Changes

This PR re-lands #165873.

This PR extends the gpu.subgroup_mma_* ops to support fp64 type.
The extension requires special handling during the lowering to nvvm due to the return type for load ops for fragment a and b (they return a scalar instead of a struct).

The original PR did not guard the new test based on the required architecture (sm80) which lead to a failure on the cuda runners with T4 GPUs.

Full diff: https://github.com/llvm/llvm-project/pull/169061.diff

8 Files Affected:

(modified) mlir/include/mlir/Conversion/GPUToNVVM/GPUToNVVMPass.h (+1-1)
(modified) mlir/include/mlir/Dialect/GPU/IR/GPUBase.td (+1-1)
(modified) mlir/include/mlir/Dialect/GPU/IR/GPUOps.td (+4-4)
(modified) mlir/lib/Conversion/GPUToNVVM/WmmaOpsToNvvm.cpp (+42-10)
(modified) mlir/lib/Dialect/GPU/IR/GPUDialect.cpp (+2-2)
(modified) mlir/test/Conversion/GPUToNVVM/wmma-ops-to-nvvm.mlir (+22)
(modified) mlir/test/Dialect/GPU/invalid.mlir (+2-2)
(added) mlir/test/Integration/GPU/CUDA/TensorCore/sm80/wmma-matmul-f64.mlir (+72)

diff --git a/mlir/include/mlir/Conversion/GPUToNVVM/GPUToNVVMPass.h b/mlir/include/mlir/Conversion/GPUToNVVM/GPUToNVVMPass.h
index 4c8abea680b66..48982ac6efe7c 100644
--- a/mlir/include/mlir/Conversion/GPUToNVVM/GPUToNVVMPass.h
+++ b/mlir/include/mlir/Conversion/GPUToNVVM/GPUToNVVMPass.h
@@ -27,7 +27,7 @@ class MMAMatrixType;
 #define GEN_PASS_DECL_CONVERTGPUOPSTONVVMOPS
 #include "mlir/Conversion/Passes.h.inc"
 
-LLVM::LLVMStructType convertMMAToLLVMType(gpu::MMAMatrixType type);
+Type convertMMAToLLVMType(gpu::MMAMatrixType type);
 
 /// Configure target to convert from the GPU dialect to NVVM.
 void configureGpuToNVVMConversionLegality(ConversionTarget &target);
diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUBase.td b/mlir/include/mlir/Dialect/GPU/IR/GPUBase.td
index 860f893367203..2c29bb8a01a41 100644
--- a/mlir/include/mlir/Dialect/GPU/IR/GPUBase.td
+++ b/mlir/include/mlir/Dialect/GPU/IR/GPUBase.td
@@ -114,7 +114,7 @@ def GPU_MMAMatrix : DialectType<
   GPU_Dialect, IsMMAMatrixTypePred, "MMAMatrix type">;
 
 // Memref type acceptable to gpu.subgroup_mma_{load|store}_matrix ops.
-def GPU_MMAMemRef : MemRefOf<[I8, I32, F16, F32, VectorOfRankAndType<[1], [I8, I32, F16, F32]>]>;
+def GPU_MMAMemRef : MemRefOf<[I8, I32, F16, F32, F64, VectorOfRankAndType<[1], [I8, I32, F16, F32, F64]>]>;
 
 class MMAMatrixOf<list<Type> allowedTypes> :
   ContainerType<AnyTypeOf<allowedTypes>, IsMMAMatrixTypePred,
diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
index a6c6038e1e224..5c7df25c58cde 100644
--- a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
+++ b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
@@ -1872,7 +1872,7 @@ def GPU_SubgroupMmaStoreMatrixOp : GPU_Op<"subgroup_mma_store_matrix",
     ```
   }];
 
-  let arguments = (ins Arg<MMAMatrixOf<[SI8, UI8, I32, F16, F32]>>:$src,
+  let arguments = (ins Arg<MMAMatrixOf<[SI8, UI8, I32, F16, F32, F64]>>:$src,
                   Arg<GPU_MMAMemRef, "",[MemWriteAt<0, FullEffect>]>:$dstMemref,
                   Variadic<Index>:$indices,
                   IndexAttr:$leadDimension,
@@ -1919,9 +1919,9 @@ def GPU_SubgroupMmaComputeOp
     ```
   }];
 
-  let arguments = (ins Arg<MMAMatrixOf<[SI8, UI8, F16, F32]>>:$opA,
-                  Arg<MMAMatrixOf<[SI8, UI8, F16, F32]>>:$opB,
-                  Arg<MMAMatrixOf<[I32, F16, F32]>>:$opC,
+  let arguments = (ins Arg<MMAMatrixOf<[SI8, UI8, F16, F32, F64]>>:$opA,
+                  Arg<MMAMatrixOf<[SI8, UI8, F16, F32, F64]>>:$opB,
+                  Arg<MMAMatrixOf<[I32, F16, F32, F64]>>:$opC,
                   OptionalAttr<UnitAttr>:$a_transpose,
                   OptionalAttr<UnitAttr>:$b_transpose);
 
diff --git a/mlir/lib/Conversion/GPUToNVVM/WmmaOpsToNvvm.cpp b/mlir/lib/Conversion/GPUToNVVM/WmmaOpsToNvvm.cpp
index 99c059cb03299..6254de81780f5 100644
--- a/mlir/lib/Conversion/GPUToNVVM/WmmaOpsToNvvm.cpp
+++ b/mlir/lib/Conversion/GPUToNVVM/WmmaOpsToNvvm.cpp
@@ -17,6 +17,7 @@
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "mlir/Dialect/LLVMIR/NVVMDialect.h"
 #include "mlir/IR/TypeUtilities.h"
+#include "mlir/IR/Types.h"
 
 using namespace mlir;
 
@@ -57,7 +58,8 @@ static NVVM::MMATypes getElementType(gpu::MMAMatrixType type) {
   if (type.getElementType().isF32())
     return type.getOperand() == "COp" ? NVVM::MMATypes::f32
                                       : NVVM::MMATypes::tf32;
-
+  if (type.getElementType().isF64())
+    return NVVM::MMATypes::f64;
   if (type.getElementType().isSignedInteger(8))
     return NVVM::MMATypes::s8;
   if (type.getElementType().isUnsignedInteger(8))
@@ -212,8 +214,13 @@ struct WmmaMmaOpToNVVMLowering
     // then passed on to the intrinsic call. Emit llvm ops to extract individual
     // values form lowered memrefs.
     SmallVector<Value> unpackedOps;
-
     auto unpackOp = [&](Value operand) {
+      // f64 a and b fragments are not structs but scalars.
+      if (!isa<LLVM::LLVMStructType>(operand.getType())) {
+        unpackedOps.push_back(operand);
+        return;
+      }
+      // every other type is lowered to an LLVM struct, extract the values.
       auto structType = cast<LLVM::LLVMStructType>(operand.getType());
       for (size_t i = 0, e = structType.getBody().size(); i < e; ++i) {
         Value toUse = LLVM::ExtractValueOp::create(rewriter, loc, operand, i);
@@ -276,10 +283,16 @@ struct WmmaConstantOpToNVVMLowering
       return failure();
     Location loc = subgroupMmaConstantOp.getLoc();
     Value cst = adaptor.getOperands()[0];
-    LLVM::LLVMStructType type = convertMMAToLLVMType(
+    Type type = convertMMAToLLVMType(
         cast<gpu::MMAMatrixType>(subgroupMmaConstantOp.getType()));
+    // If the element is not a struct, it means it's a scalar f64.
+    auto structType = dyn_cast<LLVM::LLVMStructType>(type);
+    if (!structType) {
+      rewriter.replaceOp(subgroupMmaConstantOp, cst);
+      return success();
+    }
     // If the element type is a vector create a vector from the operand.
-    if (auto vecType = dyn_cast<VectorType>(type.getBody()[0])) {
+    if (auto vecType = dyn_cast<VectorType>(structType.getBody()[0])) {
       Value vecCst = LLVM::PoisonOp::create(rewriter, loc, vecType);
       for (int64_t vecEl = 0; vecEl < vecType.getNumElements(); vecEl++) {
         Value idx = LLVM::ConstantOp::create(rewriter, loc,
@@ -289,8 +302,8 @@ struct WmmaConstantOpToNVVMLowering
       }
       cst = vecCst;
     }
-    Value matrixStruct = LLVM::PoisonOp::create(rewriter, loc, type);
-    for (size_t i : llvm::seq(size_t(0), type.getBody().size())) {
+    Value matrixStruct = LLVM::PoisonOp::create(rewriter, loc, structType);
+    for (size_t i : llvm::seq(size_t(0), structType.getBody().size())) {
       matrixStruct =
           LLVM::InsertValueOp::create(rewriter, loc, matrixStruct, cst, i);
     }
@@ -354,10 +367,24 @@ struct WmmaElementwiseOpToNVVMLowering
       return failure();
     Location loc = subgroupMmaElementwiseOp.getLoc();
     size_t numOperands = adaptor.getOperands().size();
-    LLVM::LLVMStructType destType = convertMMAToLLVMType(
+    Type destType = convertMMAToLLVMType(
         cast<gpu::MMAMatrixType>(subgroupMmaElementwiseOp.getType()));
-    Value matrixStruct = LLVM::PoisonOp::create(rewriter, loc, destType);
-    for (size_t i = 0, e = destType.getBody().size(); i < e; ++i) {
+
+    // If the element is not a struct, it means it's a scalar f64.
+    LLVM::LLVMStructType structDestTy =
+        dyn_cast<LLVM::LLVMStructType>(destType);
+    if (!structDestTy) {
+      SmallVector<Value> operands;
+      for (auto operand : adaptor.getOperands()) {
+        operands.push_back(operand);
+      }
+      Value element = createScalarOp(
+          rewriter, loc, subgroupMmaElementwiseOp.getOpType(), operands);
+      rewriter.replaceOp(subgroupMmaElementwiseOp, element);
+      return success();
+    }
+    Value matrixStruct = LLVM::PoisonOp::create(rewriter, loc, structDestTy);
+    for (size_t i = 0, e = structDestTy.getBody().size(); i < e; ++i) {
       SmallVector<Value> extractedOperands;
       for (size_t opIdx = 0; opIdx < numOperands; opIdx++) {
         extractedOperands.push_back(LLVM::ExtractValueOp::create(
@@ -377,13 +404,18 @@ struct WmmaElementwiseOpToNVVMLowering
 } // namespace
 
 /// Return the LLVMStructureType corresponding to the MMAMatrixType `type`.
-LLVM::LLVMStructType mlir::convertMMAToLLVMType(gpu::MMAMatrixType type) {
+Type mlir::convertMMAToLLVMType(gpu::MMAMatrixType type) {
   NVVM::MMAFrag frag = convertOperand(type.getOperand());
   NVVM::MMATypes eltType = getElementType(type);
   auto nRow = type.getShape()[0];
   auto nCol = type.getShape()[1];
   std::pair<Type, unsigned> typeInfo =
       NVVM::inferMMAType(eltType, frag, nRow, nCol, type.getContext());
+  // Special handling for f64 a and b fragments
+  Type f64Ty = Float64Type::get(type.getContext());
+  if (typeInfo.first == f64Ty && typeInfo.second == 1) {
+    return f64Ty;
+  }
   return LLVM::LLVMStructType::getLiteral(
       type.getContext(), SmallVector<Type, 8>(typeInfo.second, typeInfo.first));
 }
diff --git a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
index 6c6d8d2bad55d..61a630aa88960 100644
--- a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
+++ b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
@@ -208,7 +208,7 @@ Type MMAMatrixType::getElementType() const { return getImpl()->elementType; }
 StringRef MMAMatrixType::getOperand() const { return getImpl()->getOperand(); }
 
 bool MMAMatrixType::isValidElementType(Type elementType) {
-  return elementType.isF16() || elementType.isF32() ||
+  return elementType.isF16() || elementType.isF32() || elementType.isF64() ||
          elementType.isUnsignedInteger(8) || elementType.isSignedInteger(8) ||
          elementType.isInteger(32);
 }
@@ -225,7 +225,7 @@ MMAMatrixType::verifyInvariants(function_ref<InFlightDiagnostic()> emitError,
 
   if (!MMAMatrixType::isValidElementType(elementType))
     return emitError()
-           << "MMAMatrixType elements must be SI8, UI8, I32, F16, or F32";
+           << "MMAMatrixType elements must be SI8, UI8, I32, F16, F32, or F64";
 
   return success();
 }
diff --git a/mlir/test/Conversion/GPUToNVVM/wmma-ops-to-nvvm.mlir b/mlir/test/Conversion/GPUToNVVM/wmma-ops-to-nvvm.mlir
index 82c02c1d6ee63..a0801443057ea 100644
--- a/mlir/test/Conversion/GPUToNVVM/wmma-ops-to-nvvm.mlir
+++ b/mlir/test/Conversion/GPUToNVVM/wmma-ops-to-nvvm.mlir
@@ -80,6 +80,28 @@ gpu.module @test_module {
 
 // -----
 
+gpu.module @test_module {
+
+  // CHECK-LABEL: func @gpu_wmma_f64_load_op() ->
+  // CHECK-SAME: f64
+  // CHECK32-LABEL: func @gpu_wmma_f64_load_op() ->
+  func.func @gpu_wmma_f64_load_op() -> (!gpu.mma_matrix<8x4xf64, "AOp">) {
+    %wg = memref.alloca() {alignment = 32} : memref<32x32xf64, 3>
+    %i = arith.constant 16 : index
+    %j = arith.constant 16 : index
+    %0 = gpu.subgroup_mma_load_matrix %wg[%i, %j] {leadDimension = 32 : index} : memref<32x32xf64, 3> -> !gpu.mma_matrix<8x4xf64, "AOp">
+    return %0 : !gpu.mma_matrix<8x4xf64, "AOp">
+    // CHECK: %[[MUL:.*]] = llvm.mul %{{.*}}, %{{.*}} : i64
+    // CHECK: %[[ADD:.*]] = llvm.add %[[MUL]], %{{.*}} : i64
+    // CHECK: %[[GEP:.*]] = llvm.getelementptr %{{.*}}[%[[ADD]]] : (!llvm.ptr<3>, i64) -> !llvm.ptr<3>, f64
+    // CHECK: %[[C32_I32:.*]] = llvm.mlir.constant(32 : index) : i32
+    // CHECK: %[[LOAD:.*]] = nvvm.wmma.load %[[GEP]], %[[C32_I32]] {eltype = #nvvm.mma_type<f64>, frag = #nvvm.mma_frag<a>, k = 4 : i32, layout = #nvvm.mma_layout<row>, m = 8 : i32, n = 8 : i32} : (!llvm.ptr<3>) -> f64
+    // CHECK: llvm.return %[[LOAD]] : f64
+  }
+}
+
+// -----
+
 gpu.module @test_module {
 
   // CHECK-LABEL: func @gpu_wmma_store_op
diff --git a/mlir/test/Dialect/GPU/invalid.mlir b/mlir/test/Dialect/GPU/invalid.mlir
index 35381dab7b200..26bcf948bc85d 100644
--- a/mlir/test/Dialect/GPU/invalid.mlir
+++ b/mlir/test/Dialect/GPU/invalid.mlir
@@ -688,7 +688,7 @@ func.func @mmamatrix_operand_type(){
 func.func @mmamatrix_invalid_element_type(){
     %wg = memref.alloca() {alignment = 32} : memref<32x32xf16, 3>
     %i = arith.constant 16 : index
-    // expected-error @+1 {{MMAMatrixType elements must be SI8, UI8, I32, F16, or F32}}
+    // expected-error @+1 {{MMAMatrixType elements must be SI8, UI8, I32, F16, F32, or F64}}
     %0 = gpu.subgroup_mma_load_matrix %wg[%i, %i] {leadDimension = 32 : index} : memref<32x32xf16, 3> -> !gpu.mma_matrix<16x16xbf16, "AOp">
     return
 }
@@ -708,7 +708,7 @@ func.func @mmaLoadOp_identity_layout(){
 // -----
 
 func.func @mma_invalid_memref_type(%src: memref<32x4xvector<4x8xf32>>, %i: index) {
-    // expected-error @+1 {{operand #0 must be memref of 8-bit signless integer or 32-bit signless integer or 16-bit float or 32-bit float or vector of 8-bit signless integer or 32-bit signless integer or 16-bit float or 32-bit float values of ranks 1 values}}
+    // expected-error @+1 {{operand #0 must be memref of 8-bit signless integer or 32-bit signless integer or 16-bit float or 32-bit float or 64-bit float or vector of 8-bit signless integer or 32-bit signless integer or 16-bit float or 32-bit float or 64-bit float values of ranks 1 values}}
     %0 = gpu.subgroup_mma_load_matrix %src[%i, %i] {leadDimension = 4 : index} : memref<32x4xvector<4x8xf32>> -> !gpu.mma_matrix<16x16xf16, "AOp">
     return
 }
diff --git a/mlir/test/Integration/GPU/CUDA/TensorCore/sm80/wmma-matmul-f64.mlir b/mlir/test/Integration/GPU/CUDA/TensorCore/sm80/wmma-matmul-f64.mlir
new file mode 100644
index 0000000000000..a016a60022699
--- /dev/null
+++ b/mlir/test/Integration/GPU/CUDA/TensorCore/sm80/wmma-matmul-f64.mlir
@@ -0,0 +1,72 @@
+// RUN: mlir-opt %s \
+// RUN: | mlir-opt -gpu-lower-to-nvvm-pipeline="cubin-chip=sm_80 cubin-format=%gpu_compilation_format" \
+// RUN: | mlir-runner \
+// RUN:   --shared-libs=%mlir_cuda_runtime \
+// RUN:   --shared-libs=%mlir_runner_utils \
+// RUN:   --entry-point-result=void \
+// RUN: | FileCheck %s
+
+#map0 = affine_map<(d0, d1) -> (d1, d0)>
+
+func.func @main() {
+  %a = memref.alloc() : memref<8x4xf64>
+  %b = memref.alloc() : memref<4x8xf64>
+  %c = memref.alloc() : memref<8x8xf64>
+  %d = memref.alloc() : memref<8x8xf64>
+
+  %f1 = arith.constant 1.0e+00 : f64
+  %fcst = arith.constant 3.14e+00 : f64
+  %c0 = arith.constant 0 : index
+  %c8 = arith.constant 8 : index
+  %c4 = arith.constant 4 : index
+  %c1 = arith.constant 1 : index
+  %c32 = arith.constant 32 : index
+
+  // Initialize the Input matrixes with ones.
+  scf.for %arg0 = %c0 to %c8 step %c1 {
+    scf.for %arg1 = %c0 to %c4 step %c1 {
+      memref.store %f1, %a[%arg0, %arg1] : memref<8x4xf64>
+      memref.store %f1, %b[%arg1, %arg0] : memref<4x8xf64>
+    }
+  }
+  // Initialize the accumulator matrix with a constant.
+  scf.for %arg0 = %c0 to %c8 step %c1 {
+    scf.for %arg1 = %c0 to %c8 step %c1 {
+      memref.store %fcst, %c[%arg0, %arg1] : memref<8x8xf64>
+    }
+  }
+
+  %2 = memref.cast %a : memref<8x4xf64> to memref<*xf64>
+  %20 = memref.cast %b : memref<4x8xf64> to memref<*xf64>
+  %33 = memref.cast %c : memref<8x8xf64> to memref<*xf64>
+  %34 = memref.cast %d : memref<8x8xf64> to memref<*xf64>
+
+  gpu.host_register %2 : memref<*xf64>
+  gpu.host_register %20 : memref<*xf64>
+  gpu.host_register %33 : memref<*xf64>
+
+  gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %c1, %grid_y = %c1, %grid_z = %c1)
+             threads(%tx, %ty, %tz) in (%block_x = %c32, %block_y = %c1, %block_z = %c1) {
+    %A = gpu.subgroup_mma_load_matrix %a[%c0, %c0] {leadDimension = 4 : index} : memref<8x4xf64> -> !gpu.mma_matrix<8x4xf64, "AOp">
+    %B = gpu.subgroup_mma_load_matrix %b[%c0, %c0] {leadDimension = 8 : index} : memref<4x8xf64> -> !gpu.mma_matrix<4x8xf64, "BOp">
+    %C = gpu.subgroup_mma_load_matrix %c[%c0, %c0] {leadDimension = 8 : index} : memref<8x8xf64> -> !gpu.mma_matrix<8x8xf64, "COp">
+
+    %R = gpu.subgroup_mma_compute %A, %B, %C : !gpu.mma_matrix<8x4xf64, "AOp">, !gpu.mma_matrix<4x8xf64, "BOp"> -> !gpu.mma_matrix<8x8xf64, "COp">
+
+    gpu.subgroup_mma_store_matrix %R, %d[%c0, %c0] {leadDimension = 8 : index}: !gpu.mma_matrix<8x8xf64, "COp">, memref<8x8xf64>
+    gpu.terminator
+  }
+  // Print the memref after computation.
+  call @printMemrefF64(%34) : (memref<*xf64>) -> ()
+  // CHECK: [7.14,   7.14,   7.14,   7.14,   7.14,   7.14,   7.14,   7.14],
+  // CHECK-NEXT: [7.14,   7.14,   7.14,   7.14,   7.14,   7.14,   7.14,   7.14],
+  // CHECK-NEXT: [7.14,   7.14,   7.14,   7.14,   7.14,   7.14,   7.14,   7.14],
+  // CHECK-NEXT: [7.14,   7.14,   7.14,   7.14,   7.14,   7.14,   7.14,   7.14],
+  // CHECK-NEXT: [7.14,   7.14,   7.14,   7.14,   7.14,   7.14,   7.14,   7.14],
+  // CHECK-NEXT: [7.14,   7.14,   7.14,   7.14,   7.14,   7.14,   7.14,   7.14],
+  // CHECK-NEXT: [7.14,   7.14,   7.14,   7.14,   7.14,   7.14,   7.14,   7.14],
+  // CHECK-NEXT: [7.14,   7.14,   7.14,   7.14,   7.14,   7.14,   7.14,   7.14]
+  return
+}
+
+func.func private @printMemrefF64(memref<*xf64>)

github-actions · 2025-11-21T16:50:28Z

🐧 Linux x64 Test Results

7156 tests passed
595 tests skipped

castigli · 2025-12-01T12:33:38Z

@fabianmcg thanks for approving, could you merge on my behalf?

sohaibiftikhar · 2025-12-01T13:52:14Z

This seems to break:
https://github.com/llvm/llvm-project/blob/main/mlir/test/Integration/GPU/CUDA/TensorCore/sm80/wmma-matmul-f64.mlir

wmma-matmul-f64.mlir:61:12: error: CHECK: expected string not found in input
 // CHECK: [7.14, 7.14, 7.14, 7.14, 7.14, 7.14, 7.14, 7.14],
           ^
<stdin>:1:1: note: scanning from here
Unranked Memref [base@] = 0x53e1ffc5ae00 rank = 2 offset = 0 sizes = [8, 8] strides = [8, 1] data = 
^
<stdin>:2:10: note: possible intended match here
[[4.55677e-310, -nan, -nan, -nan, -nan, -nan, -nan, -nan], 
         ^

Input file: <stdin>
Check file: llvm-project/mlir/test/Integration/GPU/CUDA/TensorCore/sm80/wmma-matmul-f64.mlir

-dump-input=help explains the following input dump.

Input was:
<<<<<<
            1: Unranked Memref [base@] = 0x53e1ffc5ae00 rank = 2 offset = 0 sizes = [8, 8] strides = [8, 1] data =  
check:61'0     X~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ error: no match found
            2: [[4.55677e-310, -nan, -nan, -nan, -nan, -nan, -nan, -nan],  
check:61'0     ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
check:61'1              ?                                                   possible intended match
            3:  [-nan, -nan, -nan, -nan, -nan, -nan, -nan, -nan],  
check:61'0     ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
            4:  [-nan, -nan, -nan, -nan, -nan, -nan, -nan, -nan],  
check:61'0     ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
            5:  [-nan, -nan, -nan, -nan, -nan, -nan, -nan, -nan],  
check:61'0     ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
            6:  [-nan, -nan, -nan, -nan, -nan, -nan, -nan, -nan],  
check:61'0     ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
            7:  [-nan, -nan, -nan, -nan, -nan, -nan, -nan, -nan],  
check:61'0     ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
            .
            .
            .
>>>>>>

fabianmcg · 2025-12-01T13:56:09Z

@sohaibiftikhar what is your setup? Currently the builders are passing.

This PR re-lands llvm#165873. This PR extends the gpu.subgroup_mma_* ops to support fp64 type. The extension requires special handling during the lowering to nvvm due to the return type for load ops for fragment a and b (they return a scalar instead of a struct). The original PR did not guard the new test based on the required architecture (sm80) which lead to a failure on the cuda runners with T4 GPUs.

sohaibiftikhar · 2025-12-01T15:38:26Z

The test was running on an A100 machine.
Can this be flaky? I am seeing both success and failure on the same hardware.

castigli · 2025-12-01T15:45:29Z

The test was running on an A100 machine. Can this be flaky? I am seeing both success and failure on the same hardware.

As far as I have seen, it should always pass, I have been testing on GH200.

sohaibiftikhar · 2025-12-02T10:42:14Z

I managed to reproduce this locally on an H100 machine. The test has a failure rate of about 30% for me.

Reproduction steps:

mkdir -p build && cd build
cmake -G Ninja ../llvm \
  -DLLVM_ENABLE_PROJECTS=mlir \
  -DLLVM_TARGETS_TO_BUILD="host;NVPTX" \
  -DMLIR_RUN_CUDA_SM80_TESTS=ON \
  -DMLIR_RUN_CUDA_SM80_LT_TESTS=ON \
  -DMLIR_ENABLE_CUDA_RUNNER=ON \
  -DMLIR_RUN_CUDA_TENSOR_CORE_TESTS=ON \
  -DMLIR_GPU_COMPILATION_TEST_FORMAT=fatbin \
  -DMLIR_INCLUDE_INTEGRATION_TESTS=ON \
  -DCMAKE_BUILD_TYPE=Release
  
ninja

Reproduction script:

#!/bin/bash
TEST_FILE="../mlir/test/Integration/GPU/CUDA/TensorCore/sm80/wmma-matmul-f64.mlir"
RUNS=20
FAILURES=0
echo "Running test '$TEST_FILE' $RUNS times..."
for i in $(seq 1 $RUNS); do
    echo "---------------------------------"
    echo "Run $i of $RUNS"
    echo "---------------------------------"
    ./bin/llvm-lit -sv "$TEST_FILE"
    if [ $? -ne 0 ]; then
        FAILURES=$((FAILURES + 1))
    fi
done

echo "================================="
echo "Total runs: $RUNS"
echo "Total failures: $FAILURES"
echo "================================="

if [ $FAILURES -ne 0 ]; then
    exit 1
fi

For 20 runs this gives me:

=================================
Total runs: 20
Total failures: 6
=================================

The error seems to happen due to out of bounds access.

# .---command stderr------------
# | 'cuStreamSynchronize(stream)' failed with 'CUDA_ERROR_ILLEGAL_ADDRESS'
# | 'cuStreamDestroy(stream)' failed with 'CUDA_ERROR_ILLEGAL_ADDRESS'
# | 'cuModuleUnload(module)' failed with 'CUDA_ERROR_ILLEGAL_ADDRESS'
# `-----------------------------

Gpu information:

NVIDIA H100 80GB HBM3
NVIDIA-SMI 550.163.01
Driver Version: 550.163.01
CUDA Version: 12.4

castigli · 2025-12-02T11:19:43Z

Thank you for checking and the reproducer.
Do you see this failure also on the other wmma tests (like mlir/test/Integration/GPU/CUDA/TensorCore/wmma-matmul-f32.mlir) or only on the new one (mlir/test/Integration/GPU/CUDA/TensorCore/sm80/wmma-matmul-f64.mlir)?

sohaibiftikhar · 2025-12-02T11:25:12Z

Only for f64. f32 and f16 seem to work.

This PR re-lands llvm#165873. This PR extends the gpu.subgroup_mma_* ops to support fp64 type. The extension requires special handling during the lowering to nvvm due to the return type for load ops for fragment a and b (they return a scalar instead of a struct). The original PR did not guard the new test based on the required architecture (sm80) which lead to a failure on the cuda runners with T4 GPUs.

castigli · 2025-12-03T08:55:39Z

I am a bit stumped because I cannot reproduce the failure on GH200.

Total runs: 30
Total failures: 0

I will try on A100.

sohaibiftikhar · 2025-12-03T09:31:31Z

Naive tip but just in case make sure the test is not skipped (a lot of config flags as I put in the cmake above).

castigli · 2025-12-03T11:33:35Z

I am checking also "manually" to check the output, but I cannot get it to fail on GH200 nor A100.

NVIDIA GH200 120GB
NVIDIA A100-SXM4-80GB
NVIDIA-SMI 550.54.15              
Driver Version: 550.54.15     
CUDA Version: 12.4

sohaibiftikhar · 2025-12-03T11:55:27Z

The only obvious difference to me is that the driver version you have listed is older (March 2024). Is there a possibility to bump that on your end?

castigli · 2025-12-03T12:40:10Z

Unfortunately that is out of my control.

sohaibiftikhar · 2025-12-03T13:01:45Z

Not sure if this helps reproduce this but I ran the mlir-runner under csan and it fails reliably with that.

https://gist.githubusercontent.com/sohaibiftikhar/d44d57a687c8f8d008bbd2edfd063206/raw/a949213ccb47d7cd3ce374f4126b480903e75228/csan.log

Command:

./bin/mlir-opt ../mlir/test/Integration/GPU/CUDA/TensorCore/sm80/wmma-matmul-f64.mlir \
| ./bin/mlir-opt -gpu-lower-to-nvvm-pipeline="cubin-chip=sm_80 cubin-format=fatbin" \
| /path/to/cuda/compute-sanitizer ./bin/mlir-runner \
   --shared-libs=$PWD/lib/libmlir_cuda_runtime.so \
   --shared-libs=$PWD/lib/libmlir_runner_utils.so \
   --entry-point-result=void

castigli · 2025-12-03T13:52:21Z

unfortunately this doesn't help either on my side

========= COMPUTE-SANITIZER
Unranked Memref base@ = 0x5562b11643c0 rank = 2 offset = 0 sizes = [8, 8] strides = [8, 1] data = 
[[7.14,   7.14,   7.14,   7.14,   7.14,   7.14,   7.14,   7.14], 
 [7.14,   7.14,   7.14,   7.14,   7.14,   7.14,   7.14,   7.14], 
 [7.14,   7.14,   7.14,   7.14,   7.14,   7.14,   7.14,   7.14], 
 [7.14,   7.14,   7.14,   7.14,   7.14,   7.14,   7.14,   7.14], 
 [7.14,   7.14,   7.14,   7.14,   7.14,   7.14,   7.14,   7.14], 
 [7.14,   7.14,   7.14,   7.14,   7.14,   7.14,   7.14,   7.14], 
 [7.14,   7.14,   7.14,   7.14,   7.14,   7.14,   7.14,   7.14], 
 [7.14,   7.14,   7.14,   7.14,   7.14,   7.14,   7.14,   7.14]]
========= ERROR SUMMARY: 0 errors

Artem-B · 2025-12-03T19:56:22Z

@castigli would it be possible for you to post LLVM IR and PTX generated in your case tor the kernel in question?
We'll try to do the same on our end, and see if there are any obvious differences.

castigli · 2025-12-04T09:36:23Z

@castigli would it be possible for you to post LLVM IR and PTX generated in your case tor the kernel in question? We'll try to do the same on our end, and see if there are any obvious differences.

Of course, here it is
https://gist.githubusercontent.com/castigli/af9edc203e434276610e9eb9c526bf2e/raw/94038df4f077bd8a2e1d47a4b33b8f6e62355f37/wmma-matmul-f64-llvm-ptx.mlir

sohaibiftikhar · 2025-12-04T12:02:57Z

https://gist.githubusercontent.com/sohaibiftikhar/d44d57a687c8f8d008bbd2edfd063206/raw/c2638400f082c8994073c96bcf14f42f07793bf6/debug.ptx

Generated ptx looks identical. This might be triggering a bug in ptxas for specific driver version.

I was thinking if alignment was the problem because it always shows a 16 byte illegal write but manually changing the alignment to 16 bytes did not help in my debugging.

sohaibiftikhar · 2025-12-04T13:19:16Z

I think I found the issue. The output tensor registration is not happening.

diff --git a/tmp/broken.llvmir b/tmp/fixed.llvmir
index a1fadc7..0b988c3 100644
--- a/tmp/broken.llvmir
+++ b/tmp/fixed.llvmir
@@ -98,6 +98,7 @@ module attributes {gpu.container_module} {
     llvm.call @mgpuMemHostRegisterMemRef(%0, %67, %72) : (i64, !llvm.ptr, i64) -> ()
     llvm.call @mgpuMemHostRegisterMemRef(%0, %68, %72) : (i64, !llvm.ptr, i64) -> ()
     llvm.call @mgpuMemHostRegisterMemRef(%0, %69, %72) : (i64, !llvm.ptr, i64) -> ()
+    llvm.call @mgpuMemHostRegisterMemRef(%0, %70, %72) : (i64, !llvm.ptr, i64) -> ()
     gpu.launch_func  @main_kernel::@main_kernel blocks in (%8, %8, %8) threads in (%9, %8, %8) : i64 args(%12 : !llvm.ptr, %12 : !llvm.ptr, %5 : i64, %6 : i64, %7 : i64, %7 : i64, %8 : i64, %5 : i64, %20 : !llvm.ptr, %20 : !llvm.ptr, %5 : i64, %7 : i64, %6 : i64, %6 : i64, %8 : i64, %30 : !llvm.ptr, %30 : !llvm.ptr, %5 : i64, %6 : i64, %6 : i64, %6 : i64, %8 : i64, %38 : !llvm.ptr, %38 : !llvm.ptr, %5 : i64, %6 : i64, %6 : i64, %6 : i64, %8 : i64)
     llvm.call @printMemrefF64(%0, %70) : (i64, !llvm.ptr) -> ()
     llvm.return

Adding this fixes the issue.

See discussion on #169061

castigli · 2025-12-04T14:05:15Z

@sohaibiftikhar Good catch!
Thanks for the fix, I don't quite understand why I was not able to reproduce it.

This PR re-lands llvm#165873. This PR extends the gpu.subgroup_mma_* ops to support fp64 type. The extension requires special handling during the lowering to nvvm due to the return type for load ops for fragment a and b (they return a scalar instead of a struct). The original PR did not guard the new test based on the required architecture (sm80) which lead to a failure on the cuda runners with T4 GPUs.

See discussion on llvm#169061

castigli requested a review from fabianmcg as a code owner November 21, 2025 16:40

llvmbot added mlir:gpu mlir labels Nov 21, 2025

castigli added 5 commits December 1, 2025 10:22

GPU mma fp64 extension

0079b6c

format

82009c8

fix invalid IR test

bf4c815

address review

16d1bfb

guard wmma fp64 integration test with sm80 flag

2e622f2

castigli force-pushed the gpu-mma-fp64-take2 branch from bc67509 to 2e622f2 Compare December 1, 2025 09:22

fabianmcg approved these changes Dec 1, 2025

View reviewed changes

fabianmcg merged commit d3edc94 into llvm:main Dec 1, 2025
10 checks passed

sohaibiftikhar mentioned this pull request Dec 4, 2025

[MLIR][NVVM] Fix wmma test after d3edc94d #170659

Merged

sohaibiftikhar added a commit that referenced this pull request Dec 4, 2025

[MLIR][NVVM] Fix wmma test after d3edc94 (#170659)

b31a398

See discussion on #169061

kcloudy0717 pushed a commit to kcloudy0717/llvm-project that referenced this pull request Dec 4, 2025

[MLIR][NVVM] Fix wmma test after d3edc94 (llvm#170659)

3f47097

See discussion on llvm#169061

[MLIR][GPU] subgroup_mma fp64 extension - take 2 #169061

[MLIR][GPU] subgroup_mma fp64 extension - take 2 #169061

Uh oh!

Conversation

castigli commented Nov 21, 2025

Uh oh!

llvmbot commented Nov 21, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

github-actions bot commented Nov 21, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

🐧 Linux x64 Test Results

Uh oh!

castigli commented Dec 1, 2025

Uh oh!

Uh oh!

sohaibiftikhar commented Dec 1, 2025

Uh oh!

fabianmcg commented Dec 1, 2025

Uh oh!

sohaibiftikhar commented Dec 1, 2025

Uh oh!

castigli commented Dec 1, 2025

Uh oh!

sohaibiftikhar commented Dec 2, 2025

Uh oh!

castigli commented Dec 2, 2025

Uh oh!

sohaibiftikhar commented Dec 2, 2025

Uh oh!

castigli commented Dec 3, 2025

Uh oh!

sohaibiftikhar commented Dec 3, 2025

Uh oh!

castigli commented Dec 3, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

sohaibiftikhar commented Dec 3, 2025

Uh oh!

castigli commented Dec 3, 2025

Uh oh!

sohaibiftikhar commented Dec 3, 2025

Uh oh!

castigli commented Dec 3, 2025

Uh oh!

Artem-B commented Dec 3, 2025

Uh oh!

castigli commented Dec 4, 2025

Uh oh!

sohaibiftikhar commented Dec 4, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

sohaibiftikhar commented Dec 4, 2025

Uh oh!

castigli commented Dec 4, 2025

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

5 participants

llvmbot commented Nov 21, 2025 •

edited

Loading

github-actions bot commented Nov 21, 2025 •

edited

Loading

castigli commented Dec 3, 2025 •

edited

Loading

sohaibiftikhar commented Dec 4, 2025 •

edited

Loading