[BACKEND] Fix indexing into TMEM for lhs mma operands (#6888)

ThomasRaoux · web-flow · commit dca70ac0433f · 2025-05-20T12:45:30.000-07:00
The indexing math was incorrect when accesing tmem as lhs operand.

This fixes some random failures on attention tutorial when BLOCK_M=256
is picked
diff --git a/python/test/unit/language/test_matmul.py b/python/test/unit/language/test_matmul.py
@@ -537,7 +537,8 @@ def flatten_scale(scale):
             print(f"SWP failed for M = {M}, N = {N}")
 
 
-@pytest.mark.parametrize("BLOCK_M, BLOCK_N, BLOCK_K", [(128, 128, 64), (128, 64, 128), (64, 128, 32), (128, 256, 32)])
+@pytest.mark.parametrize("BLOCK_M, BLOCK_N, BLOCK_K", [(128, 128, 64), (128, 64, 128), (64, 128, 32), (128, 256, 32),
+                                                       (256, 64, 32)])
 @pytest.mark.parametrize("a_trans", [False, True])
 @pytest.mark.parametrize("dtype_src_str", ["float32", "float16", "float8e5"])
 @pytest.mark.skipif(is_hip() or torch.cuda.get_device_capability()[0] != 10, reason="Requires compute capability == 10")
diff --git a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/DotOpToLLVM/MMAHelpers.h b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/DotOpToLLVM/MMAHelpers.h
@@ -95,6 +95,7 @@ class DotOpMmaV5TmemLoader : public DotOpMmaMemLoader {
   SmallVector<unsigned int> instrShape;
   int numElementsPer32b;
   int numRepM;
+  int numSlicePerBlockN;
 };
 
 } // namespace NVIDIA
diff --git a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/DotOpToLLVM/MMAv5.cpp b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/DotOpToLLVM/MMAv5.cpp
@@ -26,6 +26,10 @@ mlir::triton::NVIDIA::DotOpMmaV5TmemLoader::DotOpMmaV5TmemLoader(
   auto ty = cast<MemDescType>(tensor.getType());
   auto tmemEncoding = cast<ttng::TensorMemoryEncodingAttr>(ty.getEncoding());
   unpacked = tmemEncoding.getUnpacked();
+  // When using TMEM to store operands mma operands the TMEM block size may be
+  // smaller than mma k block. Therefore we need to adjust the offset
+  // calculation.
+  numSlicePerBlockN = tmemEncoding.getBlockN() / instrShape[1];
   int elTyWidth = ty.getElementTypeBitWidth();
   numElementsPer32b = unpacked ? 1 : 32 / elTyWidth;
   auto shapePerCTA = triton::gpu::getShapePerCTA(ty);
@@ -38,8 +42,9 @@ MemDescOperand mlir::triton::NVIDIA::DotOpMmaV5TmemLoader::tmemLoad(
   if (interleaved || instrShape[0] >= 128)
     numRows = 128;
   int numColPerBlock =
-      ((instrShape[0] * instrShape[1]) / numRows) / numElementsPer32b;
-  int blockId = a + b * numRepM;
+      ((instrShape[0] * numSlicePerBlockN * instrShape[1]) / numRows) /
+      numElementsPer32b;
+  int blockId = a + (b / numSlicePerBlockN) * numRepM;
   int offset;
   if (!interleaved) {
     offset = numColPerBlock * blockId;
@@ -48,7 +53,7 @@ MemDescOperand mlir::triton::NVIDIA::DotOpMmaV5TmemLoader::tmemLoad(
     int blockIdPrevEven = blockId - blockIdIsOdd;
     offset = numColPerBlock * blockIdPrevEven + ((16 * blockIdIsOdd) << 16);
   }
-
+  offset += (b % numSlicePerBlockN) * (instrShape[1] / numElementsPer32b);
   auto tb = TritonLLVMOpBuilder(loc, rewriter);
   Value address = tb.ptrtoint(i32_ty, base);
   return {address, offset};