[BACKEND] Fix a special case where elements along the k dimension are repeated within each thread (#5121)

Jokeren · web-flow · commit 7f0633879917 · 2024-11-14T08:58:14.000Z
This PR includes the following changes:

- Adds comprehensive tests for mixed-precision dot products, including
configurations such as f8xf16, i8xf16, f8xf32, and i8xf32.
- Fixes mmav2 when the k dimension contains duplicated elements. For
example, with a 16x16 fp16 triton tensor (opidx=0, kwidth=4), a 16x32
tile is used, causing the first 16 elements in the k dimension to repeat
in the last 16 elements. During mmav2 computation, only the first half
is required.
diff --git a/lib/Conversion/TritonGPUToLLVM/MemoryOpToLLVM.cpp b/lib/Conversion/TritonGPUToLLVM/MemoryOpToLLVM.cpp
@@ -138,12 +138,17 @@ struct LocalLoadOpConversion : public ConvertOpToLLVMPattern<LocalLoadOp> {
 
   // FIXME [Dot LL]
   // Do for all DotOperandEncodingAttr once we have LLs for all of them
-  static bool isSupportedDotOpLayout(Attribute layout) {
+  static bool isSupportedDotOpLayout(RankedTensorType type) {
+    auto layout = type.getEncoding();
+    auto bitwidth = type.getElementType().getIntOrFloatBitWidth();
     if (auto dot = dyn_cast<DotOperandEncodingAttr>(layout)) {
+      auto kWidth = dot.getKWidth();
       // Use when the SharedToDotOperandMMAv2OrV3 is known to be buggy:
       // - kWidth == 8
+      // - kWidth == 4, bitwidth = 32
       if (auto mma = dyn_cast<NvidiaMmaEncodingAttr>(dot.getParent())) {
-        bool legacyLoweringIsBuggy = dot.getKWidth() >= 8;
+        bool legacyLoweringIsBuggy =
+            kWidth >= 8 || (kWidth == 4 && bitwidth == 32);
         return legacyLoweringIsBuggy && mma.isAmpere();
       }
       if (isa<AMDMfmaEncodingAttr>(dot.getParent()))
@@ -162,7 +167,7 @@ struct LocalLoadOpConversion : public ConvertOpToLLVMPattern<LocalLoadOp> {
     if (isa<SharedEncodingAttr>(srcLayout) &&
         (isa<BlockedEncodingAttr, MmaEncodingTrait, SliceEncodingAttr>(
              dstLayout) ||
-         isSupportedDotOpLayout(dstLayout))) {
+         isSupportedDotOpLayout(dstTy))) {
       return lowerSharedToDistributed(op, adaptor, getTypeConverter(),
                                       rewriter);
     }
@@ -202,7 +207,7 @@ struct LocalLoadOpConversion : public ConvertOpToLLVMPattern<LocalLoadOp> {
     auto dstShape = dstTy.getShape();
     auto srcSharedLayout = cast<SharedEncodingAttr>(srcTy.getEncoding());
     auto dstLayout = dstTy.getEncoding();
-    assert((dstShape.size() <= 2 || isSupportedDotOpLayout(dstLayout)) &&
+    assert((dstShape.size() <= 2 || isSupportedDotOpLayout(dstTy)) &&
            "Unexpected rank of ConvertLayout(shared->distributed)");
 
     auto smemObj = LLVM::getSharedMemoryObjectFromStruct(
diff --git a/python/test/regression/test_cast_matmul.py b/python/test/regression/test_cast_matmul.py
@@ -13,6 +13,11 @@
 import triton.language as tl
 
 input_dtypes = ["float16", "float32", "float64"]
+if triton.runtime.driver.active.get_current_target().backend == "cuda":
+    input_dtypes += ["int8", "float8_e5m2"]
+    cc = torch.cuda.get_device_capability(0)
+    if cc >= (8, 9):
+        input_dtypes += ["float8_e4m3fn"]
 out_dtypes = ["float16", "float32"]
 
 
@@ -63,37 +68,48 @@ def matmul_kernel(A, B, C, M, N, K,  #
     tl.store(C, acc, mask=mask)
 
 
-@pytest.mark.parametrize("M, K, N, w_dtype, x_dtype, out_dtype",
-                         [(M, K, N, w, x, o)  #
-                          for (M, K, N) in [(128, 128, 128), (1280, 768, 1024)]  #
+@pytest.mark.parametrize("M, K, N, BLOCK_K, w_dtype, x_dtype, out_dtype",
+                         [(M, K, N, BLOCK_K, w, x, o)  #
+                          for BLOCK_K in [16, 32]  #
+                          for (M, K, N) in [(128, 128, 128), (768, 768, 1024)]  #
                           for w in input_dtypes
                           for x in input_dtypes  #
                           for o in out_dtypes])
-def test_cast_matmul(M, K, N, w_dtype, x_dtype, out_dtype):
+def test_cast_matmul(M, K, N, BLOCK_K, w_dtype, x_dtype, out_dtype):
     if x_dtype == w_dtype:
         pytest.skip("skip the same input dtype")
     device = torch.cuda.current_device()
-    x_dtype = getattr(torch, x_dtype)
-    w_dtype = getattr(torch, w_dtype)
-    a = torch.randn((M, K), device=device, dtype=x_dtype)
-    b = torch.randn((K, N), device=device, dtype=w_dtype)
+    x_dtype: torch.dtype = getattr(torch, x_dtype)
+    w_dtype: torch.dtype = getattr(torch, w_dtype)
+
+    def init_tensor(dtype, shape):
+        if dtype == torch.int8:
+            return torch.randint(0, 3, shape, device=device, dtype=dtype)
+        elif dtype == torch.float8_e4m3fn or dtype == torch.float8_e5m2:
+            return torch.randn(shape, device=device, dtype=torch.float16).to(dtype)
+        else:
+            return torch.randn(shape, device=device, dtype=dtype)
+
+    a = init_tensor(w_dtype, (M, K))
+    b = init_tensor(x_dtype, (K, N))
+
     torch_dtype = getattr(torch, out_dtype)
     triton_dtype = getattr(tl, out_dtype)  # <- here force dot_out_dtype
     out_torch = torch.matmul(a.to(torch_dtype), b.to(torch_dtype))
     out_triton = torch.empty((M, N), device=device, dtype=torch_dtype)
 
     # launch kernel
-    BLOCK_M, BLOCK_N, BLOCK_K = 16, 16, 32
-    grid = ((triton.cdiv(M, BLOCK_M) * triton.cdiv(N, BLOCK_N)), 1)
+    block_m, block_n, block_k = 16, 16, BLOCK_K
+    grid = ((triton.cdiv(M, block_m) * triton.cdiv(N, block_n)), 1)
 
     matmul_kernel[grid](
         a, b, out_triton, M, N, K,  #
         a.stride(0), a.stride(1),  #
         b.stride(0), b.stride(1),  #
         out_triton.stride(0), out_triton.stride(1), dot_out_dtype=triton_dtype,  #
         GROUP_M=8,  #
-        BLOCK_M=BLOCK_M,  #
-        BLOCK_N=BLOCK_N,  #
-        BLOCK_K=BLOCK_K)
+        BLOCK_M=block_m,  #
+        BLOCK_N=block_n,  #
+        BLOCK_K=block_k)
 
     torch.testing.assert_close(out_torch, out_triton, atol=0.3, rtol=0.01)
diff --git a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ConvertLayoutOpToLLVM/SharedToDotOperandMMAv2OrV3.cpp b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ConvertLayoutOpToLLVM/SharedToDotOperandMMAv2OrV3.cpp
@@ -226,11 +226,6 @@ SmallVector<Value> MMA16816SmemLoader::computeLdsMatOffs(Value lane,
                                                          Value cSwizzleOffset) {
   Value warpB = multiDimWarpId[0];
   Value warpOff = kOrder == 2 ? multiDimWarpId[1] : multiDimWarpId[2];
-  int cTileShape = tileShape[order[0]];
-  int sTileShape = tileShape[order[1]];
-  if (!needTrans) {
-    std::swap(cTileShape, sTileShape);
-  }
 
   SmallVector<Value> offs(numPtrs);
 
@@ -239,7 +234,6 @@ SmallVector<Value> MMA16816SmemLoader::computeLdsMatOffs(Value lane,
   int laneHeight = 8;
   int quadWidth = laneWidth * kWidth;
   int quadHeight = laneHeight;
-  int numQuadI = 2;
 
   // outer index base
   Value iBase = udiv(lane, i32_val(laneWidth));
@@ -544,12 +538,15 @@ Value composeValuesToDotOperandLayoutStruct(
   // unpacked into individual elements.
   // `kIters` specifies the number of contiguous int32 elements each thread
   // should load.
-  auto kIters = isHopper ? 1 : kWidth / (32 / bitwidth);
+  // `kSize` specifies the total number of int32 elements each thread should
+  // load.
+  int kIters = isHopper ? 1 : kWidth / (32 / bitwidth);
+  int kSize = repK >= kIters ? repK * 2 : kIters;
 
   std::vector<Value> elems;
   auto unpackVec = [&](int b, int m, int k) {
-    for (auto kIter = 0; kIter < kIters; ++kIter) {
-      auto val = vals.at({b, m, k + kIter});
+    for (int kIter = 0; kIter < kIters; ++kIter) {
+      auto val = vals.at({b, m, (k + kIter) % kSize});
       auto vec = bitcast(val, vecTy);
       for (auto i = 0; i < numElemsPerVec; ++i) {
         elems.push_back(extract_element(eltTy, vec, i32_val(i)));
diff --git a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/DotOpToLLVM/MMAv2.cpp b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/DotOpToLLVM/MMAv2.cpp
@@ -90,6 +90,7 @@ ValueTableV2 getValuesFromDotOperandLayoutStruct(
     // we split the MMA into 4 sub-MMAs, each with a stride 4 x 32-bit along the
     // K dimension.
     llvm::SmallVector<unsigned> si;
+    auto kIters = kWidth / (32 / bitwidth);
 
     if (dot.getOpIdx() == 0) {
       // Original register layout:
@@ -106,11 +107,63 @@ ValueTableV2 getValuesFromDotOperandLayoutStruct(
       //  2nd MMA: [[2, 3], [10, 11], [18, 19], [26, 27]]
       //  3rd MMA: [[4, 5], [12, 13], [20, 21], [28, 29]]
       //  4th MMA: [[6, 7], [14, 15], [22, 23], [30, 31]]
-      for (size_t kRep = 0; kRep < kWidth / numElemsPerVec; ++kRep)
-        for (size_t tile = 0; tile < 4; ++tile)
-          for (size_t e = 0; e < numElemsPerVec; ++e) {
-            si.push_back(kRep * numElemsPerVec + tile * kWidth + e);
-          }
+      if (kIters <= repK) {
+        for (size_t kRep = 0; kRep < kWidth / numElemsPerVec; ++kRep)
+          for (size_t tile = 0; tile < 4; ++tile)
+            for (size_t e = 0; e < numElemsPerVec; ++e) {
+              si.push_back(kRep * numElemsPerVec + tile * kWidth + e);
+            }
+      } else {
+        // Suppose kWidth=4 and type=fp32, so numElemsPerVec=1.
+        // Each tile of the dot operand layout has a size of 16x32.
+        // However, if the triton tensor size is 16x16, elements along the k
+        // dimension are duplicated. Within each tile, each register
+        // contains 2x8 elements arranged as follows:
+        //
+        //       tile0/0           tile0/1
+        //   |<--kWidth=4-->|   |<--kWidth-->|
+        //   |<-mmaWidth=2->|
+        //   [0,  1,  2,  3]    [0,  1,  2,  3]
+        //   [4,  5,  6,  7]    [4,  5,  6,  7]
+        //
+        // tile0/1 replicates the elements in tile0/0 along the k dimension.
+        // For a tensor size of 32x32, the next tile on the m dimension is as
+        // follows:
+        //
+        //       tile1/0              tile1/1
+        //   |<--kWidth-->|       |<--kWidth-->|
+        //   [8,  9, 10, 11],     [8,  9, 10, 11]
+        //   [12, 13, 14, 15],    [12, 13, 14, 15]
+        //
+        // Within a single tile, we can perform two MMAs, and the
+        // resulting register layout for each MMA is as follows:
+        //
+        //   1st MMA: [0, 4, 1, 5]
+        //   2nd MMA: [2, 6, 3, 7]
+        //   3rd MMA: [8, 12, 9, 13]
+        //   4th MMA: [10, 14, 11, 15]
+        //
+        // Additionally, we should reorder the elements by moving the duplicated
+        // elements to the end.  In the example above, we convert the order from
+        // tile0/0, tile0/1, tile1/0, tile1/1 to tile0/0, tile1/0, tile0/1,
+        // tile1/1, so that only the first two tiles will be used in the
+        // computation.
+        size_t elemsPerTile = 2 * 2 * kWidth;
+        size_t elemsPerMma = 2 * 2 * numElemsPerVec;
+        size_t mmaWidth = kWidth / numElemsPerVec / 2;
+        size_t repMma = elemsPerTile / (mmaWidth * elemsPerMma);
+        for (size_t rep = 0; rep < repMma; ++rep)
+          for (size_t tile = 0; tile < elems.size() / elemsPerTile; ++tile)
+            for (size_t mmaKWidth = 0; mmaKWidth < mmaWidth; ++mmaKWidth)
+              for (size_t kTile = 0; kTile < 2; ++kTile)
+                for (size_t mTile = 0; mTile < 2; ++mTile)
+                  for (size_t e = 0; e < numElemsPerVec; ++e) {
+                    si.push_back(rep * mmaWidth * elemsPerMma +
+                                 mmaKWidth * 2 * numElemsPerVec +
+                                 tile * elemsPerTile + mTile * kWidth +
+                                 kTile * numElemsPerVec + e);
+                  }
+      }
     } else {
       // Original register layout:
       //
@@ -122,11 +175,36 @@ ValueTableV2 getValuesFromDotOperandLayoutStruct(
       //  2nd MMA: [[2, 3], [10, 11]]
       //  3rd MMA: [[4, 5], [12, 13]]
       //  4th MMA: [[6, 7], [14, 15]]
-      for (size_t kRep = 0; kRep < kWidth / numElemsPerVec; ++kRep)
-        for (size_t tile = 0; tile < 2; ++tile)
-          for (size_t e = 0; e < numElemsPerVec; ++e) {
-            si.push_back(kRep * numElemsPerVec + tile * kWidth + e);
-          }
+      if (kIters <= repK) {
+        for (size_t kRep = 0; kRep < kWidth / numElemsPerVec; ++kRep)
+          for (size_t tile = 0; tile < 2; ++tile)
+            for (size_t e = 0; e < numElemsPerVec; ++e) {
+              si.push_back(kRep * numElemsPerVec + tile * kWidth + e);
+            }
+      } else {
+        // Suppose kWidth=4 and type=fp32.
+        // Original register layout:
+        //
+        //       tile0/0        tile0/1
+        //   [0, 1, 2, 3]^T, [0, 1, 2, 3]^T
+        //
+        // Similar to the opIdx=0 situation, we should reorder the elements by
+        // moving the duplicated elements to the end.
+        size_t elemsPerTile = 2 * kWidth;
+        size_t elemsPerMma = 2 * numElemsPerVec;
+        size_t mmaWidth = kWidth / numElemsPerVec / 2;
+        size_t repMma = elemsPerTile / (mmaWidth * elemsPerMma);
+        for (size_t rep = 0; rep < repMma; ++rep)
+          for (size_t tile = 0; tile < elems.size() / elemsPerTile; ++tile)
+            for (size_t mmaKWidth = 0; mmaKWidth < mmaWidth; ++mmaKWidth)
+              for (size_t kTile = 0; kTile < 2; ++kTile)
+                for (size_t e = 0; e < numElemsPerVec; ++e) {
+                  si.push_back(rep * mmaWidth * elemsPerMma +
+                               mmaKWidth * 2 * numElemsPerVec +
+                               tile * elemsPerTile + kTile * numElemsPerVec +
+                               e);
+                }
+      }
     }
 
     auto step = si.size();