[Backend] Remove convertMMAV3To8BitsDotOperand (#7574)

FrederickVu · apgoucher · web-flow · commit 838621321731 · 2025-07-23T00:20:36.000+01:00
We remove the custom layout conversion lowering for the MMA v3 8-bit case which used warp shuffles as it's now handled by the general pathway #7558.  # New contributor declaration - [x] I am not making a trivial change, such as fixing a typo in a comment. - [x] I have written a PR description following these [rules](https://cbea.ms/git-commit/#why-not-how). - [x] I have run `pre-commit run --from-ref origin/main --to-ref HEAD`. - Select one of the following. - [x] I have added tests. - `/test` for `lit` tests - `/unittest` for C++ tests - `/python/test` for end-to-end tests - [ ] This PR does not need a test because `FILL THIS IN`. - Select one of the following. - [ ] I have not added any `lit` tests. - [x] The `lit` tests I have added follow these [best practices](https://mlir.llvm.org/getting_started/TestingGuide/#filecheck-best-practices), including the "tests should be minimal" section. (Usually running Python code and using the instructions it generates is not minimal.) --------- Co-authored-by: apgoucher <apgoucher@openai.com>
diff --git a/include/triton/Analysis/Utility.h b/include/triton/Analysis/Utility.h
@@ -254,10 +254,6 @@ bool cvtNeedsSharedMemory(RankedTensorType srcTy, RankedTensorType dstTy);
 
 bool atomicNeedsSharedMemory(Value result);
 
-// Return true if the src and dst layout match.
-bool matchMmaV3AndDotOperandLayout(RankedTensorType srcTy,
-                                   RankedTensorType dstTy);
-
 // Check if MFMA layout can be converted to the dot operand
 // layout using warp shuffle.
 bool matchMFMAAndDotOperandShuffleCase(RankedTensorType srcTy,
diff --git a/lib/Analysis/Utility.cpp b/lib/Analysis/Utility.cpp
@@ -713,24 +713,6 @@ bool supportMMA(Value value, int version) {
          (elemTy.isInteger(8) && version >= 2);
 }
 
-// For MMAV3 dotOperand layout matches mma operand for f16 and bf16 cases.
-bool matchMmaV3AndDotOperandLayout(RankedTensorType srcTy,
-                                   RankedTensorType dstTy) {
-  auto mmaLayout = dyn_cast<NvidiaMmaEncodingAttr>(srcTy.getEncoding());
-  auto dotOperandLayout = dyn_cast<DotOperandEncodingAttr>(dstTy.getEncoding());
-  if (!mmaLayout || !dotOperandLayout) {
-    return false;
-  }
-  int elementTypeSize = srcTy.getElementType().getIntOrFloatBitWidth();
-  auto parentTy = srcTy.cloneWithEncoding(dotOperandLayout.getParent());
-  auto ans = mmaLayout.getVersionMajor() == 3 &&
-             dotOperandLayout.getOpIdx() == 0 &&
-             mmaLayout.getWarpsPerCTA()[1] == 1 &&
-             !cvtNeedsSharedMemory(parentTy, srcTy) && elementTypeSize == 8 &&
-             dotOperandLayout.getKWidth() == 32 / elementTypeSize;
-  return ans;
-}
-
 bool matchMFMAAndDotOperandShuffleCase(RankedTensorType srcTy,
                                        RankedTensorType dstTy) {
   auto mfmaLayout = dyn_cast<AMDMfmaEncodingAttr>(srcTy.getEncoding());
@@ -810,7 +792,6 @@ bool cvtNeedsSharedMemory(RankedTensorType srcTy, RankedTensorType dstTy) {
   // they're fully subsumed by the linear-layout checks.
   return !cvtReordersRegisters(srcTy, dstTy) &&
          !cvtNeedsWarpShuffle(srcTy, dstTy) &&
-         !matchMmaV3AndDotOperandLayout(srcTy, dstTy) &&
          // to be removed when generalized warp shuffle conversions
          // are ready:
          !matchMFMAAndDotOperandShuffleCase(srcTy, dstTy);
diff --git a/test/Conversion/tritongpu_to_llvm_hopper.mlir b/test/Conversion/tritongpu_to_llvm_hopper.mlir
@@ -216,12 +216,9 @@ module attributes {"ttg.target" = "cuda:90", "ttg.num-ctas" = 1 : i32, "ttg.num-
 #shared = #ttg.nvmma_shared<{swizzlingByteWidth = 128, transposed = true, elementBitWidth = 8}>
 module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32} {
 // CHECK-LABEL: cvt_mma_to_dot_fp8
-// CHECK: nvvm.prmt
-// CHECK: nvvm.prmt
-// CHECK: nvvm.shfl.sync
-// CHECK: nvvm.shfl.sync
-// CHECK: nvvm.prmt
-// CHECK: nvvm.prmt
+// CHECK-COUNT-16: llvm.select
+// CHECK-COUNT-16: nvvm.shfl.sync
+// CHECK-COUNT-16: llvm.select
   tt.func @cvt_mma_to_dot_fp8(%a: tensor<128x64xf8E5M2, #mma>) {
     %opA = ttg.convert_layout %a : tensor<128x64xf8E5M2, #mma> -> tensor<128x64xf8E5M2, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 4}>>
     tt.return
diff --git a/test/Conversion/tritongpu_to_ptx_mmav3.mlir b/test/Conversion/tritongpu_to_ptx_mmav3.mlir
@@ -0,0 +1,121 @@
+// RUN: triton-opt %s --allocate-shared-memory --convert-triton-gpu-to-llvm='compute-capability=90 ptx-version=83' --convert-nv-gpu-to-llvm | mlir-translate --mlir-to-llvmir | opt -O3 -S | llc -mtriple nvptx64-nvidia-cuda -mcpu=sm_90 -mattr=+ptx83 | FileCheck --dump-input-context=20 %s
+
+#mma = #ttg.nvidia_mma<{versionMajor = 3, versionMinor = 0, warpsPerCTA = [4, 1], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0], instrShape = [16, 64, 16]}>
+#dot_op = #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth=4}>
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32} {
+// CHECK-LABEL: cvt_mma_to_dot_fp8
+  tt.func @cvt_mma_to_dot_fp8(%ptr : !llvm.ptr, %arg0: tensor<128x64xf8E5M2, #mma>) {
+
+    // As there are 64 elements per lane, we don't use variables to track them.
+
+    // CHECK-COUNT-64: ld.param.b8
+
+    // Intra-warp layout conversions can be viewed as a permutation of register
+    // and lane basis vectors. This can be read off from the linear layouts:
+    //
+    // #mma:     register: [[0,1], [8,0], [0,8], [0,16], [0,32], [64,0]]
+    //               lane: [[0,2], [0,4], [1,0], [2,0], [4,0]]
+    //               warp: [[16,0], [32,0]]
+    //
+    // #dot_op:  register: [[0,1], [0,2], [8,0], [0,16], [0,32], [64,0]]
+    //               lane: [[0,4], [0,8], [1,0], [2,0], [4,0]]
+    //               warp: [[16,0], [32,0]]
+    //
+    // The layout conversion is described by the permutation (r1 r2 l1 l0),
+    // which factors as (r1 l1)(l0 l1)(r1 r2).
+    //
+    // Register basis vectors correspond to the bits of the indices of the 64
+    // separate registers which hold the original elements. Since we end up
+    // packing 4 elements per register, we end up with only 16 registers in
+    // total before shuffling. The `transferWithinWarp` implementation handles
+    // register packing by ensuring that elements are packed together only if
+    // under the layout conversion, they end up in the same destination lane.
+    // To do this, it rearranges the 64 registers so that it can pack 4
+    // consecutive elements at a time according to their new register index.
+    //
+    // The transposition (r1 l1) above indicates that intially, elements with
+    // register indices whose r1 bit is on are to be moved to new lanes. We thus
+    // need to rearrange the registers. The algorithm chooses the next register
+    // bit > 1 which is not used in a mixed transposition. In this case,
+    // that bit is r2. Algebrically, this corresponds to conjugating the
+    // permutation with (r1 r2). This produces (r1 r2)(r2 l1)(l0 l1). The new
+    // (r1 r2) at the end rearranges elements after unpacking, and only
+    // (r2 l1)(l0 l1) matters for tracking the movement of the packed registers.
+    // From the point of view of the packed registers, the symbol `r2` now
+    // corresponds to the 0th bit of a (packed) register's index.
+    //
+    // The transposition (r2 l1) is a bit swap which is implemented in-place as:
+    //  1. r2 ^= l1
+    //  2. l1 ^= r2
+    //  3. r2 ^= l1.
+    // The algorithm conjugates (l0 l1) through the first two stages to produce:
+    //  1. r2 ^= l0
+    //  2a. l0 ^= r2
+    //  2b. (l0 l1)
+    //  3. r2 ^= l1.
+    // The first step is to get the value of l0.
+
+    // CHECK: mov.u32       [[TID:%.*]], %tid.x;
+    // CHECK: and.b32       [[L0_VAL:%.*]], [[TID]], 1;
+    // CHECK: setp.eq.s32   [[L0_OFF:%.*]], [[L0_VAL]], 0;
+
+    // This is used to perform 16 independent selects in stage 1.
+
+    // CHECK-COUNT-16: selp.b32     {{.*}}, {{.*}}, [[L0_OFF]];
+
+    // Next, we apply (l0 l1) to the lane id to get the base source lane for
+    // the index shuffles. This is step 2b above, but since we must specify
+    // the *source* lane for a warp-shuffle, it gets applied first in practice:
+    //
+    //       dstLane = ((l0 l1) \circ (l0 ^= r2))(srcLane)
+    //       srcLane = ((l0 ^= r2) \circ (l0 l1))(dstLane)
+    //
+    // To apply (l0 l1), we use a compile-time mask to collect the fixed bits,
+    // and then we OR it with the shifted l0 and l1 values.
+
+    // CHECK-DAG: and.b32 [[LANEID_FIXED_BITS:%.*]], [[TID]], 28;
+    // CHECK-DAG: shl.b32 [[L0_TEMP:%.*]], [[L0_VAL]], 1;
+    // CHECK-DAG: or.b32  [[LANEID_PART_PERM:%.*]], [[L0_TEMP]], [[LANEID_FIXED_BITS]];
+    // CHECK-DAG: bfe.u32 [[L1_TEMP:%.*]], [[TID]], 1, 1;
+    // CHECK-DAG: or.b32  [[LANEID_PERM:%.*]], [[LANEID_PART_PERM]], [[L1_TEMP]];
+
+    // The index shuffles have source lane dependent on the value of the r2 bit.
+    // Half of them use `LANEID_PERM` while the other half use `LANEID_PERM`
+    // with the l0 bit flipped (step 2a).
+
+    // CHECK-DAG: xor.b32     [[LANEID_PERM_F:%.*]], [[LANEID_PERM]], 1;
+
+    // CHECK-DAG: shfl.sync.idx.b32     {{.*}}, [[LANEID_PERM]], 31, -1;
+    // CHECK-DAG: shfl.sync.idx.b32     {{.*}}, [[LANEID_PERM]], 31, -1;
+    // CHECK-DAG: shfl.sync.idx.b32     {{.*}}, [[LANEID_PERM]], 31, -1;
+    // CHECK-DAG: shfl.sync.idx.b32     {{.*}}, [[LANEID_PERM]], 31, -1;
+    // CHECK-DAG: shfl.sync.idx.b32     {{.*}}, [[LANEID_PERM]], 31, -1;
+    // CHECK-DAG: shfl.sync.idx.b32     {{.*}}, [[LANEID_PERM]], 31, -1;
+    // CHECK-DAG: shfl.sync.idx.b32     {{.*}}, [[LANEID_PERM]], 31, -1;
+    // CHECK-DAG: shfl.sync.idx.b32     {{.*}}, [[LANEID_PERM]], 31, -1;
+    // CHECK-DAG: shfl.sync.idx.b32     {{.*}}, [[LANEID_PERM_F]], 31, -1;
+    // CHECK-DAG: shfl.sync.idx.b32     {{.*}}, [[LANEID_PERM_F]], 31, -1;
+    // CHECK-DAG: shfl.sync.idx.b32     {{.*}}, [[LANEID_PERM_F]], 31, -1;
+    // CHECK-DAG: shfl.sync.idx.b32     {{.*}}, [[LANEID_PERM_F]], 31, -1;
+    // CHECK-DAG: shfl.sync.idx.b32     {{.*}}, [[LANEID_PERM_F]], 31, -1;
+    // CHECK-DAG: shfl.sync.idx.b32     {{.*}}, [[LANEID_PERM_F]], 31, -1;
+    // CHECK-DAG: shfl.sync.idx.b32     {{.*}}, [[LANEID_PERM_F]], 31, -1;
+    // CHECK-DAG: shfl.sync.idx.b32     {{.*}}, [[LANEID_PERM_F]], 31, -1;
+
+    // Finally, the last set of selects are performed, using the value of l1 as
+    // the predicate (step 3).
+
+    // CHECK-DAG: and.b32           [[L1_VAL:%.*]], [[TID]], 2;
+    // CHECK-DAG: setp.eq.s32       [[L1_OFF:%.*]], [[L1_VAL]], 0;
+    // CHECK-COUNT-16: selp.b32     {{.*}}, {{.*}}, [[L1_OFF]];
+
+    // CHECK-COUNT-64: bfe.u32
+    // CHECK-COUNT-64: st.volatile.global.b8
+
+    %0 = ttg.convert_layout %arg0 : tensor<128x64xf8E5M2, #mma> -> tensor<128x64xf8E5M2, #dot_op>
+    %1 = builtin.unrealized_conversion_cast %0 : tensor<128x64xf8E5M2, #dot_op> to !llvm.struct<(i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8)>
+    llvm.store volatile %1, %ptr : !llvm.struct<(i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8)>, !llvm.ptr
+
+    tt.return
+  }
+}
diff --git a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ConvertLayoutOpToLLVM.cpp b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ConvertLayoutOpToLLVM.cpp
@@ -38,10 +38,6 @@ struct ConvertLayoutOpConversion
       if (shouldUseDistSmem(srcLayout, dstLayout))
         return lowerDistToDistWithDistSmem(op, adaptor, rewriter, targetInfo);
     }
-    if (isa<NvidiaMmaEncodingAttr>(srcLayout) &&
-        isa<DotOperandEncodingAttr>(dstLayout)) {
-      return lowerMmaToDotOperand(op, adaptor, rewriter);
-    }
 
     return failure();
   }
@@ -136,100 +132,6 @@ struct ConvertLayoutOpConversion
     return success();
   }
 
-  // Convert from accumulator MMA layout to 8bit dot operand layout.
-  // The conversion logic is taken from:
-  // https://github.com/ColfaxResearch/cutlass-kernels/blob/a9de6446c1c0415c926025cea284210c799b11f8/src/fmha-pipeline/reg2reg.h#L45
-  void
-  convertMMAV3To8BitsDotOperand(triton::gpu::ConvertLayoutOp op,
-                                OpAdaptor adaptor,
-                                ConversionPatternRewriter &rewriter) const {
-    auto loc = op.getLoc();
-    auto b = TritonLLVMOpBuilder(loc, rewriter);
-    auto dstTy = op.getType();
-    auto vals = unpackLLElements(loc, adaptor.getSrc(), rewriter);
-    SmallVector<Value> retVals;
-    for (int i = 0; i < vals.size(); i += 8) {
-      Value upper = b.undef(vec_ty(i8_ty, 4));
-      for (int j = 0; j < 4; j++) {
-        upper = b.insert_element(vec_ty(i8_ty, 4), upper, vals[i + j],
-                                 b.i32_val(j));
-      }
-      upper = b.bitcast(upper, i32_ty);
-      Value lower = b.undef(vec_ty(i8_ty, 4));
-      for (int j = 0; j < 4; j++) {
-        lower = b.insert_element(vec_ty(i8_ty, 4), lower, vals[i + 4 + j],
-                                 b.i32_val(j));
-      }
-      lower = b.bitcast(lower, i32_ty);
-
-      Value threadIdMod4 = b.urem(getThreadId(rewriter, loc), b.i32_val(4));
-      Value cnd = b.or_(b.icmp_eq(threadIdMod4, b.i32_val(0)),
-                        b.icmp_eq(threadIdMod4, b.i32_val(3)));
-      Value selectorEx0 = b.select(cnd, b.i32_val(0x3210), b.i32_val(0x7654));
-      Value selectorEx1 = b.select(cnd, b.i32_val(0x7654), b.i32_val(0x3210));
-      Value selectorEx4 = b.select(cnd, b.i32_val(0x5410), b.i32_val(0x1054));
-      Value selectorEx5 = b.select(cnd, b.i32_val(0x7632), b.i32_val(0x3276));
-
-      Value isOne = b.icmp_eq(threadIdMod4, b.i32_val(1));
-      Value isTwo = b.icmp_eq(threadIdMod4, b.i32_val(2));
-      Value isThree = b.icmp_eq(threadIdMod4, b.i32_val(3));
-      Value upperIdx = b.i32_val(0);
-      upperIdx = b.select(isOne, b.i32_val(3), upperIdx);
-      upperIdx = b.select(isTwo, b.i32_val(1), upperIdx);
-      upperIdx = b.select(isThree, b.i32_val(2), upperIdx);
-
-      Value lowerIdx = b.i32_val(1);
-      lowerIdx = b.select(isOne, b.i32_val(2), lowerIdx);
-      lowerIdx = b.select(isTwo, b.i32_val(0), lowerIdx);
-      lowerIdx = b.select(isThree, b.i32_val(3), lowerIdx);
-
-      Value upper0 =
-          LLVM::NVIDIA::permute(loc, rewriter, upper, lower, selectorEx0);
-      Value lower0 =
-          LLVM::NVIDIA::permute(loc, rewriter, upper, lower, selectorEx1);
-      Value mask = b.i32_val(0xFFFFFFFF);
-      // Set clamp tp shuffle only within 4 lanes.
-      Value clamp = b.i32_val(0x1C1F);
-      upper0 =
-          rewriter.create<NVVM::ShflOp>(loc, i32_ty, mask, upper0, upperIdx,
-                                        clamp, NVVM::ShflKind::idx, UnitAttr());
-      lower0 =
-          rewriter.create<NVVM::ShflOp>(loc, i32_ty, mask, lower0, lowerIdx,
-                                        clamp, NVVM::ShflKind::idx, UnitAttr());
-      Value upper1 =
-          LLVM::NVIDIA::permute(loc, rewriter, upper0, lower0, selectorEx4);
-      Value vecVal = b.bitcast(upper1, vec_ty(i8_ty, 4));
-      for (int i = 0; i < 4; i++) {
-        retVals.push_back(b.extract_element(i8_ty, vecVal, b.i32_val(i)));
-      }
-      Value lower1 =
-          LLVM::NVIDIA::permute(loc, rewriter, upper0, lower0, selectorEx5);
-      vecVal = b.bitcast(lower1, vec_ty(i8_ty, 4));
-      for (int i = 0; i < 4; i++) {
-        retVals.push_back(b.extract_element(i8_ty, vecVal, b.i32_val(i)));
-      }
-    }
-    Value result =
-        packLLElements(loc, getTypeConverter(), retVals, rewriter, dstTy);
-    rewriter.replaceOp(op, result);
-  }
-
-  // mma -> dot_operand
-  LogicalResult
-  lowerMmaToDotOperand(triton::gpu::ConvertLayoutOp op, OpAdaptor adaptor,
-                       ConversionPatternRewriter &rewriter) const {
-    auto loc = op.getLoc();
-    auto srcTy = op.getSrc().getType();
-    auto dstTy = op.getType();
-    if (matchMmaV3AndDotOperandLayout(srcTy, dstTy)) {
-      assert(srcTy.getElementType().getIntOrFloatBitWidth() == 8 &&
-             "Unsupported type size.");
-      convertMMAV3To8BitsDotOperand(op, adaptor, rewriter);
-      return success();
-    }
-    return failure();
-  }
-
 private:
   const NVIDIA::TargetInfo &targetInfo;
 };