intel
diff --git a/‎bin/RegisterTritonDialects.h‎
Lines changed: 0 additions & 2 deletions b/‎bin/RegisterTritonDialects.h‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎include/triton/Conversion/TritonGPUToLLVM/PatternTritonGPUOpToLLVM.h‎
Lines changed: 4 additions & 22 deletions b/‎include/triton/Conversion/TritonGPUToLLVM/PatternTritonGPUOpToLLVM.h‎
Lines changed: 4 additions & 22 deletions
diff --git a/‎include/triton/Conversion/TritonGPUToLLVM/Utility.h‎
Lines changed: 5 additions & 5 deletions b/‎include/triton/Conversion/TritonGPUToLLVM/Utility.h‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎lib/Analysis/Allocation.cpp‎
Lines changed: 6 additions & 1 deletion b/‎lib/Analysis/Allocation.cpp‎
Lines changed: 6 additions & 1 deletion
diff --git a/‎lib/Conversion/TritonGPUToLLVM/ConvertLayoutOpToLLVM.cpp‎
Lines changed: 32 additions & 0 deletions b/‎lib/Conversion/TritonGPUToLLVM/ConvertLayoutOpToLLVM.cpp‎
Lines changed: 32 additions & 0 deletions
diff --git a/‎lib/Conversion/TritonGPUToLLVM/DecomposeUnsupportedConversions.cpp‎
Lines changed: 10 additions & 0 deletions b/‎lib/Conversion/TritonGPUToLLVM/DecomposeUnsupportedConversions.cpp‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎lib/Conversion/TritonGPUToLLVM/MemoryOpToLLVM.cpp‎
Lines changed: 11 additions & 25 deletions b/‎lib/Conversion/TritonGPUToLLVM/MemoryOpToLLVM.cpp‎
Lines changed: 11 additions & 25 deletions
diff --git a/‎lib/Conversion/TritonGPUToLLVM/Utility.cpp‎
Lines changed: 1 addition & 7 deletions b/‎lib/Conversion/TritonGPUToLLVM/Utility.cpp‎
Lines changed: 1 addition & 7 deletions
diff --git a/‎lib/Dialect/TritonGPU/IR/Dialect.cpp‎
Lines changed: 34 additions & 30 deletions b/‎lib/Dialect/TritonGPU/IR/Dialect.cpp‎
Lines changed: 34 additions & 30 deletions
diff --git a/‎lib/Dialect/TritonGPU/IR/LinearLayoutConversions.cpp‎
Lines changed: 9 additions & 2 deletions b/‎lib/Dialect/TritonGPU/IR/LinearLayoutConversions.cpp‎
Lines changed: 9 additions & 2 deletions
@@ -88,8 +88,6 @@ inline void registerTritonDialects(mlir::DialectRegistry &registry) {
   mlir::registerTritonAMDGPUStreamPipeline();
   mlir::registerTritonAMDGPUStreamPipelineV2();
   mlir::registerTritonAMDGPUCanonicalizePointers();
-  mlir::triton::registerTritonAMDGPUInsertInstructionSchedHints();
-  mlir::triton::registerTritonAMDGPULowerInstructionSchedHints();
 
   // TODO: register Triton & TritonGPU passes
   registry.insert<mlir::triton::TritonDialect, mlir::cf::ControlFlowDialect,
 
@@ -27,33 +27,15 @@ constexpr int patternBenefitPrioritizeOverLLVMConversions = 10;
 constexpr int patternBenefitClampOptimizedPattern = 20;
 constexpr int patternBenefitConvertLayoutOptimizedPattern = 20;
 
-struct BackendCallbacks {
-  /**
-   * A backend-specific callback for appending auxiliary data during
-   * `LocalStoreOp` conversion.
-   *
-   * @param[in] op The reference to the re-written `LocalStoreOp`.
-   * @param[in] count The number of issued LLVM instructions.
-   * @param[in] type The input type of issued LLVM instructions.
-   */
-  std::function<void(triton::gpu::LocalStoreOp op, size_t llvmOpCount,
-                     Type llvmOpType)>
-      localStoreOpConversion = nullptr;
-};
-
 void populateElementwiseOpToLLVMPatterns(
     LLVMTypeConverter &typeConverter, RewritePatternSet &patterns,
     ModuleAxisInfoAnalysis &axisInfoAnalysis, const TargetInfoBase &targetInfo,
     PatternBenefit benefit);
 
-// The given callback is invoked at the end of a successful rewrite. The
-// callback receives 1) the current source op, 2) the number of issued LLVM
-// instructions and 3) their input types. Each MLIR backend can provide a
-// callback and, thus, handle backend-specific behaviors.
-void populateMemoryOpToLLVMPattern(
-    LLVMTypeConverter &typeConverter, const TargetInfoBase &targetInfo,
-    RewritePatternSet &patterns, PatternBenefit benefit,
-    std::optional<BackendCallbacks> backendCallbacks = std::nullopt);
+void populateMemoryOpToLLVMPattern(LLVMTypeConverter &typeConverter,
+                                   const TargetInfoBase &targetInfo,
+                                   RewritePatternSet &patterns,
+                                   PatternBenefit benefit);
 
 void populateAssertOpToLLVMPattern(LLVMTypeConverter &typeConverter,
                                    RewritePatternSet &patterns,
 
@@ -1366,11 +1366,11 @@ SmallVector<Value> loadSharedToDistributed(RankedTensorType dstTy,
                                            Location loc, RewriterBase &rewriter,
                                            const TargetInfoBase &target);
 
-void storeDistributedToShared(
-    MemDescType dstTy, RankedTensorType srcTy, Type elemLlvmTy,
-    ArrayRef<Value> srcVals, Value smemBase, ArrayRef<Value> dstStrides,
-    Location loc, RewriterBase &rewriter, const TargetInfoBase &target,
-    std::pair<size_t, Type> *const llvmOpCount = nullptr);
+void storeDistributedToShared(MemDescType dstTy, RankedTensorType srcTy,
+                              Type elemLlvmTy, ArrayRef<Value> srcVals,
+                              Value smemBase, ArrayRef<Value> dstStrides,
+                              Location loc, RewriterBase &rewriter,
+                              const TargetInfoBase &target);
 
 inline Value getStructFromSharedMemoryObject(Location loc,
                                              const SharedMemoryObject &smemObj,
 
@@ -115,7 +115,12 @@ ScratchConfig getScratchConfigForCvt(RankedTensorType srcTy,
 
   assert(!isMfmaToDotShortcut(srcTy, dstTy));
 
-  auto [inOrd, outOrd] = getCvtOrder(srcLayout, dstLayout);
+  // FIXME This is NOT entirely correct
+  // This should be getElemOrder, but we don't have such a method
+  // TODO Implement getElemOrder and make sure it's consistent with
+  // getContigPerThread
+  auto inOrd = gpu::getThreadOrder(srcLayout);
+  auto outOrd = gpu::getThreadOrder(dstLayout);
   scratchConfig.order = outOrd;
 
   unsigned srcContigPerThread =
 
@@ -404,6 +404,22 @@ struct ConvertLayoutOpUsingLinearLayoutsConversion
         }
         return true;
       }
+      if (auto dotOperand = dyn_cast<DotOperandEncodingAttr>(layout)) {
+        if (auto nvidiaMma =
+                dyn_cast<NvidiaMmaEncodingAttr>(dotOperand.getParent())) {
+          if (product(getCTAsPerCGA(nvidiaMma)) > 1) {
+            return false;
+          }
+          if (useLegacyMMAConversion) {
+            return false;
+          }
+          // FIXME [Dot LL]
+          // Enabling LL path for buggy kWidth path
+          bool largeKWidth =
+              dotOperand.getKWidth() * dstTy.getElementTypeBitWidth() > 64;
+          return largeKWidth && nvidiaMma.isAmpere();
+        }
+      }
       if (isa<BlockedEncodingAttr>(layout)) {
         return true;
       }
@@ -460,6 +476,22 @@ struct ConvertLayoutOpUsingLinearLayoutsConversion
       }
     }
 
+    // FIXME [Dot LL]
+    // We know it's just for largeKWidth case in Ampere
+    // In this case, we need to pack the outputs into i32
+    if (isa<DotOperandEncodingAttr>(dstTy.getEncoding())) {
+      auto concat = [&](Value a, Value b) {
+        return or_(zext(i32_ty, bitcast(a, i16_ty)),
+                   shl(zext(i32_ty, bitcast(b, i16_ty)), i32_val(16)));
+      };
+
+      SmallVector<Value> outVals32(outVals.size() / 2);
+      for (int i = 0; i < outVals32.size(); ++i) {
+        outVals32[i] = concat(outVals[2 * i], outVals[2 * i + 1]);
+      }
+      outVals = outVals32;
+    }
+
     Value result = packLLElements(loc, getTypeConverter(), outVals, rewriter,
                                   op.getType());
     rewriter.replaceOp(op, result);
 
@@ -90,6 +90,16 @@ void decomposeBlockedToDotLayoutConversion(ModuleOp module) {
     auto dstDotOp =
         dyn_cast<triton::gpu::DotOperandEncodingAttr>(dstType.getEncoding());
     if (srcBlocked && dstDotOp) {
+      // FIXME [Dot LL]
+      // We support this one via LLs, as the LocalLoad path is buggy
+      if (auto mma = dyn_cast<NvidiaMmaEncodingAttr>(dstDotOp.getParent())) {
+        bool largeKWidth =
+            dstDotOp.getKWidth() * dstType.getElementTypeBitWidth() > 64;
+        if (mma.isAmpere() && largeKWidth) {
+          return;
+        }
+      }
+
       Attribute sharedMemorySpace =
           triton::gpu::SharedMemorySpaceAttr::get(srcType.getContext());
       auto tmpType = MemDescType::get(
 
@@ -15,11 +15,12 @@ using namespace mlir::triton::gpu;
 // blocked -> shared.
 // Swizzling in shared memory to avoid bank conflict. Normally used for
 // A/B operands of dots.
-void lowerDistributedToShared(
-    Location loc, Value src, Value dst, Value adaptorSrc,
-    const SharedMemoryObject &smemObj, const LLVMTypeConverter *typeConverter,
-    ConversionPatternRewriter &rewriter, const TargetInfoBase &targetInfo,
-    std::pair<size_t, Type> *const llvmOpCount = nullptr) {
+void lowerDistributedToShared(Location loc, Value src, Value dst,
+                              Value adaptorSrc,
+                              const SharedMemoryObject &smemObj,
+                              const LLVMTypeConverter *typeConverter,
+                              ConversionPatternRewriter &rewriter,
+                              const TargetInfoBase &targetInfo) {
   auto srcTy = cast<RankedTensorType>(src.getType());
   auto dstTy = cast<MemDescType>(dst.getType());
   auto outOrd = mlir::cast<SharedEncodingAttr>(dstTy.getEncoding()).getOrder();
@@ -32,7 +33,7 @@ void lowerDistributedToShared(
   auto dstStrides = smemObj.getStrides();
   auto inVals = unpackLLElements(loc, adaptorSrc, rewriter);
   storeDistributedToShared(dstTy, srcTy, elemTy, inVals, smemBase, dstStrides,
-                           loc, rewriter, targetInfo, llvmOpCount);
+                           loc, rewriter, targetInfo);
 }
 
 struct LocalAllocOpConversion
@@ -184,15 +185,12 @@ struct LocalStoreOpConversion
 public:
   using ConvertOpToLLVMPattern<
       triton::gpu::LocalStoreOp>::ConvertOpToLLVMPattern;
-  using BackendCallbackType =
-      decltype(BackendCallbacks::localStoreOpConversion);
 
   LocalStoreOpConversion(const LLVMTypeConverter &converter,
                          const TargetInfoBase &targetInfo,
-                         BackendCallbackType backendCallback,
                          PatternBenefit benefit = 1)
       : ConvertOpToLLVMPattern<triton::gpu::LocalStoreOp>(converter, benefit),
-        targetInfo(targetInfo), backendCallback(backendCallback) {}
+        targetInfo(targetInfo) {}
 
   LogicalResult
   matchAndRewrite(triton::gpu::LocalStoreOp op, OpAdaptor adaptor,
@@ -202,36 +200,24 @@ struct LocalStoreOpConversion
         getTypeConverter()->convertType(op.getDst().getType().getElementType());
     auto smemObj = LLVM::getSharedMemoryObjectFromStruct(
         op.getLoc(), adaptor.getDst(), llvmElemTy, rewriter);
-
-    std::pair<size_t, Type> llvmOpCount;
     lowerDistributedToShared(op.getLoc(), op.getSrc(), op.getDst(),
                              adaptor.getSrc(), smemObj, getTypeConverter(),
-                             rewriter, targetInfo, &llvmOpCount);
-
-    if (backendCallback)
-      (backendCallback)(op, llvmOpCount.first, llvmOpCount.second);
-
+                             rewriter, targetInfo);
     rewriter.eraseOp(op);
     return success();
   }
 
 private:
   const TargetInfoBase &targetInfo;
-  BackendCallbackType backendCallback;
 };
 
 } // namespace
 
 void mlir::triton::populateMemoryOpToLLVMPattern(
     LLVMTypeConverter &typeConverter, const TargetInfoBase &targetInfo,
-    RewritePatternSet &patterns, PatternBenefit benefit,
-    std::optional<BackendCallbacks> backendCallbacks) {
+    RewritePatternSet &patterns, PatternBenefit benefit) {
   patterns.add<LocalAllocOpConversion>(typeConverter, targetInfo, benefit);
   patterns.add<LocalDeallocOpConversion>(typeConverter, benefit);
   patterns.add<LocalLoadOpConversion>(typeConverter, targetInfo, benefit);
-
-  auto backendCall =
-      backendCallbacks ? backendCallbacks->localStoreOpConversion : nullptr;
-  patterns.add<LocalStoreOpConversion>(typeConverter, targetInfo, backendCall,
-                                       benefit);
+  patterns.add<LocalStoreOpConversion>(typeConverter, targetInfo, benefit);
 }
@@ -404,8 +404,7 @@ void storeDistributedToShared(MemDescType dstTy, RankedTensorType srcTy,
                               Type elemLlvmTy, ArrayRef<Value> srcVals,
                               Value smemBase, ArrayRef<Value> dstStrides,
                               Location loc, RewriterBase &rewriter,
-                              const TargetInfoBase &target,
-                              std::pair<size_t, Type> *const llvmOpCount) {
+                              const TargetInfoBase &target) {
   bool success = emitTransferBetweenRegistersAndShared(
       srcTy, dstTy, elemLlvmTy, /*maxVecElems=*/std::nullopt, smemBase,
       dstStrides, loc, rewriter, target, [&](VectorType vecTy, Value vecAddr) {
@@ -419,12 +418,7 @@ void storeDistributedToShared(MemDescType dstTy, RankedTensorType srcTy,
         store(vec, vecAddr)
             .setAlignment(vecTy.getNumElements() *
                           elemLlvmTy.getIntOrFloatBitWidth() / 8);
-        if (llvmOpCount) {
-          ++(llvmOpCount->first);
-          llvmOpCount->second = vecTy;
-        }
       });
-
   if (!success)
     llvm::report_fatal_error("Failed to emit transfer from register to shared");
 }
 
@@ -11,6 +11,7 @@
 #include "mlir/Support/LLVM.h"
 #include "triton/Analysis/Utility.h"
 #include "triton/Dialect/Triton/IR/Utility.h"
+#include "triton/Dialect/TritonGPU/IR/Attributes.h"
 #include "triton/Dialect/TritonGPU/IR/Dialect.h"
 #include "triton/Dialect/TritonGPU/IR/LinearLayoutConversions.h"
 #include "triton/Dialect/TritonGPU/Transforms/Utility.h"
@@ -237,8 +238,31 @@ static SmallVector<unsigned> eraseOrder(ArrayRef<unsigned> order,
   return resOrder;
 }
 
+SmallVector<unsigned> getOrderForDotOperand(unsigned opIdx, unsigned rank,
+                                            bool kMajor) {
+  // kMajor: if true, the matrix is fastest-running on k,
+  //         otherwise it is on m (resp. n)
+  // opIdx=0: [batch, m, k] if rank == 3 else [m, k]
+  // opIdx=1: [batch, k, n] if rank == 3 else [k, n]
+  // batch (if rank == 3) is always the slowest running dimension
+  assert(rank == 2 || rank == 3);
+  assert(opIdx == 0 || opIdx == 1);
+  SmallVector<unsigned> order(rank);
+  std::iota(order.rbegin(), order.rend(), 0);
+  // If opIdx is 1 and kMajor is true, the order is [0, 1]
+  // (resp. [1, 2, 0] if rank == 3)
+  // Same if opIdx is 0 and kMajor is false
+  if (bool(opIdx) == kMajor) {
+    std::swap(order[0], order[1]);
+  }
+  return order;
+}
+
 SmallVector<unsigned> getWarpOrder(Attribute layout) {
   auto order = getOrder(layout);
+  // FIXME: This mmaLayout if should just return
+  // getOrderForDotOperand(0, order.size(), kMajor=false)
+  // as mma has the same order as DotOperand(opIdx=0)
   if (auto mmaLayout = dyn_cast<NvidiaMmaEncodingAttr>(layout)) {
     if (mmaLayout.isHopper()) {
       // Hopper MMA instructions force a warp order of [0, 1]. See docs:
@@ -247,30 +271,9 @@ SmallVector<unsigned> getWarpOrder(Attribute layout) {
       order.erase(it);
       order.insert(order.begin(), 0);
     }
-  }
-  return order;
-}
-
-SmallVector<unsigned> getOrderForDotOperand(unsigned opIdx, unsigned rank) {
-  SmallVector<unsigned> order(rank);
-  // The 'order' field typically represents a descending sorted array of
-  // dimensions based on contiguity. For instance, in axisInfo utilities that
-  // retrieve tensor contiguity, it's assumed that the dimension with the
-  // highest contiguity corresponds to order[0].
-  //
-  // The relation between contiguity and order is only relevant if the layout
-  // interfaces with HBM, as is the case when we load tensor from HBM to
-  // registers in the dot layout to bypass LDS. When bypassing LDS, we make the
-  // following assumptions about tensor layouts:
-  // - Tensor A (opIdx == 0) is considered to be row-major.
-  // - Tensor B (opIdx == 1) is considered to be column-major.
-  //
-  // Based on these assumptions, we define the following orders:
-  // - For opIdx == 0, we assume an order of [1, 0].
-  // - For opIdx == 1, we assume an order of [0, 1].
-  std::iota(order.rbegin(), order.rend(), 0);
-  if (opIdx == 1) {
-    std::swap(order[0], order[1]);
+  } else if (auto dotOpLayout = dyn_cast<DotOperandEncodingAttr>(layout)) {
+    order = getOrderForDotOperand(dotOpLayout.getOpIdx(), order.size(),
+                                  /*kMajor*/ false);
   }
   return order;
 }
@@ -287,13 +290,12 @@ SmallVector<unsigned> getOrder(Attribute layout) {
     return order;
   }
   if (auto dotLayout = dyn_cast<DotOperandEncodingAttr>(layout)) {
-    auto rank = getWarpsPerCTA(dotLayout.getParent()).size();
-    SmallVector<unsigned> order(rank);
+    auto rank = dotLayout.getWarpsPerCTA().size();
     if (isa<AMDMfmaEncodingAttr>(dotLayout.getParent())) {
-      return getOrderForDotOperand(dotLayout.getOpIdx(), rank);
-    } else {
-      std::iota(order.rbegin(), order.rend(), 0);
+      return getOrderForDotOperand(dotLayout.getOpIdx(), rank, /*kMajor*/ true);
     }
+    SmallVector<unsigned> order(rank);
+    std::iota(order.rbegin(), order.rend(), 0);
     return order;
   }
   if (auto sliceLayout = dyn_cast<SliceEncodingAttr>(layout)) {
@@ -1059,7 +1061,8 @@ SmallVector<unsigned> DotOperandEncodingAttr::getWarpOrder() const {
   return ::getWarpOrder(*this);
 }
 SmallVector<unsigned> DotOperandEncodingAttr::getThreadOrder() const {
-  return ::getOrder(*this);
+  return getOrderForDotOperand(getOpIdx(), getWarpsPerCTA().size(),
+                               /*kMajor*/ true);
 }
 SmallVector<unsigned> DotOperandEncodingAttr::getShapePerCTATile(
     ArrayRef<int64_t> tensorShape) const {
@@ -2042,6 +2045,7 @@ SmallVector<int64_t> NvidiaMmaEncodingAttr::getMMAv2Rep(ArrayRef<int64_t> shape,
                                                         int opIdx) const {
   auto rank = shape.size();
   auto warpsPerCTA = getWarpsPerCTA();
+
   SmallVector<int> shapePerWarp = {1, 16, 8, 4 * 64 / bitwidth};
   int numRepBatch =
       rank == 3
 
@@ -827,8 +827,15 @@ DotOperandEncodingAttr::toLinearLayout(ArrayRef<int64_t> shape) const {
 
   if (auto mfmaLayout = llvm::dyn_cast<AMDMfmaEncodingAttr>(getParent())) {
     return dotOperandMfmaToLinearLayout(*this, shape);
-  }
-  if (auto dpasLayout = llvm::dyn_cast<intel::DpasEncodingAttr>(getParent())) {
+  } else if (auto mma = mlir::dyn_cast<NvidiaMmaEncodingAttr>(getParent())) {
+    // FIXME [Dot LL]
+    // Do this unconditionally
+    auto largeKWidth = getKWidth() == 8;
+    if (mma.isAmpere() && largeKWidth) {
+      return ampereDotToLinearLayout(shape, *this);
+    }
+  } else if (auto dpasLayout =
+                 llvm::dyn_cast<intel::DpasEncodingAttr>(getParent())) {
     return dotOperandDpasToLinearLayout(*this, shape);
   }