intel · whitneywhtsang · Oct 22, 2025 · Oct 22, 2025 · Oct 23, 2025 · Oct 23, 2025
diff --git a/include/triton/Dialect/Triton/IR/Utility.h b/include/triton/Dialect/Triton/IR/Utility.h
@@ -1,6 +1,8 @@
 #ifndef TRITON_IR_UTILITY_H_
 #define TRITON_IR_UTILITY_H_
 
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+#include "mlir/IR/BuiltinTypes.h"
 #include "triton/Dialect/Triton/IR/Dialect.h"
 #include <algorithm>
 #include <numeric>
@@ -10,6 +12,14 @@ namespace mlir {
 // Bitwidth of pointers
 constexpr int kPtrBitWidth = 64;
 
+// Returns the bit width of a type, treating pointer-like types as 64-bit.
+// This handles LLVM dialect pointer types.
+inline int getIntOrFloatOrPtrBitWidth(Type type) {
+  if (isa<LLVM::LLVMPointerType, triton::PointerType>(type))
+    return kPtrBitWidth;
+  return type.getIntOrFloatBitWidth();
+}
+
 template <typename T, typename U> SmallVector<T> convertType(ArrayRef<U> in) {
   SmallVector<T> out;
   for (const auto &i : in)

diff --git a/include/triton/Dialect/TritonGPU/IR/LinearLayoutConversions.h b/include/triton/Dialect/TritonGPU/IR/LinearLayoutConversions.h
@@ -137,10 +137,9 @@ LinearLayout chooseScaledMfmaScaleLayout(MLIRContext *ctx, int dotOperandIdx,
                                          ArrayRef<unsigned> tilesPerWarp,
                                          ArrayRef<unsigned> warpsPerCTA);
 
-LinearLayout chooseScaledWmmaScaleLayout(
-    MLIRContext *ctx, int dotOperandIdx,
-    const std::vector<std::vector<int32_t>> &dotOperandWarpBasis,
-    ArrayRef<int64_t> dotOperandShape);
+LinearLayout chooseScaledWmmaScaleLayout(MLIRContext *ctx, int dotOperandIdx,
+                                         ArrayRef<unsigned> warpsPerCTA,
+                                         ArrayRef<int64_t> dotOperandShape);
 
 LinearLayout getSM120DotScaledScaleLayout(MLIRContext *ctx,
                                           ArrayRef<int64_t> shape, int opIdx,

diff --git a/include/triton/Dialect/TritonGPU/Transforms/PipeliningUtility.h b/include/triton/Dialect/TritonGPU/Transforms/PipeliningUtility.h
@@ -184,6 +184,13 @@ getLastUseOfPipelinedOp(ArrayRef<Operation *> ops, scf::ForOp forOp,
 
 // Clean up attributes passing over schedules across stages in pipelining
 void removePipeliningAttributes(ModuleOp moduleOp);
+
+// For LoadOp, DescriptorLoad, and DescriptorGather ops, determine if
+// they should be pipelined.
+bool isPipeliningBeneficial(Operation *op,
+                            triton::ModuleAxisInfoAnalysis &axisInfoAnalysis,
+                            bool filterSmall = true);
+
 } // namespace triton
 } // namespace mlir
 

diff --git a/lib/Analysis/Allocation.cpp b/lib/Analysis/Allocation.cpp
@@ -152,7 +152,8 @@ class AllocationAnalysis {
       auto shapePerCTA = gpu::getAllocationShapePerCTA(allocType);
       numElems = product<int64_t>(shapePerCTA);
     }
-    int64_t bytes = numElems * allocType.getElementTypeBitWidth() / 8;
+    int64_t bytes =
+        numElems * getIntOrFloatOrPtrBitWidth(allocType.getElementType()) / 8;
 
     auto alignment = alloc.getAlignmentOrDefault();
     allocation->addBuffer<BufferT::BufferKind::Explicit>(alloc, bytes,

diff --git a/lib/Analysis/AxisInfo.cpp b/lib/Analysis/AxisInfo.cpp
@@ -91,23 +91,26 @@ class BinaryOpVisitorImpl : public AxisInfoVisitorImpl<OpTy> {
     auto lhsInfo = operands[0]->getValue();
     auto rhsInfo = operands[1]->getValue();
     auto rank = lhsInfo.getRank();
+    assert(isa<RankedTensorType>(op.getType()) ||
+           rank == 1 && "Expected ranked tensor or scalar");
     assert(operands.size() == 2 && "Expected two operands");
+    auto constantValue = getConstantValue(op, lhsInfo, rhsInfo);
+    if (constantValue.has_value()) {
+      auto resTy = dyn_cast<RankedTensorType>(op.getType());
+      AxisInfo::DimVectorT constancy =
+          resTy ? to_vector(resTy.getShape()) : AxisInfo::DimVectorT(rank, 1);
+      AxisInfo::DimVectorT contiguity(rank, 1);
+      AxisInfo::DimVectorT divisibility(
+          rank, highestPowOf2Divisor<int64_t>(constantValue.value()));
+      return AxisInfo(contiguity, divisibility, constancy, constantValue);
+    }
     AxisInfo::DimVectorT contiguity;
     AxisInfo::DimVectorT divisibility;
     AxisInfo::DimVectorT constancy;
-    auto constantValue = getConstantValue(op, lhsInfo, rhsInfo);
     for (auto d = 0; d < rank; ++d) {
-      if (constantValue.has_value()) {
-        contiguity.push_back(1);
-        constancy.push_back(
-            std::max(lhsInfo.getConstancy(d), rhsInfo.getConstancy(d)));
-        divisibility.push_back(
-            highestPowOf2Divisor<int64_t>(constantValue.value()));
-      } else {
-        contiguity.push_back(getContiguity(op, lhsInfo, rhsInfo, d));
-        constancy.push_back(getConstancy(op, lhsInfo, rhsInfo, d));
-        divisibility.push_back(getDivisibility(op, lhsInfo, rhsInfo, d));
-      }
+      contiguity.push_back(getContiguity(op, lhsInfo, rhsInfo, d));
+      constancy.push_back(getConstancy(op, lhsInfo, rhsInfo, d));
+      divisibility.push_back(getDivisibility(op, lhsInfo, rhsInfo, d));
     }
     return AxisInfo(contiguity, divisibility, constancy, constantValue);
   }
@@ -125,9 +128,8 @@ class BinaryOpVisitorImpl : public AxisInfoVisitorImpl<OpTy> {
 
   virtual int64_t getConstancy(OpTy op, const AxisInfo &lhs,
                                const AxisInfo &rhs, int dim) {
-    return 1;
+    return gcd(lhs.getConstancy(dim), rhs.getConstancy(dim));
   }
-
   virtual std::optional<int64_t> getConstantValue(OpTy op, const AxisInfo &lhs,
                                                   const AxisInfo &rhs) {
     return {};
@@ -192,6 +194,26 @@ class CastOpAxisInfoVisitor final : public AxisInfoVisitorImpl<OpTy> {
   }
 };
 
+class UnrealizedConversionCastOpAxisInfoVisitor final
+    : public AxisInfoVisitorImpl<mlir::UnrealizedConversionCastOp> {
+public:
+  using AxisInfoVisitorImpl<
+      mlir::UnrealizedConversionCastOp>::AxisInfoVisitorImpl;
+
+  AxisInfo
+  getAxisInfo(mlir::UnrealizedConversionCastOp op,
+              ArrayRef<const dataflow::Lattice<AxisInfo> *> operands) override {
+    auto tensorType = dyn_cast<RankedTensorType>(op.getResultTypes()[0]);
+    if (tensorType &&
+        tensorType.getRank() != operands[0]->getValue().getRank()) {
+      // Do not propagate AxisInfo with incorrect rank. This can cause a crash
+      // in future visitor applications.
+      return AxisInfo::getPessimisticValueState(op->getResult(0));
+    }
+    return operands[0]->getValue();
+  }
+};
+
 class MakeRangeOpAxisInfoVisitor final
     : public AxisInfoVisitorImpl<triton::MakeRangeOp> {
 public:
@@ -254,7 +276,7 @@ class PoisonOpAxisInfoVisitor final : public AxisInfoVisitorImpl<ub::PoisonOp> {
   getAxisInfo(ub::PoisonOp op,
               ArrayRef<const dataflow::Lattice<AxisInfo> *> operands) override {
     unsigned rank = 1;
-    if (auto shape = dyn_cast<mlir::ShapedType>(op.getType()))
+    if (auto shape = dyn_cast<RankedTensorType>(op.getType()))
       rank = shape.getRank();
 
     // Poison values are never accessed, thus assume optimistic values.
@@ -308,11 +330,6 @@ class AddSubOpAxisInfoVisitor final : public BinaryOpVisitorImpl<OpTy> {
     return gcd(lhs.getDivisibility(dim), rhsDivisibility);
   }
 
-  int64_t getConstancy(OpTy op, const AxisInfo &lhs, const AxisInfo &rhs,
-                       int dim) override {
-    return gcd(lhs.getConstancy(dim), rhs.getConstancy(dim));
-  }
-
   std::optional<int64_t> getConstantValue(OpTy op, const AxisInfo &lhs,
                                           const AxisInfo &rhs) override {
     if (lhs.getConstantValue().has_value() &&
@@ -355,11 +372,6 @@ class MulIOpAxisInfoVisitor final : public BinaryOpVisitorImpl<arith::MulIOp> {
     return std::max(lhsContiguity, rhsContiguity);
   }
 
-  int64_t getConstancy(arith::MulIOp op, const AxisInfo &lhs,
-                       const AxisInfo &rhs, int dim) override {
-    return gcd(lhs.getConstancy(dim), rhs.getConstancy(dim));
-  }
-
   int64_t getDivisibility(arith::MulIOp op, const AxisInfo &lhs,
                           const AxisInfo &rhs, int dim) override {
     auto lhsDivisibility = lhs.getDivisibility(dim);
@@ -379,9 +391,13 @@ class MulIOpAxisInfoVisitor final : public BinaryOpVisitorImpl<arith::MulIOp> {
 
   std::optional<int64_t> getConstantValue(arith::MulIOp op, const AxisInfo &lhs,
                                           const AxisInfo &rhs) override {
-    if (lhs.getConstantValue().has_value() &&
-        rhs.getConstantValue().has_value())
-      return {lhs.getConstantValue().value() * rhs.getConstantValue().value()};
+    auto lhsConst = lhs.getConstantValue();
+    auto rhsConst = rhs.getConstantValue();
+    if (lhsConst.has_value() && rhsConst.has_value())
+      return {lhsConst.value() * rhsConst.value()};
+    if ((lhsConst.has_value() && lhsConst.value() == 0) ||
+        (rhsConst.has_value() && rhsConst.value() == 0))
+      return 0;
     return {};
   }
 };
@@ -404,12 +420,11 @@ class DivOpAxisInfoVisitor final : public BinaryOpVisitorImpl<OpTy> {
   int64_t getConstancy(OpTy op, const AxisInfo &lhs, const AxisInfo &rhs,
                        int dim) override {
     auto resTy = dyn_cast<RankedTensorType>(op.getType());
+    auto constancy = BinaryOpVisitorImpl<OpTy>::getConstancy(op, lhs, rhs, dim);
     if (!resTy)
-      return BinaryOpVisitorImpl<OpTy>::getConstancy(op, lhs, rhs, dim);
+      return constancy;
     auto shape = resTy.getShape();
-    // Case 1: both lhs and rhs are constants.
-    auto constancy = gcd(lhs.getConstancy(dim), rhs.getConstancy(dim));
-    // Case 2: lhs contiguous, rhs constant.
+    // Case: lhs contiguous, rhs constant.
     // lhs: d_lhs * k, d_lhs * k + 1, ..., d_lhs * k + n
     // rhs: d_rhs * p, d_rhs * p, ..., d_rhs * p
     // lhs / rhs = d_lhs * k / (d_rhs * p), (d_lhs * k + 1) / (d_rhs * p),
@@ -506,15 +521,15 @@ class RemOpAxisInfoVisitor final : public BinaryOpVisitorImpl<OpTy> {
 
   int64_t getConstancy(OpTy op, const AxisInfo &lhs, const AxisInfo &rhs,
                        int dim) override {
+    auto constancy = BinaryOpVisitorImpl<OpTy>::getConstancy(op, lhs, rhs, dim);
     auto resTy = dyn_cast<RankedTensorType>(op.getType());
     if (!resTy)
-      return BinaryOpVisitorImpl<OpTy>::getConstancy(op, lhs, rhs, dim);
-    auto shape = resTy.getShape();
-    // lhs % 1 = 0
-    return rhs.getConstantValue().has_value() &&
-                   rhs.getConstantValue().value() == 1
-               ? shape[dim]
-               : gcd(lhs.getConstancy(dim), rhs.getConstancy(dim));
+      return constancy;
+    // Case: lhs % 1 = 0
+    if (rhs.getConstantValue().has_value() &&
+        rhs.getConstantValue().value() == 1)
+      return resTy.getDimSize(dim);
+    return constancy;
   }
 
   std::optional<int64_t> getConstantValue(OpTy op, const AxisInfo &lhs,
@@ -669,7 +684,7 @@ class CmpOpAxisInfoVisitor final : public AxisInfoVisitorImpl<OpTy> {
       int64_t constHint = 1;
       if (lhsInfo.getConstantValue().has_value() &&
           rhsInfo.getConstantValue().has_value()) {
-        constHint = lhsInfo.getConstancy(d);
+        constHint = shape[d];
         constantValue =
             compare(getPredicate(op), lhsInfo.getConstantValue().value(),
                     rhsInfo.getConstantValue().value())
@@ -828,6 +843,13 @@ class SelectOpAxisInfoVisitor final : public AxisInfoVisitorImpl<OpTy> {
           rhsInfo.getConstantValue().has_value() &&
           lhsInfo.getConstantValue() == rhsInfo.getConstantValue())
         constantValue = lhsInfo.getConstantValue();
+
+      if (constantValue.has_value()) {
+        auto resTy = dyn_cast<RankedTensorType>(op.getType());
+        assert(resTy || rank == 1);
+        constancy =
+            resTy ? to_vector(resTy.getShape()) : AxisInfo::DimVectorT(rank, 1);
+      }
     }
 
     return AxisInfo(contiguity, divisibility, constancy, constantValue);
@@ -840,11 +862,6 @@ class LogicalOpAxisInfoVisitor final : public BinaryOpVisitorImpl<OpTy> {
   using BinaryOpVisitorImpl<OpTy>::BinaryOpVisitorImpl;
 
 private:
-  int64_t getConstancy(OpTy op, const AxisInfo &lhs, const AxisInfo &rhs,
-                       int dim) override {
-    return gcd(lhs.getConstancy(dim), rhs.getConstancy(dim));
-  }
-
   std::optional<int64_t> getConstantValue(OpTy op, const AxisInfo &lhs,
                                           const AxisInfo &rhs) override {
     if (lhs.getConstantValue().has_value() &&
@@ -890,11 +907,6 @@ class ShLIOpAxisInfoVisitor final : public BinaryOpVisitorImpl<arith::ShLIOp> {
     return multiplyDivisor(lhsDivisibility, 1ll << shift);
   }
 
-  int64_t getConstancy(arith::ShLIOp op, const AxisInfo &lhs,
-                       const AxisInfo &rhs, int dim) override {
-    return gcd(lhs.getConstancy(dim), rhs.getConstancy(dim));
-  }
-
   std::optional<int64_t> getConstantValue(arith::ShLIOp op, const AxisInfo &lhs,
                                           const AxisInfo &rhs) override {
     if (lhs.getConstantValue().has_value() &&
@@ -932,11 +944,6 @@ class ShROpAxisInfoVisitor final : public BinaryOpVisitorImpl<OpTy> {
     return std::max<int64_t>(1, lhsDivisibility / (int64_t(1) << shift));
   }
 
-  int64_t getConstancy(OpTy op, const AxisInfo &lhs, const AxisInfo &rhs,
-                       int dim) override {
-    return gcd(lhs.getConstancy(dim), rhs.getConstancy(dim));
-  }
-
   std::optional<int64_t> getConstantValue(OpTy op, const AxisInfo &lhs,
                                           const AxisInfo &rhs) override {
     if (lhs.getConstantValue().has_value() &&
@@ -969,9 +976,15 @@ class MaxMinOpAxisInfoVisitor final : public AxisInfoVisitorImpl<OpTy> {
         constantValue = {std::min(lhsInfo.getConstantValue().value(),
                                   rhsInfo.getConstantValue().value())};
       }
+      auto resTy = dyn_cast<RankedTensorType>(op.getType());
+      assert(resTy || rank == 1);
+      AxisInfo::DimVectorT constancy =
+          resTy ? to_vector(resTy.getShape()) : AxisInfo::DimVectorT(rank, 1);
+      AxisInfo::DimVectorT divisibility(
+          rank, highestPowOf2Divisor<int64_t>(constantValue.value()));
       return AxisInfo(/*knownContiguity=*/AxisInfo::DimVectorT(rank, 1),
-                      /*knownDivisibility=*/AxisInfo::DimVectorT(rank, 1),
-                      /*knownConstancy=*/AxisInfo::DimVectorT(rank, 1),
+                      /*knownDivisibility=*/divisibility,
+                      /*knownConstancy=*/constancy,
                       /*constantValue=*/constantValue);
     } else {
       AxisInfo::DimVectorT contiguity, divisibility, constancy;
@@ -1029,11 +1042,11 @@ AxisInfoAnalysis::AxisInfoAnalysis(DataFlowSolver &solver,
   // This is needed by TritonGPUToLLVM, to get AxisInfo when the graph is
   // in the process of a PartialConversion, where UnrealizedConversionCast
   // may exist
+  visitors.append<UnrealizedConversionCastOpAxisInfoVisitor>();
   visitors.append<CastOpAxisInfoVisitor<arith::ExtSIOp>,
                   CastOpAxisInfoVisitor<arith::ExtUIOp>,
                   CastOpAxisInfoVisitor<arith::TruncIOp>,
                   CastOpAxisInfoVisitor<triton::gpu::ConvertLayoutOp>,
-                  CastOpAxisInfoVisitor<mlir::UnrealizedConversionCastOp>,
                   CastOpAxisInfoVisitor<triton::BitcastOp>>();
   visitors.append<MakeRangeOpAxisInfoVisitor>();
   visitors.append<PoisonOpAxisInfoVisitor>();
@@ -1214,6 +1227,7 @@ void AxisInfo::initDimVectorFromHint(Attribute attr, DimVectorT *vec) {
     return rhs;
   if (rhs.getRank() == 0)
     return lhs;
+  assert(lhs.getRank() == rhs.getRank() && "Mismatched ranks");
   DimVectorT contiguity;
   DimVectorT divisibility;
   DimVectorT constancy;
@@ -1384,7 +1398,10 @@ void ModuleAxisInfoAnalysis::update(CallOpInterface callOp,
       callee.setArgAttr(index, attrName, attr);
     };
     auto axisInfo = axisInfoMap->lookup(value);
-    assert(axisInfo.getRank() == 1 && "only scalar arguments are supported");
+    // Only scalar arguments are supported. Do not forward multi-dimensional
+    // AxisInfo to the callee.
+    if (axisInfo.getRank() != 1)
+      continue;
     setAttrFn("tt.contiguity", axisInfo.getContiguity(0));
     setAttrFn("tt.divisibility", axisInfo.getDivisibility(0));
     setAttrFn("tt.constancy", axisInfo.getConstancy(0));

diff --git a/lib/Conversion/TritonGPUToLLVM/ConvertLayoutOpToLLVM.cpp b/lib/Conversion/TritonGPUToLLVM/ConvertLayoutOpToLLVM.cpp
@@ -26,8 +26,6 @@ struct ConvertLayoutOpConversion
     : public ConvertOpToLLVMPattern<ConvertLayoutOp> {
   const TargetInfoBase &targetInfo;
 
-  // Set benefit to 2 so that this pattern applies before other convert-layout
-  // conversions.  TODO(jlebar): Eventually we want this to be the only pattern.
   explicit ConvertLayoutOpConversion(LLVMTypeConverter &typeConverter,
                                      const TargetInfoBase &targetInfo,
                                      PatternBenefit benefit = 1)
@@ -277,8 +275,7 @@ struct ConvertLayoutOpConversion
     StringAttr kReg = str_attr("register");
     StringAttr kLane = str_attr("lane");
     auto elemTy = getTypeConverter()->convertType(srcTy.getElementType());
-    int bitwidth =
-        elemTy.isIntOrFloat() ? elemTy.getIntOrFloatBitWidth() : kPtrBitWidth;
+    int bitwidth = getIntOrFloatOrPtrBitWidth(elemTy);
 
     auto factors = getWarpLayoutConvertDecomposition(srcTy, dstTy, bitwidth);
     auto &[pReg, pLane, mixedTranspositions, nPack] = factors;

diff --git a/lib/Conversion/TritonGPUToLLVM/ElementwiseOpToLLVM.cpp b/lib/Conversion/TritonGPUToLLVM/ElementwiseOpToLLVM.cpp
@@ -276,7 +276,7 @@ struct ElementwiseInlineAsmOpConversion
       auto ty = getTypeConverter()->convertType(getElementType(result));
 
       // Pack return elements into 32-bits.
-      unsigned bitWidth = ty.isIntOrFloat() ? ty.getIntOrFloatBitWidth() : 64;
+      unsigned bitWidth = getIntOrFloatOrPtrBitWidth(ty);
       unsigned numElemsPerReg =
           std::min(std::max(32 / bitWidth, 1u), op.getPackedElement());
       assert(op.getPackedElement() % numElemsPerReg == 0);

diff --git a/lib/Conversion/TritonGPUToLLVM/Utility.cpp b/lib/Conversion/TritonGPUToLLVM/Utility.cpp
@@ -540,7 +540,7 @@ SmallVector<Value> lowerLdSt(
   auto kLane = str_attr("lane");
   auto kWarp = str_attr("warp");
   auto kOffset = str_attr("offset");
-  auto bitwidth = llvmElemTy.getIntOrFloatBitWidth();
+  auto bitwidth = getIntOrFloatOrPtrBitWidth(llvmElemTy);
 
   auto [elemsPerVec, permutation] =
       largestVectorisation(ctx, cvt, bitwidth, maybeMaxVecElems);
@@ -625,7 +625,7 @@ lowerLocalLdSt(Location loc, MLIRContext *ctx,
   assert(*cvt.getOutDimNames().begin() == str_attr("offset"));
   auto calcPaddedOffset = [&](Value smemOffset) {
     TritonLLVMOpBuilder b(loc, rewriter);
-    auto bitwidth = llvmElemTy.getIntOrFloatBitWidth();
+    auto bitwidth = getIntOrFloatOrPtrBitWidth(llvmElemTy);
     if (auto paddedEnc = dyn_cast<triton::gpu::PaddedSharedEncodingAttr>(
             srcTy.getEncoding())) {
       // Apply the offset needed for padding.