intel
diff --git a/‎README.md‎
Lines changed: 1 addition & 0 deletions b/‎README.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎include/triton/Conversion/TritonGPUToLLVM/Utility.h‎
Lines changed: 24 additions & 248 deletions b/‎include/triton/Conversion/TritonGPUToLLVM/Utility.h‎
Lines changed: 24 additions & 248 deletions
diff --git a/‎include/triton/Dialect/Triton/Transforms/Passes.td‎
Lines changed: 11 additions & 0 deletions b/‎include/triton/Dialect/Triton/Transforms/Passes.td‎
Lines changed: 11 additions & 0 deletions
diff --git a/‎include/triton/Dialect/TritonGPU/IR/TritonGPUOps.td‎
Lines changed: 12 additions & 0 deletions b/‎include/triton/Dialect/TritonGPU/IR/TritonGPUOps.td‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎include/triton/Dialect/TritonGPU/Transforms/PipelineExpander.h‎
Lines changed: 10 additions & 0 deletions b/‎include/triton/Dialect/TritonGPU/Transforms/PipelineExpander.h‎
Lines changed: 10 additions & 0 deletions
@@ -212,6 +212,7 @@ See [`python/triton/knobs.py`](python/triton/knobs.py) for the full list of conf
 - `TRITON_OVERRIDE_DIR` specifies the directory from which to load the IR/ptx/amdgcn files when `TRITON_KERNEL_OVERRIDE` is set to 1.
 - `TRITON_F32_DEFAULT` sets the default input precision of `tl.dot` when using 32-bit floats, which can be either `ieee`, `tf32`, or `tf32x3`.
 - `TRITON_FRONT_END_DEBUGGING=1` disables exception wrapping when an error occurs in the compiler frontend, allowing the full stack trace to be seen.
+- `TRITON_STRIP_DEBUG_INFO` removes all debug information from the module, including location information
 
 N.B. Some of these environment variables don't have a knob in `knobs.py`-- those are only relevant to the C++ layer(s), hence they don't exist in the python layer.
 
 
@@ -338,46 +338,21 @@ using namespace mlir::triton;
 
 class SharedMemoryObject {
 public:
-  SharedMemoryObject(Value base, Type baseElemType, ArrayRef<Value> offsets)
-      : base(base), baseElemType(baseElemType),
-        offsets(offsets.begin(), offsets.end()) {}
+  SharedMemoryObject(Value base, Type baseElemType, ArrayRef<Value> offsets);
 
   SharedMemoryObject(Value base, Type baseElemType, int64_t rank, Location loc,
-                     RewriterBase &rewriter)
-      : base(base), baseElemType(baseElemType) {
-    auto b = TritonLLVMOpBuilder(loc, rewriter);
-    offsets.append(rank, b.i32_val(0));
-  }
+                     RewriterBase &rewriter);
 
   SmallVector<Value> getOffsets() const { return offsets; }
   Value getBase() const { return base; }
   Type getBaseElemType() const { return baseElemType; }
 
-  SmallVector<Value> getElems() const {
-    SmallVector<Value> elems;
-    elems.push_back(base);
-    elems.append(offsets.begin(), offsets.end());
-    return elems;
-  }
+  SmallVector<Value> getElems() const;
 
-  SmallVector<Type> getTypes() const {
-    SmallVector<Type> types;
-    types.push_back(base.getType());
-    types.append(offsets.size(), IntegerType::get(base.getContext(), 32));
-    return types;
-  }
+  SmallVector<Type> getTypes() const;
 
   SmallVector<Value> getStrides(triton::gpu::MemDescType memDesc, Location loc,
-                                RewriterBase &rewriter) const {
-    auto allocShape = memDesc.getAllocShape();
-    auto allocShapePerCTA = triton::gpu::getAllocationShapePerCTA(
-        memDesc.getEncoding(), allocShape);
-    auto layoutOrder = triton::gpu::getOrder(memDesc);
-    auto allocStrides = SharedMemoryObject::getStridesForShape(
-        allocShapePerCTA, layoutOrder, loc, rewriter);
-    return SmallVector<Value>(allocStrides.end() - offsets.size(),
-                              allocStrides.end());
-  }
+                                RewriterBase &rewriter) const;
 
   // TODO(Keren): deprecate the method once AMD backend has cleaned up
   Value getCSwizzleOffset(int dim) const {
@@ -386,50 +361,16 @@ class SharedMemoryObject {
   }
 
   // TODO(Keren): deprecate the method once AMD backend has cleaned up
-  Value getBaseBeforeSlice(int dim, Location loc,
-                           RewriterBase &rewriter) const {
-    auto b = TritonLLVMOpBuilder(loc, rewriter);
-    Value cSwizzleOffset = getCSwizzleOffset(dim);
-    Value offset = b.sub(b.i32_val(0), cSwizzleOffset);
-    Type type = base.getType();
-    return b.gep(type, baseElemType, base, offset);
-  }
+  Value getBaseBeforeSlice(int dim, Location loc, RewriterBase &rewriter) const;
 
 private:
-  static SmallVector<unsigned>
-  getOrderForShape(ArrayRef<int64_t> shape, ArrayRef<unsigned> layoutOrder) {
-    SmallVector<unsigned> order(shape.size());
-    // Default minor-to-major order
-    std::iota(order.rbegin(), order.rend(), 0);
-    if (layoutOrder.size() > 0) {
-      // If a layout order is provided, we assume it specifies the order in
-      // which the dimensions are first accessed, and unspecified dimensions
-      // retain the minor-to-major order. For example, if order = [2, 1, 0] and
-      // layoutOrder = [0, 1], we need to shift `layoutOrder`
-      // by -1 (move them right). The resulting order will then be [1, 2, 0].
-      int rankDiff = layoutOrder.size() - shape.size();
-      auto minRank = std::min<size_t>(shape.size(), layoutOrder.size());
-      for (size_t i = 0; i < minRank; ++i)
-        order[i] = layoutOrder[i] - rankDiff;
-    }
-    assert(isPermutationOfIota(order) && "Invalid order");
-    return order;
-  }
+  static SmallVector<unsigned> getOrderForShape(ArrayRef<int64_t> shape,
+                                                ArrayRef<unsigned> layoutOrder);
 
   static SmallVector<Value> getStridesForShape(ArrayRef<int64_t> shape,
                                                ArrayRef<unsigned> layoutOrder,
                                                Location loc,
-                                               RewriterBase &rewriter) {
-    SmallVector<Value> strides(shape.size());
-    auto order = SharedMemoryObject::getOrderForShape(shape, layoutOrder);
-    int64_t stride = 1;
-    auto b = TritonLLVMOpBuilder(loc, rewriter);
-    for (auto idx : order) {
-      strides[idx] = b.i32_val(stride);
-      stride *= shape[idx];
-    }
-    return strides;
-  }
+                                               RewriterBase &rewriter);
 
   Value base; // i32 ptr. The start address of the shared memory object.
   Type baseElemType;
@@ -486,97 +427,14 @@ inline bool isKernel(FunctionOpInterface funcOp) {
   return funcOp.getVisibility() == SymbolTable::Visibility::Public;
 }
 
-inline Value getStackPointer(RewriterBase &rewriter,
-                             FunctionOpInterface funcOp) {
-  // See NOTE: [Additional Function Arguments]
-  if (!isKernel(funcOp)) {
-    return funcOp.getArgument(funcOp.getNumArguments() - 2);
-  }
-
-  auto mod = funcOp->getParentOfType<ModuleOp>();
-  auto globalBase = dyn_cast<LLVM::GlobalOp>(mod.lookupSymbol("global_smem"));
-  assert(globalBase);
-  return rewriter.create<LLVM::AddressOfOp>(funcOp.getLoc(), globalBase);
-}
-
-inline Value getGlobalScratchPtr(Location loc, RewriterBase &rewriter,
-                                 const TargetInfoBase &targetInfo,
-                                 FunctionOpInterface funcOp,
-                                 Value allocOffset = {}) {
-  // See NOTE: [Additional Function Arguments]
-  if (!isKernel(funcOp)) {
-    // Base for this function
-    auto gmemBase = funcOp.getArgument(funcOp.getNumArguments() - 1);
-    if (!allocOffset) {
-      return gmemBase;
-    }
-
-    auto ptrTy = mlir::LLVM::LLVMPointerType::get(rewriter.getContext(), 1);
-    auto b = TritonLLVMOpBuilder(loc, rewriter);
-    return b.gep(ptrTy, i8_ty, gmemBase, allocOffset);
-  }
-
-  // Base for entire kernel
-  auto gmemBase = funcOp.getArgument(funcOp.getNumArguments() - 1);
-
-  ModuleOp mod = funcOp.getOperation()->getParentOfType<ModuleOp>();
-  auto allocSizeAttr = mod.getOperation()->getAttrOfType<mlir::IntegerAttr>(
-      "ttg.global_scratch_memory_size");
-  if (!allocSizeAttr) {
-    return gmemBase;
-  }
+Value getStackPointer(RewriterBase &rewriter, FunctionOpInterface funcOp);
 
-  Value gridIdx[3];
-  Value gridDim[2];
-  for (int k = 0; k < 3; ++k) {
-    gridIdx[k] = rewriter.create<GetProgramIdOp>(loc, k);
-  }
-  for (int k = 0; k < 2; ++k) {
-    gridDim[k] = rewriter.create<GetNumProgramsOp>(loc, k);
-  }
+Value getGlobalScratchPtr(Location loc, RewriterBase &rewriter,
+                          const TargetInfoBase &targetInfo,
+                          FunctionOpInterface funcOp, Value allocOffset);
 
-  auto b = TritonLLVMOpBuilder(loc, rewriter);
-  Value linearId = gridIdx[2];
-  for (int k = 0; k < 2; ++k) {
-    linearId = b.add(gridIdx[1 - k], b.mul(linearId, gridDim[1 - k]));
-  }
-  auto numCTAs = triton::gpu::TritonGPUDialect::getNumCTAs(mod);
-  if (numCTAs > 1) {
-    linearId = b.mul(linearId, b.i32_val(numCTAs));
-    linearId = b.add(linearId, targetInfo.getClusterCTAId(rewriter, loc));
-  }
-
-  auto allocSize = allocSizeAttr.getValue().getZExtValue();
-
-  Value offset = b.mul(linearId, b.i32_val(allocSize));
-  if (allocOffset) {
-    offset = b.add(offset, allocOffset);
-  }
-
-  auto *ctx = rewriter.getContext();
-  auto res =
-      b.gep(mlir::LLVM::LLVMPointerType::get(ctx, 1), i8_ty, gmemBase, offset);
-  return res;
-}
-
-inline Value getSharedMemoryBase(Location loc, RewriterBase &rewriter,
-                                 const TargetInfoBase &target, Operation *op) {
-  auto ptrTy = LLVM::LLVMPointerType::get(rewriter.getContext(),
-                                          target.getSharedAddressSpace());
-  auto func = op->template getParentOfType<FunctionOpInterface>();
-  if (!func)
-    func = cast<FunctionOpInterface>(op);
-
-  assert(op->hasAttr("allocation.offset"));
-  size_t offset = cast<IntegerAttr>(op->getAttr("allocation.offset"))
-                      .getValue()
-                      .getZExtValue();
-  auto b = TritonLLVMOpBuilder(loc, rewriter);
-  Value offVal = b.i32_val(offset);
-  Value base =
-      b.gep(ptrTy, i8_ty, LLVM::getStackPointer(rewriter, func), offVal);
-  return base;
-}
+Value getSharedMemoryBase(Location loc, RewriterBase &rewriter,
+                          const TargetInfoBase &target, Operation *op);
 
 // -----------------------------------------------------------------------
 // MXFP utilities
@@ -619,16 +477,8 @@ using ::mlir::triton::gpu::DotOperandEncodingAttr;
 using ::mlir::triton::gpu::NvidiaMmaEncodingAttr;
 using ::mlir::triton::gpu::SliceEncodingAttr;
 
-inline Value dot(RewriterBase &rewriter, Location loc, ArrayRef<Value> offsets,
-                 ArrayRef<Value> strides) {
-  assert(offsets.size() == strides.size());
-  auto b = TritonLLVMOpBuilder(loc, rewriter);
-  Value ret = b.i32_val(0);
-  for (auto [offset, stride] : llvm::zip(offsets, strides)) {
-    ret = b.add(ret, b.mul(offset, stride));
-  }
-  return ret;
-}
+Value dot(RewriterBase &rewriter, Location loc, ArrayRef<Value> offsets,
+          ArrayRef<Value> strides);
 
 /// Extend 2d shared object to 3d.
 ///
@@ -720,91 +570,17 @@ SmallVector<Value> unpackLLVector(Location loc, Value llvmVec,
 
 Value packLLVector(Location loc, ValueRange vals, RewriterBase &rewriter);
 
-inline std::optional<LLVM::AtomicBinOp> matchAtomicOp(RMWOp atomicOp) {
-  switch (atomicOp) {
-  case RMWOp::AND:
-    return LLVM::AtomicBinOp::_and;
-  case RMWOp::OR:
-    return LLVM::AtomicBinOp::_or;
-  case RMWOp::XOR:
-    return LLVM::AtomicBinOp::_xor;
-  case RMWOp::ADD:
-    return LLVM::AtomicBinOp::add;
-  case RMWOp::FADD:
-    return LLVM::AtomicBinOp::fadd;
-  case RMWOp::MAX:
-    return LLVM::AtomicBinOp::max;
-  case RMWOp::MIN:
-    return LLVM::AtomicBinOp::min;
-  case RMWOp::UMAX:
-    return LLVM::AtomicBinOp::umax;
-  case RMWOp::UMIN:
-    return LLVM::AtomicBinOp::umin;
-  case RMWOp::XCHG:
-    return LLVM::AtomicBinOp::xchg;
-  default:
-    return {};
-  }
-}
+std::optional<LLVM::AtomicBinOp> matchAtomicOp(RMWOp atomicOp);
 
-inline std::optional<LLVM::AtomicOrdering>
-getMemoryOrdering(MemSemantic memOrdering) {
-  switch (memOrdering) {
-  case MemSemantic::RELAXED:
-    return LLVM::AtomicOrdering::monotonic;
-  case MemSemantic::ACQUIRE:
-    return LLVM::AtomicOrdering::acquire;
-  case MemSemantic::RELEASE:
-    return LLVM::AtomicOrdering::release;
-  case MemSemantic::ACQUIRE_RELEASE:
-    return LLVM::AtomicOrdering::acq_rel;
-  default:
-    return {};
-  }
-}
+std::optional<LLVM::AtomicOrdering> getMemoryOrdering(MemSemantic memOrdering);
 
-inline bool
-isSimpleSharedMemoryAccess(ArrayRef<int64_t> shape,
-                           ArrayRef<int64_t> allocShape,
-                           triton::gpu::SharedEncodingTrait sharedEnc) {
-  auto rank = shape.size();
-  auto swizzledLayout =
-      dyn_cast<triton::gpu::SwizzledSharedEncodingAttr>(sharedEnc);
-  auto nvmmaLayout = dyn_cast<triton::gpu::NVMMASharedEncodingAttr>(sharedEnc);
-  bool noSwizzling = (swizzledLayout && swizzledLayout.getMaxPhase() == 1) ||
-                     (nvmmaLayout && nvmmaLayout.getSwizzlingByteWidth() == 0);
-  return /*no swizzling*/ noSwizzling ||
-         /*swizzling but same shape*/ shape == allocShape ||
-         /*swizzling and rank-reduced and rank >= 2*/
-         (shape == allocShape.take_back(rank) && rank >= 2);
-}
+bool isSimpleSharedMemoryAccess(ArrayRef<int64_t> shape,
+                                ArrayRef<int64_t> allocShape,
+                                triton::gpu::SharedEncodingTrait sharedEnc);
 
-inline llvm::MapVector<StringAttr, int32_t>
-getAllFreeVarMasks(MLIRContext *ctx) {
-  // Mask where all elements are redundant
-  auto kReg = str_attr("reg");
-  auto kLane = str_attr("lane");
-  auto kWarp = str_attr("warp");
-  auto kBlock = str_attr("block");
+llvm::MapVector<StringAttr, int32_t> getAllFreeVarMasks(MLIRContext *ctx);
 
-  int32_t fullMask = -1;
-  llvm::MapVector<StringAttr, int32_t> ret;
-  for (auto dimName : {kReg, kLane, kWarp, kBlock}) {
-    ret[dimName] = fullMask;
-  }
-  return ret;
-}
-
-inline llvm::MapVector<StringAttr, int32_t> getFreeVariableMasks(Type type) {
-  auto ctx = type.getContext();
-  auto tensorTy = dyn_cast<RankedTensorType>(type);
-  if (!tensorTy) {
-    return getAllFreeVarMasks(ctx);
-  }
-  auto ll =
-      triton::gpu::toLinearLayout(tensorTy.getShape(), tensorTy.getEncoding());
-  return ll.getFreeVariableMasks();
-}
+llvm::MapVector<StringAttr, int32_t> getFreeVariableMasks(Type type);
 
 inline bool isCanonicalIndex(unsigned index, unsigned freeVarMask) {
   return (index & freeVarMask) == 0;
 
@@ -79,4 +79,15 @@ def TritonLoopInvariantCodeMotion : Pass</*cli-arg*/"triton-licm", /*Op*/"mlir::
   let dependentDialects = ["mlir::triton::TritonDialect"];
 }
 
+def TritonLoopAwareCSE : Pass<"triton-loop-aware-cse", "mlir::ModuleOp"> {
+  let summary = "CSE within loop bodies";
+
+  let description = [{
+    The `triton-loop-aware-cse` pass performs recursive common subexpression
+    elimination within loop bodies. Unlike regular CSE, which is a single-pass
+    greedy algorithm, this pass can recursively eliminate loop iteration
+    arguments and subcomputations that always have the same value.
+  }];
+}
+
 #endif
@@ -313,6 +313,18 @@ def TTG_LocalStoreOp : TTG_Op<"local_store"> {
   }];
 }
 
+def TTG_PredicateStageOp: TTG_Op<"predicate_stage",
+                                [Pure, AllTypesMatch<["iv", "ub", "step"]>]> {
+  let summary = "pipeliner stage predicate";
+  let arguments = (ins AnySignlessIntegerOrIndex:$iv,
+                       AnySignlessIntegerOrIndex:$ub,
+                       AnySignlessIntegerOrIndex:$step,
+                       I32Attr:$maxStage,
+                       I32Attr:$stage);
+  let results = (outs I1:$result);
+  let assemblyFormat = "$iv `,` $ub `,` $step `maxStage` $maxStage `stage` $stage attr-dict `:` type($iv) `->` type($result)";
+}
+
 def TTG_Fp4ToFpOp : TTG_Op<"fp4_to_fp", [Pure]> {
   let summary = "Upcast fp4 (e2m1) to fp";
 
 
@@ -57,6 +57,12 @@ struct PipeliningOption {
   /// pipeliner will have to predicate operations in the prologue/epilogue.
   bool supportDynamicLoops = false;
 
+  /// If set, use this function to emit the predicate stage ops instead of the
+  /// default one.
+  using EmitPredicateStageFnType = std::function<Value(
+      RewriterBase &, Value, Value, Value, uint64_t, uint64_t)>;
+  EmitPredicateStageFnType emitPredicateStageFn = nullptr;
+
   // Callback to predicate operations when the prologue or epilogue are not
   // peeled. This takes the original operation, an i1 predicate value and the
   // pattern rewriter. It is expected to replace the given operation with
@@ -95,6 +101,10 @@ FailureOr<scf::ForOp> pipelineForLoop(RewriterBase &rewriter, scf::ForOp forOp,
                                       const PipeliningOption &options,
                                       bool *modifiedIR = nullptr);
 
+Value emitPredicateForStage(RewriterBase &rewriter, Value inductionVar,
+                            Value upperBound, Value step, uint64_t maxStage,
+                            uint64_t stage);
+
 } // namespace triton
 } // namespace mlir