intel
diff --git a/‎bin/RegisterTritonDialects.h‎
Lines changed: 0 additions & 1 deletion b/‎bin/RegisterTritonDialects.h‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎include/triton/Dialect/TritonGPU/Transforms/Utility.h‎
Lines changed: 0 additions & 2 deletions b/‎include/triton/Dialect/TritonGPU/Transforms/Utility.h‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎include/triton/Tools/Sys/GetEnv.hpp‎
Lines changed: 0 additions & 1 deletion b/‎include/triton/Tools/Sys/GetEnv.hpp‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎lib/Dialect/Triton/IR/Ops.cpp‎
Lines changed: 1 addition & 1 deletion b/‎lib/Dialect/Triton/IR/Ops.cpp‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎lib/Dialect/TritonGPU/IR/LinearLayoutConversions.cpp‎
Lines changed: 25 additions & 12 deletions b/‎lib/Dialect/TritonGPU/IR/LinearLayoutConversions.cpp‎
Lines changed: 25 additions & 12 deletions
diff --git a/‎lib/Dialect/TritonGPU/IR/Ops.cpp‎
Lines changed: 1 addition & 1 deletion b/‎lib/Dialect/TritonGPU/IR/Ops.cpp‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎lib/Dialect/TritonGPU/Transforms/Coalesce.cpp‎
Lines changed: 50 additions & 1 deletion b/‎lib/Dialect/TritonGPU/Transforms/Coalesce.cpp‎
Lines changed: 50 additions & 1 deletion
diff --git a/‎lib/Dialect/TritonGPU/Transforms/RemoveLayoutConversions.cpp‎
Lines changed: 6 additions & 158 deletions b/‎lib/Dialect/TritonGPU/Transforms/RemoveLayoutConversions.cpp‎
Lines changed: 6 additions & 158 deletions
@@ -85,7 +85,6 @@ inline void registerTritonDialects(mlir::DialectRegistry &registry) {
   // TritonAMDGPUTransforms passes
   mlir::registerTritonAMDGPUAccelerateMatmul();
   mlir::registerTritonAMDGPUOptimizeEpilogue();
-  mlir::registerTritonAMDGPUBypassLDSForDotOperand();
   mlir::registerTritonAMDGPUReorderInstructions();
   mlir::registerTritonAMDGPUBlockPingpong();
   mlir::registerTritonAMDGPUStreamPipeline();
 
@@ -205,8 +205,6 @@ enum class MMALoadType {
 };
 MMALoadType getMMALoadType(Operation *loadOp);
 
-// Convert \param op operands and results to layout \param encoding.
-void convertOpEncoding(Attribute encoding, Operation *op);
 } // namespace mlir
 
 #endif // TRITON_DIALECT_TRITONGPU_TRANSFORMS_UTILITY_H_
@@ -31,7 +31,6 @@ inline const std::set<std::string> CACHE_INVALIDATING_ENV_VARS = {
     "TRITON_ENABLE_LLVM_DEBUG",
     "TRITON_HIP_STREAM_PREFETCH",
     "TRITON_HIP_USE_BLOCK_PINGPONG",
-    "TRITON_HIP_BYPASS_LDS_FOR_DOT",
     "TRITON_LLVM_DEBUG_ONLY",
     "TRITON_ENABLE_ASAN",
     "TRITON_OVERRIDE_ARCH",
 
@@ -700,7 +700,7 @@ LogicalResult ReshapeOp::canonicalize(ReshapeOp op, PatternRewriter &rewriter) {
 }
 
 OpFoldResult ReshapeOp::fold(FoldAdaptor adaptor) {
-  if (getType() == getSrc().getType() && !getAllowReorder()) {
+  if (getType() == getSrc().getType()) {
     // no-op
     return getSrc();
   }
 
@@ -1105,6 +1105,7 @@ LinearLayout chooseDotLdMatrixLayout(DotOperandEncodingAttr dot,
   auto rank = shape.size();
   auto opIdx = dot.getOpIdx();
   int kDim = (opIdx == 0) ? rank - 1 : rank - 2;
+  int nonKDim = (opIdx == 0) ? rank - 2 : rank - 1;
 
   StringAttr kReg = S("register");
   StringAttr kLane = S("lane");
@@ -1121,8 +1122,11 @@ LinearLayout chooseDotLdMatrixLayout(DotOperandEncodingAttr dot,
     auto reg = 1 << logReg;
     basesReg.push_back({0, reg});
   }
-  std::vector<std::vector<int>> basesLane = {{1, 0}, {2, 0}, {4, 0}};
-  int numTileCols;
+  std::vector<std::vector<int>> basesLane = {
+      {1, 0}, {2, 0}, {4, 0}, {0, 0}, {0, 0}};
+  bool kX2 = shape[kDim] > 8 * 16 / elemBitWidth;
+  bool kX4 = shape[kDim] > 16 * 16 / elemBitWidth;
+  bool nonKX2 = shape[nonKDim] > 8;
   // Construct a tile consisting of 4 8x8x16bits sub-tiles to use ldmatrix
   // efficiently. opIdx=0 and opIdx=1 are handled differently.
   if (opIdx == 0) {
@@ -1135,13 +1139,16 @@ LinearLayout chooseDotLdMatrixLayout(DotOperandEncodingAttr dot,
     if (needTrans) {
       assert(elemBitWidth <= 16 && "Only elements smaller than 16 bits are "
                                    "supported in the transposed mode");
-      basesLane.push_back({0, 8});
-      basesLane.push_back({8, 0});
+      if (nonKX2)
+        basesLane[3] = {0, 8};
+      if (kX2)
+        basesLane[4] = {8 * 16 / elemBitWidth, 0};
     } else {
-      basesLane.push_back({8, 0});
-      basesLane.push_back({0, 8 * 16 / elemBitWidth});
+      if (nonKX2)
+        basesLane[3] = {8, 0};
+      if (kX2)
+        basesLane[4] = {0, 8 * 16 / elemBitWidth};
     }
-    numTileCols = 16 * 16 / elemBitWidth;
   } else {
     // The matrix elements of thread 0 are distributed in the following pattern
     // (fp16):
@@ -1151,14 +1158,20 @@ LinearLayout chooseDotLdMatrixLayout(DotOperandEncodingAttr dot,
     if (needTrans) {
       assert(elemBitWidth <= 16 && "Only elements smaller than 16 bits are "
                                    "supported in the transposed mode");
-      basesLane.push_back({8, 0});
-      basesLane.push_back({16, 0});
+      if (kX2)
+        basesLane[3] = {8, 0};
+      if (kX4)
+        basesLane[4] = {16, 0};
     } else {
-      basesLane.push_back({0, 8 * 16 / elemBitWidth});
-      basesLane.push_back({0, 16 * 16 / elemBitWidth});
+      if (kX2)
+        basesLane[3] = {0, 8 * 16 / elemBitWidth};
+      if (kX4)
+        basesLane[4] = {0, 16 * 16 / elemBitWidth};
     }
-    numTileCols = 32 * 16 / elemBitWidth;
   }
+  int numTileCols =
+      (8 * 16 / elemBitWidth)
+      << (static_cast<int>(kX2) + static_cast<int>(kX4 && opIdx == 1));
   // Expand the `register` dimension so the size of columns matches `K`.
   auto layout =
       LinearLayout({{kReg, basesReg}, {kLane, basesLane}, {kWarp, {}}},
 
@@ -63,7 +63,7 @@ struct CanonicalizeConvertFromReshape
 
     if (isExpensiveView(convert.getSrc().getType(), op.getType()))
       return failure();
-    if (!op.getAllowReorder())
+    if (!op.getAllowReorder() || op.getEfficientLayout())
       return failure();
 
     rewriter.replaceOpWithNewOp<triton::ReshapeOp>(
 
@@ -104,6 +104,55 @@ struct CoalescePass : public impl::TritonGPUCoalesceBase<CoalescePass> {
         threadsPerWarp, CTALayout);
   }
 
+  static Type getNewType(Type type, Attribute encoding) {
+    RankedTensorType tensorType = cast<RankedTensorType>(type);
+    return RankedTensorType::get(tensorType.getShape(),
+                                 tensorType.getElementType(), encoding);
+  }
+
+  void coalesceOp(Attribute encoding, Operation *op) {
+    OpBuilder builder(op);
+    // Convert operands
+    // For load/store with tensor pointers, we don't have to change the
+    // operands' type, we do this by changing the outputs' type of
+    // `make_tensor_ptr`
+    SmallVector<Value, 4> newArgs;
+    for (auto operand : op->getOperands()) {
+      auto tensorType = dyn_cast<RankedTensorType>(operand.getType());
+      if (tensorType &&
+          !isa<triton::gpu::SharedEncodingAttr>(tensorType.getEncoding())) {
+        Type newType = getNewType(tensorType, encoding);
+        newArgs.push_back(builder.create<triton::gpu::ConvertLayoutOp>(
+            op->getLoc(), newType, operand));
+      } else {
+        newArgs.push_back(operand);
+      }
+    }
+
+    // Convert output types
+    SmallVector<Type, 4> newTypes;
+    for (auto t : op->getResultTypes()) {
+      bool isAsync = isa<triton::gpu::AsyncCopyGlobalToLocalOp>(op);
+      newTypes.push_back(isAsync ? t : getNewType(t, encoding));
+    }
+
+    // Construct new op with the new encoding
+    Operation *newOp =
+        builder.create(op->getLoc(), op->getName().getIdentifier(), newArgs,
+                       newTypes, op->getAttrs());
+
+    // Cast the results back to the original layout
+    for (size_t i = 0; i < op->getNumResults(); i++) {
+      Value newResult = newOp->getResult(i);
+      if (newTypes[i] != op->getResultTypes()[i]) {
+        newResult = builder.create<triton::gpu::ConvertLayoutOp>(
+            op->getLoc(), op->getResult(i).getType(), newResult);
+      }
+      op->getResult(i).replaceAllUsesWith(newResult);
+    }
+    op->erase();
+  }
+
   void runOnOperation() override {
     // Run axis info analysis
     ModuleOp moduleOp = getOperation();
@@ -138,7 +187,7 @@ struct CoalescePass : public impl::TritonGPUCoalesceBase<CoalescePass> {
     // 4. Convert the output of this new memory op back to L1
     // 5. Replace all the uses of the original memory op by the new one
     for (auto &kv : layoutMap) {
-      convertOpEncoding(kv.second, kv.first);
+      coalesceOp(kv.second, kv.first);
     }
   }
 };
 
@@ -131,8 +131,6 @@ class LayoutRematerialization {
   void backwardRematerialization(ConvertLayoutOp convertOp);
   void hoistConvertOnTopOfExtOrBroadcast();
   void hoistConvertOnTopOfExtOrBroadcast(ConvertLayoutOp convertOp);
-  void hoistConvertIntoConditionals();
-  void hoistConvertIntoConditionals(ConvertLayoutOp convertOp);
   void rewriteSlice(SetVector<Value> &slice, DenseMap<Value, Attribute> &layout,
                     ConvertLayoutOp convertOp, IRMapping &mapping);
   void rewriteSlice(SetVector<Value> &slice, DenseMap<Value, Attribute> &layout,
@@ -1022,66 +1020,13 @@ void LayoutRematerialization::hoistConvertOnTopOfExtOrBroadcast() {
   }
 }
 
-bool shouldPropagateConversion(ConvertLayoutOp convertOp) {
-  RankedTensorType targetType = convertOp.getType();
-  auto dotEnc = dyn_cast<DotOperandEncodingAttr>(targetType.getEncoding());
-  // If the target encoding is not DotOperandEncodingAttr, allow propagation.
-  if (!dotEnc) {
-    return true;
-  }
-  // Skip conversions to DotOperandEncodingAttr when the operand index is 0.
-  // This heuristic is applied to prevent moving the blocked->dot conversion of
-  // the Q tensor (a loop invariant in Flash Attention) outside the loop. Doing
-  // so can increase register pressure and cause spilling in some cases.
-  if (dotEnc.getOpIdx() == 0) {
-    return false;
-  }
-  // Skip conversions to DotOperandEncodingAttr when the operand index is 1 if
-  // it's not intentionally placed above a load as we have to be a bit more
-  // careful with the heuristics for both correctness and performance.
-  // TODO: Fix this logic to avoid propagating conversions backward unless
-  // it reduces the total number of conversions.
-  assert(dotEnc.getOpIdx() == 1);
-  SetVector<Operation *> slice;
-  BackwardSliceOptions opt;
-  opt.omitBlockArguments = true;
-  opt.filter = [&](Operation *op) {
-    return op->getParentRegion() == convertOp->getParentRegion();
-  };
-  getBackwardSlice(convertOp.getOperation(), &slice, opt);
-
-  for (Operation *currOp : slice) {
-    if (isa<LoadOp>(currOp)) {
-      return false;
-    }
-  }
-  // Allow propagation if no LoadOp is found.
-  return true;
-}
-
-void LayoutRematerialization::hoistConvertIntoConditionals() {
-  // Go through each ConvertLayoutOp.
-  SmallVector<ConvertLayoutOp> convertOps;
-  funcOp.walk(
-      [&](ConvertLayoutOp convertOp) { convertOps.push_back(convertOp); });
-  for (ConvertLayoutOp convertOp : convertOps) {
-    hoistConvertIntoConditionals(convertOp);
-    if (!opToDelete.contains(convertOp)) {
-      // If the conversion didn't get removed, consider it for reuse in future
-      // backward slices.
-      addRematValue(convertOp.getSrc(), convertOp.getType().getEncoding(),
-                    convertOp.getResult());
-    }
-  }
-}
-
 void LayoutRematerialization::backwardRematerialization(
     ConvertLayoutOp convertOp) {
+  // we don't handle conversions to DotOperandEncodingAttr
+  // this is a heuristic to accommodate fused attention
   RankedTensorType targetType = convertOp.getType();
-  if (!shouldPropagateConversion(convertOp)) {
+  if (isa<DotOperandEncodingAttr>(targetType.getEncoding()))
     return;
-  }
-
   Value oldV = convertOp.getSrc();
   LDBG("check backward remat with source " << oldV << " encoding "
                                            << targetType.getEncoding());
@@ -1120,10 +1065,11 @@ void LayoutRematerialization::backwardRematerialization(
 // of the convert.
 void LayoutRematerialization::hoistConvertOnTopOfExtOrBroadcast(
     ConvertLayoutOp convertOp) {
+  // we don't handle conversions to DotOperandEncodingAttr
+  // this is a heuristics to accommodate fused attention
   RankedTensorType targetType = convertOp.getType();
-  if (!shouldPropagateConversion(convertOp)) {
+  if (isa<DotOperandEncodingAttr>(targetType.getEncoding()))
     return;
-  }
 
   auto isExtOrBroadcastOp = [](Operation *op) {
     if (isa<arith::ExtSIOp, arith::ExtUIOp, arith::ExtFOp, BroadcastOp,
@@ -1205,100 +1151,6 @@ void LayoutRematerialization::hoistConvertOnTopOfExtOrBroadcast(
   rewriteSlice(slice, layout, convertOp, mapping);
 }
 
-void LayoutRematerialization::hoistConvertIntoConditionals(
-    ConvertLayoutOp convertOp) {
-  // Take the backward slice of tensor dependencies, stopping at conditionals.
-  SetVector<Value> slice;
-  DenseMap<Value, Attribute> layout;
-  auto isIfOp = [](Operation *op) { return isa<scf::IfOp>(op); };
-  if (failed(getRematerializableSlice(convertOp.getSrcMutable(),
-                                      convertOp.getType().getEncoding(), slice,
-                                      layout, isIfOp)))
-    return;
-
-  // Find conditional edges above which the conversion can be hoisted.
-  SmallVector<std::pair<Value, OpOperand *>> hoistAbove;
-  unsigned sliceSize = slice.size();
-  // The routine will recurse through backward slices, e.g. to handle loops and
-  // conditional chains. Thus, we re-query the size of `slice`.
-  for (unsigned i = 0; i < slice.size(); i++) {
-    Value v = slice[i];
-    auto ifOp = v.getDefiningOp<scf::IfOp>();
-    if (!ifOp)
-      continue;
-
-    Attribute rootLayout = layout.at(v);
-    unsigned resIdx = cast<OpResult>(v).getResultNumber();
-
-    // Take the backward slice along each branch.
-    auto thenYield =
-        cast<scf::YieldOp>(ifOp.getThenRegion().front().getTerminator());
-    auto elseYield =
-        cast<scf::YieldOp>(ifOp.getElseRegion().front().getTerminator());
-
-    OpOperand &thenRes = thenYield.getResultsMutable()[resIdx];
-    OpOperand &elseRes = elseYield.getResultsMutable()[resIdx];
-
-    SetVector<Value> thenSlice, elseSlice;
-    DenseMap<Value, Attribute> thenLayout, elseLayout;
-
-    LogicalResult thenResult = getRematerializableSlice(
-        thenRes, rootLayout, thenSlice, thenLayout, isIfOp);
-    LogicalResult elseResult = getRematerializableSlice(
-        elseRes, rootLayout, elseSlice, elseLayout, isIfOp);
-
-    // If propagation across both edges of this conditional succeeded, then we
-    // don't need to hoist across it.
-    if (succeeded(thenResult) && succeeded(elseResult)) {
-      slice.insert(thenSlice.begin(), thenSlice.end());
-      slice.insert(elseSlice.begin(), elseSlice.end());
-      layout.insert(thenLayout.begin(), thenLayout.end());
-      layout.insert(elseLayout.begin(), elseLayout.end());
-      continue;
-    }
-
-    // If propagation across both edges failed, then there is nothing to do
-    // for this one.
-    if (failed(thenResult) && failed(elseResult))
-      continue;
-
-    // The layout conversion can be rematerialized along one edge but not the
-    // other. We can hoist the conversion into the other branch.
-    if (succeeded(elseResult)) {
-      std::swap(thenSlice, elseSlice);
-      std::swap(thenLayout, elseLayout);
-      hoistAbove.push_back({v, &thenRes});
-    } else {
-      hoistAbove.push_back({v, &elseRes});
-    }
-    slice.insert(thenSlice.begin(), thenSlice.end());
-    layout.insert(thenLayout.begin(), thenLayout.end());
-  }
-
-  // It's hard to know if duplicating the conversion into separate branches is
-  // profitable without more analysis. For now, hoist at most one.
-  if (hoistAbove.size() != 1)
-    return;
-
-  IRMapping mapping;
-  for (auto [result, edge] : hoistAbove) {
-    // Hoist the convert into the conditional and rewrite the slice.
-    OpBuilder b(edge->getOwner());
-    Value v = edge->get();
-    Attribute encoding = layout.at(result);
-
-    auto tensorType = cast<RankedTensorType>(v.getType());
-    auto newType = RankedTensorType::get(tensorType.getShape(),
-                                         tensorType.getElementType(), encoding);
-
-    Value newCvt = b.create<ConvertLayoutOp>(convertOp.getLoc(), newType, v);
-
-    mapping.map(v, newCvt);
-    slice.remove(v);
-  }
-  rewriteSlice(slice, layout, convertOp, mapping);
-}
-
 void backwardRematerialization(ModuleOp module) {
   module.walk([](FuncOp funcOp) {
     LayoutRematerialization layoutRemat(funcOp);
@@ -1313,10 +1165,6 @@ void hoistConvert(ModuleOp module) {
     LayoutRematerialization layoutRemat(funcOp);
     layoutRemat.hoistConvertOnTopOfExtOrBroadcast();
     layoutRemat.cleanup();
-
-    layoutRemat = LayoutRematerialization(funcOp);
-    layoutRemat.hoistConvertIntoConditionals();
-    layoutRemat.cleanup();
   });
 }
 } // namespace
Original file line number	Diff line number	Diff line change
`@@ -700,7 +700,7 @@ LogicalResult ReshapeOp::canonicalize(ReshapeOp op, PatternRewriter &rewriter) {`
`700`	`700`	`}`
`701`	`701`
`702`	`702`	`OpFoldResult ReshapeOp::fold(FoldAdaptor adaptor) {`
`703`		`- if (getType() == getSrc().getType() && !getAllowReorder()) {`
	`703`	`+ if (getType() == getSrc().getType()) {`
`704`	`704`	`// no-op`
`705`	`705`	`return getSrc();`
`706`	`706`	`}`