triton-lang
diff --git a/‎bin/RegisterTritonDialects.h‎
Lines changed: 1 addition & 0 deletions b/‎bin/RegisterTritonDialects.h‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎include/triton/Dialect/TritonGPU/Transforms/Utility.h‎
Lines changed: 2 additions & 1 deletion b/‎include/triton/Dialect/TritonGPU/Transforms/Utility.h‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎include/triton/Tools/LinearLayout.h‎
Lines changed: 4 additions & 0 deletions b/‎include/triton/Tools/LinearLayout.h‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎include/triton/Tools/Sys/GetEnv.hpp‎
Lines changed: 1 addition & 0 deletions b/‎include/triton/Tools/Sys/GetEnv.hpp‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎lib/Analysis/Utility.cpp‎
Lines changed: 2 additions & 7 deletions b/‎lib/Analysis/Utility.cpp‎
Lines changed: 2 additions & 7 deletions
diff --git a/‎lib/Conversion/TritonGPUToLLVM/GatherOpToLLVM.cpp‎
Lines changed: 4 additions & 3 deletions b/‎lib/Conversion/TritonGPUToLLVM/GatherOpToLLVM.cpp‎
Lines changed: 4 additions & 3 deletions
diff --git a/‎lib/Dialect/TritonGPU/Transforms/RemoveLayoutConversions.cpp‎
Lines changed: 46 additions & 20 deletions b/‎lib/Dialect/TritonGPU/Transforms/RemoveLayoutConversions.cpp‎
Lines changed: 46 additions & 20 deletions
diff --git a/‎lib/Dialect/TritonGPU/Transforms/Utility.cpp‎
Lines changed: 11 additions & 5 deletions b/‎lib/Dialect/TritonGPU/Transforms/Utility.cpp‎
Lines changed: 11 additions & 5 deletions
diff --git a/‎lib/Tools/LinearLayout.cpp‎
Lines changed: 5 additions & 1 deletion b/‎lib/Tools/LinearLayout.cpp‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎python/test/unit/runtime/test_cache.py‎
Lines changed: 20 additions & 20 deletions b/‎python/test/unit/runtime/test_cache.py‎
Lines changed: 20 additions & 20 deletions
@@ -62,6 +62,7 @@ inline void registerTritonDialects(mlir::DialectRegistry &registry) {
   mlir::registerTritonAMDGPUAccelerateMatmul();
   mlir::registerTritonAMDGPUOptimizeEpilogue();
   mlir::registerTritonAMDGPUReorderInstructions();
+  mlir::registerTritonAMDGPUBlockPingpong();
   mlir::registerTritonAMDGPUStreamPipeline();
   mlir::registerTritonAMDGPUCanonicalizePointers();
   mlir::registerTritonAMDGPUConvertToBufferOps();
 
@@ -163,7 +163,8 @@ Operation *cloneWithInferType(mlir::OpBuilder &rewriter, Operation *op,
 LogicalResult getConvertBackwardSlice(
     Value root, SetVector<Value> &slice, Attribute rootEncoding,
     DenseMap<Value, Attribute> &layout,
-    std::function<bool(Operation *)> stopPropagation = nullptr);
+    std::function<bool(Operation *)> stopPropagation = nullptr,
+    std::function<Value(Value, Attribute)> getExistingConversion = nullptr);
 
 // Populate pattern to remove dead cycles in ForOp.
 void populateForOpDeadArgumentElimination(RewritePatternSet &patterns);
 
@@ -679,6 +679,10 @@ class LinearLayout {
 
   // Get the layout that is the inverse of this layout.
   [[nodiscard]] LinearLayout invert() const;
+  // Compute and return a psueodinverse of this layout. This is a layout such
+  // that `B = A.psuedoinvert()` implies that `A(B(x)) = I`. If `A` is
+  // invertible, then this returns `A^-1`.
+  [[nodiscard]] LinearLayout pseudoinvert() const;
 
   // For each in-dim, returns a bitmask of the "free variables" in the layout
   // function.
 
@@ -29,6 +29,7 @@ inline const std::set<std::string> CACHE_INVALIDATING_ENV_VARS = {
     "TRITON_DISABLE_RESHAPE_ENCODING_INFERENCE",
     "TRITON_ENABLE_LLVM_DEBUG",
     "TRITON_HIP_STREAM_PREFETCH",
+    "TRITON_HIP_USE_BLOCK_PINGPONG",
     "TRITON_LLVM_DEBUG_ONLY",
     "USE_IR_LOC",
     "NVPTX_ENABLE_DUMP",
 
@@ -491,13 +491,8 @@ bool GatherLoweringHelper::isWarpLocal() {
   // in the index and source tensors are the same. This means we don't need to
   // xor shuffle across threads before emitting index shuffles; we push warp
   // shuffling to layout conversions.
-  if (srcLayout->sublayout(kLane, otherDims) !=
-      idxLayout->sublayout(kLane, otherDims))
-    return false;
-
-  // Otherwise, the source layout has to be invertible. This primarily means
-  // the codegen path doesn't support broadcasted source layouts.
-  return srcLayout->isInvertible();
+  return srcLayout->sublayout(kLane, otherDims) ==
+         idxLayout->sublayout(kLane, otherDims);
 }
 
 unsigned getNumScratchElements(ArrayRef<unsigned> shape) {
 
@@ -240,9 +240,10 @@ void GatherOpConversion::emitWarpLocalGather(
   // `llvm.select` using `src_reg` to get the right one. `K` is the number of
   // elements per column owned by a thread.
 
-  // Fully invert the source layout. We know it is invertible because
-  // `isWarpLocal` checked this.
-  LinearLayout invSrcLayout = srcLayout.invert();
+  // Invert the source layout. It doesn't matter whether it is fully invertible
+  // with respect to anything except the register input dimension, since we know
+  // those don't vary in ways that matter for codegen.
+  LinearLayout invSrcLayout = srcLayout.pseudoinvert();
 
   // Sanity check: the warp must be invariant to the index because otherwise the
   // gather would need to read across warps!
 
@@ -116,17 +116,13 @@ class LayoutPropagation {
 class LayoutRematerialization {
 public:
   LayoutRematerialization(FuncOp F) : funcOp(F) {}
+
   // Map the original value to the remat'ed one.
   void addRematValue(Value old, Attribute encoding, Value newV);
-  bool hasRematValue(Value value, Attribute encoding) {
-    return rematMapping.contains({value, encoding});
-  }
-  // Return the remat'ed value in the given encoding.
-  Value getRematValue(Value value, Attribute encoding) {
-    auto it = rematMapping.find({value, encoding});
-    assert(it != rematMapping.end());
-    return it->second;
-  }
+  // Get the remat'ed value in the given encoding, if one already exists and
+  // is different then the layout conversion root.
+  Value getRematValue(Value value, Attribute encoding, Value root) const;
+
   void cleanup();
   void backwardRematerialization();
   void backwardRematerialization(ConvertLayoutOp convertOp);
@@ -137,6 +133,11 @@ class LayoutRematerialization {
   void rewriteSlice(SetVector<Value> &slice, DenseMap<Value, Attribute> &layout,
                     ConvertLayoutOp convertOp);
 
+  LogicalResult getRematerializableSlice(
+      Value root, Attribute rootEncoding, SetVector<Value> &slice,
+      DenseMap<Value, Attribute> &layout,
+      std::function<bool(Operation *)> stopPropagation = nullptr);
+
 private:
   void updateRematMapping(SmallVector<std::tuple<Value, Value>> &values);
   // Existing tuples of (value, layout) that needs to be updated when recreating
@@ -157,6 +158,21 @@ void LayoutRematerialization::addRematValue(Value old, Attribute encoding,
   mappedValues[old] = encoding;
 }
 
+Value LayoutRematerialization::getRematValue(Value value, Attribute encoding,
+                                             Value root) const {
+  Value remat = rematMapping.lookup({value, encoding});
+  if (!remat)
+    return {};
+  // If the remat'ed value is a conversion result, make sure it is different
+  // than the root of the one we're looking at.
+  if (auto cvt = remat.getDefiningOp<ConvertLayoutOp>()) {
+    if (cvt.getSrc() == root)
+      return {};
+  }
+  // This remat'ed value can be reused.
+  return remat;
+}
+
 // Remove unneeded values now that we are done with the rematMapping.
 void LayoutRematerialization::cleanup() {
   for (Operation *op : llvm::reverse(opToDelete))
@@ -766,8 +782,8 @@ void LayoutRematerialization::rewriteSlice(SetVector<Value> &slice,
     auto layoutIt = layout.find(v);
     assert(layoutIt != layout.end());
     // If we already have a remat value for this value, use it.
-    if (hasRematValue(v, layoutIt->second)) {
-      mapping.map(v, getRematValue(v, layoutIt->second));
+    if (Value remat = getRematValue(v, layoutIt->second, convertOp.getSrc())) {
+      mapping.map(v, remat);
       valuesWithExistingRemat.insert(v);
       continue;
     }
@@ -928,12 +944,17 @@ void LayoutRematerialization::rewriteSlice(SetVector<Value> &slice,
   rewriteSlice(slice, layout, convertOp, mapping);
 }
 
-LogicalResult getRematerializableSlice(
+LogicalResult LayoutRematerialization::getRematerializableSlice(
     Value root, Attribute rootEncoding, SetVector<Value> &slice,
     DenseMap<Value, Attribute> &layout,
-    std::function<bool(Operation *)> stopPropagation = nullptr) {
-  LogicalResult result = getConvertBackwardSlice(root, slice, rootEncoding,
-                                                 layout, stopPropagation);
+    std::function<bool(Operation *)> stopPropagation) {
+  // Allow re-using existing conversions for a value.
+  auto getExistingConversion = [&](Value value, Attribute encoding) -> Value {
+    return getRematValue(value, encoding, root);
+  };
+  LogicalResult result =
+      getConvertBackwardSlice(root, slice, rootEncoding, layout,
+                              stopPropagation, getExistingConversion);
   if (result.failed() || slice.empty())
     return failure();
 
@@ -950,8 +971,14 @@ LogicalResult getRematerializableSlice(
 void LayoutRematerialization::backwardRematerialization() {
   // Go through each ConvertLayoutOp.
   SmallVector<ConvertLayoutOp> convertOps;
-  funcOp.walk(
-      [&](ConvertLayoutOp convertOp) { convertOps.push_back(convertOp); });
+  funcOp.walk([&](ConvertLayoutOp convertOp) {
+    convertOps.push_back(convertOp);
+    // Add existing layout conversions as rematerializations of themselves. This
+    // enables rematerialization of other conversions to re-use existing
+    // conversions. Importantly, don't add them to `mappedValues`.
+    rematMapping.insert(
+        {{convertOp.getSrc(), convertOp.getType().getEncoding()}, convertOp});
+  });
   for (ConvertLayoutOp convertOp : convertOps) {
     backwardRematerialization(convertOp);
   }
@@ -976,14 +1003,13 @@ void LayoutRematerialization::backwardRematerialization(
   // careful with the heuristics for both correctness and perf
   if (isa<DotOperandEncodingAttr, LinearEncodingAttr>(targetType.getEncoding()))
     return;
-  Value oldV = convertOp->getOperand(0);
+  Value oldV = convertOp.getSrc();
   LDBG("check backward remat with source " << oldV << " encoding "
                                            << targetType.getEncoding());
   // Check to see if there are existing remat'ed values for the pair of oldValue
   // and encoding.
-  if (hasRematValue(oldV, targetType.getEncoding())) {
+  if (Value newV = getRematValue(oldV, targetType.getEncoding(), oldV)) {
     // Replace it with the remat'ed value.
-    Value newV = getRematValue(oldV, targetType.getEncoding());
     convertOp.replaceAllUsesWith(newV);
     opToDelete.insert(convertOp);
     LDBG("found remat'ed value" << newV);
 
@@ -757,11 +757,11 @@ static bool isFreeConvert(Operation *op) {
                               convertOp.getType());
 }
 
-LogicalResult
-getConvertBackwardSlice(Value root, SetVector<Value> &slice,
-                        Attribute rootEncoding,
-                        DenseMap<Value, Attribute> &layout,
-                        std::function<bool(Operation *)> stopPropagation) {
+LogicalResult getConvertBackwardSlice(
+    Value root, SetVector<Value> &slice, Attribute rootEncoding,
+    DenseMap<Value, Attribute> &layout,
+    std::function<bool(Operation *)> stopPropagation,
+    std::function<Value(Value, Attribute)> getExistingConversion) {
   DenseSet<std::pair<Value, Attribute>> seen;
   SmallVector<std::pair<Value, Attribute>> queue;
 
@@ -802,6 +802,12 @@ getConvertBackwardSlice(Value root, SetVector<Value> &slice,
 
       continue;
     }
+    Value existing;
+    if (getExistingConversion &&
+        (existing = getExistingConversion(currentValue, encoding))) {
+      enqueue(existing, encoding);
+      continue;
+    }
     if (auto *definingOp = currentValue.getDefiningOp()) {
       // If the op has multiple results we need to update all results layout.
       for (Value result : definingOp->getResults()) {
 
@@ -957,9 +957,13 @@ LinearLayout LinearLayout::invertAndCompose(const LinearLayout &outer) const {
 }
 
 LinearLayout LinearLayout::invert() const {
-  // A^-1(x) = A^-1(I(x)), thus A.invert() = I.invertAndCompose(A)
   assert(isInvertible() &&
          "A linear layout must be surjective and square to be invertible");
+  return pseudoinvert();
+}
+
+LinearLayout LinearLayout::pseudoinvert() const {
+  // A^-1(x) = A^-1(I(x)), thus A.invert() = I.invertAndCompose(A)
   LinearLayout identity = LinearLayout::empty();
   for (auto outDim : getOutDimNames()) {
     identity *= LinearLayout::identity1D(getOutDimSize(outDim), outDim, outDim);
 
@@ -199,7 +199,7 @@ def kernel(X, i: tl.int32):
     kernel[(1, )](x, 8)
     kernel[(1, )](x, 16)
     kernel[(1, )](x, 17)
-    assert len(kernel.cache[device]) == 3
+    assert len(kernel.device_caches[device][0]) == 3
 
 
 GLOBAL_DEFAULT_ARG = 1
@@ -223,7 +223,7 @@ def kernel(X, i: tl.constexpr = GLOBAL_DEFAULT_ARG):
     assert x == torch.ones_like(x)
 
     device = getattr(torch, device).current_device()
-    assert len(kernel.cache[device]) == 1
+    assert len(kernel.device_caches[device][0]) == 1
 
 
 GLOBAL_VAR: tl.constexpr = 1
@@ -416,13 +416,13 @@ def kernel_add(a, b, o, N: tl.constexpr):
         32,
     ]
     device = getattr(torch, device).current_device()
-    assert len(kernel_add.cache[device]) == 0
+    assert len(kernel_add.device_caches[device][0]) == 0
     kernel_add.warmup(torch.float32, torch.float32, torch.float32, 32, grid=(1, ))
-    assert len(kernel_add.cache[device]) == 1
+    assert len(kernel_add.device_caches[device][0]) == 1
     kernel_add.warmup(*args, grid=(1, ))
-    assert len(kernel_add.cache[device]) == 1
+    assert len(kernel_add.device_caches[device][0]) == 1
     kernel_add.warmup(*args, grid=(1, ))
-    assert len(kernel_add.cache[device]) == 1
+    assert len(kernel_add.device_caches[device][0]) == 1
 
 
 def test_jit_debug(device) -> None:
@@ -433,12 +433,12 @@ def kernel(tmp):
 
     device = getattr(torch, device).current_device()
     tmp = torch.tensor([1], dtype=torch.int32, device=device)
-    assert len(kernel.cache[device]) == 0
+    assert len(kernel.device_caches[device][0]) == 0
     kernel[(1, )](tmp, debug=False)
-    assert len(kernel.cache[device]) == 1
+    assert len(kernel.device_caches[device][0]) == 1
     kernel[(1, )](tmp, debug=True)
-    assert len(kernel.cache[device]) == 2
-    bins = list(kernel.cache[device].values())
+    assert len(kernel.device_caches[device][0]) == 2
+    bins = list(kernel.device_caches[device][0].values())
     assert bins[0].asm['ttir'] != bins[1].asm['ttir']
 
 
@@ -455,18 +455,18 @@ def kernel_add_device(a, b, o, N: tl.constexpr):
         add_fn(a, b, o, N)
 
     device = getattr(torch, device).current_device()
-    assert len(kernel_add_device.cache[device]) == 0
+    assert len(kernel_add_device.device_caches[device][0]) == 0
     kernel_add_device.warmup(torch.float32, torch.float32, torch.float32, 32, grid=(1, ))
-    assert len(kernel_add_device.cache[device]) == 1
-    bins = list(kernel_add_device.cache[device].values())
+    assert len(kernel_add_device.device_caches[device][0]) == 1
+    bins = list(kernel_add_device.device_caches[device][0].values())
     inline_ttir = bins[0].asm['ttir']
     add_fn.noinline = True
     add_fn.hash = None
     kernel_add_device.hash = None
-    kernel_add_device.cache[device].clear()
+    kernel_add_device.device_caches[device][0].clear()
     kernel_add_device.warmup(torch.float32, torch.float32, torch.float32, 32, grid=(1, ))
-    assert len(kernel_add_device.cache[device]) == 1
-    bins = list(kernel_add_device.cache[device].values())
+    assert len(kernel_add_device.device_caches[device][0]) == 1
+    bins = list(kernel_add_device.device_caches[device][0].values())
     noinline_ttir = bins[0].asm['ttir']
     assert inline_ttir != noinline_ttir
 
@@ -514,12 +514,12 @@ def cache_hook(*args, **kwargs):
 
     # clear the cache
     shutil.rmtree(fresh_triton_cache)
-    kernel_add.cache[device].clear()
+    kernel_add.device_caches[device][0].clear()
 
     # preload the kernel
     kernel_preload = kernel_add.preload(specialization_data)
     assert kernel_preload.hash == hash
-    assert len(kernel_add.cache[device]) == 1
+    assert len(kernel_add.device_caches[device][0]) == 1
 
     # we should hit the cache and not compile anything
     counter = 0
@@ -532,7 +532,7 @@ def inc_counter(*args, **kwargs):
     final_kernel = kernel_add.warmup(torch.float32, torch.float32, torch.float32, 32, tl.float32, grid=(1, ))
     JITFunction.cache_hook = None
     assert counter == 0
-    assert len(kernel_add.cache[device]) == 1
+    assert len(kernel_add.device_caches[device][0]) == 1
     assert final_kernel.hash == hash
 
     # test that we can't preload a mismatched kernel
@@ -572,7 +572,7 @@ def compiled_hook(*args, **kwargs):
     kernel_add.warmup(torch.float32, torch.float32, torch.float32, 32, tl.float32, grid=(1, ))
     assert specialization_data is not None and specialization_data_compiled == specialization_data
     assert is_warmup is True
-    assert key in kernel_add.cache[getattr(torch, device).current_device()]
+    assert key in kernel_add.device_caches[getattr(torch, device).current_device()][0]
 
 
 @pytest.mark.skipif(reason="within_2g is a HIP specific optimization", condition=not is_hip())