intel
diff --git a/‎bin/RegisterTritonDialects.h‎
Lines changed: 1 addition & 0 deletions b/‎bin/RegisterTritonDialects.h‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎include/triton/Dialect/TritonGPU/Transforms/Utility.h‎
Lines changed: 2 additions & 1 deletion b/‎include/triton/Dialect/TritonGPU/Transforms/Utility.h‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎include/triton/Tools/Sys/GetEnv.hpp‎
Lines changed: 1 addition & 0 deletions b/‎include/triton/Tools/Sys/GetEnv.hpp‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎lib/Dialect/TritonGPU/Transforms/RemoveLayoutConversions.cpp‎
Lines changed: 46 additions & 20 deletions b/‎lib/Dialect/TritonGPU/Transforms/RemoveLayoutConversions.cpp‎
Lines changed: 46 additions & 20 deletions
diff --git a/‎lib/Dialect/TritonGPU/Transforms/Utility.cpp‎
Lines changed: 11 additions & 5 deletions b/‎lib/Dialect/TritonGPU/Transforms/Utility.cpp‎
Lines changed: 11 additions & 5 deletions
diff --git a/‎python/test/unit/runtime/test_cache.py‎
Lines changed: 20 additions & 20 deletions b/‎python/test/unit/runtime/test_cache.py‎
Lines changed: 20 additions & 20 deletions
diff --git a/‎python/triton/runtime/jit.py‎
Lines changed: 16 additions & 16 deletions b/‎python/triton/runtime/jit.py‎
Lines changed: 16 additions & 16 deletions
diff --git a/‎test/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions b/‎test/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions
@@ -87,6 +87,7 @@ inline void registerTritonDialects(mlir::DialectRegistry &registry) {
   mlir::registerTritonAMDGPUAccelerateMatmul();
   mlir::registerTritonAMDGPUOptimizeEpilogue();
   mlir::registerTritonAMDGPUReorderInstructions();
+  mlir::registerTritonAMDGPUBlockPingpong();
   mlir::registerTritonAMDGPUStreamPipeline();
   mlir::registerTritonAMDGPUCanonicalizePointers();
   mlir::registerTritonAMDGPUConvertToBufferOps();
 
@@ -163,7 +163,8 @@ Operation *cloneWithInferType(mlir::OpBuilder &rewriter, Operation *op,
 LogicalResult getConvertBackwardSlice(
     Value root, SetVector<Value> &slice, Attribute rootEncoding,
     DenseMap<Value, Attribute> &layout,
-    std::function<bool(Operation *)> stopPropagation = nullptr);
+    std::function<bool(Operation *)> stopPropagation = nullptr,
+    std::function<Value(Value, Attribute)> getExistingConversion = nullptr);
 
 // Populate pattern to remove dead cycles in ForOp.
 void populateForOpDeadArgumentElimination(RewritePatternSet &patterns);
 
@@ -29,6 +29,7 @@ inline const std::set<std::string> CACHE_INVALIDATING_ENV_VARS = {
     "TRITON_DISABLE_RESHAPE_ENCODING_INFERENCE",
     "TRITON_ENABLE_LLVM_DEBUG",
     "TRITON_HIP_STREAM_PREFETCH",
+    "TRITON_HIP_USE_BLOCK_PINGPONG",
     "TRITON_LLVM_DEBUG_ONLY",
     "USE_IR_LOC",
     "NVPTX_ENABLE_DUMP",
 
@@ -116,17 +116,13 @@ class LayoutPropagation {
 class LayoutRematerialization {
 public:
   LayoutRematerialization(FuncOp F) : funcOp(F) {}
+
   // Map the original value to the remat'ed one.
   void addRematValue(Value old, Attribute encoding, Value newV);
-  bool hasRematValue(Value value, Attribute encoding) {
-    return rematMapping.contains({value, encoding});
-  }
-  // Return the remat'ed value in the given encoding.
-  Value getRematValue(Value value, Attribute encoding) {
-    auto it = rematMapping.find({value, encoding});
-    assert(it != rematMapping.end());
-    return it->second;
-  }
+  // Get the remat'ed value in the given encoding, if one already exists and
+  // is different then the layout conversion root.
+  Value getRematValue(Value value, Attribute encoding, Value root) const;
+
   void cleanup();
   void backwardRematerialization();
   void backwardRematerialization(ConvertLayoutOp convertOp);
@@ -137,6 +133,11 @@ class LayoutRematerialization {
   void rewriteSlice(SetVector<Value> &slice, DenseMap<Value, Attribute> &layout,
                     ConvertLayoutOp convertOp);
 
+  LogicalResult getRematerializableSlice(
+      Value root, Attribute rootEncoding, SetVector<Value> &slice,
+      DenseMap<Value, Attribute> &layout,
+      std::function<bool(Operation *)> stopPropagation = nullptr);
+
 private:
   void updateRematMapping(SmallVector<std::tuple<Value, Value>> &values);
   // Existing tuples of (value, layout) that needs to be updated when recreating
@@ -157,6 +158,21 @@ void LayoutRematerialization::addRematValue(Value old, Attribute encoding,
   mappedValues[old] = encoding;
 }
 
+Value LayoutRematerialization::getRematValue(Value value, Attribute encoding,
+                                             Value root) const {
+  Value remat = rematMapping.lookup({value, encoding});
+  if (!remat)
+    return {};
+  // If the remat'ed value is a conversion result, make sure it is different
+  // than the root of the one we're looking at.
+  if (auto cvt = remat.getDefiningOp<ConvertLayoutOp>()) {
+    if (cvt.getSrc() == root)
+      return {};
+  }
+  // This remat'ed value can be reused.
+  return remat;
+}
+
 // Remove unneeded values now that we are done with the rematMapping.
 void LayoutRematerialization::cleanup() {
   for (Operation *op : llvm::reverse(opToDelete))
@@ -766,8 +782,8 @@ void LayoutRematerialization::rewriteSlice(SetVector<Value> &slice,
     auto layoutIt = layout.find(v);
     assert(layoutIt != layout.end());
     // If we already have a remat value for this value, use it.
-    if (hasRematValue(v, layoutIt->second)) {
-      mapping.map(v, getRematValue(v, layoutIt->second));
+    if (Value remat = getRematValue(v, layoutIt->second, convertOp.getSrc())) {
+      mapping.map(v, remat);
       valuesWithExistingRemat.insert(v);
       continue;
     }
@@ -928,12 +944,17 @@ void LayoutRematerialization::rewriteSlice(SetVector<Value> &slice,
   rewriteSlice(slice, layout, convertOp, mapping);
 }
 
-LogicalResult getRematerializableSlice(
+LogicalResult LayoutRematerialization::getRematerializableSlice(
     Value root, Attribute rootEncoding, SetVector<Value> &slice,
     DenseMap<Value, Attribute> &layout,
-    std::function<bool(Operation *)> stopPropagation = nullptr) {
-  LogicalResult result = getConvertBackwardSlice(root, slice, rootEncoding,
-                                                 layout, stopPropagation);
+    std::function<bool(Operation *)> stopPropagation) {
+  // Allow re-using existing conversions for a value.
+  auto getExistingConversion = [&](Value value, Attribute encoding) -> Value {
+    return getRematValue(value, encoding, root);
+  };
+  LogicalResult result =
+      getConvertBackwardSlice(root, slice, rootEncoding, layout,
+                              stopPropagation, getExistingConversion);
   if (result.failed() || slice.empty())
     return failure();
 
@@ -950,8 +971,14 @@ LogicalResult getRematerializableSlice(
 void LayoutRematerialization::backwardRematerialization() {
   // Go through each ConvertLayoutOp.
   SmallVector<ConvertLayoutOp> convertOps;
-  funcOp.walk(
-      [&](ConvertLayoutOp convertOp) { convertOps.push_back(convertOp); });
+  funcOp.walk([&](ConvertLayoutOp convertOp) {
+    convertOps.push_back(convertOp);
+    // Add existing layout conversions as rematerializations of themselves. This
+    // enables rematerialization of other conversions to re-use existing
+    // conversions. Importantly, don't add them to `mappedValues`.
+    rematMapping.insert(
+        {{convertOp.getSrc(), convertOp.getType().getEncoding()}, convertOp});
+  });
   for (ConvertLayoutOp convertOp : convertOps) {
     backwardRematerialization(convertOp);
   }
@@ -976,14 +1003,13 @@ void LayoutRematerialization::backwardRematerialization(
   // careful with the heuristics for both correctness and perf
   if (isa<DotOperandEncodingAttr, LinearEncodingAttr>(targetType.getEncoding()))
     return;
-  Value oldV = convertOp->getOperand(0);
+  Value oldV = convertOp.getSrc();
   LDBG("check backward remat with source " << oldV << " encoding "
                                            << targetType.getEncoding());
   // Check to see if there are existing remat'ed values for the pair of oldValue
   // and encoding.
-  if (hasRematValue(oldV, targetType.getEncoding())) {
+  if (Value newV = getRematValue(oldV, targetType.getEncoding(), oldV)) {
     // Replace it with the remat'ed value.
-    Value newV = getRematValue(oldV, targetType.getEncoding());
     convertOp.replaceAllUsesWith(newV);
     opToDelete.insert(convertOp);
     LDBG("found remat'ed value" << newV);
 
@@ -757,11 +757,11 @@ static bool isFreeConvert(Operation *op) {
                               convertOp.getType());
 }
 
-LogicalResult
-getConvertBackwardSlice(Value root, SetVector<Value> &slice,
-                        Attribute rootEncoding,
-                        DenseMap<Value, Attribute> &layout,
-                        std::function<bool(Operation *)> stopPropagation) {
+LogicalResult getConvertBackwardSlice(
+    Value root, SetVector<Value> &slice, Attribute rootEncoding,
+    DenseMap<Value, Attribute> &layout,
+    std::function<bool(Operation *)> stopPropagation,
+    std::function<Value(Value, Attribute)> getExistingConversion) {
   DenseSet<std::pair<Value, Attribute>> seen;
   SmallVector<std::pair<Value, Attribute>> queue;
 
@@ -802,6 +802,12 @@ getConvertBackwardSlice(Value root, SetVector<Value> &slice,
 
       continue;
     }
+    Value existing;
+    if (getExistingConversion &&
+        (existing = getExistingConversion(currentValue, encoding))) {
+      enqueue(existing, encoding);
+      continue;
+    }
     if (auto *definingOp = currentValue.getDefiningOp()) {
       // If the op has multiple results we need to update all results layout.
       for (Value result : definingOp->getResults()) {
 
@@ -199,7 +199,7 @@ def kernel(X, i: tl.int32):
     kernel[(1, )](x, 8)
     kernel[(1, )](x, 16)
     kernel[(1, )](x, 17)
-    assert len(kernel.cache[device]) == 3
+    assert len(kernel.device_caches[device][0]) == 3
 
 
 GLOBAL_DEFAULT_ARG = 1
@@ -223,7 +223,7 @@ def kernel(X, i: tl.constexpr = GLOBAL_DEFAULT_ARG):
     assert x == torch.ones_like(x)
 
     device = getattr(torch, device).current_device()
-    assert len(kernel.cache[device]) == 1
+    assert len(kernel.device_caches[device][0]) == 1
 
 
 GLOBAL_VAR: tl.constexpr = 1
@@ -416,13 +416,13 @@ def kernel_add(a, b, o, N: tl.constexpr):
         32,
     ]
     device = getattr(torch, device).current_device()
-    assert len(kernel_add.cache[device]) == 0
+    assert len(kernel_add.device_caches[device][0]) == 0
     kernel_add.warmup(torch.float32, torch.float32, torch.float32, 32, grid=(1, ))
-    assert len(kernel_add.cache[device]) == 1
+    assert len(kernel_add.device_caches[device][0]) == 1
     kernel_add.warmup(*args, grid=(1, ))
-    assert len(kernel_add.cache[device]) == 1
+    assert len(kernel_add.device_caches[device][0]) == 1
     kernel_add.warmup(*args, grid=(1, ))
-    assert len(kernel_add.cache[device]) == 1
+    assert len(kernel_add.device_caches[device][0]) == 1
 
 
 def test_jit_debug(device) -> None:
@@ -433,12 +433,12 @@ def kernel(tmp):
 
     device = getattr(torch, device).current_device()
     tmp = torch.tensor([1], dtype=torch.int32, device=device)
-    assert len(kernel.cache[device]) == 0
+    assert len(kernel.device_caches[device][0]) == 0
     kernel[(1, )](tmp, debug=False)
-    assert len(kernel.cache[device]) == 1
+    assert len(kernel.device_caches[device][0]) == 1
     kernel[(1, )](tmp, debug=True)
-    assert len(kernel.cache[device]) == 2
-    bins = list(kernel.cache[device].values())
+    assert len(kernel.device_caches[device][0]) == 2
+    bins = list(kernel.device_caches[device][0].values())
     assert bins[0].asm['ttir'] != bins[1].asm['ttir']
 
 
@@ -455,18 +455,18 @@ def kernel_add_device(a, b, o, N: tl.constexpr):
         add_fn(a, b, o, N)
 
     device = getattr(torch, device).current_device()
-    assert len(kernel_add_device.cache[device]) == 0
+    assert len(kernel_add_device.device_caches[device][0]) == 0
     kernel_add_device.warmup(torch.float32, torch.float32, torch.float32, 32, grid=(1, ))
-    assert len(kernel_add_device.cache[device]) == 1
-    bins = list(kernel_add_device.cache[device].values())
+    assert len(kernel_add_device.device_caches[device][0]) == 1
+    bins = list(kernel_add_device.device_caches[device][0].values())
     inline_ttir = bins[0].asm['ttir']
     add_fn.noinline = True
     add_fn.hash = None
     kernel_add_device.hash = None
-    kernel_add_device.cache[device].clear()
+    kernel_add_device.device_caches[device][0].clear()
     kernel_add_device.warmup(torch.float32, torch.float32, torch.float32, 32, grid=(1, ))
-    assert len(kernel_add_device.cache[device]) == 1
-    bins = list(kernel_add_device.cache[device].values())
+    assert len(kernel_add_device.device_caches[device][0]) == 1
+    bins = list(kernel_add_device.device_caches[device][0].values())
     noinline_ttir = bins[0].asm['ttir']
     assert inline_ttir != noinline_ttir
 
@@ -514,12 +514,12 @@ def cache_hook(*args, **kwargs):
 
     # clear the cache
     shutil.rmtree(fresh_triton_cache)
-    kernel_add.cache[device].clear()
+    kernel_add.device_caches[device][0].clear()
 
     # preload the kernel
     kernel_preload = kernel_add.preload(specialization_data)
     assert kernel_preload.hash == hash
-    assert len(kernel_add.cache[device]) == 1
+    assert len(kernel_add.device_caches[device][0]) == 1
 
     # we should hit the cache and not compile anything
     counter = 0
@@ -532,7 +532,7 @@ def inc_counter(*args, **kwargs):
     final_kernel = kernel_add.warmup(torch.float32, torch.float32, torch.float32, 32, tl.float32, grid=(1, ))
     JITFunction.cache_hook = None
     assert counter == 0
-    assert len(kernel_add.cache[device]) == 1
+    assert len(kernel_add.device_caches[device][0]) == 1
     assert final_kernel.hash == hash
 
     # test that we can't preload a mismatched kernel
@@ -572,7 +572,7 @@ def compiled_hook(*args, **kwargs):
     kernel_add.warmup(torch.float32, torch.float32, torch.float32, 32, tl.float32, grid=(1, ))
     assert specialization_data is not None and specialization_data_compiled == specialization_data
     assert is_warmup is True
-    assert key in kernel_add.cache[getattr(torch, device).current_device()]
+    assert key in kernel_add.device_caches[getattr(torch, device).current_device()][0]
 
 
 @pytest.mark.skipif(reason="within_2g is a HIP specific optimization", condition=not is_hip())
 
@@ -544,47 +544,49 @@ def add_pre_run_hook(self, hook):
         assert callable(hook)
         self.pre_run_hooks.append(hook)
 
-    def create_binder(self, backend):
+    def create_binder(self):
         """
         Precompute as much as possible.
         """
         from ..compiler import CompiledKernel, compile, ASTSource, make_backend
+        target = driver.active.get_current_target()
+        backend = make_backend(target)
         self.CompiledKernel = CompiledKernel
         self.compile = compile
         self.ASTSource = ASTSource
-        self.make_backend = make_backend
-        self.binder = create_function_from_signature(self.signature, self.params, backend)
+        binder = create_function_from_signature(self.signature, self.params, backend)
         self.constexpr_indices = [i for (i, p) in enumerate(self.params) if p.is_constexpr]
         self.non_constexpr_indices = [i for (i, p) in enumerate(self.params) if not p.is_constexpr]
         self.specialised_indices = [
             i for (i, p) in enumerate(self.params) if (not p.do_not_specialize) and (not p.is_constexpr)
         ]
+        return [target, backend, binder]
 
     def run(self, *args, grid, warmup, **kwargs):
         kwargs["debug"] = kwargs.get("debug", self.debug) or os.environ.get("TRITON_DEBUG", "0") == "1"
 
         # parse options
-        from ..compiler import make_backend
         device = driver.active.get_current_device()
         stream = driver.active.get_current_stream(device)
-        target = driver.active.get_current_target()
-        backend = make_backend(target)
 
         # Execute pre run hooks with args and kwargs
         for hook in self.pre_run_hooks:
             hook(*args, **kwargs)
 
-        if self.binder is None:
-            self.create_binder(backend)
-
-        bound_args, sig_and_spec, constexpr_vals, non_constexpr_vals, excess_kwargs = self.binder(*args, **kwargs)
+        # This is a length-4 list [kernel_cache, target, backend, binder]:
+        device_cache = self.device_caches[device]
+        if len(device_cache) == 1:
+            device_cache[1:] = self.create_binder()
+        bound_args, sig_and_spec, constexpr_vals, non_constexpr_vals, excess_kwargs = device_cache[3](*args, **kwargs)
 
         # compute cache key
         key = ''.join(sig_and_spec) + str((constexpr_vals, excess_kwargs))
-        kernel = self.cache[device].get(key, None)
+        kernel = device_cache[0].get(key, None)
 
         if kernel is None:
             # Kernel is not cached; we have to compile.
+            target = device_cache[1]
+            backend = device_cache[2]
             options = backend.parse_options(kwargs)
 
             # deprecated arguments
@@ -625,7 +627,7 @@ def run(self, *args, grid, warmup, **kwargs):
                 target=target,
                 options=options.__dict__,
             )
-            self.cache[device][key] = kernel
+            device_cache[0][key] = kernel
             self._call_hook(key, signature, device, constants, options, configs, warmup, before=False)
 
         # Check that used global values have not changed.
@@ -669,8 +671,6 @@ def __init__(self, fn, version=None, do_not_specialize=None, do_not_specialize_o
         self.repr = lambda _: fn.__name__ if repr is None else repr(_)
         self.launch_metadata = launch_metadata
 
-        self.binder = None
-
         self.params = []
         for i, param in enumerate(self.signature.parameters.values()):
             dns = i in do_not_specialize or param.name in do_not_specialize
@@ -681,7 +681,7 @@ def __init__(self, fn, version=None, do_not_specialize=None, do_not_specialize_o
         self.src = textwrap.dedent(inspect.getsource(fn))
         self.src = self.src[re.search(r"^def\s+\w+\s*\(", self.src, re.MULTILINE).start():]
         # cache of just-in-time compiled kernels
-        self.cache = defaultdict(dict)
+        self.device_caches = defaultdict(lambda: [{}])
         self.hash = None
 
         # Map of global variables used by the function and any functions it
@@ -750,7 +750,7 @@ def preload(self, specialization_data):
         }
         key = deserialized_obj['key']
         kernel = compile(src, None, options)
-        self.cache[device][key] = kernel
+        self.device_caches[device][0][key] = kernel
         return kernel
 
     # we do not parse `src` in the constructor because
 
@@ -15,6 +15,7 @@ set(TRITON_TEST_DEPENDS
   triton-opt
   triton-tensor-layout
   triton-translate
+  triton-llvm-opt
 )
 
 set(FILECHECK_PATH "${LLVM_LIBRARY_DIR}/../bin/FileCheck")
Original file line number	Diff line number	Diff line change
`@@ -15,6 +15,7 @@ set(TRITON_TEST_DEPENDS`
`15`	`15`	`triton-opt`
`16`	`16`	`triton-tensor-layout`
`17`	`17`	`triton-translate`
	`18`	`+ triton-llvm-opt`
`18`	`19`	`)`
`19`	`20`
`20`	`21`	`set(FILECHECK_PATH "${LLVM_LIBRARY_DIR}/../bin/FileCheck")`