[AUTOTUNER] Fix issue in autotuner which may use the wrong value as the key of config cache. (#4808)

chengjunlu · web-flow · commit 184fb53b1a57 · 2024-09-26T17:41:37.000-04:00
The autotuner uses the index of the arguments of the Triton kernel signature to look up the value to be used as the key for the config cache. There is an issue if the user pass the kernel arguments as keyword args in arbitrary order. The name of the argument should be used to look up the value of the args passed by the user instead of the `key_idx`. This prevents the autotuner from using a mismatched value as the key for caching when the arguments are passed in arbitrary order as keyword args. - [X] I am not making a trivial change, such as fixing a typo in a comment. - [X] I have written a PR description following these [rules](https://cbea.ms/git-commit/#why-not-how). - [X] I have run `pre-commit run --from-ref origin/main --to-ref HEAD`. - Select one of the following. - [X] I have added tests. - `/test` for `lit` tests - `/unittest` for C++ tests - `/python/test` for end-to-end tests - [ ] This PR does not need a test because `FILL THIS IN`. - Select one of the following. - [X] I have not added any `lit` tests. - [ ] The `lit` tests I have added follow these [best practices](https://mlir.llvm.org/getting_started/TestingGuide/#filecheck-best-practices), including the "tests should be minimal" section. (Usually running Python code and using the instructions it generates is not minimal.)
diff --git a/python/test/unit/runtime/test_autotuner.py b/python/test/unit/runtime/test_autotuner.py
@@ -7,22 +7,25 @@
 
 @pytest.mark.parametrize('use_cuda_graph', [False, True])
 def test_kwargs(use_cuda_graph: bool, device: str):
-    N = 1024
-    src = torch.randn(N, device=device)
-    dst = torch.empty(N, device=device)
+    M, N = 1024, 16
+    src = torch.randn(M * N, device=device)
+    dst = torch.empty(M * N, device=device)
 
-    configs = [triton.Config(kwargs={'BLOCK_SIZE': 32}), triton.Config(kwargs={'BLOCK_SIZE': 128})]
+    configs = [triton.Config(kwargs={'BLOCK_SIZE_M': 32}), triton.Config(kwargs={'BLOCK_SIZE_M': 128})]
 
-    @triton.autotune(configs=configs, key=['N'], warmup=1, rep=1, use_cuda_graph=use_cuda_graph)
+    @triton.autotune(configs=configs, key=['M'], warmup=1, rep=1, use_cuda_graph=use_cuda_graph)
     @triton.jit
-    def _kernel(dst, src, N, BLOCK_SIZE: tl.constexpr):
-        offsets = tl.program_id(0) * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
-        x = tl.load(src + offsets, mask=offsets < N)
-        tl.store(dst + offsets, x, mask=offsets < N)
-
-    grid = lambda META: (triton.cdiv(N, META['BLOCK_SIZE']), )
-    _kernel[grid](dst, src, N)
-    _kernel[grid](dst=dst, src=src, N=N)
+    def _kernel(dst, src, stride_m: tl.constexpr, M, BLOCK_SIZE_N: tl.constexpr, BLOCK_SIZE_M: tl.constexpr):
+        offsets_m = tl.program_id(0) * stride_m + tl.arange(0, BLOCK_SIZE_M)
+        offsets_n = tl.arange(0, BLOCK_SIZE_N)
+        x = tl.load(src + offsets_m[:, None] * BLOCK_SIZE_N + offsets_n[None, :])
+        tl.store(dst + offsets_m[:, None] * BLOCK_SIZE_N + offsets_n[None, :], x)
+
+    grid = lambda META: (triton.cdiv(N, META['BLOCK_SIZE_M']), )
+    _kernel[grid](dst, src, N, M, N)
+    # the key word args could be in arbitrary order.
+    _kernel[grid](dst=dst, src=src, M=M // 2, stride_m=N, BLOCK_SIZE_N=N)
+    assert len(_kernel.cache) == 2
 
 
 def test_restore(device):
diff --git a/python/triton/runtime/autotuner.py b/python/triton/runtime/autotuner.py
@@ -38,7 +38,7 @@ def __init__(
             self.configs = [Config({}, num_warps=4, num_stages=2, num_ctas=1)]
         else:
             self.configs = configs
-        self.key_idx = [arg_names.index(k) for k in key]
+        self.keys = key
         self.cache = {}
         self.arg_names = arg_names
 
@@ -136,12 +136,9 @@ def run(self, *args, **kwargs):
         used_cached_result = True
         if len(self.configs) > 1:
             all_args = {**self.nargs, **kwargs}
-            _args = []
-            for name in self.arg_names:
-                if name in all_args:
-                    _args.append(all_args[name])
-            key = [_args[i] for i in self.key_idx]
-            for arg in _args:
+            _args = {k: v for (k, v) in all_args.items() if k in self.arg_names}
+            key = [_args[key] for key in self.keys if key in _args]
+            for _, arg in _args.items():
                 if hasattr(arg, "dtype"):
                     key.append(str(arg.dtype))
             key = tuple(key)