feat: add return all for do_bench (intel#4493)

functionstackx · web-flow · commit 6a9a0a6474af · 2024-08-10T12:11:50.000-04:00
Since H100s have a power throttling depending on the kernel, it is important to see how the TFLOPs change over time. I have this patch in my internal codebase and found it useful to see the cyclic patterns of different kernels and see how long it takes before reaching a steady state. ![image](https://github.com/user-attachments/assets/ff77edea-8f61-446a-8afe-023c25933fe9) Complete the following tasks before sending your PR, and replace `[ ]` with `[x]` to indicate you have done them. - [x ] I am not making a trivial change, such as fixing a typo in a comment. - [ x] I have written a PR description following these [rules](https://cbea.ms/git-commit/#why-not-how). - [ ] I have run `pre-commit run --from-ref origin/main --to-ref HEAD`. - Select one of the following. - [ ] I have added tests. - `/test` for `lit` tests - `/unittest` for C++ tests - `/python/test` for end-to-end tests - [x ] This PR does not need a test because do_bench does not have unit tests LOL. - Select one of the following. - [ ] I have not added any `lit` tests. - [ ] The `lit` tests I have added follow these [best practices](https://mlir.llvm.org/getting_started/TestingGuide/#filecheck-best-practices), including the "tests should be minimal" section. (Usually running Python code and using the instructions it generates is not minimal.)
diff --git a/python/triton/testing.py b/python/triton/testing.py
@@ -23,6 +23,8 @@ def _summarize_statistics(times, quantiles, return_mode):
         if len(ret) == 1:
             ret = ret[0]
         return ret
+    if return_mode == "all":
+        return times.tolist()
     return getattr(torch, return_mode)(times).item()
 
 
@@ -36,11 +38,11 @@ def do_bench_cudagraph(fn, rep=20, grad_to_none=None, quantiles=None, return_mod
     :type rep: int
     :param grad_to_none: Reset the gradient of the provided tensor to None
     :type grad_to_none: torch.tensor, optional
-    :param return_mode: The statistical measure to return. Options are "min", "max", "mean", or "median". Default is "mean".
+    :param return_mode: The statistical measure to return. Options are "min", "max", "mean", "median", or "all" Default is "mean".
     :type return_mode: str
     """
     import torch
-    assert return_mode in ["min", "max", "mean", "median"]
+    assert return_mode in ["min", "max", "mean", "median", "all"]
 
     with torch.cuda.stream(torch.cuda.Stream()):
         # warmup
@@ -107,10 +109,9 @@ def do_bench(fn, warmup=25, rep=100, grad_to_none=None, quantiles=None, fast_flu
     :type quantiles: list[float], optional
     :param fast_flush: Use faster kernel to flush L2 cache between measurements
     :type fast_flush: bool, default is True
-    :param return_mode: The statistical measure to return. Options are "min", "max", "mean", or "median". Default is "mean".
-    :type return_mode: str
+    :param return_mode: The statistical measure to return. Options are "min", "max", "mean", "median", or "all" Default is "mean".    :type return_mode: str
     """
-    assert return_mode in ["min", "max", "mean", "median"]
+    assert return_mode in ["min", "max", "mean", "median", "all"]
     import torch
 
     di = torch._dynamo.device_interface.get_interface_for_device(device_type)