[AUTOTUNER] Don't cache benchmarking stream (intel#3993)

int3 · web-flow · commit 8e43e71d044a · 2024-05-24T12:16:21.000-04:00
This caching seems to be responsible for some CUDA OOMs we encountered
in Meta-internal builds. I haven't got a reduced repro, but this change
does seem to fix things. My hypothesis is that the cached stream is
causing the memory allocated for the graph to be retained.
diff --git a/python/triton/runtime/autotuner.py b/python/triton/runtime/autotuner.py
@@ -92,7 +92,6 @@ def _post_hook(args, exception):
         self.num_reps = rep
         import torch
         self.use_cuda_graph = use_cuda_graph and torch.cuda.is_available()
-        self.benchmarkig_stream = torch.cuda.Stream() if self.use_cuda_graph else None
 
     def _bench(self, *args, config, **meta):
         from ..compiler.errors import CompileTimeAssertionFailure
@@ -128,7 +127,7 @@ def kernel_call():
         try:
             if self.use_cuda_graph:
                 import torch
-                with torch.cuda.stream(self.benchmarkig_stream):
+                with torch.cuda.stream(torch.cuda.Stream()):
                     bench_res = do_bench_cudagraph(kernel_call, rep=self.num_reps, return_mode="median")
                 return bench_res
             return do_bench(kernel_call, warmup=self.num_warmups, rep=self.num_reps, quantiles=(0.5, 0.2, 0.8))