Rename "output_tensor" -> "out" in autotune_process.py (pytorch#153169)

masnesral · pytorchmergebot · commit f1de3f9f07f6 · 2025-05-13T14:18:29.000Z
Summary: This change is to support remote autotuning. I want to use all the same benchmarking utilities in select_algorithm.py. For remote autotuning, I'll reuse the TritonBenchmarkRequest class used for subprocess autotuning because it's already serializable. That class is also used in standard, in-process autotuning, but via TritonTemplateCaller.benchmark() which sets the output_tensor param when calling the underlying TritonBenchmarkRequest. For remote, I'll be using the TritonBenchmarkRequest request directly so I want the parameter to be named 'out' to avoid "got an unexpected keyword argument 'out'". Test Plan: Existing unit tests Pull Request resolved: pytorch#153169 Approved by: https://github.com/aorenste, https://github.com/eellison
diff --git a/torch/_inductor/autotune_process.py b/torch/_inductor/autotune_process.py
@@ -424,7 +424,7 @@ def __init__(
         self.extra_args = extra_args
 
     def make_run_fn(
-        self, *input_tensors: torch.Tensor, output_tensor: torch.Tensor
+        self, *input_tensors: torch.Tensor, out: torch.Tensor
     ) -> Callable[[], None]:
         raise NotImplementedError
 
@@ -435,30 +435,30 @@ def do_bench(
         self,
         fn,
         *input_tensors: torch.Tensor,
-        output_tensor: Optional[torch.Tensor] = None,
+        out: Optional[torch.Tensor] = None,
     ) -> float:
         raise NotImplementedError
 
     def benchmark(
         self,
         *input_tensors: torch.Tensor,
-        output_tensor: Optional[torch.Tensor] = None,
+        out: Optional[torch.Tensor] = None,
     ) -> float:
         debug = autotuning_log.isEnabledFor(logging.DEBUG)
         if debug:
             start_ts = time.time()
 
         # create args and out tensor
-        if output_tensor is None:
+        if out is None:
             assert len(input_tensors) == 0
             input_tensors = tuple(x.to_tensor() for x in self.input_tensor_meta)
-            output_tensor = self.output_tensor_meta.to_tensor()
+            out = self.output_tensor_meta.to_tensor()
 
         if debug:
             create_tensor_elapse = time.time() - start_ts  # type: ignore[possibly-undefined]
             start_ts = time.time()
         try:
-            fn = self.make_run_fn(*input_tensors, output_tensor=output_tensor)
+            fn = self.make_run_fn(*input_tensors, out=out)
         except NonzeroWorkspaceNotSupportedError:
             # Skipping all ops with nonzero workspace requirements
             autotuning_log.info("Skipping op due to nonzero workspace requirement")
@@ -468,7 +468,7 @@ def benchmark(
             load_elapse = time.time() - start_ts  # type: ignore[possibly-undefined]
             start_ts = time.time()
 
-        out = self.do_bench(fn, *input_tensors, output_tensor)
+        res = self.do_bench(fn, *input_tensors, out)
 
         if debug:
             bench_elapse = time.time() - start_ts  # type: ignore[possibly-undefined]
@@ -480,7 +480,7 @@ def benchmark(
                 bench_elapse,
             )
         self.cleanup_run_fn()
-        return out
+        return res
 
 
 class _TestBenchmarkRequest(BenchmarkRequest):
@@ -504,7 +504,7 @@ def __init__(
         self.crash = crash
 
     def benchmark(
-        self, *input_tensors: torch.Tensor, output_tensor: Optional[torch.Tensor] = None
+        self, *input_tensors: torch.Tensor, out: Optional[torch.Tensor] = None
     ) -> float:
         if self.device is not None:
             assert os.environ.get(CUDA_VISIBLE_DEVICES, None) == str(self.device)
@@ -522,11 +522,11 @@ def do_bench(
         self,
         fn,
         *input_tensors: torch.Tensor,
-        output_tensor: Optional[torch.Tensor] = None,
+        out: Optional[torch.Tensor] = None,
     ) -> float:
         device_idx_set = OrderedSet(
             tensor.device.index
-            for tensor in [*input_tensors, output_tensor]
+            for tensor in [*input_tensors, out]
             if isinstance(tensor, torch.Tensor)
             and is_gpu(tensor.device.type)
             and tensor.device.index is not None
@@ -546,18 +546,18 @@ def do_bench(
         else:
             device_idx = device_interface.current_device()
         with device_interface.device(device_idx):  # type: ignore[attr-defined]
-            out = benchmarker.benchmark_gpu(fn)
+            res = benchmarker.benchmark_gpu(fn)
             device_interface.synchronize()  # shake out any CUDA errors
 
-        return out
+        return res
 
 
 class CPUDeviceBenchmarkMixin:
     def do_bench(
         self,
         fn,
         *input_tensors: torch.Tensor,
-        output_tensor: Optional[torch.Tensor] = None,
+        out: Optional[torch.Tensor] = None,
     ) -> float:
         return benchmarker.benchmark_cpu(fn)
 
@@ -593,7 +593,7 @@ def __init__(
         self.kpack = kpack
 
     def make_run_fn(
-        self, *input_tensors: torch.Tensor, output_tensor: torch.Tensor
+        self, *input_tensors: torch.Tensor, out: torch.Tensor
     ) -> Callable[[], None]:
         mod = PyCodeCache.load_by_key_path(self.module_cache_key, self.module_path)
         autotuning_log.debug(
@@ -614,10 +614,10 @@ def make_run_fn(
         if "warmup" in inspect.signature(run_method).parameters:
             warmup_arg["warmup"] = False
 
-        if output_tensor.device.type == "cpu":
+        if out.device.type == "cpu":
             stream = 0
         else:
-            device_type = output_tensor.device.type
+            device_type = out.device.type
             device_interface = get_interface_for_device(device_type)
             stream = device_interface.get_raw_stream(
                 self.output_tensor_meta.device.index
@@ -630,7 +630,7 @@ def make_run_fn(
             return functools.partial(
                 run_method,
                 *input_tensors,
-                output_tensor,
+                out,
                 *extra_args,
                 **warmup_arg,
                 stream=stream,
@@ -639,7 +639,7 @@ def make_run_fn(
             return functools.partial(
                 run_method,
                 *input_tensors,
-                output_tensor,
+                out,
                 *extra_args,
                 **warmup_arg,
                 stream=stream,
@@ -692,14 +692,11 @@ def precompile(self):
         autotuning_log.debug("Done precompiling %s", self)
 
     def make_run_fn(
-        self, *input_tensors: torch.Tensor, output_tensor: torch.Tensor
+        self, *input_tensors: torch.Tensor, out: torch.Tensor
     ) -> Callable[[], None]:
         self.ensure_dll_loaded()
         self.update_workspace_size()
-        args = [
-            c_void_p(tensor.data_ptr())
-            for tensor in list(input_tensors) + [output_tensor]
-        ]
+        args = [c_void_p(tensor.data_ptr()) for tensor in list(input_tensors) + [out]]
         autotuning_log.debug(
             "make_run_fn: self.kernel_name=%s, self.source_file=%s, self.hash_key=%s, self.DLL=%s, args=%s, self.extra_args=%s",
             self.kernel_name,
@@ -716,7 +713,7 @@ def make_run_fn(
             self.workspace = torch.zeros(
                 (self.workspace_size + 7) // 8,
                 dtype=torch.float64,
-                device=output_tensor.device,
+                device=out.device,
             )
             workspace_ptr = c_void_p(self.workspace.data_ptr())
 
@@ -806,11 +803,11 @@ def precompile(self):
         autotuning_log.debug("Done precompiling %s", self)
 
     def make_run_fn(
-        self, *input_tensors: torch.Tensor, output_tensor: torch.Tensor
+        self, *input_tensors: torch.Tensor, out: torch.Tensor
     ) -> Callable[[], None]:
         # TODO(jgong5): use CppPythonBindingsCodeCache for better binding perf
         self.DLL = CppCodeCache.load(self.source_code, device_type="cpu")
-        args = [tensor.data_ptr() for tensor in list(input_tensors) + [output_tensor]]
+        args = [tensor.data_ptr() for tensor in list(input_tensors) + [out]]
         autotuning_log.debug(
             "make_run_fn: self.kernel_name=%s, self.DLL=%s, args=%s, self.extra_args=%s",
             self.kernel_name,
diff --git a/torch/_inductor/codegen/cpp_template_kernel.py b/torch/_inductor/codegen/cpp_template_kernel.py
@@ -553,7 +553,7 @@ def precompile(self) -> None:
 
     def benchmark(self, *args, out) -> float:
         assert self.bmreq is not None
-        return self.bmreq.benchmark(*args, output_tensor=out)
+        return self.bmreq.benchmark(*args, out=out)
 
     def hash_key(self) -> str:
         return "-".join(
diff --git a/torch/_inductor/codegen/cuda/cuda_kernel.py b/torch/_inductor/codegen/cuda/cuda_kernel.py
@@ -590,7 +590,7 @@ def precompile(self) -> None:
     def benchmark(self, *args, out) -> float:
         assert self.bmreq is not None
         return self.bmreq.benchmark(
-            *args, output_tensor=out
+            *args, out=out
         )  # @TODO: Hack for ensuring that Cutlass Kernel is preferred
 
     def __str__(self) -> str:
diff --git a/torch/_inductor/codegen/rocm/rocm_benchmark_request.py b/torch/_inductor/codegen/rocm/rocm_benchmark_request.py
@@ -55,14 +55,11 @@ def precompile(self):
         log.debug("Done precompiling %s", self)
 
     def make_run_fn(
-        self, *input_tensors: torch.Tensor, output_tensor: torch.Tensor
+        self, *input_tensors: torch.Tensor, out: torch.Tensor
     ) -> Callable[[], None]:
         self.ensure_dll_loaded()
         self.update_workspace_size()
-        args = [
-            c_void_p(tensor.data_ptr())
-            for tensor in list(input_tensors) + [output_tensor]
-        ]
+        args = [c_void_p(tensor.data_ptr()) for tensor in list(input_tensors) + [out]]
         size_args = [c_int(arg) for arg in self.extra_args]
         log.debug(
             "make_run_fn: self.kernel_name=%s, self.source_file=%s, self.hash_key=%s, self.DLL=%s, args=%s, self.extra_args=%s",
@@ -80,7 +77,7 @@ def make_run_fn(
             self.workspace = torch.zeros(
                 (self.workspace_size + 7) // 8,
                 dtype=torch.float64,
-                device=output_tensor.device,
+                device=out.device,
             )
             workspace_ptr = c_void_p(self.workspace.data_ptr())
 
diff --git a/torch/_inductor/codegen/rocm/rocm_kernel.py b/torch/_inductor/codegen/rocm/rocm_kernel.py
@@ -246,7 +246,7 @@ def precompile(self) -> None:
 
     def benchmark(self, *args, out) -> float:
         assert self.bmreq is not None
-        return self.bmreq.benchmark(*args, output_tensor=out)
+        return self.bmreq.benchmark(*args, out=out)
 
     def __str__(self) -> str:
         return f"ROCmTemplateCaller(source_file={self.bmreq.source_file}, {self.info_dict()})"
diff --git a/torch/_inductor/select_algorithm.py b/torch/_inductor/select_algorithm.py
@@ -1500,7 +1500,7 @@ def __init__(
 
     def benchmark(self, *args, out):
         assert self.bmreq is not None
-        return self.bmreq.benchmark(*args, output_tensor=out)
+        return self.bmreq.benchmark(*args, out=out)
 
     def precompile(self):
         assert self.bmreq is not None