[inductor][fix] subproc autotuning respect cache dir changes (pytorch#167918)

nmacchioni · pytorchmergebot · commit bbf39cad677a · 2025-11-18T02:14:47.000Z
Summary: noticed this bug with subproc autotuning while working on async autotuning the created subprocs don't respect changes to cache dirs, specifically the Triton cache dir, which causes subproc autotuning to cache miss on otherwise cached Triton kernels, net effect being that precompile in gemm autotuning path became an expensive no-op on the torchbench model I tested with, compile time with subproc autotuning went down from ~1k seconds to ~500 seconds, now matching in-process autotuning Test Plan: CI Differential Revision: D87170069 Pull Request resolved: pytorch#167918 Approved by: https://github.com/aorenste
diff --git a/torch/_inductor/autotune_process.py b/torch/_inductor/autotune_process.py
@@ -78,11 +78,13 @@ def process_main(read_pipe: IO[bytes], write_pipe: IO[bytes]) -> None:
 
         def workloop():
             while True:
-                job = TuningProcess.recv(read_pipe)
+                job, extra_env = TuningProcess.recv(read_pipe)
                 if job is None:
                     # None is a sentinel for the child to shut down
                     break
                 try:
+                    if extra_env:
+                        os.environ.update(extra_env)
                     result = job()
                 except Exception as e:
                     result = e
@@ -95,8 +97,10 @@ def workloop():
             pass
 
     @staticmethod
-    def send(obj: Any, write_pipe: IO[bytes]) -> None:
-        pickle.dump(obj, write_pipe)
+    def send(
+        obj: Any, write_pipe: IO[bytes], extra_env: dict[str, str] | None = None
+    ) -> None:
+        pickle.dump((obj, extra_env), write_pipe)
         write_pipe.flush()
 
     @staticmethod
@@ -158,13 +162,13 @@ def alive(self) -> bool:
         """
         return self.running and self.process.poll() is None
 
-    def put(self, req: Any) -> None:
+    def put(self, req: Any, extra_env: dict[str, str] | None = None) -> None:
         """
         Push a work item to the child process.
         """
         if not self.alive():
             self.start()
-        TuningProcess.send(req, self.write_pipe)
+        TuningProcess.send(req, self.write_pipe, extra_env=extra_env)
 
     def get(self, timeout: float = 120.0) -> Any:
         """
@@ -174,7 +178,7 @@ def get(self, timeout: float = 120.0) -> Any:
         try:
             if not self.selector.select(timeout):
                 raise TimeoutError(f"Timeout in autotune subprocess {self.process.pid}")
-            result = TuningProcess.recv(self.read_pipe)
+            result, _ = TuningProcess.recv(self.read_pipe)
         except TimeoutError:
             self.kill()
             raise
@@ -305,8 +309,10 @@ def target(self, choice: TritonTemplateCaller) -> float:
         """
         assert choice.bmreq is not None
 
+        env_vars = ["TORCHINDUCTOR_CACHE_DIR", "TRITON_CACHE_DIR"]
+        extra_env = {v: os.environ[v] for v in env_vars if v in os.environ}
         process = self.process_queue.get()
-        process.put(choice.bmreq.benchmark)
+        process.put(choice.bmreq.benchmark, extra_env=extra_env)
         try:
             return process.get(
                 config.max_autotune_subproc_result_timeout_seconds,