diff --git a/vllm/compilation/wrapper.py b/vllm/compilation/wrapper.py index 8d5df1061eda..4a55bbd2c815 100644 --- a/vllm/compilation/wrapper.py +++ b/vllm/compilation/wrapper.py @@ -100,23 +100,36 @@ def bytecode_hook(self, old_code: CodeType, new_code: CodeType): "transformed_code.py") if not os.path.exists(decompiled_file): try: - # usually the decompilation will succeed for most models, - # as we guarantee a full-graph compilation in Dynamo. - # but there's no 100% guarantee, since decompliation is - # not a reversible process. - import depyf - src = depyf.decompile(new_code) - - with open(decompiled_file, "w") as f: - f.write(src) - - logger.debug("Dynamo transformed code saved to %s", - decompiled_file) + # Check if we should perform actual decompilation or write placeholder + if envs.VLLM_COMPILE_DEPYF: + # Perform actual decompilation when VLLM_COMPILE_DEPYF=1 + # usually the decompilation will succeed for most models, + # as we guarantee a full-graph compilation in Dynamo. + # but there's no 100% guarantee, since decompliation is + # not a reversible process. + import depyf + src = depyf.decompile(new_code) + + with open(decompiled_file, "w") as f: + f.write(src) + + logger.debug("Dynamo transformed code saved to %s", + decompiled_file) + else: + # Write placeholder file with comment when VLLM_COMPILE_DEPYF=0 (default) + placeholder_content = "# Please set VLLM_COMPILE_DEPYF=1 to populate this file\n" + with open(decompiled_file, "w") as f: + f.write(placeholder_content) + logger.debug("Placeholder Dynamo transformed code saved to %s. " + "Set VLLM_COMPILE_DEPYF=1 to perform actual decompilation.", + decompiled_file) except Exception: pass if self.vllm_config.compilation_config.use_cudagraph and \ "update" in new_code.co_names: + # For cudagraph error checking, we always perform decompilation regardless of VLLM_COMPILE_DEPYF + # because this is a critical error checking mechanism import depyf src = depyf.decompile(new_code) msg = "Assigning / modifying buffers of nn.Module during forward pass is not allowed when using cudagraph inside the compiler because it will cause silent errors. Please use eager mode or fix the code. The following code contains clues about which buffer is being modified (please search for the usage of the function `update`):\n" + src # noqa diff --git a/vllm/envs.py b/vllm/envs.py index 2d470c6dccbf..793c22476de3 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -794,6 +794,11 @@ def get_vllm_port() -> Optional[int]: lambda: float(os.getenv("VLLM_LOG_BATCHSIZE_INTERVAL", "-1")), "VLLM_DISABLE_COMPILE_CACHE": lambda: bool(int(os.getenv("VLLM_DISABLE_COMPILE_CACHE", "0"))), + # Controls whether to perform depyf decompilation during compilation + # 0 (default): Write placeholder file with comment + # 1: Perform actual depyf decompilation + "VLLM_COMPILE_DEPYF": + lambda: bool(int(os.getenv("VLLM_COMPILE_DEPYF", "0"))), # If set, vllm will run in development mode, which will enable # some additional endpoints for developing and debugging,