[v0.9.1][bugfix] fix torchair runtime errror caused by configuration mismtaches and .kv_cache_bytes file missing (#2312)

linfeng-yuan · web-flow · commit 19c7b3ec67b9 · 2025-08-13T20:35:23.000+08:00
### What this PR does / why we need it?
Original implementation of torchair caching forces users to make
everything prepared, fix all the configuration and enable
`use_cached_npu_graph`, and it might cause some problems confusing to
understand and tackle for users. It is better to compile the graph twice
instead of reusing the old kvcaches and cached torchair graph. And the
extra duration time is acceptable.

### Does this PR introduce _any_ user-facing change?
If users want to enabling torchair.cache_compile with high compilation
speed, it is recommended to enable both `use_cached_kv_cache_bytes` and
`use_cached_graph` in `torchair_graph_config`. Without
`use_cached_kv_cache_bytes`, we'll compile torchair computation graph
twice to avoid runtime error caused by configuration mismtaches (the
second compilation will be much faster).

### How was this patch tested?
CI and e2e vllm serving passed.

Signed-off-by: linfeng-yuan &lt;1102311262@qq.com&gt;
diff --git a/vllm_ascend/ascend_config.py b/vllm_ascend/ascend_config.py
@@ -75,6 +75,9 @@ def __init__(self, torchair_graph_config):
         )  # Whether to enable torchair graph mode. Currently only DeepSeek series models and PanguProMoE are supported to use torchair graph mode
         self.use_cached_graph = torchair_graph_config.get(
             "use_cached_graph", False)  # Whether to use cached graph
+        self.use_cached_kv_cache_bytes = torchair_graph_config.get(
+            "use_cached_kv_cache_bytes", False
+        )  # Whether to use cached kv_caches' memory, this option can only be enabled with use_cached_graph
         self.graph_batch_sizes = torchair_graph_config.get(
             "graph_batch_sizes", [])  # The batch size for torchair graph cache
         self.graph_batch_sizes_init = torchair_graph_config.get(
@@ -106,6 +109,10 @@ def __init__(self, torchair_graph_config):
                 raise RuntimeError(
                     "use_cached_graph is valid only when Torchair graph mode is enabled"
                 )
+            if self.use_cached_kv_cache_bytes:
+                raise RuntimeError(
+                    "use_cached_kv_cache_bytes is valid only when Torchair graph mode is enabled"
+                )
             if self.graph_batch_sizes:
                 raise RuntimeError(
                     "graph_batch_sizes is valid only when Torchair graph mode is enabled"
@@ -133,8 +140,12 @@ def __init__(self, torchair_graph_config):
         if not self.enable_multistream_moe:
             if self.enable_super_kernel:
                 raise RuntimeError(
-                    "enable_super_kernel is valid only when Torchair graph mode and enable_multistream_moe is enabled"
+                    "enable_super_kernel is valid only when Torchair graph mode and enable_multistream_moe are enabled"
                 )
+        if self.use_cached_kv_cache_bytes and not self.use_cached_graph:
+            raise RuntimeError(
+                "use_cached_kv_cache_bytes is valid only when Torchair graph mode and use_cached_graph are enabled"
+            )
 
 
 class AscendSchedulerConfig:
diff --git a/vllm_ascend/platform.py b/vllm_ascend/platform.py
@@ -28,7 +28,10 @@
 from vllm.platforms import Platform, PlatformEnum
 
 from vllm_ascend.ascend_config import check_ascend_config, init_ascend_config
-from vllm_ascend.utils import ASCEND_QUATIZATION_METHOD, update_aclgraph_sizes
+from vllm_ascend.utils import (ASCEND_QUATIZATION_METHOD,
+                               check_torchair_cache_exist,
+                               delete_torchair_cache_file,
+                               update_aclgraph_sizes)
 
 if TYPE_CHECKING:
     from vllm.config import ModelConfig, VllmConfig
@@ -157,6 +160,14 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
                 "Torchair compilation enabled on NPU. Setting level to NO_COMPILATION"
             )
             compilation_config.level = CompilationLevel.NO_COMPILATION
+            # Note: We delete the torchair cache folder here to prevent runtime issues caused by dimension
+            # mismatches or configuration inconsistencies when users reuse cached computation graphs. Though
+            # this will increase graph compilation duration, it significantly enhances robustness and decreases
+            # graph launching time during inference. In order to decrease torchair graph compilation time, users
+            # can enable both `use_cached_graph` and `use_cached_kv_cache_bytes` in torchair_graph_config.
+            if check_torchair_cache_exist(
+            ) and not ascend_config.torchair_graph_config.use_cached_kv_cache_bytes:
+                delete_torchair_cache_file()
         elif parallel_config.distributed_executor_backend == "ray":
             logger.warning(
                 "Ray distributed executor backend is not compatible with ACL Graph mode "
diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py
@@ -84,7 +84,7 @@
 from vllm_ascend.multistream.ms_split import compute_split_seq_index
 from vllm_ascend.platform import NPUPlatform
 from vllm_ascend.sample.rejection_sampler import AscendRejectionSampler
-from vllm_ascend.utils import (ProfileExecuteDuration,
+from vllm_ascend.utils import (TORCHAIR_CACHE_DIR, ProfileExecuteDuration,
                                check_torchair_cache_exist,
                                write_kv_cache_bytes_to_file)
 from vllm_ascend.worker.mtp_proposer_v1 import MtpProposer
@@ -360,6 +360,7 @@ def __init__(self, vllm_config: VllmConfig, device: torch.device):
         ascend_config = get_ascend_config()
         self.torchair_graph_enabled = ascend_config.torchair_graph_config.enabled and self.vllm_config.model_config.use_mla
         self.use_cached_npu_graph = ascend_config.torchair_graph_config.use_cached_graph
+        self.use_cached_kv_cache_bytes = ascend_config.torchair_graph_config.use_cached_kv_cache_bytes
         self.torchair_graph_batch_sizes = ascend_config.torchair_graph_config.graph_batch_sizes
 
         if ascend_config.torchair_graph_config.graph_batch_sizes_init:
@@ -1904,6 +1905,7 @@ def _get_torchair_lazy_compiled_model(self, batch_size: int):
                     self.model.__dict__[forward_proxy_name],
                     dynamic=True,
                     fullgraph=envs_vllm.VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE,
+                    cache_dir=TORCHAIR_CACHE_DIR,
                     config=config,
                     ge_cache=False)
             return self.torchair_compiled_models[batch_size]
@@ -2082,14 +2084,20 @@ def capture_model(self) -> None:
             torchair_graph_batch_sizes = self.torchair_graph_batch_sizes
             graph_num = len(torchair_graph_batch_sizes)
             if self.use_cached_npu_graph and not check_torchair_cache_exist():
-                # If caching is enabled but does not exist, we will compile the model twice. The first
-                # time is used to generate the cache, and the second time is used to load the cache to
-                # skip the overhead caused by Dynamo guard mechanism.
+                # If caching is enabled but does not exist (either
+                # use_cached_kv_cache_bytes is disabled or kv_cache_bytes are
+                # different), we will compile the model twice. The first time is
+                # used to generate the cache, and the second time is used to load the
+                # cache to skip the overhead caused by Dynamo guard mechanism.
                 logger.info(
-                    "Use cached npu graph but cache doesn't exist! Now we compile graph to genetate torchair cache, this usually takes %.1f~%.1f mins.",
+                    "Cache compilation for torchair graph is enabled. Now we compile graph to genetate"
+                    " torchair cache, this usually takes %.1f~%.1f mins.",
                     0.5 * graph_num, 1.5 * graph_num)
                 self._compile_torchair_graph(torchair_graph_batch_sizes)
                 NPUPlatform.synchronize()
+                # Note: We reset dynamo and reload the compiled torchair cached computation graph below
+                # that was compiled above. This operation reduces graph launch time by 2-4ms and avoids
+                # runtime errors caused by configuration mismatches in graph mode.
                 torch._dynamo.reset()
                 self.torchair_compiled_models.clear()
                 if self.speculative_config and self.speculative_config.method == "deepseek_mtp":
@@ -2104,8 +2112,7 @@ def capture_model(self) -> None:
                     "Capturing torchair graph, this usually takes %.1f~%.1f mins.",
                     0.5 * graph_num, 1.5 * graph_num)
                 self._compile_torchair_graph(torchair_graph_batch_sizes)
-
-            if self.new_kv_cache_bytes > 0:
+            if self.use_cached_kv_cache_bytes and self.new_kv_cache_bytes > 0:
                 write_kv_cache_bytes_to_file(torch.distributed.get_rank(),
                                              self.new_kv_cache_bytes)
         elif self.use_aclgraph:
diff --git a/vllm_ascend/worker/worker_v1.py b/vllm_ascend/worker/worker_v1.py
@@ -193,7 +193,9 @@ def determine_available_memory(self) -> int:
         logger.info(
             f"Available memory: {available_kv_cache_memory}, total memory: {total_npu_memory}"
         )
-        if get_ascend_config().torchair_graph_config.enabled:
+        if (get_ascend_config().torchair_graph_config.enabled
+                and get_ascend_config(
+                ).torchair_graph_config.use_cached_kv_cache_bytes):
             if check_torchair_cache_exist(
             ) and check_kv_cache_bytes_cache_exist():
                 old_kv_cache_bytes = read_kv_cache_bytes_from_file(