diff --git a/vllm/worker/hpu_worker.py b/vllm/worker/hpu_worker.py index 04354c9ed0b9..b53014709117 100755 --- a/vllm/worker/hpu_worker.py +++ b/vllm/worker/hpu_worker.py @@ -249,6 +249,11 @@ def execute_model( 'VLLM_HPU_LOG_STEP_CPU_FALLBACKS_ALL', '0') != '0' log_cpu_fallbacks = os.environ.get('VLLM_HPU_LOG_STEP_CPU_FALLBACKS', '0') != '0' or log_cpu_fallbacks_all + + # Empty the Mamba cache table when all the requests are clear + if execute_model_req is None: + self.model_runner.mamba_cache_table.clear() + if (log_graph_compilation or log_cpu_fallbacks) and \ execute_model_req is not None: from habana_frameworks.torch.hpu.metrics import metric_localcontext