From a77980c94d430c9731094cd85ea2501c9eba6e06 Mon Sep 17 00:00:00 2001 From: Shen Xu Date: Fri, 22 Aug 2025 15:32:21 -0700 Subject: [PATCH] Call .detach() in static attention cache update helper Summary: This reduces memory usage. Differential Revision: D80853817 --- examples/models/llama/static_attention.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/models/llama/static_attention.py b/examples/models/llama/static_attention.py index 5ffd25f2c7f..fb1a05f4cc9 100644 --- a/examples/models/llama/static_attention.py +++ b/examples/models/llama/static_attention.py @@ -549,7 +549,7 @@ def _update_states(self, attn_updates, update_pos, update_len): style=self.style, update_pos=update_pos, update_len=update_len, - ) + ).detach() for cache_id, update in v_cache_updates.items(): self.v_caches[cache_id] = StaticKVCache.apply_update( self.v_caches[cache_id], @@ -558,7 +558,7 @@ def _update_states(self, attn_updates, update_pos, update_len): style=self.style, update_pos=update_pos, update_len=update_len, - ) + ).detach() self.pos += update_len def _get_lookahead_decoding_mask(