Fix lookahead decoding cache buffer size

sxu · web-flow · commit b562f3694345 · 2025-07-22T22:24:51.000-07:00
Differential Revision: D78759433 Pull Request resolved: #12725
diff --git a/examples/models/llama/runner/static_attention_io_manager.h b/examples/models/llama/runner/static_attention_io_manager.h
@@ -396,7 +396,7 @@ template <typename TokenT>
 class SuffixCache {
  public:
   SuffixCache(size_t n, size_t capacity)
-      : n_(n), capacity_(capacity), pos_(0), cache_(n_ * capacity_) {}
+      : n_(n), capacity_(capacity), pos_(0), cache_((n_ - 1) * capacity_) {}
 
   void add(executorch::runtime::Span<TokenT> suffix) {
     if (suffix.size() != n_ - 1) {