From 831688b0e24134bc821dc36821b877825da1aee4 Mon Sep 17 00:00:00 2001
From: Shen Xu <shenchenxu@meta.com>
Date: Tue, 22 Jul 2025 20:25:26 -0700
Subject: [PATCH] Fix lookahead decoding cache buffer size (#12725)

Summary:

For n-grams we are only storing the suffixes of size n-1. Over sizing the buffer here leads to subsequent memcpy copying too much.

Reviewed By: limintang, billmguo

Differential Revision: D78759433
---
 examples/models/llama/runner/static_attention_io_manager.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/examples/models/llama/runner/static_attention_io_manager.h b/examples/models/llama/runner/static_attention_io_manager.h
index 14182bd9cb3..74925a777a2 100644
--- a/examples/models/llama/runner/static_attention_io_manager.h
+++ b/examples/models/llama/runner/static_attention_io_manager.h
@@ -396,7 +396,7 @@ template <typename TokenT>
 class SuffixCache {
  public:
   SuffixCache(size_t n, size_t capacity)
-      : n_(n), capacity_(capacity), pos_(0), cache_(n_ * capacity_) {}
+      : n_(n), capacity_(capacity), pos_(0), cache_((n_ - 1) * capacity_) {}
 
   void add(executorch::runtime::Span<TokenT> suffix) {
     if (suffix.size() != n_ - 1) {