From 831688b0e24134bc821dc36821b877825da1aee4 Mon Sep 17 00:00:00 2001 From: Shen Xu Date: Tue, 22 Jul 2025 20:25:26 -0700 Subject: [PATCH] Fix lookahead decoding cache buffer size (#12725) Summary: For n-grams we are only storing the suffixes of size n-1. Over sizing the buffer here leads to subsequent memcpy copying too much. Reviewed By: limintang, billmguo Differential Revision: D78759433 --- examples/models/llama/runner/static_attention_io_manager.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/models/llama/runner/static_attention_io_manager.h b/examples/models/llama/runner/static_attention_io_manager.h index 14182bd9cb3..74925a777a2 100644 --- a/examples/models/llama/runner/static_attention_io_manager.h +++ b/examples/models/llama/runner/static_attention_io_manager.h @@ -396,7 +396,7 @@ template class SuffixCache { public: SuffixCache(size_t n, size_t capacity) - : n_(n), capacity_(capacity), pos_(0), cache_(n_ * capacity_) {} + : n_(n), capacity_(capacity), pos_(0), cache_((n_ - 1) * capacity_) {} void add(executorch::runtime::Span suffix) { if (suffix.size() != n_ - 1) {