Internal change

ai-edge-bot · copybara-github · commit 060cc9dc6725 · 2026-01-27T17:21:38.000-08:00
LiteRT-LM-PiperOrigin-RevId: 861951627
diff --git a/runtime/executor/llm_executor_io_types.h b/runtime/executor/llm_executor_io_types.h
@@ -64,6 +64,11 @@ struct RuntimeState {
 
   // Random generator for sampling step.
   std::shared_ptr<std::default_random_engine> rand_gen;
+
+  // Whether decode has been run ever after prefill.
+  // This is only used by the compiled model executor to determine whether
+  // KVCache preparation for prefill or decode should be done.
+  bool ran_decode = false;
 };
 
 // A resource interface to hold the llm context.
diff --git a/runtime/executor/llm_litert_compiled_model_executor.cc b/runtime/executor/llm_litert_compiled_model_executor.cc
@@ -502,11 +502,11 @@ absl::Status LlmLiteRtCompiledModelExecutorBase::RollBackProcessedTokens() {
 
 absl::Status LlmLiteRtCompiledModelExecutorBase::PrepareFirstPrefillAfterDecode(
     int token_index_to_reduce) {
-  if (!ran_decode_) {
+  if (!llm_context_->runtime_state().ran_decode) {
     return absl::OkStatus();
   }
 
-  ran_decode_ = false;
+  llm_context_->runtime_state().ran_decode = false;
 
   int output_heads = 1;
   if (llm_context_->runtime_config().output_heads.has_value()) {
@@ -939,11 +939,11 @@ int LlmLiteRtCompiledModelExecutorBase::BindTensorsAndRunDecodeStatic(
 }
 
 absl::Status LlmLiteRtCompiledModelExecutorBase::PrepareFirstDecode() {
-  if (ran_decode_) {
+  if (llm_context_->runtime_state().ran_decode) {
     return absl::OkStatus();
   }
   // Mark that we have run decode at least once.
-  ran_decode_ = true;
+  llm_context_->runtime_state().ran_decode = true;
 
   int output_heads = 1;
   if (llm_context_->runtime_config().output_heads.has_value()) {
@@ -1050,7 +1050,7 @@ LlmLiteRtCompiledModelExecutorBase::DecodeLogits(
       auto output_logits,
       decode_output_buffers_[signatures_.output_logits].Duplicate());
 
-  bool last_run_is_decode = ran_decode_;
+  bool last_run_is_decode = llm_context_->runtime_state().ran_decode;
   RETURN_IF_ERROR(PrepareFirstDecode());
   ASSIGN_OR_RETURN(auto step_and_token, GetTokenToDecode(inputs));
   RETURN_IF_ERROR(DecodeInternal(step_and_token.token, output_logits));
diff --git a/runtime/executor/llm_litert_compiled_model_executor.h b/runtime/executor/llm_litert_compiled_model_executor.h
@@ -284,10 +284,6 @@ class LlmLiteRtCompiledModelExecutorBase : public LlmExecutor {
   // 3. The processed tokens.(e.g. KVCache)
   std::unique_ptr<LlmContext> llm_context_;
 
-  // Whether decode has been run ever after prefill.
-  // TODO: b/409401231 - Make sure this state is session dependent.
-  bool ran_decode_ = false;
-
   // Sampler for sampling logits.
   // For now, only CPU sampler is supported.
   std::unique_ptr<Sampler> sampler_;