From 541593a7f84433b66a2220c14924e52a6d2c0bda Mon Sep 17 00:00:00 2001
From: Djordje Lukic <djordje.lukic@docker.com>
Date: Tue, 9 Jun 2026 21:30:44 +0200
Subject: [PATCH 1/2] fix(runtime): don't count sub-session tokens in the
 compaction trigger

compactIfNeeded estimated the token impact of newly added messages via
sess.GetAllMessages(), which recurses into sub-sessions. In multi-agent
runs the content produced by a transfer_task child was therefore
attributed to the parent session even though it never enters the
parent's prompt (GetMessages skips sub-session items).

The phantom tokens triggered a compaction of a parent conversation that
was actually tiny; with everything fitting the keep budget the split
resolved to the 'compact everything, keep nothing' sentinel, so the
user's task and the in-flight tool exchange were wiped. The agent's next
prompt was literally just 'Session Summary: ...', which models read as
the user asking for a summary and answer with a confused 'I see no
conversation history' reply, halting mid-task.

Add Session.OwnMessages() (direct messages only, no sub-session
recursion) and use it for the trigger's before/after counts so the
estimate matches what the session actually sends.

Fixes #2871

Assisted-By: docker-agent
Signed-off-by: Djordje Lukic <djordje.lukic@docker.com>
---
 pkg/runtime/compaction_trigger_test.go | 106 +++++++++++++++++++++++++
 pkg/runtime/loop.go                    |  11 ++-
 pkg/session/session.go                 |  18 +++++
 3 files changed, 132 insertions(+), 3 deletions(-)
 create mode 100644 pkg/runtime/compaction_trigger_test.go

diff --git a/pkg/runtime/compaction_trigger_test.go b/pkg/runtime/compaction_trigger_test.go
new file mode 100644
index 000000000..693c7a787
--- /dev/null
+++ b/pkg/runtime/compaction_trigger_test.go
@@ -0,0 +1,106 @@
+package runtime
+
+import (
+	"strings"
+	"testing"
+
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
+
+	"github.com/docker/docker-agent/pkg/agent"
+	"github.com/docker/docker-agent/pkg/chat"
+	"github.com/docker/docker-agent/pkg/session"
+	"github.com/docker/docker-agent/pkg/team"
+	"github.com/docker/docker-agent/pkg/tools"
+)
+
+// TestCompactIfNeeded_IgnoresSubSessionTokens is a regression test for
+// issue #2871: in a multi-agent run, the tokens accumulated inside a
+// transfer_task sub-session were counted by the proactive compaction
+// trigger (GetAllMessages recurses into sub-sessions) even though they
+// never enter the parent's prompt (GetMessages skips sub-session items).
+// The phantom tokens made the parent compact its own tiny conversation;
+// with everything fitting the keep budget that meant "compact
+// everything, keep nothing" — the agent's next prompt was just the
+// summary and it halted with a confused "no conversation history" reply.
+func TestCompactIfNeeded_IgnoresSubSessionTokens(t *testing.T) {
+	prov := &mockProvider{id: "test/model", stream: &mockStream{}}
+	root := agent.New("root", "agent", agent.WithModel(prov))
+	tm := team.New(team.WithAgents(root))
+
+	rt, err := NewLocalRuntime(tm,
+		WithSessionCompaction(true),
+		WithModelStore(mockModelStoreWithLimit{limit: 100_000}))
+	require.NoError(t, err)
+
+	sess := session.New(session.WithUserMessage("build the app"))
+	messageCountBefore := len(sess.OwnMessages())
+
+	// Simulate a completed transfer_task tool call: a sub-session holding
+	// far more content than the parent's context limit, plus a small
+	// tool-result message on the parent itself.
+	sub := session.New(session.WithUserMessage("subtask"))
+	sub.AddMessage(session.NewAgentMessage("worker", &chat.Message{
+		Role:    chat.MessageRoleAssistant,
+		Content: strings.Repeat("z", 600_000), // ~150k estimated tokens
+	}))
+	sess.AddMessage(session.NewAgentMessage("root", &chat.Message{
+		Role:      chat.MessageRoleAssistant,
+		ToolCalls: []tools.ToolCall{{ID: "t1", Function: tools.FunctionCall{Name: "transfer_task"}}},
+	}))
+	sess.AddSubSession(sub)
+	sess.AddMessage(session.NewAgentMessage("root", &chat.Message{
+		Role:       chat.MessageRoleTool,
+		ToolCallID: "t1",
+		Content:    "subtask done",
+	}))
+
+	events := make(chan Event, 16)
+	rt.compactIfNeeded(t.Context(), sess, root, 100_000, messageCountBefore, NewChannelSink(events))
+	close(events)
+
+	for ev := range events {
+		_, isCompaction := ev.(*SessionCompactionEvent)
+		assert.False(t, isCompaction,
+			"sub-session tokens must not trigger compaction of the parent session")
+	}
+}
+
+// TestCompactIfNeeded_TriggersOnOwnMessages pins the complementary case:
+// large tool results recorded directly on the session still trigger the
+// proactive compaction.
+func TestCompactIfNeeded_TriggersOnOwnMessages(t *testing.T) {
+	prov := &mockProvider{id: "test/model", stream: &mockStream{}}
+	root := agent.New("root", "agent", agent.WithModel(prov))
+	tm := team.New(team.WithAgents(root))
+
+	rt, err := NewLocalRuntime(tm,
+		WithSessionCompaction(true),
+		WithModelStore(mockModelStoreWithLimit{limit: 100_000}))
+	require.NoError(t, err)
+
+	sess := session.New(session.WithUserMessage("build the app"))
+	messageCountBefore := len(sess.OwnMessages())
+
+	sess.AddMessage(session.NewAgentMessage("root", &chat.Message{
+		Role:      chat.MessageRoleAssistant,
+		ToolCalls: []tools.ToolCall{{ID: "t1", Function: tools.FunctionCall{Name: "shell"}}},
+	}))
+	sess.AddMessage(session.NewAgentMessage("root", &chat.Message{
+		Role:       chat.MessageRoleTool,
+		ToolCallID: "t1",
+		Content:    strings.Repeat("z", 600_000), // ~150k estimated tokens
+	}))
+
+	events := make(chan Event, 16)
+	rt.compactIfNeeded(t.Context(), sess, root, 100_000, messageCountBefore, NewChannelSink(events))
+	close(events)
+
+	sawCompaction := false
+	for ev := range events {
+		if _, ok := ev.(*SessionCompactionEvent); ok {
+			sawCompaction = true
+		}
+	}
+	assert.True(t, sawCompaction, "large own tool results must still trigger compaction")
+}
diff --git a/pkg/runtime/loop.go b/pkg/runtime/loop.go
index d27e7656f..fdbb4125f 100644
--- a/pkg/runtime/loop.go
+++ b/pkg/runtime/loop.go
@@ -84,7 +84,7 @@ func (r *LocalRuntime) drainAndEmitSteered(ctx context.Context, sess *session.Se
 	if len(steered) == 0 {
 		return steerResult{}
 	}
-	messageCountBefore := len(sess.GetAllMessages())
+	messageCountBefore := len(sess.OwnMessages())
 	contents := make([]string, 0, len(steered))
 	for i, sm := range steered {
 		contents = append(contents, sm.Content)
@@ -627,7 +627,7 @@ func (r *LocalRuntime) runTurn(
 
 	// Record the message count before tool calls so we can
 	// measure how much content was added by tool results.
-	messageCountBeforeTools := len(sess.GetAllMessages())
+	messageCountBeforeTools := len(sess.OwnMessages())
 
 	stopRun, stopMsg := r.processToolCalls(ctx, sess, res.Calls, agentTools, events)
 
@@ -851,7 +851,12 @@ func (r *LocalRuntime) compactIfNeeded(
 		return
 	}
 
-	newMessages := sess.GetAllMessages()[messageCountBefore:]
+	// Estimate only over the session's own new messages: sub-session
+	// content recorded during tool calls (transfer_task and friends)
+	// never enters this session's prompt, so counting it here would
+	// attribute phantom tokens to a small parent conversation and
+	// trigger a compaction that wipes it (see issue #2871).
+	newMessages := sess.OwnMessages()[messageCountBefore:]
 	var addedTokens int64
 	for _, msg := range newMessages {
 		addedTokens += compaction.EstimateMessageTokens(&msg.Message)
diff --git a/pkg/session/session.go b/pkg/session/session.go
index 30303f030..be35198f8 100644
--- a/pkg/session/session.go
+++ b/pkg/session/session.go
@@ -541,6 +541,24 @@ func (s *Session) GetAllMessages() []Message {
 	return messages
 }
 
+// OwnMessages extracts this session's direct messages, excluding system
+// messages and WITHOUT recursing into sub-sessions. This is the set of
+// messages that actually enters this session's prompt (GetMessages skips
+// sub-session items), so token accounting that drives compaction must
+// use it: counting sub-session content would attribute phantom tokens
+// to the parent and compact a conversation that isn't actually large.
+func (s *Session) OwnMessages() []Message {
+	items := s.snapshotItems()
+
+	var messages []Message
+	for _, item := range items {
+		if item.IsMessage() && item.Message.Message.Role != chat.MessageRoleSystem {
+			messages = append(messages, *item.Message)
+		}
+	}
+	return messages
+}
+
 func (s *Session) GetLastAssistantMessageContent() string {
 	return s.getLastMessageContentByRole(chat.MessageRoleAssistant)
 }

From 6996a021cfeed79dad30cdffd05d3d25a3dc8d63 Mon Sep 17 00:00:00 2001
From: Djordje Lukic <djordje.lukic@docker.com>
Date: Tue, 9 Jun 2026 21:30:57 +0200
Subject: [PATCH 2/2] fix(compactor): scale compaction budgets to the context
 window

The compactor used fixed absolute budgets: MaxSummaryTokens (16k) was
subtracted from the window when sizing the summarizer's input, and
maxKeepTokens (20k) sized the verbatim-kept tail. Since ead97457 made
compaction activate for models whose window resolves from
provider_opts.context_size, both constants can exceed the entire
window: contextAvailable went to zero, FirstIndexInBudget dropped every
conversation message, and the summarizer received only its own prompts.
It then fabricated an 'I see no conversation history' non-summary that
replaced the real session history.

Scale both budgets to the window (min(16k, limit/4) for the summary
cap, min(20k, limit/5) for the kept tail) so the kept tail plus the
summary always land well under the compaction threshold, and use the
scaled cap for the summary call's max_tokens so small-window providers
don't reject the request.

As a safety net, RunLLM now no-ops when not a single conversation
message fits the summarization budget (e.g. one giant tool result)
instead of running the summarizer on an empty conversation and wiping
the history with the result.

ComputeFirstKeptEntry gains a contextLimit parameter so hook-supplied
summaries share the same kept-tail policy; a non-positive limit falls
back to the unscaled budget.

Related to #2871

Assisted-By: docker-agent
Signed-off-by: Djordje Lukic <djordje.lukic@docker.com>
---
 pkg/runtime/compactor/compactor.go      |  63 ++++++++++++---
 pkg/runtime/compactor/compactor_test.go | 103 ++++++++++++++++++++++--
 pkg/runtime/session_compaction.go       |   6 +-
 3 files changed, 151 insertions(+), 21 deletions(-)

diff --git a/pkg/runtime/compactor/compactor.go b/pkg/runtime/compactor/compactor.go
index f810296ca..501b87e4a 100644
--- a/pkg/runtime/compactor/compactor.go
+++ b/pkg/runtime/compactor/compactor.go
@@ -22,6 +22,7 @@ import (
 	"context"
 	"errors"
 	"fmt"
+	"log/slog"
 	"time"
 
 	"github.com/docker/docker-agent/pkg/agent"
@@ -35,15 +36,43 @@ import (
 // MaxSummaryTokens caps the summary's output length when using the
 // default LLM strategy. Exposed because the runtime subtracts it from
 // the model's context budget when deciding whether the model lookup
-// produced a workable limit.
+// produced a workable limit. For small context windows the effective
+// cap is scaled down via [summaryTokenBudget] so the summary call
+// never consumes more than a quarter of the window.
 const MaxSummaryTokens = 16_000
 
 // maxKeepTokens is the runtime's policy for how much recent
 // conversation to preserve verbatim across a compaction. Messages
 // fitting in this window are kept aside; the rest are the candidates
-// to summarize.
+// to summarize. For small context windows the effective budget is
+// scaled down via [keepTokenBudget] so the kept tail never occupies
+// more than a fifth of the window.
 const maxKeepTokens = 20_000
 
+// summaryTokenBudget returns the output-token cap for the summary
+// call, scaled to the context window. The fixed [MaxSummaryTokens]
+// cap works for large windows but exceeds small ones entirely (e.g. a
+// local model with provider_opts.context_size of 8k), which used to
+// leave no room for the conversation being summarized — the
+// summarizer then received an empty conversation and produced a
+// confused non-summary that wiped the session history.
+func summaryTokenBudget(contextLimit int64) int64 {
+	return min(MaxSummaryTokens, contextLimit/4)
+}
+
+// keepTokenBudget returns the verbatim-keep budget for a compaction,
+// scaled to the context window so that the kept tail plus the summary
+// always leave the post-compaction session well under the compaction
+// threshold. A non-positive contextLimit (hook-supplied summaries may
+// run without a resolvable model definition) falls back to the
+// unscaled policy.
+func keepTokenBudget(contextLimit int64) int64 {
+	if contextLimit <= 0 {
+		return maxKeepTokens
+	}
+	return min(maxKeepTokens, contextLimit/5)
+}
+
 // Result is the structural outcome of running a compaction strategy.
 // The runtime applies it to the parent session by appending a
 // session.Item with FirstKeptEntry set, resetting the running
@@ -120,12 +149,24 @@ func RunLLM(ctx context.Context, args LLMArgs) (*Result, error) {
 
 	summaryModel := provider.CloneWithOptions(ctx, args.Agent.Model(ctx),
 		options.WithStructuredOutput(nil),
-		options.WithMaxTokens(MaxSummaryTokens),
+		options.WithMaxTokens(summaryTokenBudget(args.ContextLimit)),
 	)
 	compactionAgent := agent.New("root", "", agent.WithModel(summaryModel))
 
 	messages, firstKeptEntry := extractMessages(args.Session, compactionAgent, args.ContextLimit, args.AdditionalPrompt)
 
+	// The first and last entries are the synthesized compaction
+	// system/user prompts; anything between them is the conversation to
+	// summarize. Running the summarizer without a conversation would
+	// make it fabricate a "there is no history" reply that then
+	// REPLACES the real session history, so treat this as a no-op
+	// instead (the session is left untouched).
+	if len(messages) <= 2 {
+		slog.WarnContext(ctx, "Compaction skipped: no conversation messages fit the summarization budget",
+			"session_id", args.Session.ID, "context_limit", args.ContextLimit)
+		return nil, nil
+	}
+
 	compactionSession := session.New(
 		session.WithTitle("Generating summary"),
 		session.WithMessages(toItems(messages)),
@@ -150,12 +191,12 @@ func RunLLM(ctx context.Context, args LLMArgs) (*Result, error) {
 
 // ComputeFirstKeptEntry returns the index in sess.Messages of the
 // first message preserved verbatim after compaction, given the
-// [maxKeepTokens] window. Used by the runtime when a hook supplies
-// its own summary so the kept-tail policy stays consistent across
-// the two strategies.
-func ComputeFirstKeptEntry(sess *session.Session) int {
+// [keepTokenBudget] window for contextLimit. Used by the runtime when
+// a hook supplies its own summary so the kept-tail policy stays
+// consistent across the two strategies.
+func ComputeFirstKeptEntry(sess *session.Session, contextLimit int64) int {
 	messages, sessIndices := gatherCompactionInput(sess)
-	return firstKeptSessionIndex(sess, sessIndices, compaction.SplitIndexForKeep(messages, maxKeepTokens))
+	return firstKeptSessionIndex(sess, sessIndices, compaction.SplitIndexForKeep(messages, keepTokenBudget(contextLimit)))
 }
 
 // gatherCompactionInput is a thin wrapper around
@@ -197,12 +238,12 @@ func gatherCompactionInput(sess *session.Session) ([]chat.Message, []int) {
 // a cache checkpoint or accrue duplicate cost.
 //
 // If the conversation tail itself doesn't fit in
-// (contextLimit − MaxSummaryTokens − prompt-overhead), older messages
+// (contextLimit − summary budget − prompt-overhead), older messages
 // are dropped from the front of the to-compact list to make room.
 func extractMessages(sess *session.Session, _ *agent.Agent, contextLimit int64, additionalPrompt string) ([]chat.Message, int) {
 	messages, sessIndices := gatherCompactionInput(sess)
 
-	splitIdx := compaction.SplitIndexForKeep(messages, maxKeepTokens)
+	splitIdx := compaction.SplitIndexForKeep(messages, keepTokenBudget(contextLimit))
 	firstKeptEntry := firstKeptSessionIndex(sess, sessIndices, splitIdx)
 	messages = messages[:splitIdx]
 
@@ -222,7 +263,7 @@ func extractMessages(sess *session.Session, _ *agent.Agent, contextLimit int64,
 	}
 
 	contextAvailable := max(int64(0),
-		contextLimit-MaxSummaryTokens-
+		contextLimit-summaryTokenBudget(contextLimit)-
 			compaction.EstimateMessageTokens(&systemPromptMessage)-
 			compaction.EstimateMessageTokens(&userPromptMessage))
 	firstIndex := compaction.FirstIndexInBudget(messages, contextAvailable)
diff --git a/pkg/runtime/compactor/compactor_test.go b/pkg/runtime/compactor/compactor_test.go
index 492737831..7d65cb52f 100644
--- a/pkg/runtime/compactor/compactor_test.go
+++ b/pkg/runtime/compactor/compactor_test.go
@@ -3,6 +3,7 @@ package compactor
 import (
 	"context"
 	"errors"
+	"fmt"
 	"strings"
 	"testing"
 
@@ -76,15 +77,19 @@ func TestExtractMessages(t *testing.T) {
 			wantConversationMsgCount: 4,
 		},
 		{
-			name: "truncation when context limit is very small",
+			name: "older messages dropped when they exceed the summarization budget",
 			messages: []session.Item{
-				newMsg(chat.MessageRoleUser, "first message with lots of content that takes tokens"),
-				newMsg(chat.MessageRoleAssistant, "first response with lots of content that takes tokens"),
+				newMsg(chat.MessageRoleUser, strings.Repeat("a", 80_000)),      // ~20k tokens
+				newMsg(chat.MessageRoleAssistant, strings.Repeat("b", 80_000)), // ~20k tokens
 				newMsg(chat.MessageRoleUser, "second message"),
 				newMsg(chat.MessageRoleAssistant, "second response"),
 			},
-			contextLimit:             MaxSummaryTokens + 50,
-			wantConversationMsgCount: 0,
+			// The two small messages form the kept tail (keep budget
+			// 32k/5). Of the two ~20k-token compact candidates only the
+			// newest fits contextAvailable ≈ 0.75×32k − prompts ≈ 23.8k;
+			// the older one is dropped from the summarizer's input.
+			contextLimit:             32_000,
+			wantConversationMsgCount: 1,
 		},
 		{
 			name: "additional prompt is appended",
@@ -183,7 +188,7 @@ func TestComputeFirstKeptEntry(t *testing.T) {
 	t.Run("empty session returns 0", func(t *testing.T) {
 		t.Parallel()
 		sess := session.New()
-		assert.Equal(t, 0, ComputeFirstKeptEntry(sess))
+		assert.Equal(t, 0, ComputeFirstKeptEntry(sess, 100_000))
 	})
 
 	t.Run("short conversation: split at end (compact everything)", func(t *testing.T) {
@@ -193,7 +198,7 @@ func TestComputeFirstKeptEntry(t *testing.T) {
 			session.NewMessageItem(&session.Message{Message: chat.Message{Role: chat.MessageRoleUser, Content: "hi"}}),
 			session.NewMessageItem(&session.Message{Message: chat.Message{Role: chat.MessageRoleAssistant, Content: "hello"}}),
 		}))
-		assert.Equal(t, len(sess.Messages), ComputeFirstKeptEntry(sess))
+		assert.Equal(t, len(sess.Messages), ComputeFirstKeptEntry(sess, 100_000))
 	})
 }
 
@@ -345,6 +350,90 @@ func TestGatherCompactionInput_PriorSummaryWithoutFirstKeptEntry(t *testing.T) {
 	assert.Equal(t, []int{2, 3, 4}, sessIndices)
 }
 
+// TestRunLLM_SmallContextWindow is a regression test for issue #2871:
+// with a small context window (e.g. a local model whose size comes from
+// provider_opts.context_size), the fixed MaxSummaryTokens budget used to
+// consume the whole window, so the summarizer received zero conversation
+// messages, fabricated an "I see no conversation history" reply, and that
+// text then replaced the entire session history. The budgets must scale
+// with the window so the summarizer always sees real conversation.
+func TestRunLLM_SmallContextWindow(t *testing.T) {
+	t.Parallel()
+
+	big := strings.Repeat("x", 4_000) // ~1k estimated tokens per tool result
+	sess := session.New(session.WithUserMessage("please do the big task"))
+	for i := range 8 {
+		id := fmt.Sprintf("tc%d", i)
+		sess.AddMessage(session.NewAgentMessage("root", &chat.Message{
+			Role:      chat.MessageRoleAssistant,
+			ToolCalls: []tools.ToolCall{{ID: id, Function: tools.FunctionCall{Name: "shell", Arguments: `{"cmd":"ls"}`}}},
+		}))
+		sess.AddMessage(session.NewAgentMessage("root", &chat.Message{
+			Role:       chat.MessageRoleTool,
+			ToolCallID: id,
+			Content:    big,
+		}))
+	}
+	a := agent.New("root", "instr", agent.WithModel(fakeProvider{id: modelsdev.NewID("fake", "model")}))
+
+	var conversationCount int
+	result, err := RunLLM(t.Context(), LLMArgs{
+		Session:      sess,
+		Agent:        a,
+		ContextLimit: 8_192,
+		RunAgent: func(_ context.Context, _ *agent.Agent, cs *session.Session) error {
+			msgs := cs.GetAllMessages()
+			// All non-system messages minus the trailing compaction user
+			// prompt are the conversation handed to the summarizer.
+			conversationCount = len(msgs) - 1
+			cs.AddMessage(session.NewAgentMessage("root", &chat.Message{
+				Role:    chat.MessageRoleAssistant,
+				Content: "the summary",
+			}))
+			return nil
+		},
+	})
+
+	require.NoError(t, err)
+	require.NotNil(t, result)
+	assert.Positive(t, conversationCount, "summarizer must receive conversation messages even on small context windows")
+	assert.Equal(t, "the summary", result.Summary)
+	assert.Less(t, result.FirstKeptEntry, len(sess.Messages), "a recent tail must be kept verbatim")
+	assert.Positive(t, result.FirstKeptEntry)
+}
+
+// TestRunLLM_NoConversationFits_NoOps pins the safety net behind the
+// scaled budgets: when not a single conversation message fits the
+// summarization budget (e.g. one giant tool result), RunLLM must no-op
+// instead of running the summarizer on an empty conversation — the
+// resulting non-summary would otherwise wipe the session history.
+func TestRunLLM_NoConversationFits_NoOps(t *testing.T) {
+	t.Parallel()
+
+	sess := session.New(session.WithMessages([]session.Item{
+		session.NewMessageItem(&session.Message{Message: chat.Message{
+			Role:    chat.MessageRoleUser,
+			Content: strings.Repeat("x", 200_000), // ~50k tokens, exceeds the whole window
+		}}),
+	}))
+	a := agent.New("root", "instr", agent.WithModel(fakeProvider{id: modelsdev.NewID("fake", "model")}))
+
+	runAgentCalled := false
+	result, err := RunLLM(t.Context(), LLMArgs{
+		Session:      sess,
+		Agent:        a,
+		ContextLimit: 8_192,
+		RunAgent: func(context.Context, *agent.Agent, *session.Session) error {
+			runAgentCalled = true
+			return nil
+		},
+	})
+
+	require.NoError(t, err)
+	assert.Nil(t, result, "compaction must be a no-op when nothing fits the budget")
+	assert.False(t, runAgentCalled, "the summarizer must not run on an empty conversation")
+}
+
 func TestRunLLM_DoesNotDuplicateSystemPrompt(t *testing.T) {
 	t.Parallel()
 
diff --git a/pkg/runtime/session_compaction.go b/pkg/runtime/session_compaction.go
index 6132c2d5d..115c209ce 100644
--- a/pkg/runtime/session_compaction.go
+++ b/pkg/runtime/session_compaction.go
@@ -76,7 +76,7 @@ func (r *LocalRuntime) doCompact(ctx context.Context, sess *session.Session, a *
 
 	// Choose the strategy: a hook-supplied summary if before_compaction
 	// returned one, otherwise the default LLM strategy.
-	result := summaryFromHook(sess, a, pre)
+	result := summaryFromHook(sess, a, pre, contextLimit)
 	if result == nil {
 		if contextLimit <= 0 {
 			slog.ErrorContext(ctx, "Failed to generate session summary",
@@ -143,7 +143,7 @@ func (r *LocalRuntime) doCompact(ctx context.Context, sess *session.Session, a *
 // summary's token count for session bookkeeping. The Result.Cost is
 // left at its zero value because no LLM call ran — the hook produced
 // the summary itself, so there's nothing to bill.
-func summaryFromHook(sess *session.Session, a *agent.Agent, pre *hooks.Result) *compactor.Result {
+func summaryFromHook(sess *session.Session, a *agent.Agent, pre *hooks.Result, contextLimit int64) *compactor.Result {
 	if pre == nil || pre.Summary == "" {
 		return nil
 	}
@@ -151,7 +151,7 @@ func summaryFromHook(sess *session.Session, a *agent.Agent, pre *hooks.Result) *
 		"session_id", sess.ID, "agent", a.Name(), "summary_length", len(pre.Summary))
 	return &compactor.Result{
 		Summary:        pre.Summary,
-		FirstKeptEntry: compactor.ComputeFirstKeptEntry(sess),
+		FirstKeptEntry: compactor.ComputeFirstKeptEntry(sess, contextLimit),
 		// Estimate the summary's token count for session bookkeeping;
 		// no LLM was called so Cost stays at the zero value.
 		InputTokens: compaction.EstimateMessageTokens(&chat.Message{