From 541593a7f84433b66a2220c14924e52a6d2c0bda Mon Sep 17 00:00:00 2001 From: Djordje Lukic Date: Tue, 9 Jun 2026 21:30:44 +0200 Subject: [PATCH 1/2] fix(runtime): don't count sub-session tokens in the compaction trigger compactIfNeeded estimated the token impact of newly added messages via sess.GetAllMessages(), which recurses into sub-sessions. In multi-agent runs the content produced by a transfer_task child was therefore attributed to the parent session even though it never enters the parent's prompt (GetMessages skips sub-session items). The phantom tokens triggered a compaction of a parent conversation that was actually tiny; with everything fitting the keep budget the split resolved to the 'compact everything, keep nothing' sentinel, so the user's task and the in-flight tool exchange were wiped. The agent's next prompt was literally just 'Session Summary: ...', which models read as the user asking for a summary and answer with a confused 'I see no conversation history' reply, halting mid-task. Add Session.OwnMessages() (direct messages only, no sub-session recursion) and use it for the trigger's before/after counts so the estimate matches what the session actually sends. Fixes #2871 Assisted-By: docker-agent Signed-off-by: Djordje Lukic --- pkg/runtime/compaction_trigger_test.go | 106 +++++++++++++++++++++++++ pkg/runtime/loop.go | 11 ++- pkg/session/session.go | 18 +++++ 3 files changed, 132 insertions(+), 3 deletions(-) create mode 100644 pkg/runtime/compaction_trigger_test.go diff --git a/pkg/runtime/compaction_trigger_test.go b/pkg/runtime/compaction_trigger_test.go new file mode 100644 index 000000000..693c7a787 --- /dev/null +++ b/pkg/runtime/compaction_trigger_test.go @@ -0,0 +1,106 @@ +package runtime + +import ( + "strings" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "github.com/docker/docker-agent/pkg/agent" + "github.com/docker/docker-agent/pkg/chat" + "github.com/docker/docker-agent/pkg/session" + "github.com/docker/docker-agent/pkg/team" + "github.com/docker/docker-agent/pkg/tools" +) + +// TestCompactIfNeeded_IgnoresSubSessionTokens is a regression test for +// issue #2871: in a multi-agent run, the tokens accumulated inside a +// transfer_task sub-session were counted by the proactive compaction +// trigger (GetAllMessages recurses into sub-sessions) even though they +// never enter the parent's prompt (GetMessages skips sub-session items). +// The phantom tokens made the parent compact its own tiny conversation; +// with everything fitting the keep budget that meant "compact +// everything, keep nothing" — the agent's next prompt was just the +// summary and it halted with a confused "no conversation history" reply. +func TestCompactIfNeeded_IgnoresSubSessionTokens(t *testing.T) { + prov := &mockProvider{id: "test/model", stream: &mockStream{}} + root := agent.New("root", "agent", agent.WithModel(prov)) + tm := team.New(team.WithAgents(root)) + + rt, err := NewLocalRuntime(tm, + WithSessionCompaction(true), + WithModelStore(mockModelStoreWithLimit{limit: 100_000})) + require.NoError(t, err) + + sess := session.New(session.WithUserMessage("build the app")) + messageCountBefore := len(sess.OwnMessages()) + + // Simulate a completed transfer_task tool call: a sub-session holding + // far more content than the parent's context limit, plus a small + // tool-result message on the parent itself. + sub := session.New(session.WithUserMessage("subtask")) + sub.AddMessage(session.NewAgentMessage("worker", &chat.Message{ + Role: chat.MessageRoleAssistant, + Content: strings.Repeat("z", 600_000), // ~150k estimated tokens + })) + sess.AddMessage(session.NewAgentMessage("root", &chat.Message{ + Role: chat.MessageRoleAssistant, + ToolCalls: []tools.ToolCall{{ID: "t1", Function: tools.FunctionCall{Name: "transfer_task"}}}, + })) + sess.AddSubSession(sub) + sess.AddMessage(session.NewAgentMessage("root", &chat.Message{ + Role: chat.MessageRoleTool, + ToolCallID: "t1", + Content: "subtask done", + })) + + events := make(chan Event, 16) + rt.compactIfNeeded(t.Context(), sess, root, 100_000, messageCountBefore, NewChannelSink(events)) + close(events) + + for ev := range events { + _, isCompaction := ev.(*SessionCompactionEvent) + assert.False(t, isCompaction, + "sub-session tokens must not trigger compaction of the parent session") + } +} + +// TestCompactIfNeeded_TriggersOnOwnMessages pins the complementary case: +// large tool results recorded directly on the session still trigger the +// proactive compaction. +func TestCompactIfNeeded_TriggersOnOwnMessages(t *testing.T) { + prov := &mockProvider{id: "test/model", stream: &mockStream{}} + root := agent.New("root", "agent", agent.WithModel(prov)) + tm := team.New(team.WithAgents(root)) + + rt, err := NewLocalRuntime(tm, + WithSessionCompaction(true), + WithModelStore(mockModelStoreWithLimit{limit: 100_000})) + require.NoError(t, err) + + sess := session.New(session.WithUserMessage("build the app")) + messageCountBefore := len(sess.OwnMessages()) + + sess.AddMessage(session.NewAgentMessage("root", &chat.Message{ + Role: chat.MessageRoleAssistant, + ToolCalls: []tools.ToolCall{{ID: "t1", Function: tools.FunctionCall{Name: "shell"}}}, + })) + sess.AddMessage(session.NewAgentMessage("root", &chat.Message{ + Role: chat.MessageRoleTool, + ToolCallID: "t1", + Content: strings.Repeat("z", 600_000), // ~150k estimated tokens + })) + + events := make(chan Event, 16) + rt.compactIfNeeded(t.Context(), sess, root, 100_000, messageCountBefore, NewChannelSink(events)) + close(events) + + sawCompaction := false + for ev := range events { + if _, ok := ev.(*SessionCompactionEvent); ok { + sawCompaction = true + } + } + assert.True(t, sawCompaction, "large own tool results must still trigger compaction") +} diff --git a/pkg/runtime/loop.go b/pkg/runtime/loop.go index d27e7656f..fdbb4125f 100644 --- a/pkg/runtime/loop.go +++ b/pkg/runtime/loop.go @@ -84,7 +84,7 @@ func (r *LocalRuntime) drainAndEmitSteered(ctx context.Context, sess *session.Se if len(steered) == 0 { return steerResult{} } - messageCountBefore := len(sess.GetAllMessages()) + messageCountBefore := len(sess.OwnMessages()) contents := make([]string, 0, len(steered)) for i, sm := range steered { contents = append(contents, sm.Content) @@ -627,7 +627,7 @@ func (r *LocalRuntime) runTurn( // Record the message count before tool calls so we can // measure how much content was added by tool results. - messageCountBeforeTools := len(sess.GetAllMessages()) + messageCountBeforeTools := len(sess.OwnMessages()) stopRun, stopMsg := r.processToolCalls(ctx, sess, res.Calls, agentTools, events) @@ -851,7 +851,12 @@ func (r *LocalRuntime) compactIfNeeded( return } - newMessages := sess.GetAllMessages()[messageCountBefore:] + // Estimate only over the session's own new messages: sub-session + // content recorded during tool calls (transfer_task and friends) + // never enters this session's prompt, so counting it here would + // attribute phantom tokens to a small parent conversation and + // trigger a compaction that wipes it (see issue #2871). + newMessages := sess.OwnMessages()[messageCountBefore:] var addedTokens int64 for _, msg := range newMessages { addedTokens += compaction.EstimateMessageTokens(&msg.Message) diff --git a/pkg/session/session.go b/pkg/session/session.go index 30303f030..be35198f8 100644 --- a/pkg/session/session.go +++ b/pkg/session/session.go @@ -541,6 +541,24 @@ func (s *Session) GetAllMessages() []Message { return messages } +// OwnMessages extracts this session's direct messages, excluding system +// messages and WITHOUT recursing into sub-sessions. This is the set of +// messages that actually enters this session's prompt (GetMessages skips +// sub-session items), so token accounting that drives compaction must +// use it: counting sub-session content would attribute phantom tokens +// to the parent and compact a conversation that isn't actually large. +func (s *Session) OwnMessages() []Message { + items := s.snapshotItems() + + var messages []Message + for _, item := range items { + if item.IsMessage() && item.Message.Message.Role != chat.MessageRoleSystem { + messages = append(messages, *item.Message) + } + } + return messages +} + func (s *Session) GetLastAssistantMessageContent() string { return s.getLastMessageContentByRole(chat.MessageRoleAssistant) } From 6996a021cfeed79dad30cdffd05d3d25a3dc8d63 Mon Sep 17 00:00:00 2001 From: Djordje Lukic Date: Tue, 9 Jun 2026 21:30:57 +0200 Subject: [PATCH 2/2] fix(compactor): scale compaction budgets to the context window The compactor used fixed absolute budgets: MaxSummaryTokens (16k) was subtracted from the window when sizing the summarizer's input, and maxKeepTokens (20k) sized the verbatim-kept tail. Since ead97457 made compaction activate for models whose window resolves from provider_opts.context_size, both constants can exceed the entire window: contextAvailable went to zero, FirstIndexInBudget dropped every conversation message, and the summarizer received only its own prompts. It then fabricated an 'I see no conversation history' non-summary that replaced the real session history. Scale both budgets to the window (min(16k, limit/4) for the summary cap, min(20k, limit/5) for the kept tail) so the kept tail plus the summary always land well under the compaction threshold, and use the scaled cap for the summary call's max_tokens so small-window providers don't reject the request. As a safety net, RunLLM now no-ops when not a single conversation message fits the summarization budget (e.g. one giant tool result) instead of running the summarizer on an empty conversation and wiping the history with the result. ComputeFirstKeptEntry gains a contextLimit parameter so hook-supplied summaries share the same kept-tail policy; a non-positive limit falls back to the unscaled budget. Related to #2871 Assisted-By: docker-agent Signed-off-by: Djordje Lukic --- pkg/runtime/compactor/compactor.go | 63 ++++++++++++--- pkg/runtime/compactor/compactor_test.go | 103 ++++++++++++++++++++++-- pkg/runtime/session_compaction.go | 6 +- 3 files changed, 151 insertions(+), 21 deletions(-) diff --git a/pkg/runtime/compactor/compactor.go b/pkg/runtime/compactor/compactor.go index f810296ca..501b87e4a 100644 --- a/pkg/runtime/compactor/compactor.go +++ b/pkg/runtime/compactor/compactor.go @@ -22,6 +22,7 @@ import ( "context" "errors" "fmt" + "log/slog" "time" "github.com/docker/docker-agent/pkg/agent" @@ -35,15 +36,43 @@ import ( // MaxSummaryTokens caps the summary's output length when using the // default LLM strategy. Exposed because the runtime subtracts it from // the model's context budget when deciding whether the model lookup -// produced a workable limit. +// produced a workable limit. For small context windows the effective +// cap is scaled down via [summaryTokenBudget] so the summary call +// never consumes more than a quarter of the window. const MaxSummaryTokens = 16_000 // maxKeepTokens is the runtime's policy for how much recent // conversation to preserve verbatim across a compaction. Messages // fitting in this window are kept aside; the rest are the candidates -// to summarize. +// to summarize. For small context windows the effective budget is +// scaled down via [keepTokenBudget] so the kept tail never occupies +// more than a fifth of the window. const maxKeepTokens = 20_000 +// summaryTokenBudget returns the output-token cap for the summary +// call, scaled to the context window. The fixed [MaxSummaryTokens] +// cap works for large windows but exceeds small ones entirely (e.g. a +// local model with provider_opts.context_size of 8k), which used to +// leave no room for the conversation being summarized — the +// summarizer then received an empty conversation and produced a +// confused non-summary that wiped the session history. +func summaryTokenBudget(contextLimit int64) int64 { + return min(MaxSummaryTokens, contextLimit/4) +} + +// keepTokenBudget returns the verbatim-keep budget for a compaction, +// scaled to the context window so that the kept tail plus the summary +// always leave the post-compaction session well under the compaction +// threshold. A non-positive contextLimit (hook-supplied summaries may +// run without a resolvable model definition) falls back to the +// unscaled policy. +func keepTokenBudget(contextLimit int64) int64 { + if contextLimit <= 0 { + return maxKeepTokens + } + return min(maxKeepTokens, contextLimit/5) +} + // Result is the structural outcome of running a compaction strategy. // The runtime applies it to the parent session by appending a // session.Item with FirstKeptEntry set, resetting the running @@ -120,12 +149,24 @@ func RunLLM(ctx context.Context, args LLMArgs) (*Result, error) { summaryModel := provider.CloneWithOptions(ctx, args.Agent.Model(ctx), options.WithStructuredOutput(nil), - options.WithMaxTokens(MaxSummaryTokens), + options.WithMaxTokens(summaryTokenBudget(args.ContextLimit)), ) compactionAgent := agent.New("root", "", agent.WithModel(summaryModel)) messages, firstKeptEntry := extractMessages(args.Session, compactionAgent, args.ContextLimit, args.AdditionalPrompt) + // The first and last entries are the synthesized compaction + // system/user prompts; anything between them is the conversation to + // summarize. Running the summarizer without a conversation would + // make it fabricate a "there is no history" reply that then + // REPLACES the real session history, so treat this as a no-op + // instead (the session is left untouched). + if len(messages) <= 2 { + slog.WarnContext(ctx, "Compaction skipped: no conversation messages fit the summarization budget", + "session_id", args.Session.ID, "context_limit", args.ContextLimit) + return nil, nil + } + compactionSession := session.New( session.WithTitle("Generating summary"), session.WithMessages(toItems(messages)), @@ -150,12 +191,12 @@ func RunLLM(ctx context.Context, args LLMArgs) (*Result, error) { // ComputeFirstKeptEntry returns the index in sess.Messages of the // first message preserved verbatim after compaction, given the -// [maxKeepTokens] window. Used by the runtime when a hook supplies -// its own summary so the kept-tail policy stays consistent across -// the two strategies. -func ComputeFirstKeptEntry(sess *session.Session) int { +// [keepTokenBudget] window for contextLimit. Used by the runtime when +// a hook supplies its own summary so the kept-tail policy stays +// consistent across the two strategies. +func ComputeFirstKeptEntry(sess *session.Session, contextLimit int64) int { messages, sessIndices := gatherCompactionInput(sess) - return firstKeptSessionIndex(sess, sessIndices, compaction.SplitIndexForKeep(messages, maxKeepTokens)) + return firstKeptSessionIndex(sess, sessIndices, compaction.SplitIndexForKeep(messages, keepTokenBudget(contextLimit))) } // gatherCompactionInput is a thin wrapper around @@ -197,12 +238,12 @@ func gatherCompactionInput(sess *session.Session) ([]chat.Message, []int) { // a cache checkpoint or accrue duplicate cost. // // If the conversation tail itself doesn't fit in -// (contextLimit − MaxSummaryTokens − prompt-overhead), older messages +// (contextLimit − summary budget − prompt-overhead), older messages // are dropped from the front of the to-compact list to make room. func extractMessages(sess *session.Session, _ *agent.Agent, contextLimit int64, additionalPrompt string) ([]chat.Message, int) { messages, sessIndices := gatherCompactionInput(sess) - splitIdx := compaction.SplitIndexForKeep(messages, maxKeepTokens) + splitIdx := compaction.SplitIndexForKeep(messages, keepTokenBudget(contextLimit)) firstKeptEntry := firstKeptSessionIndex(sess, sessIndices, splitIdx) messages = messages[:splitIdx] @@ -222,7 +263,7 @@ func extractMessages(sess *session.Session, _ *agent.Agent, contextLimit int64, } contextAvailable := max(int64(0), - contextLimit-MaxSummaryTokens- + contextLimit-summaryTokenBudget(contextLimit)- compaction.EstimateMessageTokens(&systemPromptMessage)- compaction.EstimateMessageTokens(&userPromptMessage)) firstIndex := compaction.FirstIndexInBudget(messages, contextAvailable) diff --git a/pkg/runtime/compactor/compactor_test.go b/pkg/runtime/compactor/compactor_test.go index 492737831..7d65cb52f 100644 --- a/pkg/runtime/compactor/compactor_test.go +++ b/pkg/runtime/compactor/compactor_test.go @@ -3,6 +3,7 @@ package compactor import ( "context" "errors" + "fmt" "strings" "testing" @@ -76,15 +77,19 @@ func TestExtractMessages(t *testing.T) { wantConversationMsgCount: 4, }, { - name: "truncation when context limit is very small", + name: "older messages dropped when they exceed the summarization budget", messages: []session.Item{ - newMsg(chat.MessageRoleUser, "first message with lots of content that takes tokens"), - newMsg(chat.MessageRoleAssistant, "first response with lots of content that takes tokens"), + newMsg(chat.MessageRoleUser, strings.Repeat("a", 80_000)), // ~20k tokens + newMsg(chat.MessageRoleAssistant, strings.Repeat("b", 80_000)), // ~20k tokens newMsg(chat.MessageRoleUser, "second message"), newMsg(chat.MessageRoleAssistant, "second response"), }, - contextLimit: MaxSummaryTokens + 50, - wantConversationMsgCount: 0, + // The two small messages form the kept tail (keep budget + // 32k/5). Of the two ~20k-token compact candidates only the + // newest fits contextAvailable ≈ 0.75×32k − prompts ≈ 23.8k; + // the older one is dropped from the summarizer's input. + contextLimit: 32_000, + wantConversationMsgCount: 1, }, { name: "additional prompt is appended", @@ -183,7 +188,7 @@ func TestComputeFirstKeptEntry(t *testing.T) { t.Run("empty session returns 0", func(t *testing.T) { t.Parallel() sess := session.New() - assert.Equal(t, 0, ComputeFirstKeptEntry(sess)) + assert.Equal(t, 0, ComputeFirstKeptEntry(sess, 100_000)) }) t.Run("short conversation: split at end (compact everything)", func(t *testing.T) { @@ -193,7 +198,7 @@ func TestComputeFirstKeptEntry(t *testing.T) { session.NewMessageItem(&session.Message{Message: chat.Message{Role: chat.MessageRoleUser, Content: "hi"}}), session.NewMessageItem(&session.Message{Message: chat.Message{Role: chat.MessageRoleAssistant, Content: "hello"}}), })) - assert.Equal(t, len(sess.Messages), ComputeFirstKeptEntry(sess)) + assert.Equal(t, len(sess.Messages), ComputeFirstKeptEntry(sess, 100_000)) }) } @@ -345,6 +350,90 @@ func TestGatherCompactionInput_PriorSummaryWithoutFirstKeptEntry(t *testing.T) { assert.Equal(t, []int{2, 3, 4}, sessIndices) } +// TestRunLLM_SmallContextWindow is a regression test for issue #2871: +// with a small context window (e.g. a local model whose size comes from +// provider_opts.context_size), the fixed MaxSummaryTokens budget used to +// consume the whole window, so the summarizer received zero conversation +// messages, fabricated an "I see no conversation history" reply, and that +// text then replaced the entire session history. The budgets must scale +// with the window so the summarizer always sees real conversation. +func TestRunLLM_SmallContextWindow(t *testing.T) { + t.Parallel() + + big := strings.Repeat("x", 4_000) // ~1k estimated tokens per tool result + sess := session.New(session.WithUserMessage("please do the big task")) + for i := range 8 { + id := fmt.Sprintf("tc%d", i) + sess.AddMessage(session.NewAgentMessage("root", &chat.Message{ + Role: chat.MessageRoleAssistant, + ToolCalls: []tools.ToolCall{{ID: id, Function: tools.FunctionCall{Name: "shell", Arguments: `{"cmd":"ls"}`}}}, + })) + sess.AddMessage(session.NewAgentMessage("root", &chat.Message{ + Role: chat.MessageRoleTool, + ToolCallID: id, + Content: big, + })) + } + a := agent.New("root", "instr", agent.WithModel(fakeProvider{id: modelsdev.NewID("fake", "model")})) + + var conversationCount int + result, err := RunLLM(t.Context(), LLMArgs{ + Session: sess, + Agent: a, + ContextLimit: 8_192, + RunAgent: func(_ context.Context, _ *agent.Agent, cs *session.Session) error { + msgs := cs.GetAllMessages() + // All non-system messages minus the trailing compaction user + // prompt are the conversation handed to the summarizer. + conversationCount = len(msgs) - 1 + cs.AddMessage(session.NewAgentMessage("root", &chat.Message{ + Role: chat.MessageRoleAssistant, + Content: "the summary", + })) + return nil + }, + }) + + require.NoError(t, err) + require.NotNil(t, result) + assert.Positive(t, conversationCount, "summarizer must receive conversation messages even on small context windows") + assert.Equal(t, "the summary", result.Summary) + assert.Less(t, result.FirstKeptEntry, len(sess.Messages), "a recent tail must be kept verbatim") + assert.Positive(t, result.FirstKeptEntry) +} + +// TestRunLLM_NoConversationFits_NoOps pins the safety net behind the +// scaled budgets: when not a single conversation message fits the +// summarization budget (e.g. one giant tool result), RunLLM must no-op +// instead of running the summarizer on an empty conversation — the +// resulting non-summary would otherwise wipe the session history. +func TestRunLLM_NoConversationFits_NoOps(t *testing.T) { + t.Parallel() + + sess := session.New(session.WithMessages([]session.Item{ + session.NewMessageItem(&session.Message{Message: chat.Message{ + Role: chat.MessageRoleUser, + Content: strings.Repeat("x", 200_000), // ~50k tokens, exceeds the whole window + }}), + })) + a := agent.New("root", "instr", agent.WithModel(fakeProvider{id: modelsdev.NewID("fake", "model")})) + + runAgentCalled := false + result, err := RunLLM(t.Context(), LLMArgs{ + Session: sess, + Agent: a, + ContextLimit: 8_192, + RunAgent: func(context.Context, *agent.Agent, *session.Session) error { + runAgentCalled = true + return nil + }, + }) + + require.NoError(t, err) + assert.Nil(t, result, "compaction must be a no-op when nothing fits the budget") + assert.False(t, runAgentCalled, "the summarizer must not run on an empty conversation") +} + func TestRunLLM_DoesNotDuplicateSystemPrompt(t *testing.T) { t.Parallel() diff --git a/pkg/runtime/session_compaction.go b/pkg/runtime/session_compaction.go index 6132c2d5d..115c209ce 100644 --- a/pkg/runtime/session_compaction.go +++ b/pkg/runtime/session_compaction.go @@ -76,7 +76,7 @@ func (r *LocalRuntime) doCompact(ctx context.Context, sess *session.Session, a * // Choose the strategy: a hook-supplied summary if before_compaction // returned one, otherwise the default LLM strategy. - result := summaryFromHook(sess, a, pre) + result := summaryFromHook(sess, a, pre, contextLimit) if result == nil { if contextLimit <= 0 { slog.ErrorContext(ctx, "Failed to generate session summary", @@ -143,7 +143,7 @@ func (r *LocalRuntime) doCompact(ctx context.Context, sess *session.Session, a * // summary's token count for session bookkeeping. The Result.Cost is // left at its zero value because no LLM call ran — the hook produced // the summary itself, so there's nothing to bill. -func summaryFromHook(sess *session.Session, a *agent.Agent, pre *hooks.Result) *compactor.Result { +func summaryFromHook(sess *session.Session, a *agent.Agent, pre *hooks.Result, contextLimit int64) *compactor.Result { if pre == nil || pre.Summary == "" { return nil } @@ -151,7 +151,7 @@ func summaryFromHook(sess *session.Session, a *agent.Agent, pre *hooks.Result) * "session_id", sess.ID, "agent", a.Name(), "summary_length", len(pre.Summary)) return &compactor.Result{ Summary: pre.Summary, - FirstKeptEntry: compactor.ComputeFirstKeptEntry(sess), + FirstKeptEntry: compactor.ComputeFirstKeptEntry(sess, contextLimit), // Estimate the summary's token count for session bookkeeping; // no LLM was called so Cost stays at the zero value. InputTokens: compaction.EstimateMessageTokens(&chat.Message{