Fix uncalibrated overflow and trailing-message prefill error

BYK · BYK · commit 6af2b88f74f5 · 2026-02-24T22:37:39.000Z
- gradient: on first turn (no calibration data), apply a 1.5x safety
  multiplier to tryFit output before accepting a layer. chars/4 estimates
  undercount by up to 1.8x on sessions with large tool outputs, causing
  tryFit to pack too many messages — the window estimates as fitting but
  actually overflows the API limit. With the multiplier, the gradient
  escalates to the next layer until the estimated total * 1.5 fits within
  maxInput. Once calibrated (exact API counts available), the multiplier
  is not applied.

- index: remove the 'break on tool parts' from the trailing-message drop
  loop. The break left trailing assistant messages with tool parts at the
  end of the compressed window, causing 'conversation must end with a user
  message' prefill errors. Always drop trailing non-user messages
  regardless of tool content — a hard API error is worse than the model
  re-invoking a tool.
diff --git a/package.json b/package.json
@@ -1,6 +1,6 @@
 {
   "name": "opencode-lore",
-  "version": "0.2.1",
+  "version": "0.2.2",
   "type": "module",
   "license": "MIT",
   "description": "Three-tier memory architecture for OpenCode — distillation, not summarization",
diff --git a/src/gradient.ts b/src/gradient.ts
@@ -722,8 +722,27 @@ export function transform(input: {
   const maxInput = contextLimit - outputReserved;
   const sid = input.sessionID ?? input.messages[0]?.info.sessionID;
 
+  // True when we have real API token data from a previous turn in this session.
+  // When false (first turn / session change), chars/4 estimates can undercount by
+  // up to 1.8x — so tryFit output must be validated with a safety multiplier before
+  // being used, to prevent sending an apparently-fitting window that actually overflows.
+  const calibrated = lastKnownInput > 0 && sid === lastKnownSessionID;
+
+  // On uncalibrated turns, apply this multiplier to tryFit's estimated total to
+  // approximate the real token count. 1.5 is conservative but not so aggressive
+  // that it forces layer 4 on modestly-sized sessions.
+  const UNCALIBRATED_SAFETY = 1.5;
+
+  // Returns true if the tryFit result is safe to use: either we have calibrated
+  // data (exact) or the estimated total * safety factor fits within maxInput.
+  function fitsWithSafetyMargin(result: { totalTokens: number } | null): boolean {
+    if (!result) return false;
+    if (calibrated) return true;
+    return result.totalTokens * UNCALIBRATED_SAFETY <= maxInput;
+  }
+
   let expectedInput: number;
-  if (lastKnownInput > 0 && sid === lastKnownSessionID) {
+  if (calibrated) {
     // Exact approach: prior API count + estimate of only the new messages.
     const newMsgCount = Math.max(0, input.messages.length - lastKnownMessageCount);
     const newMsgTokens = newMsgCount > 0
@@ -793,7 +812,7 @@ export function transform(input: {
           rawBudget,
           strip: "none",
         });
-    if (layer1) return { ...layer1, layer: 1, usable, distilledBudget, rawBudget };
+    if (fitsWithSafetyMargin(layer1)) return { ...layer1!, layer: 1, usable, distilledBudget, rawBudget };
   }
 
   // Layer 1 didn't fit (or was force-skipped) — reset the raw window cache.
@@ -812,9 +831,9 @@ export function transform(input: {
       strip: "old-tools",
       protectedTurns: 2,
     });
-    if (layer2) {
+    if (fitsWithSafetyMargin(layer2)) {
       urgentDistillation = true;
-      return { ...layer2, layer: 2, usable, distilledBudget, rawBudget };
+      return { ...layer2!, layer: 2, usable, distilledBudget, rawBudget };
     }
   }
 
@@ -833,9 +852,9 @@ export function transform(input: {
     rawBudget: Math.floor(usable * 0.55),
     strip: "all-tools",
   });
-  if (layer3) {
+  if (fitsWithSafetyMargin(layer3)) {
     urgentDistillation = true;
-    return { ...layer3, layer: 3, usable, distilledBudget, rawBudget };
+    return { ...layer3!, layer: 3, usable, distilledBudget, rawBudget };
   }
 
   // Layer 4: Emergency — last 2 distillations, last 3 raw messages with tool parts intact.
diff --git a/src/index.ts b/src/index.ts
@@ -389,12 +389,13 @@ export const LorePlugin: Plugin = async (ctx) => {
       // Layer 0 means all messages fit within the context budget — leave them alone
       // so the append-only sequence stays intact for prompt caching.
       if (result.layer > 0) {
+        // The API requires the conversation to end with a user message.
+        // Always drop trailing non-user messages — even assistant messages with
+        // tool parts. A hard API error is worse than the model re-invoking a tool.
         while (
           result.messages.length > 0 &&
           result.messages.at(-1)!.info.role !== "user"
         ) {
-          const last = result.messages.at(-1)!;
-          if (last.parts.some((p) => p.type === "tool")) break;
           const dropped = result.messages.pop()!;
           console.error(
             "[lore] WARN: dropping trailing",

Original file line number	Diff line number	Diff line change
`@@ -1,6 +1,6 @@`
`1`	`1`	`{`
`2`	`2`	`"name": "opencode-lore",`
`3`		`- "version": "0.2.1",`
	`3`	`+ "version": "0.2.2",`
`4`	`4`	`"type": "module",`
`5`	`5`	`"license": "MIT",`
`6`	`6`	`"description": "Three-tier memory architecture for OpenCode — distillation, not summarization",`