Merge pull request #193 from Opencode-DCP/refactor/simplify-extract-distillation

Tarquinen · web-flow · commit 82285142b73d · 2025-12-23T15:53:00.000-05:00
Simplify extract distillation to array format
diff --git a/lib/messages/prune.ts b/lib/messages/prune.ts
@@ -6,7 +6,8 @@ import { extractParameterKey, buildToolIdList } from "./utils"
 import { getLastAssistantMessage, getLastUserMessage, isMessageCompacted } from "../shared-utils"
 import { AssistantMessage, UserMessage } from "@opencode-ai/sdk"
 
-const PRUNED_TOOL_INPUT_REPLACEMENT = "[Input removed to save context]"
+const PRUNED_TOOL_INPUT_REPLACEMENT =
+    "[content removed to save context, this is not what was written to the file, but a placeholder]"
 const PRUNED_TOOL_OUTPUT_REPLACEMENT =
     "[Output removed to save context - information superseded or no longer needed]"
 const getNudgeString = (config: PluginConfig): string => {
@@ -50,7 +51,7 @@ const SYNTHETIC_MESSAGE_ID = "msg_01234567890123456789012345"
 const SYNTHETIC_PART_ID = "prt_01234567890123456789012345"
 const SYNTHETIC_USER_MESSAGE_ID = "msg_01234567890123456789012346"
 const SYNTHETIC_USER_PART_ID = "prt_01234567890123456789012346"
-const REASONING_MODEL_USER_MESSAGE_CONTENT = "<system-context-injection/>"
+const REASONING_MODEL_USER_MESSAGE_CONTENT = "[internal: context sync - no response needed]"
 
 const buildPrunableToolsList = (
     state: SessionState,
diff --git a/lib/prompts/discard-tool-spec.txt b/lib/prompts/discard-tool-spec.txt
@@ -3,34 +3,29 @@ Discards tool outputs from context to manage conversation size and reduce noise.
 ## IMPORTANT: The Prunable List
 A `<prunable-tools>` list is provided to you showing available tool outputs you can discard when there are tools available for pruning. Each line has the format `ID: tool, parameter` (e.g., `20: read, /path/to/file.ts`). You MUST only use numeric IDs that appear in this list to select which tools to discard.
 
-**Note:** For `write` and `edit` tools, discarding removes the input content (the code being written/edited) while preserving the output confirmation. This is useful after completing a file modification when you no longer need the raw content in context.
-
 ## When to Use This Tool
 
-Use `discard` for removing tool outputs that are no longer needed **without preserving their content**:
-
-### 1. Task Completion (Clean Up)
-**When:** You have successfully completed a specific unit of work (e.g., fixed a bug, wrote a file, answered a question).
-**Action:** Discard the tools used for that task with reason `completion`.
+Use `discard` for removing tool content that is no longer needed
 
-### 2. Removing Noise (Garbage Collection)
-**When:** You have read files or run commands that turned out to be irrelevant, unhelpful, or outdated (meaning later tools have provided fresher, more valid information).
-**Action:** Discard these specific tool outputs immediately with reason `noise`.
+- **Noise:** Irrelevant, unhelpful, or superseded outputs that provide no value.
+- **Task Completion:** Work is complete and there's no valuable information worth preserving.
 
 ## When NOT to Use This Tool
 
-- **If you need to preserve information:** Keep the raw output in context rather than discarding it.
-- **If you'll need the output later:** Don't discard files you plan to edit, or context you'll need for implementation.
+- **If the output contains useful information:** Use `extract` instead to preserve key findings.
+- **If you'll need the output later:** Don't discard files you plan to edit or context you'll need for implementation.
 
 ## Best Practices
 - **Strategic Batching:** Don't discard single small tool outputs (like short bash commands) unless they are pure noise. Wait until you have several items to perform high-impact discards.
 - **Think ahead:** Before discarding, ask: "Will I need this output for an upcoming task?" If yes, keep it.
 
 ## Format
-The `ids` parameter is an array where the first element is the reason, followed by numeric IDs:
-`ids: ["reason", "id1", "id2", ...]`
 
-## Examples
+- `ids`: Array where the first element is the reason, followed by numeric IDs from the `<prunable-tools>` list
+
+Reasons: `noise` | `completion`
+
+## Example
 
 <example_noise>
 Assistant: [Reads 'wrong_file.ts']
@@ -40,17 +35,7 @@ This file isn't relevant to the auth system. I'll remove it to clear the context
 
 <example_completion>
 Assistant: [Runs tests, they pass]
-The tests passed. I'll clean up now.
+The tests passed and I don't need to preserve any details. I'll clean up now.
 [Uses discard with ids: ["completion", "20", "21"]]
 </example_completion>
 
-<example_keep>
-Assistant: [Reads 'auth.ts' to understand the login flow]
-I've understood the auth flow. I'll need to modify this file to add the new validation, so I'm keeping this read in context rather than discarding.
-</example_keep>
-
-<example_edit_completion>
-Assistant: [Edits 'auth.ts' to add validation]
-The edit was successful. I no longer need the raw edit content in context.
-[Uses discard with ids: ["completion", "15"]]
-</example_edit_completion>
diff --git a/lib/prompts/extract-tool-spec.txt b/lib/prompts/extract-tool-spec.txt
@@ -7,70 +7,38 @@ A `<prunable-tools>` list is provided to you showing available tool outputs you
 
 Use `extract` when you have gathered useful information that you want to **preserve in distilled form** before removing the raw outputs:
 
-### 1. Task Completion
-**When:** You have completed a unit of work and want to preserve key findings.
-**Action:** Extract with distillation scaled to the value of the content. High-value insights require comprehensive capture; routine completions can use lighter distillation.
-
-### 2. Knowledge Preservation
-**When:** You have read files, run commands, or gathered context that contains valuable information you'll need to reference later, but the full raw output is too large to keep.
-**Action:** Convert raw data into distilled knowledge. This allows you to remove large outputs (like full file reads) while keeping only the specific parts you need (like a single function signature or constant).
-
-## CRITICAL: Distillation Requirements
-
-You MUST provide distilled findings in the `distillation` parameter. This is not optional.
-
-- **Comprehensive Capture:** Distillation is not just a summary. It must be a high-fidelity representation of the technical details. If you read a file, the distillation should include function signatures, specific logic flows, constant values, and any constraints or edge cases discovered.
-- **Task-Relevant Verbosity:** Be as verbose as necessary to ensure that the "distilled" version is a complete substitute for the raw output for the task at hand. If you will need to reference a specific algorithm or interface later, include it in its entirety within the distillation.
-- **Extract Per-ID:** When extracting from multiple tools, your `distillation` object MUST contain a corresponding entry for EVERY ID being extracted. You must capture high-fidelity findings for each tool individually to ensure no signal is lost.
-- **Structure:** Map EVERY `ID` from the `ids` array to its specific distilled findings.
-  Example: `{ "20": { ... }, "21": { ... } }`
-- Capture all relevant details (function names, logic, constraints) to ensure no signal is lost.
-- Prioritize information that is essential for the immediate next steps of your plan.
+- **Task Completion:** You completed a unit of work and want to preserve key findings.
+- **Knowledge Preservation:** You have context that contains valuable information, but also a lot of unnecessary detail - you only need to preserve some specifics.
 
 ## When NOT to Use This Tool
 
-- **If you need precise syntax:** If you'll need to edit a file, grep for exact strings, or reference precise syntax, keep the raw output. Distillation works for understanding; implementation often requires the original.
-- **If uncertain:** Prefer keeping over re-fetching. The cost of retaining context is lower than the cost of redundant tool calls.
+- **If you need precise syntax:** If you'll edit a file or grep for exact strings, keep the raw output.
+- **If uncertain:** Prefer keeping over re-fetching.
+
 
 ## Best Practices
-- **Technical Fidelity:** Ensure that types, parameters, and return values are preserved if they are relevant to upcoming implementation steps.
 - **Strategic Batching:** Wait until you have several items or a few large outputs to extract, rather than doing tiny, frequent extractions. Aim for high-impact extractions that significantly reduce context size.
 - **Think ahead:** Before extracting, ask: "Will I need the raw output for an upcoming task?" If you researched a file you'll later edit, do NOT extract it.
 
 ## Format
-The `ids` parameter is an array of numeric IDs as strings:
-`ids: ["id1", "id2", ...]`
 
-The `distillation` parameter is an object mapping each ID to its distilled findings:
-`distillation: { "id1": { ...findings... }, "id2": { ...findings... } }`
+- `ids`: Array of numeric IDs as strings from the `<prunable-tools>` list
+- `distillation`: Array of strings, one per ID (positional: distillation[0] is for ids[0], etc.)
+
+Each distillation string should capture the essential information you need to preserve - function signatures, logic, constraints, values, etc. Be as detailed as needed for your task.
 
 ## Example
 
 <example_extraction>
-Assistant: [Reads service implementation, types, and config]
-I'll preserve the full technical specification and implementation logic before extracting.
-[Uses extract with ids: ["10", "11", "12"], distillation: {
-  "10": {
-    "file": "src/services/auth.ts",
-    "signatures": [
-      "async function validateToken(token: string): Promise<User | null>",
-      "function hashPassword(password: string): string"
-    ],
-    "logic": "The validateToken function first checks the local cache before calling the external OIDC provider. It uses a 5-minute TTL for cached tokens.",
-    "dependencies": ["import { cache } from '../utils/cache'", "import { oidc } from '../config'"],
-    "constraints": "Tokens must be at least 128 chars long. hashPassword uses bcrypt with 12 rounds."
-  },
-  "11": {
-    "file": "src/types/user.ts",
-    "interface": "interface User { id: string; email: string; permissions: ('read' | 'write' | 'admin')[]; status: 'active' | 'suspended'; }",
-    "context": "The permissions array is strictly typed and used by the RBAC middleware."
-  },
-  "12": {
-    "file": "config/default.json",
-    "values": { "PORT": 3000, "RETRY_STRATEGY": "exponential", "MAX_ATTEMPTS": 5 },
-    "impact": "The retry strategy affects all outgoing HTTP clients in the core module."
-  }
-}]
+Assistant: [Reads auth service and user types]
+I'll preserve the key details before extracting.
+[Uses extract with:
+  ids: ["10", "11"],
+  distillation: [
+    "auth.ts: validateToken(token: string) -> User|null checks cache first (5min TTL) then OIDC. hashPassword uses bcrypt 12 rounds. Tokens must be 128+ chars.",
+    "user.ts: interface User { id: string; email: string; permissions: ('read'|'write'|'admin')[]; status: 'active'|'suspended' }"
+  ]
+]
 </example_extraction>
 
 <example_keep>
diff --git a/lib/prompts/system/system-prompt-both.txt b/lib/prompts/system/system-prompt-both.txt
@@ -9,10 +9,9 @@ TWO TOOLS FOR CONTEXT MANAGEMENT
 - `extract`: Extract key findings into distilled knowledge before removing raw outputs. Use when you need to preserve information.
 
 CHOOSING THE RIGHT TOOL
-Ask: "Do I need to preserve any information from this output?"
-- **No** → `discard` (default for cleanup)
-- **Yes** → `extract` (preserves distilled knowledge)
-- **Uncertain** → `extract` (safer, preserves signal)
+Ask: "Is this output clearly noise or irrelevant?"
+- **Yes** → `discard` (pure cleanup, no preservation)
+- **No** → `extract` (default - preserves key findings)
 
 Common scenarios:
 - Task complete, no valuable context → `discard`
@@ -39,5 +38,7 @@ When in doubt, keep it. Batch your actions and aim for high-impact prunes that s
 FAILURE TO PRUNE will result in context leakage and DEGRADED PERFORMANCES.
 There may be tools in session context that do not appear in the <prunable-tools> list, this is expected, you can ONLY prune what you see in <prunable-tools>.
 
+If you see a user message containing only `[internal: context sync - no response needed]`, this is an internal system marker used for context injection - it is NOT user input. Do not acknowledge it, do not respond to it, and do not mention it. Simply continue with your current task or wait for actual user input.
+
 </instruction>
 </system-reminder>
diff --git a/lib/prompts/system/system-prompt-discard.txt b/lib/prompts/system/system-prompt-discard.txt
@@ -30,5 +30,7 @@ When in doubt, keep it. Batch your actions and aim for high-impact discards that
 FAILURE TO DISCARD will result in context leakage and DEGRADED PERFORMANCES.
 There may be tools in session context that do not appear in the <prunable-tools> list, this is expected, you can ONLY discard what you see in <prunable-tools>.
 
+If you see a user message containing only `[internal: context sync - no response needed]`, this is an internal system marker used for context injection - it is NOT user input. Do not acknowledge it, do not respond to it, and do not mention it. Simply continue with your current task or wait for actual user input.
+
 </instruction>
 </system-reminder>
diff --git a/lib/prompts/system/system-prompt-extract.txt b/lib/prompts/system/system-prompt-extract.txt
@@ -30,5 +30,7 @@ When in doubt, keep it. Batch your actions and aim for high-impact extractions t
 FAILURE TO EXTRACT will result in context leakage and DEGRADED PERFORMANCES.
 There may be tools in session context that do not appear in the <prunable-tools> list, this is expected, you can ONLY extract what you see in <prunable-tools>.
 
+If you see a user message containing only `[internal: context sync - no response needed]`, this is an internal system marker used for context injection - it is NOT user input. Do not acknowledge it, do not respond to it, and do not mention it. Simply continue with your current task or wait for actual user input.
+
 </instruction>
 </system-reminder>
diff --git a/lib/strategies/tools.ts b/lib/strategies/tools.ts
@@ -32,13 +32,13 @@ async function executePruneOperation(
     ids: string[],
     reason: PruneReason,
     toolName: string,
-    distillation?: Record<string, any>,
+    distillation?: string[],
 ): Promise<string> {
     const { client, state, logger, config, workingDirectory } = ctx
     const sessionId = toolCtx.sessionID
 
     logger.info(`${toolName} tool invoked`)
-    logger.info(JSON.stringify({ ids, reason }))
+    logger.info(JSON.stringify(reason ? { ids, reason } : { ids }))
 
     if (!ids || ids.length === 0) {
         logger.debug(`${toolName} tool called but ids is empty or undefined`)
@@ -171,17 +171,17 @@ export function createExtractTool(ctx: PruneToolContext): ReturnType<typeof tool
                 .array(tool.schema.string())
                 .describe("Numeric IDs as strings to extract from the <prunable-tools> list"),
             distillation: tool.schema
-                .record(tool.schema.string(), tool.schema.any())
+                .array(tool.schema.string())
                 .describe(
-                    "REQUIRED. An object mapping each ID to its distilled findings. Must contain an entry for every ID being pruned.",
+                    "REQUIRED. Array of strings, one per ID (positional: distillation[0] is for ids[0], etc.)",
                 ),
         },
         async execute(args, toolCtx) {
-            if (!args.distillation || Object.keys(args.distillation).length === 0) {
+            if (!args.distillation || args.distillation.length === 0) {
                 ctx.logger.debug(
                     "Extract tool called without distillation: " + JSON.stringify(args),
                 )
-                return 'Missing distillation. You must provide distillation data when using extract. Format: distillation: { "id": { ...findings... } }'
+                return "Missing distillation. You must provide a distillation string for each ID."
             }
 
             // Log the distillation for debugging/analysis