Skip to content

Commit 311ef20

Browse files
committed
fix: implement read_file history deduplication (#6279)
- Add READ_FILE_DEDUPLICATION experimental feature flag - Implement deduplicateReadFileHistory method in Task class - Integrate deduplication into readFileTool after successful reads - Add comprehensive unit tests for deduplication logic - Update readFileTool tests to include mock deduplication method This feature removes duplicate read_file entries from conversation history while preserving the most recent content for each file. It respects a 5-minute cache window and handles single files, multi-file reads, and legacy formats.
1 parent 342ee70 commit 311ef20

File tree

7 files changed

+581
-1
lines changed

7 files changed

+581
-1
lines changed

packages/types/src/experiment.ts

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ import type { Keys, Equals, AssertEqual } from "./type-fu.js"
66
* ExperimentId
77
*/
88

9-
export const experimentIds = ["powerSteering", "multiFileApplyDiff"] as const
9+
export const experimentIds = ["powerSteering", "multiFileApplyDiff", "readFileDeduplication"] as const
1010

1111
export const experimentIdsSchema = z.enum(experimentIds)
1212

@@ -19,6 +19,7 @@ export type ExperimentId = z.infer<typeof experimentIdsSchema>
1919
export const experimentsSchema = z.object({
2020
powerSteering: z.boolean().optional(),
2121
multiFileApplyDiff: z.boolean().optional(),
22+
readFileDeduplication: z.boolean().optional(),
2223
})
2324

2425
export type Experiments = z.infer<typeof experimentsSchema>

src/core/task/Task.ts

Lines changed: 104 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -329,6 +329,110 @@ export class Task extends EventEmitter<ClineEvents> {
329329
return readApiMessages({ taskId: this.taskId, globalStoragePath: this.globalStoragePath })
330330
}
331331

332+
public async deduplicateReadFileHistory(): Promise<void> {
333+
// Check if the experimental feature is enabled
334+
const state = await this.providerRef.deref()?.getState()
335+
if (!state?.experiments || !experiments.isEnabled(state.experiments, EXPERIMENT_IDS.READ_FILE_DEDUPLICATION)) {
336+
return
337+
}
338+
339+
const cacheWindowMs = 5 * 60 * 1000 // 5 minutes
340+
const now = Date.now()
341+
const seenFiles = new Map<string, { messageIndex: number; blockIndex: number }>()
342+
const blocksToRemove = new Map<number, Set<number>>() // messageIndex -> Set of blockIndexes to remove
343+
344+
// Process messages in reverse order (newest first) to keep the most recent reads
345+
for (let i = this.apiConversationHistory.length - 1; i >= 0; i--) {
346+
const message = this.apiConversationHistory[i]
347+
348+
// Only process user messages
349+
if (message.role !== "user") {
350+
continue
351+
}
352+
353+
// Skip messages within the cache window
354+
if (message.ts && now - message.ts < cacheWindowMs) {
355+
continue
356+
}
357+
358+
// Process content blocks
359+
if (Array.isArray(message.content)) {
360+
for (let j = 0; j < message.content.length; j++) {
361+
const block = message.content[j]
362+
if (block.type === "text" && typeof block.text === "string") {
363+
// Check for read_file results in text blocks
364+
const readFileMatch = block.text.match(/\[read_file(?:\s+for\s+'([^']+)')?.*?\]\s*Result:/i)
365+
366+
if (readFileMatch) {
367+
// Extract file paths from the result content
368+
const resultContent = block.text.substring(block.text.indexOf("Result:") + 7).trim()
369+
370+
// Handle new XML format
371+
const xmlFileMatches = resultContent.matchAll(/<file>\s*<path>([^<]+)<\/path>/g)
372+
const xmlFilePaths: string[] = []
373+
for (const match of xmlFileMatches) {
374+
xmlFilePaths.push(match[1].trim())
375+
}
376+
377+
// Handle legacy format (single file)
378+
let filePaths: string[] = xmlFilePaths
379+
if (xmlFilePaths.length === 0 && readFileMatch[1]) {
380+
filePaths = [readFileMatch[1]]
381+
}
382+
383+
if (filePaths.length > 0) {
384+
// For multi-file reads, only mark as duplicate if ALL files have been seen
385+
const allFilesSeen = filePaths.every((path) => seenFiles.has(path))
386+
387+
if (allFilesSeen) {
388+
// This is a duplicate - mark this block for removal
389+
if (!blocksToRemove.has(i)) {
390+
blocksToRemove.set(i, new Set())
391+
}
392+
blocksToRemove.get(i)!.add(j)
393+
} else {
394+
// This is not a duplicate - update seen files
395+
filePaths.forEach((path) => {
396+
seenFiles.set(path, { messageIndex: i, blockIndex: j })
397+
})
398+
}
399+
}
400+
}
401+
}
402+
}
403+
}
404+
}
405+
406+
// Build the updated history, removing marked blocks
407+
const updatedHistory: ApiMessage[] = []
408+
for (let i = 0; i < this.apiConversationHistory.length; i++) {
409+
const message = this.apiConversationHistory[i]
410+
const blocksToRemoveForMessage = blocksToRemove.get(i)
411+
412+
if (blocksToRemoveForMessage && blocksToRemoveForMessage.size > 0 && Array.isArray(message.content)) {
413+
// Filter out marked blocks
414+
const filteredContent: Anthropic.Messages.ContentBlockParam[] = []
415+
416+
for (let j = 0; j < message.content.length; j++) {
417+
if (!blocksToRemoveForMessage.has(j)) {
418+
filteredContent.push(message.content[j])
419+
}
420+
}
421+
422+
// Only add the message if it has content after filtering
423+
if (filteredContent.length > 0) {
424+
updatedHistory.push({ ...message, content: filteredContent })
425+
}
426+
} else {
427+
// Keep the message as-is
428+
updatedHistory.push(message)
429+
}
430+
}
431+
432+
// Update the conversation history
433+
await this.overwriteApiConversationHistory(updatedHistory)
434+
}
435+
332436
private async addToApiConversationHistory(message: Anthropic.MessageParam) {
333437
const messageWithTs = { ...message, ts: Date.now() }
334438
this.apiConversationHistory.push(messageWithTs)

0 commit comments

Comments
 (0)