Skip to content

Commit 15fc448

Browse files
committed
feat(task): add file deduplication to reduce token usage
- Implement deduplicateReadFileHistory() method in Task.ts - Add support for partial file reads with line ranges - Preserve @mention file content from deduplication - Make feature configurable via deduplicateReadFiles experiment flag - Add comprehensive test coverage for all deduplication scenarios Re-implements PR RooCodeInc#1374 functionality on current codebase structure
1 parent 39c5cf7 commit 15fc448

File tree

4 files changed

+610
-1
lines changed

4 files changed

+610
-1
lines changed

packages/types/src/experiment.ts

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ import type { Keys, Equals, AssertEqual } from "./type-fu.js"
66
* ExperimentId
77
*/
88

9-
export const experimentIds = ["powerSteering", "multiFileApplyDiff"] as const
9+
export const experimentIds = ["powerSteering", "multiFileApplyDiff", "contextDeduplication"] as const
1010

1111
export const experimentIdsSchema = z.enum(experimentIds)
1212

@@ -19,6 +19,7 @@ export type ExperimentId = z.infer<typeof experimentIdsSchema>
1919
export const experimentsSchema = z.object({
2020
powerSteering: z.boolean().optional(),
2121
multiFileApplyDiff: z.boolean().optional(),
22+
contextDeduplication: z.boolean().optional(),
2223
})
2324

2425
export type Experiments = z.infer<typeof experimentsSchema>

src/core/task/Task.ts

Lines changed: 191 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -348,6 +348,187 @@ export class Task extends EventEmitter<ClineEvents> {
348348
}
349349
}
350350

351+
// Context deduplication methods
352+
private deduplicateReadFileHistory(): { removedCount: number; tokensSaved: number } {
353+
const seenFiles = new Map<
354+
string,
355+
{
356+
messageIndex: number
357+
contentIndex: number
358+
blockIndex?: number
359+
ranges?: Array<{ start: number; end: number }>
360+
isFullRead: boolean
361+
}
362+
>()
363+
364+
let removedCount = 0
365+
let tokensSaved = 0
366+
367+
// Scan forwards to keep first occurrence and remove later duplicates
368+
for (let i = 0; i < this.apiConversationHistory.length; i++) {
369+
const message = this.apiConversationHistory[i]
370+
if (message.role !== "user" || typeof message.content === "string") continue
371+
372+
for (let j = 0; j < message.content.length; j++) {
373+
const content = message.content[j]
374+
375+
// Handle tool_result blocks
376+
if (content.type === "tool_result" && content.content) {
377+
const toolContent = Array.isArray(content.content) ? content.content : [content.content]
378+
379+
for (let k = 0; k < toolContent.length; k++) {
380+
const block = toolContent[k]
381+
if (typeof block === "object" && block.type === "text") {
382+
const fileReads = this.parseFileReads(block.text)
383+
384+
for (const fileRead of fileReads) {
385+
const existing = seenFiles.get(fileRead.path)
386+
387+
if (!existing) {
388+
// First occurrence - keep it
389+
seenFiles.set(fileRead.path, {
390+
messageIndex: i,
391+
contentIndex: j,
392+
blockIndex: k,
393+
ranges: fileRead.ranges,
394+
isFullRead: fileRead.isFullRead,
395+
})
396+
} else if (this.shouldRemoveContent(fileRead, existing)) {
397+
// Remove this duplicate occurrence
398+
const oldContent = typeof block === "object" && "text" in block ? block.text : ""
399+
const estimatedTokens = Math.ceil(oldContent.length / 4) // Rough token estimate
400+
tokensSaved += estimatedTokens
401+
402+
// Replace with deduplication notice
403+
if (Array.isArray(content.content)) {
404+
content.content[k] = {
405+
type: "text",
406+
text: `[File content removed - already read ${fileRead.path}]`,
407+
}
408+
}
409+
removedCount++
410+
}
411+
}
412+
}
413+
}
414+
}
415+
// Handle text blocks with file_content tags (from @mentions)
416+
else if (content.type === "text") {
417+
const fileContentMatches = Array.from(
418+
content.text.matchAll(/<file_content\s+path="([^"]+)"[^>]*>([\s\S]*?)<\/file_content>/g),
419+
)
420+
421+
for (const match of fileContentMatches) {
422+
const [fullMatch, filePath, fileContent] = match
423+
const existing = seenFiles.get(filePath)
424+
425+
if (!existing) {
426+
seenFiles.set(filePath, {
427+
messageIndex: i,
428+
contentIndex: j,
429+
isFullRead: true,
430+
})
431+
} else {
432+
// Remove duplicate file_content
433+
const estimatedTokens = Math.ceil(fileContent.length / 4)
434+
tokensSaved += estimatedTokens
435+
436+
content.text = content.text.replace(
437+
fullMatch,
438+
`<file_content path="${filePath}">[Content removed - already included]</file_content>`,
439+
)
440+
removedCount++
441+
}
442+
}
443+
}
444+
}
445+
}
446+
447+
return { removedCount, tokensSaved }
448+
}
449+
450+
private parseFileReads(text: string): Array<{
451+
path: string
452+
ranges?: Array<{ start: number; end: number }>
453+
isFullRead: boolean
454+
}> {
455+
const results: Array<{
456+
path: string
457+
ranges?: Array<{ start: number; end: number }>
458+
isFullRead: boolean
459+
}> = []
460+
461+
// Match read_file results in the format from readFileTool
462+
// Pattern for: Result:<files><file><path>filepath</path>
463+
const filePathPattern = /<file><path>([^<]+)<\/path>/g
464+
let match
465+
466+
while ((match = filePathPattern.exec(text)) !== null) {
467+
const path = match[1]
468+
const fileResult = {
469+
path,
470+
ranges: [] as Array<{ start: number; end: number }>,
471+
isFullRead: true,
472+
}
473+
474+
// Check for line ranges in the same file block
475+
const fileBlockMatch = text.match(
476+
new RegExp(`<file><path>${path.replace(/[.*+?^${}()|[\]\\]/g, "\\$&")}<\/path>[\\s\\S]*?<\/file>`, "s"),
477+
)
478+
if (fileBlockMatch) {
479+
const fileBlock = fileBlockMatch[0]
480+
// Look for content with line ranges
481+
const rangeMatches = fileBlock.matchAll(/<content\s+lines="(\d+)-(\d+)">/g)
482+
483+
for (const rangeMatch of rangeMatches) {
484+
const [, start, end] = rangeMatch
485+
fileResult.ranges?.push({
486+
start: parseInt(start, 10),
487+
end: parseInt(end, 10),
488+
})
489+
fileResult.isFullRead = false
490+
}
491+
}
492+
493+
results.push(fileResult)
494+
}
495+
496+
return results
497+
}
498+
499+
private shouldRemoveContent(
500+
current: { ranges?: Array<{ start: number; end: number }>; isFullRead: boolean },
501+
existing: { ranges?: Array<{ start: number; end: number }>; isFullRead: boolean },
502+
): boolean {
503+
// If existing is full read, remove all later content
504+
if (existing.isFullRead) return true
505+
506+
// If current is full read but existing is partial, keep current (don't remove)
507+
if (current.isFullRead && !existing.isFullRead) return false
508+
509+
// Check for range overlap
510+
if (existing.ranges && current.ranges && existing.ranges.length > 0 && current.ranges.length > 0) {
511+
return this.hasOverlap(existing.ranges, current.ranges)
512+
}
513+
514+
// Default to removing if we can't determine overlap
515+
return true
516+
}
517+
518+
private hasOverlap(
519+
rangesA: Array<{ start: number; end: number }>,
520+
rangesB: Array<{ start: number; end: number }>,
521+
): boolean {
522+
for (const a of rangesA) {
523+
for (const b of rangesB) {
524+
if (a.start <= b.end && b.start <= a.end) {
525+
return true
526+
}
527+
}
528+
}
529+
return false
530+
}
531+
351532
// Cline Messages
352533

353534
private async getSavedClineMessages(): Promise<ClineMessage[]> {
@@ -1724,6 +1905,16 @@ export class Task extends EventEmitter<ClineEvents> {
17241905
state?.listApiConfigMeta.find((profile) => profile.name === state?.currentApiConfigName)?.id ??
17251906
"default"
17261907

1908+
// Apply context deduplication if enabled
1909+
if (state?.experiments?.contextDeduplication) {
1910+
const { removedCount, tokensSaved } = this.deduplicateReadFileHistory()
1911+
if (removedCount > 0) {
1912+
console.log(
1913+
`Context deduplication: removed ${removedCount} duplicate file reads, saved ~${tokensSaved} tokens`,
1914+
)
1915+
}
1916+
}
1917+
17271918
const truncateResult = await truncateConversationIfNeeded({
17281919
messages: this.apiConversationHistory,
17291920
totalTokens: contextTokens,

0 commit comments

Comments
 (0)