Skip to content

Commit a661d39

Browse files
Prevent adding duplicate files to the system prompt by checking both resolved paths (symlinks) and md5 sum.
1 parent a0384f3 commit a661d39

File tree

2 files changed

+194
-14
lines changed

2 files changed

+194
-14
lines changed

src/core/prompts/sections/__tests__/custom-instructions.spec.ts

Lines changed: 117 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1605,3 +1605,120 @@ describe("Rules directory reading", () => {
16051605
expect(result).toBe("\n# Rules from .roorules:\nfallback content\n")
16061606
})
16071607
})
1608+
1609+
describe("File deduplication", () => {
1610+
beforeEach(() => {
1611+
vi.clearAllMocks()
1612+
})
1613+
1614+
test("should skip files with duplicate content (same MD5 hash)", async () => {
1615+
// Mock directory structure with different files having identical content
1616+
const mockEntries = [
1617+
{
1618+
name: "file1.md",
1619+
isFile: () => true,
1620+
isSymbolicLink: () => false,
1621+
isDirectory: () => false,
1622+
parentPath: "/fake/path/.roo/rules",
1623+
},
1624+
{
1625+
name: "file2.md",
1626+
isFile: () => true,
1627+
isSymbolicLink: () => false,
1628+
isDirectory: () => false,
1629+
parentPath: "/fake/path/.roo/rules",
1630+
},
1631+
{
1632+
name: "file3.md",
1633+
isFile: () => true,
1634+
isSymbolicLink: () => false,
1635+
isDirectory: () => false,
1636+
parentPath: "/fake/path/.roo/rules",
1637+
},
1638+
]
1639+
1640+
// Mock fs operations
1641+
statMock.mockResolvedValueOnce({ isDirectory: () => true }) // .roo/rules exists
1642+
readdirMock.mockResolvedValueOnce(mockEntries)
1643+
1644+
// Mock file stats for each file
1645+
statMock.mockResolvedValueOnce({ isFile: () => true }) // file1.md
1646+
statMock.mockResolvedValueOnce({ isFile: () => true }) // file2.md
1647+
statMock.mockResolvedValueOnce({ isFile: () => true }) // file3.md
1648+
1649+
// Mock file content reading - file2 and file3 have identical content
1650+
readFileMock.mockResolvedValueOnce("unique content") // file1.md
1651+
readFileMock.mockResolvedValueOnce("duplicate content") // file2.md
1652+
readFileMock.mockResolvedValueOnce("duplicate content") // file3.md (should be skipped due to duplicate hash)
1653+
1654+
const result = await loadRuleFiles("/fake/path")
1655+
1656+
// Should include both unique contents, but duplicate content only once
1657+
expect(result).toContain("unique content")
1658+
expect(result).toContain("duplicate content")
1659+
1660+
// Count occurrences of "duplicate content" - should only appear once
1661+
const duplicateContentMatches = (result.match(/duplicate content/g) || []).length
1662+
expect(duplicateContentMatches).toBe(1)
1663+
})
1664+
1665+
test("should skip files with duplicate resolved paths via symlinks", async () => {
1666+
// Mock directory structure with symlinks pointing to the same file
1667+
const mockEntries = [
1668+
{
1669+
name: "original.md",
1670+
isFile: () => true,
1671+
isSymbolicLink: () => false,
1672+
isDirectory: () => false,
1673+
parentPath: "/fake/path/.roo/rules",
1674+
},
1675+
{
1676+
name: "symlink1.md",
1677+
isFile: () => false,
1678+
isSymbolicLink: () => true,
1679+
isDirectory: () => false,
1680+
parentPath: "/fake/path/.roo/rules",
1681+
},
1682+
{
1683+
name: "symlink2.md",
1684+
isFile: () => false,
1685+
isSymbolicLink: () => true,
1686+
isDirectory: () => false,
1687+
parentPath: "/fake/path/.roo/rules",
1688+
},
1689+
]
1690+
1691+
// Mock fs operations
1692+
statMock.mockResolvedValueOnce({ isDirectory: () => true }) // .roo/rules exists
1693+
readdirMock.mockResolvedValueOnce(mockEntries)
1694+
1695+
// Mock file stats
1696+
statMock.mockResolvedValueOnce({ isFile: () => true }) // original.md
1697+
1698+
// Mock symlink resolution - both symlinks point to original.md
1699+
readlinkMock.mockResolvedValueOnce("original.md") // symlink1 -> original.md
1700+
statMock.mockResolvedValueOnce({ isFile: () => true }) // symlink1 target
1701+
1702+
readlinkMock.mockResolvedValueOnce("original.md") // symlink2 -> original.md
1703+
statMock.mockResolvedValueOnce({ isFile: () => true }) // symlink2 target
1704+
1705+
// Mock file content reading
1706+
readFileMock.mockResolvedValueOnce("original content") // original.md
1707+
readFileMock.mockResolvedValueOnce("original content") // symlink1 target (should be skipped - same resolved path)
1708+
readFileMock.mockResolvedValueOnce("original content") // symlink2 target (should be skipped - same resolved path)
1709+
1710+
const result = await loadRuleFiles("/fake/path")
1711+
1712+
// Should only include original content once
1713+
expect(result).toContain("original content")
1714+
1715+
// Count occurrences of "original content" - should only appear once
1716+
const originalContentMatches = (result.match(/original content/g) || []).length
1717+
expect(originalContentMatches).toBe(1)
1718+
})
1719+
1720+
// Note: The deduplication functionality is tested by the two tests above.
1721+
// Additional integration testing would require more complex mock setup
1722+
// that may be brittle. The core functionality is verified by the
1723+
// content hash and resolved path deduplication tests.
1724+
})

src/core/prompts/sections/custom-instructions.ts

Lines changed: 77 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ import fs from "fs/promises"
22
import path from "path"
33
import * as os from "os"
44
import { Dirent } from "fs"
5+
import { createHash } from "crypto"
56

67
import { isLanguage } from "@roo-code/types"
78

@@ -10,6 +11,42 @@ import type { SystemPromptSettings } from "../types"
1011
import { LANGUAGES } from "../../../shared/language"
1112
import { getRooDirectoriesForCwd, getGlobalRooDirectory } from "../../../services/roo-config"
1213

14+
/**
15+
* File tracking system to prevent duplicate inclusions
16+
*/
17+
interface FileTracker {
18+
resolvedPaths: Set<string>
19+
contentHashes: Set<string>
20+
}
21+
22+
/**
23+
* Create MD5 hash of file content
24+
*/
25+
function createContentHash(content: string): string {
26+
return createHash("md5").update(content, "utf8").digest("hex")
27+
}
28+
29+
/**
30+
* Check if a file should be included based on resolved path and content hash
31+
*/
32+
function shouldIncludeFile(resolvedPath: string, content: string, tracker: FileTracker): boolean {
33+
// Skip if we've already included this resolved path
34+
if (tracker.resolvedPaths.has(resolvedPath)) {
35+
return false
36+
}
37+
38+
// Skip if we've already included content with this hash
39+
const contentHash = createContentHash(content)
40+
if (tracker.contentHashes.has(contentHash)) {
41+
return false
42+
}
43+
44+
// Add to tracking sets
45+
tracker.resolvedPaths.add(resolvedPath)
46+
tracker.contentHashes.add(contentHash)
47+
return true
48+
}
49+
1350
/**
1451
* Safely read a file and return its trimmed content
1552
*/
@@ -108,7 +145,10 @@ async function resolveSymLink(
108145
/**
109146
* Read all text files from a directory in alphabetical order
110147
*/
111-
async function readTextFilesFromDirectory(dirPath: string): Promise<Array<{ filename: string; content: string }>> {
148+
async function readTextFilesFromDirectory(
149+
dirPath: string,
150+
tracker: FileTracker,
151+
): Promise<Array<{ filename: string; content: string }>> {
112152
try {
113153
const entries = await fs.readdir(dirPath, { withFileTypes: true, recursive: true })
114154

@@ -136,6 +176,12 @@ async function readTextFilesFromDirectory(dirPath: string): Promise<Array<{ file
136176
return null
137177
}
138178
const content = await safeReadFile(resolvedPath)
179+
180+
// Skip if file was already included or content is duplicate
181+
if (!shouldIncludeFile(resolvedPath, content, tracker)) {
182+
return null
183+
}
184+
139185
// Use resolvedPath for display to maintain existing behavior
140186
return { filename: resolvedPath, content, sortKey: originalPath }
141187
}
@@ -182,15 +228,21 @@ function formatDirectoryContent(dirPath: string, files: Array<{ filename: string
182228
* Load rule files from global and project-local directories
183229
* Global rules are loaded first, then project-local rules which can override global ones
184230
*/
185-
export async function loadRuleFiles(cwd: string): Promise<string> {
231+
export async function loadRuleFiles(cwd: string, tracker?: FileTracker): Promise<string> {
232+
// Create a default tracker if none provided (for backward compatibility with tests)
233+
const fileTracker = tracker || {
234+
resolvedPaths: new Set<string>(),
235+
contentHashes: new Set<string>(),
236+
}
237+
186238
const rules: string[] = []
187239
const rooDirectories = getRooDirectoriesForCwd(cwd)
188240

189241
// Check for .roo/rules/ directories in order (global first, then project-local)
190242
for (const rooDir of rooDirectories) {
191243
const rulesDir = path.join(rooDir, "rules")
192244
if (await directoryExists(rulesDir)) {
193-
const files = await readTextFilesFromDirectory(rulesDir)
245+
const files = await readTextFilesFromDirectory(rulesDir, fileTracker)
194246
if (files.length > 0) {
195247
const content = formatDirectoryContent(rulesDir, files)
196248
rules.push(content)
@@ -207,8 +259,9 @@ export async function loadRuleFiles(cwd: string): Promise<string> {
207259
const ruleFiles = [".roorules", ".clinerules"]
208260

209261
for (const file of ruleFiles) {
210-
const content = await safeReadFile(path.join(cwd, file))
211-
if (content) {
262+
const filePath = path.join(cwd, file)
263+
const content = await safeReadFile(filePath)
264+
if (content && shouldIncludeFile(filePath, content, fileTracker)) {
212265
return `\n# Rules from ${file}:\n${content}\n`
213266
}
214267
}
@@ -220,7 +273,7 @@ export async function loadRuleFiles(cwd: string): Promise<string> {
220273
* Load AGENTS.md or AGENT.md file from the project root if it exists
221274
* Checks for both AGENTS.md (standard) and AGENT.md (alternative) for compatibility
222275
*/
223-
async function loadAgentRulesFile(cwd: string): Promise<string> {
276+
async function loadAgentRulesFile(cwd: string, tracker: FileTracker): Promise<string> {
224277
// Try both filenames - AGENTS.md (standard) first, then AGENT.md (alternative)
225278
const filenames = ["AGENTS.md", "AGENT.md"]
226279

@@ -251,7 +304,7 @@ async function loadAgentRulesFile(cwd: string): Promise<string> {
251304

252305
// Read the content from the resolved path
253306
const content = await safeReadFile(resolvedPath)
254-
if (content) {
307+
if (content && shouldIncludeFile(resolvedPath, content, tracker)) {
255308
return `# Agent Rules Standard (${filename}):\n${content}`
256309
}
257310
} catch (err) {
@@ -274,6 +327,12 @@ export async function addCustomInstructions(
274327
): Promise<string> {
275328
const sections = []
276329

330+
// Create file tracker to prevent duplicates
331+
const fileTracker: FileTracker = {
332+
resolvedPaths: new Set<string>(),
333+
contentHashes: new Set<string>(),
334+
}
335+
277336
// Load mode-specific rules if mode is provided
278337
let modeRuleContent = ""
279338
let usedRuleFile = ""
@@ -286,7 +345,7 @@ export async function addCustomInstructions(
286345
for (const rooDir of rooDirectories) {
287346
const modeRulesDir = path.join(rooDir, `rules-${mode}`)
288347
if (await directoryExists(modeRulesDir)) {
289-
const files = await readTextFilesFromDirectory(modeRulesDir)
348+
const files = await readTextFilesFromDirectory(modeRulesDir, fileTracker)
290349
if (files.length > 0) {
291350
const content = formatDirectoryContent(modeRulesDir, files)
292351
modeRules.push(content)
@@ -301,13 +360,17 @@ export async function addCustomInstructions(
301360
} else {
302361
// Fall back to existing behavior for legacy files
303362
const rooModeRuleFile = `.roorules-${mode}`
304-
modeRuleContent = await safeReadFile(path.join(cwd, rooModeRuleFile))
305-
if (modeRuleContent) {
363+
const rooModeRuleFilePath = path.join(cwd, rooModeRuleFile)
364+
const rooModeContent = await safeReadFile(rooModeRuleFilePath)
365+
if (rooModeContent && shouldIncludeFile(rooModeRuleFilePath, rooModeContent, fileTracker)) {
366+
modeRuleContent = rooModeContent
306367
usedRuleFile = rooModeRuleFile
307368
} else {
308369
const clineModeRuleFile = `.clinerules-${mode}`
309-
modeRuleContent = await safeReadFile(path.join(cwd, clineModeRuleFile))
310-
if (modeRuleContent) {
370+
const clineModeRuleFilePath = path.join(cwd, clineModeRuleFile)
371+
const clineModeContent = await safeReadFile(clineModeRuleFilePath)
372+
if (clineModeContent && shouldIncludeFile(clineModeRuleFilePath, clineModeContent, fileTracker)) {
373+
modeRuleContent = clineModeContent
311374
usedRuleFile = clineModeRuleFile
312375
}
313376
}
@@ -350,14 +413,14 @@ export async function addCustomInstructions(
350413

351414
// Add AGENTS.md content if enabled (default: true)
352415
if (options.settings?.useAgentRules !== false) {
353-
const agentRulesContent = await loadAgentRulesFile(cwd)
416+
const agentRulesContent = await loadAgentRulesFile(cwd, fileTracker)
354417
if (agentRulesContent && agentRulesContent.trim()) {
355418
rules.push(agentRulesContent.trim())
356419
}
357420
}
358421

359422
// Add generic rules
360-
const genericRuleContent = await loadRuleFiles(cwd)
423+
const genericRuleContent = await loadRuleFiles(cwd, fileTracker)
361424
if (genericRuleContent && genericRuleContent.trim()) {
362425
rules.push(genericRuleContent.trim())
363426
}

0 commit comments

Comments
 (0)