Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions src/services/code-index/constants/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@ export const DEFAULT_MAX_SEARCH_RESULTS = CODEBASE_INDEX_DEFAULTS.DEFAULT_SEARCH
/**File Watcher */
export const QDRANT_CODE_BLOCK_NAMESPACE = "f47ac10b-58cc-4372-a567-0e02b2c3d479"
export const MAX_FILE_SIZE_BYTES = 1 * 1024 * 1024 // 1MB
export const MAX_SWIFT_FILE_SIZE_BYTES = 512 * 1024 // 512KB - Swift files can be memory intensive
export const MEMORY_CHECK_INTERVAL_FILES = 10 // Check memory every N files

/**Directory Scanner */
export const MAX_LIST_FILES_LIMIT = 3_000
Expand Down
218 changes: 151 additions & 67 deletions src/services/code-index/processors/parser.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,15 @@ import { LanguageParser, loadRequiredLanguageParsers } from "../../tree-sitter/l
import { parseMarkdown } from "../../tree-sitter/markdownParser"
import { ICodeParser, CodeBlock } from "../interfaces"
import { scannerExtensions } from "../shared/supported-extensions"
import { MAX_BLOCK_CHARS, MIN_BLOCK_CHARS, MIN_CHUNK_REMAINDER_CHARS, MAX_CHARS_TOLERANCE_FACTOR } from "../constants"
import {
MAX_BLOCK_CHARS,
MIN_BLOCK_CHARS,
MIN_CHUNK_REMAINDER_CHARS,
MAX_CHARS_TOLERANCE_FACTOR,
MAX_SWIFT_FILE_SIZE_BYTES,
MEMORY_CHECK_INTERVAL_FILES,
} from "../constants"
import { MemoryMonitor } from "../utils/memoryMonitor"
import { TelemetryService } from "@roo-code/telemetry"
import { TelemetryEventName } from "@roo-code/types"
import { sanitizeErrorMessage } from "../shared/validation-helpers"
Expand All @@ -17,6 +25,8 @@ import { sanitizeErrorMessage } from "../shared/validation-helpers"
export class CodeParser implements ICodeParser {
private loadedParsers: LanguageParser = {}
private pendingLoads: Map<string, Promise<LanguageParser>> = new Map()
private memoryMonitor = MemoryMonitor.getInstance()
private filesProcessed = 0
// Markdown files are now supported using the custom markdown parser
// which extracts headers and sections for semantic indexing

Expand All @@ -33,6 +43,17 @@ export class CodeParser implements ICodeParser {
fileHash?: string
},
): Promise<CodeBlock[]> {
// Periodic memory monitoring
this.filesProcessed++
if (this.filesProcessed % MEMORY_CHECK_INTERVAL_FILES === 0) {
const isHighMemory = this.memoryMonitor.checkAndCleanup()
if (isHighMemory) {
console.warn(
`High memory usage detected (${this.memoryMonitor.getMemoryUsageMB()}MB) after processing ${this.filesProcessed} files`,
)
}
}

// Get file extension
const ext = path.extname(filePath).toLowerCase()

Expand All @@ -50,6 +71,23 @@ export class CodeParser implements ICodeParser {
fileHash = options.fileHash || this.createFileHash(content)
} else {
try {
// Check file size before reading for Swift files
if (ext === ".swift") {
const stats = await readFile(filePath, "utf8")
.then((content) => ({ size: Buffer.byteLength(content, "utf8") }))
.catch(() => null)
if (stats && stats.size > MAX_SWIFT_FILE_SIZE_BYTES) {
console.warn(
`Skipping large Swift file ${filePath} (${Math.round(stats.size / 1024)}KB > ${Math.round(MAX_SWIFT_FILE_SIZE_BYTES / 1024)}KB limit)`,
Copy link
Contributor

@adamhill adamhill Jul 15, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@roomote-agent Why not just truncate the Swift file? It's better to get some of the file indexed rather than of none of it.

)
TelemetryService.instance.captureEvent(TelemetryEventName.CODE_INDEX_ERROR, {
error: `Swift file too large: ${stats.size} bytes`,
location: "parseFile:fileSizeCheck",
})
return []
}
}

content = await readFile(filePath, "utf8")
fileHash = this.createFileHash(content)
} catch (error) {
Expand All @@ -63,6 +101,14 @@ export class CodeParser implements ICodeParser {
}
}

// Additional memory check before parsing large files
if (content.length > MAX_SWIFT_FILE_SIZE_BYTES && this.memoryMonitor.isMemoryPressure()) {
console.warn(
`Skipping file ${filePath} due to memory pressure (${this.memoryMonitor.getMemoryUsageMB()}MB used)`,
)
return []
}

// Parse the file
return this.parseContent(filePath, content, fileHash)
}
Expand Down Expand Up @@ -144,84 +190,122 @@ export class CodeParser implements ICodeParser {
return []
}

const tree = language.parser.parse(content)
let tree: any = null
let captures: any[] = []

// We don't need to get the query string from languageQueries since it's already loaded
// in the language object
const captures = tree ? language.query.captures(tree.rootNode) : []

// Check if captures are empty
if (captures.length === 0) {
if (content.length >= MIN_BLOCK_CHARS) {
// Perform fallback chunking if content is large enough
const blocks = this._performFallbackChunking(filePath, content, fileHash, seenSegmentHashes)
return blocks
} else {
// Return empty if content is too small for fallback
try {
// Check memory before parsing
if (this.memoryMonitor.isMemoryPressure()) {
console.warn(`Skipping parsing ${filePath} due to memory pressure`)
return []
}
}

const results: CodeBlock[] = []
tree = language.parser.parse(content)

// Process captures if not empty
const queue: Node[] = Array.from(captures).map((capture) => capture.node)
// We don't need to get the query string from languageQueries since it's already loaded
// in the language object
captures = tree ? language.query.captures(tree.rootNode) : []

while (queue.length > 0) {
const currentNode = queue.shift()!
// const lineSpan = currentNode.endPosition.row - currentNode.startPosition.row + 1 // Removed as per lint error
// Check if captures are empty
if (captures.length === 0) {
if (content.length >= MIN_BLOCK_CHARS) {
// Perform fallback chunking if content is large enough
const blocks = this._performFallbackChunking(filePath, content, fileHash, seenSegmentHashes)
return blocks
} else {
// Return empty if content is too small for fallback
return []
}
}

const results: CodeBlock[] = []

// Process captures if not empty
const queue: Node[] = Array.from(captures).map((capture) => capture.node)
let processedNodes = 0
const maxNodesToProcess = 1000 // Limit to prevent excessive memory usage

while (queue.length > 0 && processedNodes < maxNodesToProcess) {
// Periodic memory check during processing
if (processedNodes % 100 === 0 && this.memoryMonitor.isMemoryPressure()) {
console.warn(
`Stopping node processing for ${filePath} due to memory pressure after ${processedNodes} nodes`,
)
break
}

// Check if the node meets the minimum character requirement
if (currentNode.text.length >= MIN_BLOCK_CHARS) {
// If it also exceeds the maximum character limit, try to break it down
if (currentNode.text.length > MAX_BLOCK_CHARS * MAX_CHARS_TOLERANCE_FACTOR) {
if (currentNode.children.filter((child) => child !== null).length > 0) {
// If it has children, process them instead
queue.push(...currentNode.children.filter((child) => child !== null))
const currentNode = queue.shift()!
processedNodes++

// Check if the node meets the minimum character requirement
if (currentNode.text.length >= MIN_BLOCK_CHARS) {
// If it also exceeds the maximum character limit, try to break it down
if (currentNode.text.length > MAX_BLOCK_CHARS * MAX_CHARS_TOLERANCE_FACTOR) {
if (currentNode.children.filter((child) => child !== null).length > 0) {
// If it has children, process them instead (but limit queue growth)
const validChildren = currentNode.children.filter((child) => child !== null)
if (queue.length + validChildren.length < maxNodesToProcess) {
queue.push(...validChildren)
}
} else {
// If it's a leaf node, chunk it
const chunkedBlocks = this._chunkLeafNodeByLines(
currentNode,
filePath,
fileHash,
seenSegmentHashes,
)
results.push(...chunkedBlocks)
}
} else {
// If it's a leaf node, chunk it
const chunkedBlocks = this._chunkLeafNodeByLines(
currentNode,
filePath,
fileHash,
seenSegmentHashes,
)
results.push(...chunkedBlocks)
}
} else {
// Node meets min chars and is within max chars, create a block
const identifier =
currentNode.childForFieldName("name")?.text ||
currentNode.children.find((c) => c?.type === "identifier")?.text ||
null
const type = currentNode.type
const start_line = currentNode.startPosition.row + 1
const end_line = currentNode.endPosition.row + 1
const content = currentNode.text
const contentPreview = content.slice(0, 100)
const segmentHash = createHash("sha256")
.update(`${filePath}-${start_line}-${end_line}-${content.length}-${contentPreview}`)
.digest("hex")

if (!seenSegmentHashes.has(segmentHash)) {
seenSegmentHashes.add(segmentHash)
results.push({
file_path: filePath,
identifier,
type,
start_line,
end_line,
content,
segmentHash,
fileHash,
})
// Node meets min chars and is within max chars, create a block
const identifier =
currentNode.childForFieldName("name")?.text ||
currentNode.children.find((c) => c?.type === "identifier")?.text ||
null
const type = currentNode.type
const start_line = currentNode.startPosition.row + 1
const end_line = currentNode.endPosition.row + 1
const nodeContent = currentNode.text
const contentPreview = nodeContent.slice(0, 100)
const segmentHash = createHash("sha256")
.update(`${filePath}-${start_line}-${end_line}-${nodeContent.length}-${contentPreview}`)
.digest("hex")

if (!seenSegmentHashes.has(segmentHash)) {
seenSegmentHashes.add(segmentHash)
results.push({
file_path: filePath,
identifier,
type,
start_line,
end_line,
content: nodeContent,
segmentHash,
fileHash,
})
}
}
}
// Nodes smaller than minBlockChars are ignored
}
// Nodes smaller than minBlockChars are ignored
}

return results
return results
} finally {
// Clean up tree-sitter resources
if (tree) {
try {
tree.delete?.()
} catch (e) {
// Ignore cleanup errors
}
}

// Force garbage collection for Swift files if available
if (ext === "swift" && global.gc) {
global.gc()
}
}
}

/**
Expand Down
71 changes: 67 additions & 4 deletions src/services/code-index/processors/scanner.ts
Original file line number Diff line number Diff line change
Expand Up @@ -17,19 +17,25 @@ import { t } from "../../../i18n"
import {
QDRANT_CODE_BLOCK_NAMESPACE,
MAX_FILE_SIZE_BYTES,
MAX_SWIFT_FILE_SIZE_BYTES,
MAX_LIST_FILES_LIMIT,
BATCH_SEGMENT_THRESHOLD,
MAX_BATCH_RETRIES,
INITIAL_RETRY_DELAY_MS,
PARSING_CONCURRENCY,
BATCH_PROCESSING_CONCURRENCY,
MEMORY_CHECK_INTERVAL_FILES,
} from "../constants"
import { MemoryMonitor } from "../utils/memoryMonitor"
import { isPathInIgnoredDirectory } from "../../glob/ignore-utils"
import { TelemetryService } from "@roo-code/telemetry"
import { TelemetryEventName } from "@roo-code/types"
import { sanitizeErrorMessage } from "../shared/validation-helpers"

export class DirectoryScanner implements IDirectoryScanner {
private memoryMonitor = MemoryMonitor.getInstance()
private filesProcessed = 0

constructor(
private readonly embedder: IEmbedder,
private readonly qdrantClient: IVectorStore,
Expand Down Expand Up @@ -107,9 +113,35 @@ export class DirectoryScanner implements IDirectoryScanner {
const parsePromises = supportedPaths.map((filePath) =>
parseLimiter(async () => {
try {
// Check file size
// Periodic memory monitoring
this.filesProcessed++
if (this.filesProcessed % MEMORY_CHECK_INTERVAL_FILES === 0) {
const isHighMemory = this.memoryMonitor.checkAndCleanup()
if (isHighMemory) {
console.warn(
`High memory usage detected (${this.memoryMonitor.getMemoryUsageMB()}MB) during directory scan after ${this.filesProcessed} files`,
)
}
}

// Check if memory pressure should stop processing
if (this.memoryMonitor.isMemoryPressure()) {
console.warn(
`Skipping file ${filePath} due to memory pressure (${this.memoryMonitor.getMemoryUsageMB()}MB used)`,
)
skippedCount++
return
}

// Check file size with Swift-specific limits
const stats = await stat(filePath)
if (stats.size > MAX_FILE_SIZE_BYTES) {
const ext = path.extname(filePath).toLowerCase()
const maxSize = ext === ".swift" ? MAX_SWIFT_FILE_SIZE_BYTES : MAX_FILE_SIZE_BYTES

if (stats.size > maxSize) {
console.warn(
`Skipping large ${ext} file ${filePath} (${Math.round(stats.size / 1024)}KB > ${Math.round(maxSize / 1024)}KB limit)`,
)
skippedCount++ // Skip large files
return
}
Expand Down Expand Up @@ -148,6 +180,34 @@ export class DirectoryScanner implements IDirectoryScanner {
const release = await mutex.acquire()
totalBlockCount += fileBlockCount
try {
// Check memory before adding to batch
if (this.memoryMonitor.isMemoryPressure()) {
console.warn(
`Memory pressure detected, forcing batch processing early (${currentBatchBlocks.length} blocks)`,
)
// Force process current batch before adding more
if (currentBatchBlocks.length > 0) {
const batchBlocks = [...currentBatchBlocks]
const batchTexts = [...currentBatchTexts]
const batchFileInfos = [...currentBatchFileInfos]
currentBatchBlocks = []
currentBatchTexts = []
currentBatchFileInfos = []

const batchPromise = batchLimiter(() =>
this.processBatch(
batchBlocks,
batchTexts,
batchFileInfos,
scanWorkspace,
onError,
onBlocksIndexed,
),
)
activeBatchPromises.push(batchPromise)
}
}

currentBatchBlocks.push(block)
currentBatchTexts.push(trimmedContent)
addedBlocksFromFile = true
Expand All @@ -160,8 +220,11 @@ export class DirectoryScanner implements IDirectoryScanner {
})
}

// Check if batch threshold is met
if (currentBatchBlocks.length >= BATCH_SEGMENT_THRESHOLD) {
// Check if batch threshold is met or memory pressure
if (
currentBatchBlocks.length >= BATCH_SEGMENT_THRESHOLD ||
this.memoryMonitor.isMemoryPressure()
) {
// Copy current batch data and clear accumulators
const batchBlocks = [...currentBatchBlocks]
const batchTexts = [...currentBatchTexts]
Expand Down
Loading