Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/services/code-index/constants/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ export const QDRANT_CODE_BLOCK_NAMESPACE = "f47ac10b-58cc-4372-a567-0e02b2c3d479
export const MAX_FILE_SIZE_BYTES = 1 * 1024 * 1024 // 1MB

/**Directory Scanner */
export const MAX_LIST_FILES_LIMIT = 3_000
export const MAX_LIST_FILES_LIMIT_CODE_INDEX = 50_000
export const BATCH_SEGMENT_THRESHOLD = 60 // Number of code segments to batch for embeddings/upserts
export const MAX_BATCH_RETRIES = 3
export const INITIAL_RETRY_DELAY_MS = 500
Expand Down
1 change: 0 additions & 1 deletion src/services/code-index/interfaces/file-processor.ts
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,6 @@ export interface IDirectoryScanner {
onBlocksIndexed?: (indexedCount: number) => void,
onFileParsed?: (fileBlockCount: number) => void,
): Promise<{
codeBlocks: CodeBlock[]
stats: {
processed: number
skipped: number
Expand Down
36 changes: 22 additions & 14 deletions src/services/code-index/processors/__tests__/scanner.spec.ts
Original file line number Diff line number Diff line change
Expand Up @@ -168,7 +168,16 @@ describe("DirectoryScanner", () => {
expect(mockCodeParser.parseFile).not.toHaveBeenCalled()
})

it("should parse changed files and return code blocks", async () => {
it("should parse changed files and return empty codeBlocks array", async () => {
// Create scanner without embedder to test the non-embedding path
const scannerNoEmbeddings = new DirectoryScanner(
null as any, // No embedder
null as any, // No vector store
mockCodeParser,
mockCacheManager,
mockIgnoreInstance,
)

const { listFiles } = await import("../../../glob/list-files")
vi.mocked(listFiles).mockResolvedValue([["test/file1.js"], false])
const mockBlocks: any[] = [
Expand All @@ -185,8 +194,7 @@ describe("DirectoryScanner", () => {
]
;(mockCodeParser.parseFile as any).mockResolvedValue(mockBlocks)

const result = await scanner.scanDirectory("/test")
expect(result.codeBlocks).toEqual(mockBlocks)
const result = await scannerNoEmbeddings.scanDirectory("/test")
expect(result.stats.processed).toBe(1)
})

Expand Down Expand Up @@ -252,6 +260,15 @@ describe("DirectoryScanner", () => {
})

it("should process markdown files alongside code files", async () => {
// Create scanner without embedder to test the non-embedding path
const scannerNoEmbeddings = new DirectoryScanner(
null as any, // No embedder
null as any, // No vector store
mockCodeParser,
mockCacheManager,
mockIgnoreInstance,
)

const { listFiles } = await import("../../../glob/list-files")
vi.mocked(listFiles).mockResolvedValue([["test/README.md", "test/app.js", "docs/guide.markdown"], false])

Expand Down Expand Up @@ -306,24 +323,15 @@ describe("DirectoryScanner", () => {
return []
})

const result = await scanner.scanDirectory("/test")
const result = await scannerNoEmbeddings.scanDirectory("/test")

// Verify all files were processed
expect(mockCodeParser.parseFile).toHaveBeenCalledTimes(3)
expect(mockCodeParser.parseFile).toHaveBeenCalledWith("test/README.md", expect.any(Object))
expect(mockCodeParser.parseFile).toHaveBeenCalledWith("test/app.js", expect.any(Object))
expect(mockCodeParser.parseFile).toHaveBeenCalledWith("docs/guide.markdown", expect.any(Object))

// Verify code blocks include both markdown and code content
expect(result.codeBlocks).toHaveLength(3)
expect(result.codeBlocks).toEqual(
expect.arrayContaining([
expect.objectContaining({ type: "markdown_header_h1" }),
expect.objectContaining({ type: "function" }),
expect.objectContaining({ type: "markdown_header_h2" }),
]),
)

// Verify processing still works without codeBlocks accumulation
expect(result.stats.processed).toBe(3)
})

Expand Down
50 changes: 32 additions & 18 deletions src/services/code-index/processors/scanner.ts
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ import { t } from "../../../i18n"
import {
QDRANT_CODE_BLOCK_NAMESPACE,
MAX_FILE_SIZE_BYTES,
MAX_LIST_FILES_LIMIT,
MAX_LIST_FILES_LIMIT_CODE_INDEX,
BATCH_SEGMENT_THRESHOLD,
MAX_BATCH_RETRIES,
INITIAL_RETRY_DELAY_MS,
Expand Down Expand Up @@ -51,13 +51,13 @@ export class DirectoryScanner implements IDirectoryScanner {
onError?: (error: Error) => void,
onBlocksIndexed?: (indexedCount: number) => void,
onFileParsed?: (fileBlockCount: number) => void,
): Promise<{ codeBlocks: CodeBlock[]; stats: { processed: number; skipped: number }; totalBlockCount: number }> {
): Promise<{ stats: { processed: number; skipped: number }; totalBlockCount: number }> {
const directoryPath = directory
// Capture workspace context at scan start
const scanWorkspace = getWorkspacePathForContext(directoryPath)

// Get all files recursively (handles .gitignore automatically)
const [allPaths, _] = await listFiles(directoryPath, true, MAX_LIST_FILES_LIMIT)
const [allPaths, _] = await listFiles(directoryPath, true, MAX_LIST_FILES_LIMIT_CODE_INDEX)

// Filter out directories (marked with trailing '/')
const filePaths = allPaths.filter((p) => !p.endsWith("/"))
Expand Down Expand Up @@ -85,7 +85,6 @@ export class DirectoryScanner implements IDirectoryScanner {

// Initialize tracking variables
const processedFiles = new Set<string>()
const codeBlocks: CodeBlock[] = []
let processedCount = 0
let skippedCount = 0

Expand All @@ -98,7 +97,7 @@ export class DirectoryScanner implements IDirectoryScanner {
let currentBatchBlocks: CodeBlock[] = []
let currentBatchTexts: string[] = []
let currentBatchFileInfos: { filePath: string; fileHash: string; isNew: boolean }[] = []
const activeBatchPromises: Promise<void>[] = []
const activeBatchPromises = new Set<Promise<void>>()

// Initialize block counter
let totalBlockCount = 0
Expand All @@ -125,6 +124,7 @@ export class DirectoryScanner implements IDirectoryScanner {

// Check against cache
const cachedFileHash = this.cacheManager.getHash(filePath)
const isNewFile = !cachedFileHash
if (cachedFileHash === currentFileHash) {
// File is unchanged
skippedCount++
Expand All @@ -135,7 +135,6 @@ export class DirectoryScanner implements IDirectoryScanner {
const blocks = await this.codeParser.parseFile(filePath, { content, fileHash: currentFileHash })
const fileBlockCount = blocks.length
onFileParsed?.(fileBlockCount)
codeBlocks.push(...blocks)
processedCount++

// Process embeddings if configured
Expand All @@ -146,20 +145,11 @@ export class DirectoryScanner implements IDirectoryScanner {
const trimmedContent = block.content.trim()
if (trimmedContent) {
const release = await mutex.acquire()
totalBlockCount += fileBlockCount
try {
currentBatchBlocks.push(block)
currentBatchTexts.push(trimmedContent)
addedBlocksFromFile = true

if (addedBlocksFromFile) {
currentBatchFileInfos.push({
filePath,
fileHash: currentFileHash,
isNew: !this.cacheManager.getHash(filePath),
})
}

// Check if batch threshold is met
if (currentBatchBlocks.length >= BATCH_SEGMENT_THRESHOLD) {
// Copy current batch data and clear accumulators
Expand All @@ -181,13 +171,33 @@ export class DirectoryScanner implements IDirectoryScanner {
onBlocksIndexed,
),
)
activeBatchPromises.push(batchPromise)
activeBatchPromises.add(batchPromise)

// Clean up completed promises to prevent memory accumulation
batchPromise.finally(() => {
activeBatchPromises.delete(batchPromise)
})
}
} finally {
release()
}
}
}

// Add file info once per file (outside the block loop)
if (addedBlocksFromFile) {
const release = await mutex.acquire()
try {
totalBlockCount += fileBlockCount
currentBatchFileInfos.push({
filePath,
fileHash: currentFileHash,
isNew: isNewFile,
})
} finally {
release()
}
}
} else {
// Only update hash if not being processed in a batch
await this.cacheManager.updateHash(filePath, currentFileHash)
Expand Down Expand Up @@ -232,7 +242,12 @@ export class DirectoryScanner implements IDirectoryScanner {
const batchPromise = batchLimiter(() =>
this.processBatch(batchBlocks, batchTexts, batchFileInfos, scanWorkspace, onError, onBlocksIndexed),
)
activeBatchPromises.push(batchPromise)
activeBatchPromises.add(batchPromise)

// Clean up completed promises to prevent memory accumulation
batchPromise.finally(() => {
activeBatchPromises.delete(batchPromise)
})
} finally {
release()
}
Expand Down Expand Up @@ -280,7 +295,6 @@ export class DirectoryScanner implements IDirectoryScanner {
}

return {
codeBlocks,
stats: {
processed: processedCount,
skipped: skippedCount,
Expand Down