Skip to content
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/services/code-index/constants/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ export const QDRANT_CODE_BLOCK_NAMESPACE = "f47ac10b-58cc-4372-a567-0e02b2c3d479
export const MAX_FILE_SIZE_BYTES = 1 * 1024 * 1024 // 1MB

/**Directory Scanner */
export const MAX_LIST_FILES_LIMIT = 3_000
export const MAX_LIST_FILES_LIMIT_CODE_INDEX = 50_000
export const BATCH_SEGMENT_THRESHOLD = 60 // Number of code segments to batch for embeddings/upserts
export const MAX_BATCH_RETRIES = 3
export const INITIAL_RETRY_DELAY_MS = 500
Expand Down
1 change: 0 additions & 1 deletion src/services/code-index/interfaces/file-processor.ts
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,6 @@ export interface IDirectoryScanner {
onBlocksIndexed?: (indexedCount: number) => void,
onFileParsed?: (fileBlockCount: number) => void,
): Promise<{
codeBlocks: CodeBlock[]
stats: {
processed: number
skipped: number
Expand Down
36 changes: 22 additions & 14 deletions src/services/code-index/processors/__tests__/scanner.spec.ts
Original file line number Diff line number Diff line change
Expand Up @@ -168,7 +168,16 @@ describe("DirectoryScanner", () => {
expect(mockCodeParser.parseFile).not.toHaveBeenCalled()
})

it("should parse changed files and return code blocks", async () => {
it("should parse changed files and return empty codeBlocks array", async () => {
// Create scanner without embedder to test the non-embedding path
const scannerNoEmbeddings = new DirectoryScanner(
null as any, // No embedder
null as any, // No vector store
mockCodeParser,
mockCacheManager,
mockIgnoreInstance,
)

const { listFiles } = await import("../../../glob/list-files")
vi.mocked(listFiles).mockResolvedValue([["test/file1.js"], false])
const mockBlocks: any[] = [
Expand All @@ -185,8 +194,7 @@ describe("DirectoryScanner", () => {
]
;(mockCodeParser.parseFile as any).mockResolvedValue(mockBlocks)

const result = await scanner.scanDirectory("/test")
expect(result.codeBlocks).toEqual(mockBlocks)
const result = await scannerNoEmbeddings.scanDirectory("/test")
expect(result.stats.processed).toBe(1)
})

Expand Down Expand Up @@ -252,6 +260,15 @@ describe("DirectoryScanner", () => {
})

it("should process markdown files alongside code files", async () => {
// Create scanner without embedder to test the non-embedding path
const scannerNoEmbeddings = new DirectoryScanner(
null as any, // No embedder
null as any, // No vector store
mockCodeParser,
mockCacheManager,
mockIgnoreInstance,
)

const { listFiles } = await import("../../../glob/list-files")
vi.mocked(listFiles).mockResolvedValue([["test/README.md", "test/app.js", "docs/guide.markdown"], false])

Expand Down Expand Up @@ -306,24 +323,15 @@ describe("DirectoryScanner", () => {
return []
})

const result = await scanner.scanDirectory("/test")
const result = await scannerNoEmbeddings.scanDirectory("/test")

// Verify all files were processed
expect(mockCodeParser.parseFile).toHaveBeenCalledTimes(3)
expect(mockCodeParser.parseFile).toHaveBeenCalledWith("test/README.md", expect.any(Object))
expect(mockCodeParser.parseFile).toHaveBeenCalledWith("test/app.js", expect.any(Object))
expect(mockCodeParser.parseFile).toHaveBeenCalledWith("docs/guide.markdown", expect.any(Object))

// Verify code blocks include both markdown and code content
expect(result.codeBlocks).toHaveLength(3)
expect(result.codeBlocks).toEqual(
expect.arrayContaining([
expect.objectContaining({ type: "markdown_header_h1" }),
expect.objectContaining({ type: "function" }),
expect.objectContaining({ type: "markdown_header_h2" }),
]),
)

// Verify processing still works without codeBlocks accumulation
expect(result.stats.processed).toBe(3)
})

Expand Down
28 changes: 13 additions & 15 deletions src/services/code-index/processors/scanner.ts
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ import { t } from "../../../i18n"
import {
QDRANT_CODE_BLOCK_NAMESPACE,
MAX_FILE_SIZE_BYTES,
MAX_LIST_FILES_LIMIT,
MAX_LIST_FILES_LIMIT_CODE_INDEX,
BATCH_SEGMENT_THRESHOLD,
MAX_BATCH_RETRIES,
INITIAL_RETRY_DELAY_MS,
Expand Down Expand Up @@ -51,13 +51,13 @@ export class DirectoryScanner implements IDirectoryScanner {
onError?: (error: Error) => void,
onBlocksIndexed?: (indexedCount: number) => void,
onFileParsed?: (fileBlockCount: number) => void,
): Promise<{ codeBlocks: CodeBlock[]; stats: { processed: number; skipped: number }; totalBlockCount: number }> {
): Promise<{ stats: { processed: number; skipped: number }; totalBlockCount: number }> {
const directoryPath = directory
// Capture workspace context at scan start
const scanWorkspace = getWorkspacePathForContext(directoryPath)

// Get all files recursively (handles .gitignore automatically)
const [allPaths, _] = await listFiles(directoryPath, true, MAX_LIST_FILES_LIMIT)
const [allPaths, _] = await listFiles(directoryPath, true, MAX_LIST_FILES_LIMIT_CODE_INDEX)

// Filter out directories (marked with trailing '/')
const filePaths = allPaths.filter((p) => !p.endsWith("/"))
Expand Down Expand Up @@ -85,7 +85,6 @@ export class DirectoryScanner implements IDirectoryScanner {

// Initialize tracking variables
const processedFiles = new Set<string>()
const codeBlocks: CodeBlock[] = []
let processedCount = 0
let skippedCount = 0

Expand Down Expand Up @@ -135,7 +134,6 @@ export class DirectoryScanner implements IDirectoryScanner {
const blocks = await this.codeParser.parseFile(filePath, { content, fileHash: currentFileHash })
const fileBlockCount = blocks.length
onFileParsed?.(fileBlockCount)
codeBlocks.push(...blocks)
processedCount++

// Process embeddings if configured
Expand All @@ -146,20 +144,11 @@ export class DirectoryScanner implements IDirectoryScanner {
const trimmedContent = block.content.trim()
if (trimmedContent) {
const release = await mutex.acquire()
totalBlockCount += fileBlockCount
try {
currentBatchBlocks.push(block)
currentBatchTexts.push(trimmedContent)
addedBlocksFromFile = true

if (addedBlocksFromFile) {
currentBatchFileInfos.push({
filePath,
fileHash: currentFileHash,
isNew: !this.cacheManager.getHash(filePath),
})
}

// Check if batch threshold is met
if (currentBatchBlocks.length >= BATCH_SEGMENT_THRESHOLD) {
// Copy current batch data and clear accumulators
Expand Down Expand Up @@ -188,6 +177,16 @@ export class DirectoryScanner implements IDirectoryScanner {
}
}
}

// Add file info once per file (outside the block loop)
if (addedBlocksFromFile) {
totalBlockCount += fileBlockCount
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The update to shared batch accumulators (totalBlockCount and currentBatchFileInfos) is done outside a mutex lock. This could lead to race conditions. Also, consider reusing the cached hash value (avoid calling cacheManager.getHash(filePath) twice).

currentBatchFileInfos.push({
filePath,
fileHash: currentFileHash,
isNew: !this.cacheManager.getHash(filePath),
})
}
} else {
// Only update hash if not being processed in a batch
await this.cacheManager.updateHash(filePath, currentFileHash)
Expand Down Expand Up @@ -280,7 +279,6 @@ export class DirectoryScanner implements IDirectoryScanner {
}

return {
codeBlocks,
stats: {
processed: processedCount,
skipped: skippedCount,
Expand Down