diff --git a/src/services/code-index/processors/__tests__/parser-csharp-fix.spec.ts b/src/services/code-index/processors/__tests__/parser-csharp-fix.spec.ts new file mode 100644 index 00000000000..92ce2fc3922 --- /dev/null +++ b/src/services/code-index/processors/__tests__/parser-csharp-fix.spec.ts @@ -0,0 +1,250 @@ +import { describe, it, expect, beforeEach, vi } from "vitest" +import { CodeParser } from "../parser" +import * as path from "path" + +// Mock the language parser loading +vi.mock("../../../tree-sitter/languageParser", () => ({ + loadRequiredLanguageParsers: vi.fn().mockResolvedValue({ + cs: { + parser: { + parse: vi.fn().mockReturnValue({ + rootNode: { + type: "compilation_unit", + startPosition: { row: 0, column: 0 }, + endPosition: { row: 27, column: 1 }, + text: "", + children: [], + }, + }), + }, + query: { + captures: vi.fn(), + }, + }, + }), +})) + +describe("CodeParser - C# Using Directives Fix", () => { + let parser: CodeParser + + beforeEach(() => { + parser = new CodeParser() + vi.clearAllMocks() + }) + + it("should group using directives together to meet minimum block size", async () => { + const filePath = "/test/TestFile.cs" + const content = `using System; +using System.Collections.Generic; +using System.Linq; +using System.Threading.Tasks; + +namespace TestNamespace +{ + public class TestClass + { + public void TestMethod() + { + Console.WriteLine("Hello World"); + } + } +}` + + // Mock the tree-sitter captures to return using directives and other nodes + const mockCaptures = [ + { + name: "name.definition.using", + node: { + type: "using_directive", + text: "using System;", + startPosition: { row: 0, column: 0 }, + endPosition: { row: 0, column: 13 }, + children: [], + childForFieldName: () => null, + }, + }, + { + name: "name.definition.using", + node: { + type: "using_directive", + text: "using System.Collections.Generic;", + startPosition: { row: 1, column: 0 }, + endPosition: { row: 1, column: 33 }, + children: [], + childForFieldName: () => null, + }, + }, + { + name: "name.definition.using", + node: { + type: "using_directive", + text: "using System.Linq;", + startPosition: { row: 2, column: 0 }, + endPosition: { row: 2, column: 18 }, + children: [], + childForFieldName: () => null, + }, + }, + { + name: "name.definition.using", + node: { + type: "using_directive", + text: "using System.Threading.Tasks;", + startPosition: { row: 3, column: 0 }, + endPosition: { row: 3, column: 29 }, + children: [], + childForFieldName: () => null, + }, + }, + { + name: "name.definition.namespace", + node: { + type: "namespace_declaration", + text: `namespace TestNamespace +{ + public class TestClass + { + public void TestMethod() + { + Console.WriteLine("Hello World"); + } + } +}`, + startPosition: { row: 5, column: 0 }, + endPosition: { row: 14, column: 1 }, + children: [], + childForFieldName: () => null, + }, + }, + ] + + // Update the mock to return our captures + const { loadRequiredLanguageParsers } = await import("../../../tree-sitter/languageParser") + const mockParsers = await loadRequiredLanguageParsers([filePath]) + mockParsers.cs.query.captures = vi.fn().mockReturnValue(mockCaptures) + + const result = await parser.parseFile(filePath, { content }) + + // Should have 2 blocks: grouped using directives and the namespace + expect(result).toHaveLength(2) + + // First block should be the grouped using directives + const usingBlock = result.find((block) => block.type === "using_directive_group") + expect(usingBlock).toBeDefined() + expect(usingBlock?.start_line).toBe(1) + expect(usingBlock?.end_line).toBe(4) + expect(usingBlock?.content).toBe( + "using System;\n" + + "using System.Collections.Generic;\n" + + "using System.Linq;\n" + + "using System.Threading.Tasks;", + ) + + // Second block should be the namespace + const namespaceBlock = result.find((block) => block.type === "namespace_declaration") + expect(namespaceBlock).toBeDefined() + expect(namespaceBlock?.start_line).toBe(6) + expect(namespaceBlock?.end_line).toBe(15) + }) + + it("should not group using directives if they are separated by too many lines", async () => { + const filePath = "/test/TestFile.cs" + const content = `using System; +using System.Collections.Generic; +using System.Text; + +// Some comment + +using System.Linq; +using System.Threading.Tasks; + +namespace TestNamespace +{ + public class TestClass { } +}` + + const mockCaptures = [ + { + name: "name.definition.using", + node: { + type: "using_directive", + text: "using System;", + startPosition: { row: 0, column: 0 }, + endPosition: { row: 0, column: 13 }, + children: [], + childForFieldName: () => null, + }, + }, + { + name: "name.definition.using", + node: { + type: "using_directive", + text: "using System.Collections.Generic;", + startPosition: { row: 1, column: 0 }, + endPosition: { row: 1, column: 33 }, + children: [], + childForFieldName: () => null, + }, + }, + { + name: "name.definition.using", + node: { + type: "using_directive", + text: "using System.Text;", + startPosition: { row: 2, column: 0 }, + endPosition: { row: 2, column: 18 }, + children: [], + childForFieldName: () => null, + }, + }, + { + name: "name.definition.using", + node: { + type: "using_directive", + text: "using System.Linq;", + startPosition: { row: 6, column: 0 }, + endPosition: { row: 6, column: 18 }, + children: [], + childForFieldName: () => null, + }, + }, + { + name: "name.definition.using", + node: { + type: "using_directive", + text: "using System.Threading.Tasks;", + startPosition: { row: 7, column: 0 }, + endPosition: { row: 7, column: 29 }, + children: [], + childForFieldName: () => null, + }, + }, + ] + + const { loadRequiredLanguageParsers } = await import("../../../tree-sitter/languageParser") + const mockParsers = await loadRequiredLanguageParsers([filePath]) + mockParsers.cs.query.captures = vi.fn().mockReturnValue(mockCaptures) + + const result = await parser.parseFile(filePath, { content }) + + // Should have at least one block for the grouped using directives + const usingBlocks = result.filter((block) => block.type === "using_directive_group") + expect(usingBlocks.length).toBeGreaterThanOrEqual(1) + + // The first group should contain the first three using directives + const firstGroup = usingBlocks[0] + expect(firstGroup.content).toBe( + "using System;\n" + "using System.Collections.Generic;\n" + "using System.Text;", + ) + expect(firstGroup.start_line).toBe(1) + expect(firstGroup.end_line).toBe(3) + + // If there's a second group, it should contain the last two using directives + if (usingBlocks.length > 1) { + const secondGroup = usingBlocks[1] + expect(secondGroup.content).toBe("using System.Linq;\n" + "using System.Threading.Tasks;") + expect(secondGroup.start_line).toBe(7) + expect(secondGroup.end_line).toBe(8) + } + }) +}) diff --git a/src/services/code-index/processors/parser.ts b/src/services/code-index/processors/parser.ts index 96d747c4c9f..88deec4663b 100644 --- a/src/services/code-index/processors/parser.ts +++ b/src/services/code-index/processors/parser.ts @@ -165,7 +165,42 @@ export class CodeParser implements ICodeParser { const results: CodeBlock[] = [] // Process captures if not empty - const queue: Node[] = Array.from(captures).map((capture) => capture.node) + const captureNodes = Array.from(captures).map((capture) => capture.node) + + // Group small related nodes together (e.g., using directives in C#) + const groupedNodes = this._groupSmallNodes(captureNodes) + + const queue: Node[] = [...groupedNodes.regularNodes] + + // Process grouped small nodes first + for (const group of groupedNodes.groups) { + const { nodes, type, identifier } = group + const combinedContent = nodes.map((n) => n.text).join("\n") + const start_line = nodes[0].startPosition.row + 1 + const end_line = nodes[nodes.length - 1].endPosition.row + 1 + + // Only create a block if the combined content meets the minimum size + if (combinedContent.length >= MIN_BLOCK_CHARS) { + const contentPreview = combinedContent.slice(0, 100) + const segmentHash = createHash("sha256") + .update(`${filePath}-${start_line}-${end_line}-${combinedContent.length}-${contentPreview}`) + .digest("hex") + + if (!seenSegmentHashes.has(segmentHash)) { + seenSegmentHashes.add(segmentHash) + results.push({ + file_path: filePath, + identifier, + type, + start_line, + end_line, + content: combinedContent, + segmentHash, + fileHash, + }) + } + } + } while (queue.length > 0) { const currentNode = queue.shift()! @@ -218,12 +253,88 @@ export class CodeParser implements ICodeParser { } } } - // Nodes smaller than minBlockChars are ignored + // Nodes smaller than minBlockChars are ignored unless they were grouped } return results } + /** + * Groups small nodes that are semantically related (e.g., consecutive using directives) + * to ensure they meet the minimum block size requirement when combined. + */ + private _groupSmallNodes(nodes: Node[]): { + groups: Array<{ nodes: Node[]; type: string; identifier: string | null }> + regularNodes: Node[] + } { + const groups: Array<{ nodes: Node[]; type: string; identifier: string | null }> = [] + const regularNodes: Node[] = [] + const processedIndices = new Set() + + // Group consecutive nodes of the same type that are small + for (let i = 0; i < nodes.length; i++) { + if (processedIndices.has(i)) continue + + const node = nodes[i] + + // If node is large enough on its own, add to regular nodes + if (node.text.length >= MIN_BLOCK_CHARS) { + regularNodes.push(node) + processedIndices.add(i) + continue + } + + // Try to group small nodes of the same type + const nodeType = node.type + const groupNodes: Node[] = [node] + processedIndices.add(i) + + // Look for consecutive nodes of the same type + for (let j = i + 1; j < nodes.length; j++) { + if (processedIndices.has(j)) continue + + const nextNode = nodes[j] + + // Stop grouping if we encounter a different type or a large node + if (nextNode.type !== nodeType || nextNode.text.length >= MIN_BLOCK_CHARS) { + break + } + + // Check if nodes are consecutive (no significant gap between them) + const prevEndLine = groupNodes[groupNodes.length - 1].endPosition.row + const nextStartLine = nextNode.startPosition.row + + // Allow up to 1 empty line between grouped nodes + if (nextStartLine - prevEndLine <= 2) { + groupNodes.push(nextNode) + processedIndices.add(j) + } else { + break + } + } + + // Only create a group if we have multiple nodes or if it's a special type + // that should be grouped even when alone (like using directives) + if (groupNodes.length > 1 || nodeType === "using_directive") { + groups.push({ + nodes: groupNodes, + type: nodeType + "_group", + identifier: null, + }) + } + // Otherwise, the single small node will be ignored + } + + // Add any remaining unprocessed nodes to regular nodes + for (let i = 0; i < nodes.length; i++) { + if (!processedIndices.has(i) && nodes[i].text.length >= MIN_BLOCK_CHARS) { + regularNodes.push(nodes[i]) + } + } + + return { groups, regularNodes } + } + /** * Common helper function to chunk text by lines, avoiding tiny remainders. */