Skip to content

Commit 0df020f

Browse files
committed
fix: group small C# nodes to meet minimum block size for indexing
- Added _groupSmallNodes method to group consecutive small nodes of the same type - This ensures using directives and other small constructs are properly indexed - Fixes issue where only using directives were being indexed in C# files - Added comprehensive tests to verify the fix Fixes #6048
1 parent df6c57d commit 0df020f

File tree

2 files changed

+363
-2
lines changed

2 files changed

+363
-2
lines changed
Lines changed: 250 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,250 @@
1+
import { describe, it, expect, beforeEach, vi } from "vitest"
2+
import { CodeParser } from "../parser"
3+
import * as path from "path"
4+
5+
// Mock the language parser loading
6+
vi.mock("../../../tree-sitter/languageParser", () => ({
7+
loadRequiredLanguageParsers: vi.fn().mockResolvedValue({
8+
cs: {
9+
parser: {
10+
parse: vi.fn().mockReturnValue({
11+
rootNode: {
12+
type: "compilation_unit",
13+
startPosition: { row: 0, column: 0 },
14+
endPosition: { row: 27, column: 1 },
15+
text: "",
16+
children: [],
17+
},
18+
}),
19+
},
20+
query: {
21+
captures: vi.fn(),
22+
},
23+
},
24+
}),
25+
}))
26+
27+
describe("CodeParser - C# Using Directives Fix", () => {
28+
let parser: CodeParser
29+
30+
beforeEach(() => {
31+
parser = new CodeParser()
32+
vi.clearAllMocks()
33+
})
34+
35+
it("should group using directives together to meet minimum block size", async () => {
36+
const filePath = "/test/TestFile.cs"
37+
const content = `using System;
38+
using System.Collections.Generic;
39+
using System.Linq;
40+
using System.Threading.Tasks;
41+
42+
namespace TestNamespace
43+
{
44+
public class TestClass
45+
{
46+
public void TestMethod()
47+
{
48+
Console.WriteLine("Hello World");
49+
}
50+
}
51+
}`
52+
53+
// Mock the tree-sitter captures to return using directives and other nodes
54+
const mockCaptures = [
55+
{
56+
name: "name.definition.using",
57+
node: {
58+
type: "using_directive",
59+
text: "using System;",
60+
startPosition: { row: 0, column: 0 },
61+
endPosition: { row: 0, column: 13 },
62+
children: [],
63+
childForFieldName: () => null,
64+
},
65+
},
66+
{
67+
name: "name.definition.using",
68+
node: {
69+
type: "using_directive",
70+
text: "using System.Collections.Generic;",
71+
startPosition: { row: 1, column: 0 },
72+
endPosition: { row: 1, column: 33 },
73+
children: [],
74+
childForFieldName: () => null,
75+
},
76+
},
77+
{
78+
name: "name.definition.using",
79+
node: {
80+
type: "using_directive",
81+
text: "using System.Linq;",
82+
startPosition: { row: 2, column: 0 },
83+
endPosition: { row: 2, column: 18 },
84+
children: [],
85+
childForFieldName: () => null,
86+
},
87+
},
88+
{
89+
name: "name.definition.using",
90+
node: {
91+
type: "using_directive",
92+
text: "using System.Threading.Tasks;",
93+
startPosition: { row: 3, column: 0 },
94+
endPosition: { row: 3, column: 29 },
95+
children: [],
96+
childForFieldName: () => null,
97+
},
98+
},
99+
{
100+
name: "name.definition.namespace",
101+
node: {
102+
type: "namespace_declaration",
103+
text: `namespace TestNamespace
104+
{
105+
public class TestClass
106+
{
107+
public void TestMethod()
108+
{
109+
Console.WriteLine("Hello World");
110+
}
111+
}
112+
}`,
113+
startPosition: { row: 5, column: 0 },
114+
endPosition: { row: 14, column: 1 },
115+
children: [],
116+
childForFieldName: () => null,
117+
},
118+
},
119+
]
120+
121+
// Update the mock to return our captures
122+
const { loadRequiredLanguageParsers } = await import("../../../tree-sitter/languageParser")
123+
const mockParsers = await loadRequiredLanguageParsers([filePath])
124+
mockParsers.cs.query.captures = vi.fn().mockReturnValue(mockCaptures)
125+
126+
const result = await parser.parseFile(filePath, { content })
127+
128+
// Should have 2 blocks: grouped using directives and the namespace
129+
expect(result).toHaveLength(2)
130+
131+
// First block should be the grouped using directives
132+
const usingBlock = result.find((block) => block.type === "using_directive_group")
133+
expect(usingBlock).toBeDefined()
134+
expect(usingBlock?.start_line).toBe(1)
135+
expect(usingBlock?.end_line).toBe(4)
136+
expect(usingBlock?.content).toBe(
137+
"using System;\n" +
138+
"using System.Collections.Generic;\n" +
139+
"using System.Linq;\n" +
140+
"using System.Threading.Tasks;",
141+
)
142+
143+
// Second block should be the namespace
144+
const namespaceBlock = result.find((block) => block.type === "namespace_declaration")
145+
expect(namespaceBlock).toBeDefined()
146+
expect(namespaceBlock?.start_line).toBe(6)
147+
expect(namespaceBlock?.end_line).toBe(15)
148+
})
149+
150+
it("should not group using directives if they are separated by too many lines", async () => {
151+
const filePath = "/test/TestFile.cs"
152+
const content = `using System;
153+
using System.Collections.Generic;
154+
using System.Text;
155+
156+
// Some comment
157+
158+
using System.Linq;
159+
using System.Threading.Tasks;
160+
161+
namespace TestNamespace
162+
{
163+
public class TestClass { }
164+
}`
165+
166+
const mockCaptures = [
167+
{
168+
name: "name.definition.using",
169+
node: {
170+
type: "using_directive",
171+
text: "using System;",
172+
startPosition: { row: 0, column: 0 },
173+
endPosition: { row: 0, column: 13 },
174+
children: [],
175+
childForFieldName: () => null,
176+
},
177+
},
178+
{
179+
name: "name.definition.using",
180+
node: {
181+
type: "using_directive",
182+
text: "using System.Collections.Generic;",
183+
startPosition: { row: 1, column: 0 },
184+
endPosition: { row: 1, column: 33 },
185+
children: [],
186+
childForFieldName: () => null,
187+
},
188+
},
189+
{
190+
name: "name.definition.using",
191+
node: {
192+
type: "using_directive",
193+
text: "using System.Text;",
194+
startPosition: { row: 2, column: 0 },
195+
endPosition: { row: 2, column: 18 },
196+
children: [],
197+
childForFieldName: () => null,
198+
},
199+
},
200+
{
201+
name: "name.definition.using",
202+
node: {
203+
type: "using_directive",
204+
text: "using System.Linq;",
205+
startPosition: { row: 6, column: 0 },
206+
endPosition: { row: 6, column: 18 },
207+
children: [],
208+
childForFieldName: () => null,
209+
},
210+
},
211+
{
212+
name: "name.definition.using",
213+
node: {
214+
type: "using_directive",
215+
text: "using System.Threading.Tasks;",
216+
startPosition: { row: 7, column: 0 },
217+
endPosition: { row: 7, column: 29 },
218+
children: [],
219+
childForFieldName: () => null,
220+
},
221+
},
222+
]
223+
224+
const { loadRequiredLanguageParsers } = await import("../../../tree-sitter/languageParser")
225+
const mockParsers = await loadRequiredLanguageParsers([filePath])
226+
mockParsers.cs.query.captures = vi.fn().mockReturnValue(mockCaptures)
227+
228+
const result = await parser.parseFile(filePath, { content })
229+
230+
// Should have at least one block for the grouped using directives
231+
const usingBlocks = result.filter((block) => block.type === "using_directive_group")
232+
expect(usingBlocks.length).toBeGreaterThanOrEqual(1)
233+
234+
// The first group should contain the first three using directives
235+
const firstGroup = usingBlocks[0]
236+
expect(firstGroup.content).toBe(
237+
"using System;\n" + "using System.Collections.Generic;\n" + "using System.Text;",
238+
)
239+
expect(firstGroup.start_line).toBe(1)
240+
expect(firstGroup.end_line).toBe(3)
241+
242+
// If there's a second group, it should contain the last two using directives
243+
if (usingBlocks.length > 1) {
244+
const secondGroup = usingBlocks[1]
245+
expect(secondGroup.content).toBe("using System.Linq;\n" + "using System.Threading.Tasks;")
246+
expect(secondGroup.start_line).toBe(7)
247+
expect(secondGroup.end_line).toBe(8)
248+
}
249+
})
250+
})

src/services/code-index/processors/parser.ts

Lines changed: 113 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -165,7 +165,42 @@ export class CodeParser implements ICodeParser {
165165
const results: CodeBlock[] = []
166166

167167
// Process captures if not empty
168-
const queue: Node[] = Array.from(captures).map((capture) => capture.node)
168+
const captureNodes = Array.from(captures).map((capture) => capture.node)
169+
170+
// Group small related nodes together (e.g., using directives in C#)
171+
const groupedNodes = this._groupSmallNodes(captureNodes)
172+
173+
const queue: Node[] = [...groupedNodes.regularNodes]
174+
175+
// Process grouped small nodes first
176+
for (const group of groupedNodes.groups) {
177+
const { nodes, type, identifier } = group
178+
const combinedContent = nodes.map((n) => n.text).join("\n")
179+
const start_line = nodes[0].startPosition.row + 1
180+
const end_line = nodes[nodes.length - 1].endPosition.row + 1
181+
182+
// Only create a block if the combined content meets the minimum size
183+
if (combinedContent.length >= MIN_BLOCK_CHARS) {
184+
const contentPreview = combinedContent.slice(0, 100)
185+
const segmentHash = createHash("sha256")
186+
.update(`${filePath}-${start_line}-${end_line}-${combinedContent.length}-${contentPreview}`)
187+
.digest("hex")
188+
189+
if (!seenSegmentHashes.has(segmentHash)) {
190+
seenSegmentHashes.add(segmentHash)
191+
results.push({
192+
file_path: filePath,
193+
identifier,
194+
type,
195+
start_line,
196+
end_line,
197+
content: combinedContent,
198+
segmentHash,
199+
fileHash,
200+
})
201+
}
202+
}
203+
}
169204

170205
while (queue.length > 0) {
171206
const currentNode = queue.shift()!
@@ -218,12 +253,88 @@ export class CodeParser implements ICodeParser {
218253
}
219254
}
220255
}
221-
// Nodes smaller than minBlockChars are ignored
256+
// Nodes smaller than minBlockChars are ignored unless they were grouped
222257
}
223258

224259
return results
225260
}
226261

262+
/**
263+
* Groups small nodes that are semantically related (e.g., consecutive using directives)
264+
* to ensure they meet the minimum block size requirement when combined.
265+
*/
266+
private _groupSmallNodes(nodes: Node[]): {
267+
groups: Array<{ nodes: Node[]; type: string; identifier: string | null }>
268+
regularNodes: Node[]
269+
} {
270+
const groups: Array<{ nodes: Node[]; type: string; identifier: string | null }> = []
271+
const regularNodes: Node[] = []
272+
const processedIndices = new Set<number>()
273+
274+
// Group consecutive nodes of the same type that are small
275+
for (let i = 0; i < nodes.length; i++) {
276+
if (processedIndices.has(i)) continue
277+
278+
const node = nodes[i]
279+
280+
// If node is large enough on its own, add to regular nodes
281+
if (node.text.length >= MIN_BLOCK_CHARS) {
282+
regularNodes.push(node)
283+
processedIndices.add(i)
284+
continue
285+
}
286+
287+
// Try to group small nodes of the same type
288+
const nodeType = node.type
289+
const groupNodes: Node[] = [node]
290+
processedIndices.add(i)
291+
292+
// Look for consecutive nodes of the same type
293+
for (let j = i + 1; j < nodes.length; j++) {
294+
if (processedIndices.has(j)) continue
295+
296+
const nextNode = nodes[j]
297+
298+
// Stop grouping if we encounter a different type or a large node
299+
if (nextNode.type !== nodeType || nextNode.text.length >= MIN_BLOCK_CHARS) {
300+
break
301+
}
302+
303+
// Check if nodes are consecutive (no significant gap between them)
304+
const prevEndLine = groupNodes[groupNodes.length - 1].endPosition.row
305+
const nextStartLine = nextNode.startPosition.row
306+
307+
// Allow up to 1 empty line between grouped nodes
308+
if (nextStartLine - prevEndLine <= 2) {
309+
groupNodes.push(nextNode)
310+
processedIndices.add(j)
311+
} else {
312+
break
313+
}
314+
}
315+
316+
// Only create a group if we have multiple nodes or if it's a special type
317+
// that should be grouped even when alone (like using directives)
318+
if (groupNodes.length > 1 || nodeType === "using_directive") {
319+
groups.push({
320+
nodes: groupNodes,
321+
type: nodeType + "_group",
322+
identifier: null,
323+
})
324+
}
325+
// Otherwise, the single small node will be ignored
326+
}
327+
328+
// Add any remaining unprocessed nodes to regular nodes
329+
for (let i = 0; i < nodes.length; i++) {
330+
if (!processedIndices.has(i) && nodes[i].text.length >= MIN_BLOCK_CHARS) {
331+
regularNodes.push(nodes[i])
332+
}
333+
}
334+
335+
return { groups, regularNodes }
336+
}
337+
227338
/**
228339
* Common helper function to chunk text by lines, avoiding tiny remainders.
229340
*/

0 commit comments

Comments
 (0)