Skip to content

Commit 8ad8408

Browse files
committed
fix: resolve Go codebase indexing with language-specific thresholds
- Update Go tree-sitter queries to capture full declarations instead of just identifiers - Implement language-specific character thresholds (50 chars for Go vs 100 default) - Fix inspectGo.spec.ts test to match new query behavior - Add comprehensive test coverage for Go indexing fix This ensures Go files are properly indexed for semantic search while preventing duplicate references. All tests now pass.
1 parent 5209780 commit 8ad8408

File tree

4 files changed

+273
-83
lines changed

4 files changed

+273
-83
lines changed
Lines changed: 211 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,211 @@
1+
import { describe, it, expect, beforeAll, vi } from "vitest"
2+
import { CodeParser } from "../processors/parser"
3+
import * as languageParserModule from "../../tree-sitter/languageParser"
4+
import * as path from "path"
5+
6+
describe("Go Indexing Fix", () => {
7+
let wasmDir: string | undefined
8+
9+
beforeAll(async () => {
10+
// Find WASM directory
11+
const possibleWasmDirs = [path.join(__dirname, "../../../dist"), path.join(process.cwd(), "dist")]
12+
13+
for (const dir of possibleWasmDirs) {
14+
try {
15+
const fsSync = require("fs")
16+
const wasmPath = path.join(dir, "tree-sitter-go.wasm")
17+
if (fsSync.existsSync(wasmPath)) {
18+
wasmDir = dir
19+
break
20+
}
21+
} catch (e) {
22+
// Continue searching
23+
}
24+
}
25+
26+
if (!wasmDir) {
27+
throw new Error("Could not find WASM directory")
28+
}
29+
30+
// Mock loadRequiredLanguageParsers to use our WASM directory
31+
const originalLoad = languageParserModule.loadRequiredLanguageParsers
32+
vi.spyOn(languageParserModule, "loadRequiredLanguageParsers").mockImplementation(
33+
async (files: string[], customWasmDir?: string) => {
34+
return originalLoad(files, customWasmDir || wasmDir)
35+
},
36+
)
37+
})
38+
39+
it("should correctly index Go functions, methods, and types", async () => {
40+
const parser = new CodeParser()
41+
42+
const goContent = `package main
43+
44+
import (
45+
"fmt"
46+
"strings"
47+
)
48+
49+
// User represents a user in the system
50+
type User struct {
51+
ID int
52+
Name string
53+
Email string
54+
IsActive bool
55+
}
56+
57+
// NewUser creates a new user instance
58+
func NewUser(id int, name, email string) *User {
59+
return &User{
60+
ID: id,
61+
Name: name,
62+
Email: email,
63+
IsActive: true,
64+
}
65+
}
66+
67+
// GetDisplayName returns the user's display name
68+
func (u *User) GetDisplayName() string {
69+
return fmt.Sprintf("%s <%s>", u.Name, u.Email)
70+
}
71+
72+
// Validate checks if the user data is valid
73+
func (u *User) Validate() error {
74+
if u.Name == "" {
75+
return fmt.Errorf("name cannot be empty")
76+
}
77+
if !strings.Contains(u.Email, "@") {
78+
return fmt.Errorf("invalid email format")
79+
}
80+
return nil
81+
}
82+
83+
// ProcessUsers processes a list of users
84+
func ProcessUsers(users []*User) {
85+
for _, user := range users {
86+
if err := user.Validate(); err != nil {
87+
fmt.Printf("Invalid user %d: %v\n", user.ID, err)
88+
continue
89+
}
90+
fmt.Println(user.GetDisplayName())
91+
}
92+
}
93+
94+
func main() {
95+
users := []*User{
96+
NewUser(1, "Alice", "[email protected]"),
97+
NewUser(2, "Bob", "[email protected]"),
98+
}
99+
ProcessUsers(users)
100+
}`
101+
102+
const blocks = await parser.parseFile("test.go", {
103+
content: goContent,
104+
fileHash: "test-hash",
105+
})
106+
107+
// Verify we got blocks
108+
expect(blocks.length).toBeGreaterThan(0)
109+
110+
// Check for specific function declarations
111+
const functionBlocks = blocks.filter((b) => b.type === "function_declaration")
112+
const functionNames = functionBlocks.map((b) => b.identifier).sort()
113+
expect(functionNames).toContain("NewUser")
114+
expect(functionNames).toContain("ProcessUsers")
115+
// Note: main function might be filtered out if it's less than 50 characters
116+
117+
// Check for method declarations
118+
const methodBlocks = blocks.filter((b) => b.type === "method_declaration")
119+
const methodNames = methodBlocks.map((b) => b.identifier).sort()
120+
expect(methodNames).toContain("GetDisplayName")
121+
expect(methodNames).toContain("Validate")
122+
123+
// Check for type declarations
124+
const typeBlocks = blocks.filter((b) => b.type === "type_declaration")
125+
expect(typeBlocks.length).toBeGreaterThan(0)
126+
127+
// Verify content is captured correctly
128+
const newUserBlock = functionBlocks.find((b) => b.identifier === "NewUser")
129+
expect(newUserBlock).toBeDefined()
130+
expect(newUserBlock!.content).toContain("func NewUser")
131+
expect(newUserBlock!.content).toContain("return &User{")
132+
133+
// Verify line numbers are correct
134+
const validateBlock = methodBlocks.find((b) => b.identifier === "Validate")
135+
expect(validateBlock).toBeDefined()
136+
expect(validateBlock!.start_line).toBeGreaterThan(1)
137+
expect(validateBlock!.end_line).toBeGreaterThan(validateBlock!.start_line)
138+
})
139+
140+
it("should respect the 50-character threshold for Go", async () => {
141+
const parser = new CodeParser()
142+
143+
const goContent = `package main
144+
145+
// Short function - should be filtered out
146+
func f() {
147+
return
148+
}
149+
150+
// Longer function - should be included
151+
func calculateTotal(items []int) int {
152+
total := 0
153+
for _, item := range items {
154+
total += item
155+
}
156+
return total
157+
}`
158+
159+
const blocks = await parser.parseFile("test.go", {
160+
content: goContent,
161+
fileHash: "test-hash",
162+
})
163+
164+
// The short function should be filtered out
165+
const functionBlocks = blocks.filter((b) => b.type === "function_declaration")
166+
expect(functionBlocks.length).toBe(1)
167+
expect(functionBlocks[0].identifier).toBe("calculateTotal")
168+
169+
// Verify the short function was not included
170+
const shortFunction = functionBlocks.find((b) => b.identifier === "f")
171+
expect(shortFunction).toBeUndefined()
172+
})
173+
174+
it("should capture full declaration content, not just identifiers", async () => {
175+
const parser = new CodeParser()
176+
177+
const goContent = `package main
178+
179+
type Config struct {
180+
Host string
181+
Port int
182+
Debug bool
183+
Timeout int
184+
}
185+
186+
func (c *Config) GetAddress() string {
187+
return fmt.Sprintf("%s:%d", c.Host, c.Port)
188+
}`
189+
190+
const blocks = await parser.parseFile("test.go", {
191+
content: goContent,
192+
fileHash: "test-hash",
193+
})
194+
195+
// Check that we capture the full struct declaration
196+
const typeBlock = blocks.find((b) => b.type === "type_declaration")
197+
if (typeBlock) {
198+
expect(typeBlock.content).toContain("type Config struct")
199+
expect(typeBlock.content).toContain("Host string")
200+
expect(typeBlock.content).toContain("Port int")
201+
expect(typeBlock.content).toContain("Debug bool")
202+
expect(typeBlock.content).toContain("Timeout int")
203+
}
204+
205+
// Check that we capture the full method declaration
206+
const methodBlock = blocks.find((b) => b.type === "method_declaration" && b.identifier === "GetAddress")
207+
expect(methodBlock).toBeDefined()
208+
expect(methodBlock!.content).toContain("func (c *Config) GetAddress() string")
209+
expect(methodBlock!.content).toContain("return fmt.Sprintf")
210+
})
211+
})

src/services/code-index/processors/parser.ts

Lines changed: 38 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,14 @@ import { ICodeParser, CodeBlock } from "../interfaces"
77
import { scannerExtensions } from "../shared/supported-extensions"
88
import { MAX_BLOCK_CHARS, MIN_BLOCK_CHARS, MIN_CHUNK_REMAINDER_CHARS, MAX_CHARS_TOLERANCE_FACTOR } from "../constants"
99

10+
/**
11+
* Language-specific minimum block character thresholds
12+
*/
13+
const LANGUAGE_THRESHOLDS: Record<string, number> = {
14+
go: 50, // Go has concise syntax
15+
default: MIN_BLOCK_CHARS, // Default for other languages (100)
16+
}
17+
1018
/**
1119
* Implementation of the code parser interface
1220
*/
@@ -67,6 +75,15 @@ export class CodeParser implements ICodeParser {
6775
return scannerExtensions.includes(extension)
6876
}
6977

78+
/**
79+
* Gets the minimum block character threshold for a language
80+
* @param language Language identifier
81+
* @returns Minimum character threshold
82+
*/
83+
private getMinBlockChars(language: string): number {
84+
return LANGUAGE_THRESHOLDS[language] || LANGUAGE_THRESHOLDS.default
85+
}
86+
7087
/**
7188
* Creates a hash for a file
7289
* @param content File content
@@ -86,6 +103,7 @@ export class CodeParser implements ICodeParser {
86103
private async parseContent(filePath: string, content: string, fileHash: string): Promise<CodeBlock[]> {
87104
const ext = path.extname(filePath).slice(1).toLowerCase()
88105
const seenSegmentHashes = new Set<string>()
106+
const minBlockChars = this.getMinBlockChars(ext)
89107

90108
// Check if we already have the parser loaded
91109
if (!this.loadedParsers[ext]) {
@@ -128,9 +146,15 @@ export class CodeParser implements ICodeParser {
128146

129147
// Check if captures are empty
130148
if (captures.length === 0) {
131-
if (content.length >= MIN_BLOCK_CHARS) {
149+
if (content.length >= minBlockChars) {
132150
// Perform fallback chunking if content is large enough
133-
const blocks = this._performFallbackChunking(filePath, content, fileHash, seenSegmentHashes)
151+
const blocks = this._performFallbackChunking(
152+
filePath,
153+
content,
154+
fileHash,
155+
seenSegmentHashes,
156+
minBlockChars,
157+
)
134158
return blocks
135159
} else {
136160
// Return empty if content is too small for fallback
@@ -148,20 +172,20 @@ export class CodeParser implements ICodeParser {
148172
// const lineSpan = currentNode.endPosition.row - currentNode.startPosition.row + 1 // Removed as per lint error
149173

150174
// Check if the node meets the minimum character requirement
151-
if (currentNode.text.length >= MIN_BLOCK_CHARS) {
175+
if (currentNode.text.length >= minBlockChars) {
152176
// If it also exceeds the maximum character limit, try to break it down
153177
if (currentNode.text.length > MAX_BLOCK_CHARS * MAX_CHARS_TOLERANCE_FACTOR) {
154178
if (currentNode.children.filter((child) => child !== null).length > 0) {
155179
// If it has children, process them instead
156180
queue.push(...currentNode.children.filter((child) => child !== null))
157181
} else {
158-
// If it's a leaf node, chunk it (passing MIN_BLOCK_CHARS as per Task 1 Step 5)
159-
// Note: _chunkLeafNodeByLines logic might need further adjustment later
182+
// If it's a leaf node, chunk it
160183
const chunkedBlocks = this._chunkLeafNodeByLines(
161184
currentNode,
162185
filePath,
163186
fileHash,
164187
seenSegmentHashes,
188+
minBlockChars,
165189
)
166190
results.push(...chunkedBlocks)
167191
}
@@ -194,7 +218,7 @@ export class CodeParser implements ICodeParser {
194218
}
195219
}
196220
}
197-
// Nodes smaller than MIN_BLOCK_CHARS are ignored
221+
// Nodes smaller than minBlockChars are ignored
198222
}
199223

200224
return results
@@ -207,9 +231,9 @@ export class CodeParser implements ICodeParser {
207231
lines: string[],
208232
filePath: string,
209233
fileHash: string,
210-
211234
chunkType: string,
212235
seenSegmentHashes: Set<string>,
236+
minBlockChars: number,
213237
baseStartLine: number = 1, // 1-based start line of the *first* line in the `lines` array
214238
): CodeBlock[] {
215239
const chunks: CodeBlock[] = []
@@ -219,7 +243,7 @@ export class CodeParser implements ICodeParser {
219243
const effectiveMaxChars = MAX_BLOCK_CHARS * MAX_CHARS_TOLERANCE_FACTOR
220244

221245
const finalizeChunk = (endLineIndex: number) => {
222-
if (currentChunkLength >= MIN_BLOCK_CHARS && currentChunkLines.length > 0) {
246+
if (currentChunkLength >= minBlockChars && currentChunkLines.length > 0) {
223247
const chunkContent = currentChunkLines.join("\n")
224248
const startLine = baseStartLine + chunkStartLineIndex
225249
const endLine = baseStartLine + endLineIndex
@@ -300,7 +324,7 @@ export class CodeParser implements ICodeParser {
300324
}
301325

302326
if (
303-
currentChunkLength >= MIN_BLOCK_CHARS &&
327+
currentChunkLength >= minBlockChars &&
304328
remainderLength < MIN_CHUNK_REMAINDER_CHARS &&
305329
currentChunkLines.length > 1
306330
) {
@@ -311,7 +335,7 @@ export class CodeParser implements ICodeParser {
311335
const potentialNextChunkLength = potentialNextChunkLines.join("\n").length + 1
312336

313337
if (
314-
potentialChunkLength >= MIN_BLOCK_CHARS &&
338+
potentialChunkLength >= minBlockChars &&
315339
potentialNextChunkLength >= MIN_CHUNK_REMAINDER_CHARS
316340
) {
317341
splitIndex = k
@@ -348,16 +372,18 @@ export class CodeParser implements ICodeParser {
348372
content: string,
349373
fileHash: string,
350374
seenSegmentHashes: Set<string>,
375+
minBlockChars: number,
351376
): CodeBlock[] {
352377
const lines = content.split("\n")
353-
return this._chunkTextByLines(lines, filePath, fileHash, "fallback_chunk", seenSegmentHashes)
378+
return this._chunkTextByLines(lines, filePath, fileHash, "fallback_chunk", seenSegmentHashes, minBlockChars)
354379
}
355380

356381
private _chunkLeafNodeByLines(
357382
node: Node,
358383
filePath: string,
359384
fileHash: string,
360385
seenSegmentHashes: Set<string>,
386+
minBlockChars: number,
361387
): CodeBlock[] {
362388
const lines = node.text.split("\n")
363389
const baseStartLine = node.startPosition.row + 1
@@ -367,6 +393,7 @@ export class CodeParser implements ICodeParser {
367393
fileHash,
368394
node.type, // Use the node's type
369395
seenSegmentHashes,
396+
minBlockChars,
370397
baseStartLine,
371398
)
372399
}

0 commit comments

Comments
 (0)