Skip to content

Commit c27a4c7

Browse files
committed
run against chonkie
1 parent 9e224f1 commit c27a4c7

File tree

5 files changed

+326
-138
lines changed

5 files changed

+326
-138
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,3 +22,4 @@ todo.md
2222
plan.md
2323
eval/cache
2424
eval/results
25+
eval/data

eval/chunkers/chonkie.ts

Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
/**
2+
* Chonkie CodeChunker wrapper for evaluation
3+
*
4+
* Wraps the Chonkie Python library's CodeChunker for use in the evaluation harness.
5+
* Calls the Python script via subprocess.
6+
*/
7+
8+
import { spawn } from 'node:child_process'
9+
import { dirname, join } from 'node:path'
10+
11+
const EVAL_DIR = dirname(import.meta.dir)
12+
const PYTHON_PATH = join(EVAL_DIR, '.venv', 'bin', 'python')
13+
const SCRIPT_PATH = join(import.meta.dir, 'chonkie_chunk.py')
14+
15+
interface ChunkResult {
16+
id: string
17+
text: string
18+
startLine: number
19+
endLine: number
20+
}
21+
22+
/**
23+
* Chunk a file using Chonkie's CodeChunker and return results
24+
* in a format compatible with the evaluation
25+
*
26+
* @param filepath - Path to the file
27+
* @param code - Source code content
28+
* @param maxChunkSize - Maximum characters per chunk (default: 1500)
29+
*/
30+
export async function chunkFile(
31+
filepath: string,
32+
code: string,
33+
maxChunkSize: number = 1500,
34+
): Promise<ChunkResult[]> {
35+
return new Promise((resolve, reject) => {
36+
const proc = spawn(
37+
PYTHON_PATH,
38+
[SCRIPT_PATH, filepath, String(maxChunkSize)],
39+
{
40+
stdio: ['pipe', 'pipe', 'pipe'],
41+
},
42+
)
43+
44+
let stdout = ''
45+
let stderr = ''
46+
47+
proc.stdout.on('data', (data) => {
48+
stdout += data.toString()
49+
})
50+
51+
proc.stderr.on('data', (data) => {
52+
stderr += data.toString()
53+
})
54+
55+
proc.on('close', (code) => {
56+
if (code !== 0) {
57+
reject(new Error(`Chonkie chunker failed: ${stderr}`))
58+
return
59+
}
60+
61+
try {
62+
const result = JSON.parse(stdout)
63+
if (result.error) {
64+
reject(new Error(`Chonkie error: ${result.error}`))
65+
return
66+
}
67+
resolve(result)
68+
} catch {
69+
reject(new Error(`Failed to parse Chonkie output: ${stdout}`))
70+
}
71+
})
72+
73+
proc.on('error', (err) => {
74+
reject(err)
75+
})
76+
77+
// Write code to stdin
78+
proc.stdin.write(code)
79+
proc.stdin.end()
80+
})
81+
}

eval/chunkers/chonkie_chunk.py

Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,92 @@
1+
#!/usr/bin/env python3
2+
"""
3+
Chonkie CodeChunker wrapper for evaluation.
4+
5+
Takes filepath, code, and max_chunk_size as arguments.
6+
Outputs JSON array of chunks with id, text, startLine, endLine.
7+
"""
8+
9+
import json
10+
import sys
11+
from chonkie import CodeChunker
12+
13+
14+
def count_nws(text: str) -> int:
15+
"""Count non-whitespace characters to match the evaluation's sizing."""
16+
return sum(1 for c in text if not c.isspace())
17+
18+
19+
def main():
20+
if len(sys.argv) < 3:
21+
print("Usage: chonkie_chunk.py <filepath> <max_chunk_size>", file=sys.stderr)
22+
print("Code is read from stdin", file=sys.stderr)
23+
sys.exit(1)
24+
25+
filepath = sys.argv[1]
26+
max_chunk_size = int(sys.argv[2])
27+
28+
# Read code from stdin to handle large files and special characters
29+
code = sys.stdin.read()
30+
31+
# Determine language from file extension
32+
ext = filepath.rsplit(".", 1)[-1].lower() if "." in filepath else ""
33+
lang_map = {
34+
"py": "python",
35+
"js": "javascript",
36+
"ts": "typescript",
37+
"tsx": "tsx",
38+
"jsx": "javascript",
39+
"rs": "rust",
40+
"go": "go",
41+
"java": "java",
42+
"c": "c",
43+
"cpp": "cpp",
44+
"h": "c",
45+
"hpp": "cpp",
46+
"rb": "ruby",
47+
"php": "php",
48+
"cs": "c_sharp",
49+
"swift": "swift",
50+
"kt": "kotlin",
51+
"scala": "scala",
52+
}
53+
54+
language = lang_map.get(ext, "python") # Default to python for .py files
55+
56+
try:
57+
# Initialize CodeChunker with character tokenizer to match NWS-based sizing
58+
# Use a simple character-based token counter
59+
chunker = CodeChunker(
60+
tokenizer_or_token_counter=lambda x: len(x), # Character count
61+
chunk_size=max_chunk_size,
62+
language=language,
63+
include_nodes=False,
64+
)
65+
66+
chunks = chunker.chunk(code)
67+
68+
# Convert to evaluation format
69+
results = []
70+
lines = code.split("\n")
71+
72+
for chunk in chunks:
73+
# Find line numbers from start/end indices
74+
start_line = code[:chunk.start_index].count("\n")
75+
end_line = code[:chunk.end_index].count("\n")
76+
77+
results.append({
78+
"id": f"{filepath}:{start_line}-{end_line}",
79+
"text": chunk.text,
80+
"startLine": start_line,
81+
"endLine": end_line,
82+
})
83+
84+
print(json.dumps(results))
85+
86+
except Exception as e:
87+
print(json.dumps({"error": str(e)}), file=sys.stderr)
88+
sys.exit(1)
89+
90+
91+
if __name__ == "__main__":
92+
main()

eval/embeddings.ts

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -110,12 +110,12 @@ async function embedBatch(texts: string[]): Promise<number[][]> {
110110
* Embed texts with caching
111111
*
112112
* @param texts - Array of texts to embed
113-
* @param onProgress - Optional callback for progress updates
113+
* @param onProgress - Optional callback for progress updates (done, total, cachedCount)
114114
* @returns Array of embeddings (same order as input texts)
115115
*/
116116
export async function embedTexts(
117117
texts: string[],
118-
onProgress?: (done: number, total: number) => void,
118+
onProgress?: (done: number, total: number, cached: number) => void,
119119
): Promise<number[][]> {
120120
await mkdir(CACHE_DIR, { recursive: true })
121121

@@ -135,8 +135,10 @@ export async function embedTexts(
135135
}
136136

137137
const cachedCount = texts.length - uncachedTexts.length
138-
if (cachedCount > 0) {
139-
console.log(` Found ${cachedCount}/${texts.length} embeddings in cache`)
138+
139+
// Report initial state if all cached
140+
if (onProgress && uncachedTexts.length === 0) {
141+
onProgress(texts.length, texts.length, cachedCount)
140142
}
141143

142144
// Embed uncached texts in batches
@@ -155,8 +157,9 @@ export async function embedTexts(
155157

156158
if (onProgress) {
157159
onProgress(
158-
Math.min(i + BATCH_SIZE, uncachedTexts.length),
159-
uncachedTexts.length,
160+
cachedCount + Math.min(i + BATCH_SIZE, uncachedTexts.length),
161+
texts.length,
162+
cachedCount,
160163
)
161164
}
162165
}

0 commit comments

Comments
 (0)