run against chonkie

nexxeln · nexxeln · commit c27a4c7c2cd8 · 2025-12-19T01:58:08.000+05:30
diff --git a/.gitignore b/.gitignore
@@ -22,3 +22,4 @@ todo.md
 plan.md
 eval/cache
 eval/results
+eval/data
diff --git a/eval/chunkers/chonkie.ts b/eval/chunkers/chonkie.ts
@@ -0,0 +1,81 @@
+/**
+ * Chonkie CodeChunker wrapper for evaluation
+ *
+ * Wraps the Chonkie Python library's CodeChunker for use in the evaluation harness.
+ * Calls the Python script via subprocess.
+ */
+
+import { spawn } from 'node:child_process'
+import { dirname, join } from 'node:path'
+
+const EVAL_DIR = dirname(import.meta.dir)
+const PYTHON_PATH = join(EVAL_DIR, '.venv', 'bin', 'python')
+const SCRIPT_PATH = join(import.meta.dir, 'chonkie_chunk.py')
+
+interface ChunkResult {
+	id: string
+	text: string
+	startLine: number
+	endLine: number
+}
+
+/**
+ * Chunk a file using Chonkie's CodeChunker and return results
+ * in a format compatible with the evaluation
+ *
+ * @param filepath - Path to the file
+ * @param code - Source code content
+ * @param maxChunkSize - Maximum characters per chunk (default: 1500)
+ */
+export async function chunkFile(
+	filepath: string,
+	code: string,
+	maxChunkSize: number = 1500,
+): Promise<ChunkResult[]> {
+	return new Promise((resolve, reject) => {
+		const proc = spawn(
+			PYTHON_PATH,
+			[SCRIPT_PATH, filepath, String(maxChunkSize)],
+			{
+				stdio: ['pipe', 'pipe', 'pipe'],
+			},
+		)
+
+		let stdout = ''
+		let stderr = ''
+
+		proc.stdout.on('data', (data) => {
+			stdout += data.toString()
+		})
+
+		proc.stderr.on('data', (data) => {
+			stderr += data.toString()
+		})
+
+		proc.on('close', (code) => {
+			if (code !== 0) {
+				reject(new Error(`Chonkie chunker failed: ${stderr}`))
+				return
+			}
+
+			try {
+				const result = JSON.parse(stdout)
+				if (result.error) {
+					reject(new Error(`Chonkie error: ${result.error}`))
+					return
+				}
+				resolve(result)
+			} catch {
+				reject(new Error(`Failed to parse Chonkie output: ${stdout}`))
+			}
+		})
+
+		proc.on('error', (err) => {
+			reject(err)
+		})
+
+		// Write code to stdin
+		proc.stdin.write(code)
+		proc.stdin.end()
+	})
+}
diff --git a/eval/chunkers/chonkie_chunk.py b/eval/chunkers/chonkie_chunk.py
@@ -0,0 +1,92 @@
+#!/usr/bin/env python3
+"""
+Chonkie CodeChunker wrapper for evaluation.
+
+Takes filepath, code, and max_chunk_size as arguments.
+Outputs JSON array of chunks with id, text, startLine, endLine.
+"""
+
+import json
+import sys
+from chonkie import CodeChunker
+
+
+def count_nws(text: str) -> int:
+    """Count non-whitespace characters to match the evaluation's sizing."""
+    return sum(1 for c in text if not c.isspace())
+
+
+def main():
+    if len(sys.argv) < 3:
+        print("Usage: chonkie_chunk.py <filepath> <max_chunk_size>", file=sys.stderr)
+        print("Code is read from stdin", file=sys.stderr)
+        sys.exit(1)
+
+    filepath = sys.argv[1]
+    max_chunk_size = int(sys.argv[2])
+    
+    # Read code from stdin to handle large files and special characters
+    code = sys.stdin.read()
+    
+    # Determine language from file extension
+    ext = filepath.rsplit(".", 1)[-1].lower() if "." in filepath else ""
+    lang_map = {
+        "py": "python",
+        "js": "javascript",
+        "ts": "typescript",
+        "tsx": "tsx",
+        "jsx": "javascript",
+        "rs": "rust",
+        "go": "go",
+        "java": "java",
+        "c": "c",
+        "cpp": "cpp",
+        "h": "c",
+        "hpp": "cpp",
+        "rb": "ruby",
+        "php": "php",
+        "cs": "c_sharp",
+        "swift": "swift",
+        "kt": "kotlin",
+        "scala": "scala",
+    }
+    
+    language = lang_map.get(ext, "python")  # Default to python for .py files
+    
+    try:
+        # Initialize CodeChunker with character tokenizer to match NWS-based sizing
+        # Use a simple character-based token counter
+        chunker = CodeChunker(
+            tokenizer_or_token_counter=lambda x: len(x),  # Character count
+            chunk_size=max_chunk_size,
+            language=language,
+            include_nodes=False,
+        )
+        
+        chunks = chunker.chunk(code)
+        
+        # Convert to evaluation format
+        results = []
+        lines = code.split("\n")
+        
+        for chunk in chunks:
+            # Find line numbers from start/end indices
+            start_line = code[:chunk.start_index].count("\n")
+            end_line = code[:chunk.end_index].count("\n")
+            
+            results.append({
+                "id": f"{filepath}:{start_line}-{end_line}",
+                "text": chunk.text,
+                "startLine": start_line,
+                "endLine": end_line,
+            })
+        
+        print(json.dumps(results))
+        
+    except Exception as e:
+        print(json.dumps({"error": str(e)}), file=sys.stderr)
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/eval/embeddings.ts b/eval/embeddings.ts
@@ -110,12 +110,12 @@ async function embedBatch(texts: string[]): Promise<number[][]> {
  * Embed texts with caching
  *
  * @param texts - Array of texts to embed
- * @param onProgress - Optional callback for progress updates
+ * @param onProgress - Optional callback for progress updates (done, total, cachedCount)
  * @returns Array of embeddings (same order as input texts)
  */
 export async function embedTexts(
 	texts: string[],
-	onProgress?: (done: number, total: number) => void,
+	onProgress?: (done: number, total: number, cached: number) => void,
 ): Promise<number[][]> {
 	await mkdir(CACHE_DIR, { recursive: true })
 
@@ -135,8 +135,10 @@ export async function embedTexts(
 	}
 
 	const cachedCount = texts.length - uncachedTexts.length
-	if (cachedCount > 0) {
-		console.log(`  Found ${cachedCount}/${texts.length} embeddings in cache`)
+
+	// Report initial state if all cached
+	if (onProgress && uncachedTexts.length === 0) {
+		onProgress(texts.length, texts.length, cachedCount)
 	}
 
 	// Embed uncached texts in batches
@@ -155,8 +157,9 @@ export async function embedTexts(
 
 		if (onProgress) {
 			onProgress(
-				Math.min(i + BATCH_SIZE, uncachedTexts.length),
-				uncachedTexts.length,
+				cachedCount + Math.min(i + BATCH_SIZE, uncachedTexts.length),
+				texts.length,
+				cachedCount,
 			)
 		}
 	}
diff --git a/eval/run.ts b/eval/run.ts

Original file line number	Diff line number	Diff line change
`@@ -110,12 +110,12 @@ async function embedBatch(texts: string[]): Promise<number[][]> {`
`110`	`110`	`* Embed texts with caching`
`111`	`111`	`*`
`112`	`112`	`* @param texts - Array of texts to embed`
`113`		`- * @param onProgress - Optional callback for progress updates`
	`113`	`+ * @param onProgress - Optional callback for progress updates (done, total, cachedCount)`
`114`	`114`	`* @returns Array of embeddings (same order as input texts)`
`115`	`115`	`*/`
`116`	`116`	`export async function embedTexts(`
`117`	`117`	`texts: string[],`
`118`		`- onProgress?: (done: number, total: number) => void,`
	`118`	`+ onProgress?: (done: number, total: number, cached: number) => void,`
`119`	`119`	`): Promise<number[][]> {`
`120`	`120`	`await mkdir(CACHE_DIR, { recursive: true })`
`121`	`121`
`@@ -135,8 +135,10 @@ export async function embedTexts(`
`135`	`135`	`}`
`136`	`136`
`137`	`137`	`const cachedCount = texts.length - uncachedTexts.length`
`138`		`- if (cachedCount > 0) {`
`139`		- console.log(` Found ${cachedCount}/${texts.length} embeddings in cache`)
	`138`	`+`
	`139`	`+ // Report initial state if all cached`
	`140`	`+ if (onProgress && uncachedTexts.length === 0) {`
	`141`	`+ onProgress(texts.length, texts.length, cachedCount)`
`140`	`142`	`}`
`141`	`143`
`142`	`144`	`// Embed uncached texts in batches`
`@@ -155,8 +157,9 @@ export async function embedTexts(`
`155`	`157`
`156`	`158`	`if (onProgress) {`
`157`	`159`	`onProgress(`
`158`		`- Math.min(i + BATCH_SIZE, uncachedTexts.length),`
`159`		`- uncachedTexts.length,`
	`160`	`+ cachedCount + Math.min(i + BATCH_SIZE, uncachedTexts.length),`
	`161`	`+ texts.length,`
	`162`	`+ cachedCount,`
`160`	`163`	`)`
`161`	`164`	`}`
`162`	`165`	`}`