add missing eval files: fixed chunker, metrics

nexxeln · nexxeln · commit 9e224f107d7d · 2025-12-19T00:20:16.000+05:30
diff --git a/eval/chunkers/fixed.ts b/eval/chunkers/fixed.ts
@@ -0,0 +1,89 @@
+/**
+ * Fixed-size chunker for evaluation baseline
+ *
+ * Simple line-based chunker that splits code into fixed-size chunks
+ * based on non-whitespace character count. Used as a baseline comparison
+ * for the AST-aware chunker.
+ */
+
+/**
+ * Count non-whitespace characters in a string
+ */
+function countNws(text: string): number {
+	let count = 0
+	for (let i = 0; i < text.length; i++) {
+		if (text.charCodeAt(i) > 32) count++
+	}
+	return count
+}
+
+/**
+ * Chunk a file using fixed-size chunking based on NWS character count
+ *
+ * @param filepath - Path to the file (used for chunk IDs)
+ * @param code - Source code content
+ * @param maxNws - Maximum NWS characters per chunk (default: 1500)
+ */
+export async function chunkFile(
+	filepath: string,
+	code: string,
+	maxNws: number = 1500,
+): Promise<
+	Array<{
+		id: string
+		text: string
+		startLine: number
+		endLine: number
+	}>
+> {
+	const lines = code.split('\n')
+	const chunks: Array<{
+		id: string
+		text: string
+		startLine: number
+		endLine: number
+	}> = []
+
+	let currentLines: string[] = []
+	let currentNws = 0
+	let startLine = 0
+
+	for (let i = 0; i < lines.length; i++) {
+		const line = lines[i] ?? ''
+		const lineNws = countNws(line)
+
+		if (currentNws + lineNws > maxNws && currentLines.length > 0) {
+			// Flush current chunk
+			const text = currentLines.join('\n')
+			const endLine = startLine + currentLines.length - 1
+			chunks.push({
+				id: `${filepath}:${startLine}-${endLine}`,
+				text,
+				startLine,
+				endLine,
+			})
+
+			// Start new chunk
+			currentLines = [line]
+			currentNws = lineNws
+			startLine = i
+		} else {
+			currentLines.push(line)
+			currentNws += lineNws
+		}
+	}
+
+	// Flush remaining lines
+	if (currentLines.length > 0) {
+		const text = currentLines.join('\n')
+		const endLine = startLine + currentLines.length - 1
+		chunks.push({
+			id: `${filepath}:${startLine}-${endLine}`,
+			text,
+			startLine,
+			endLine,
+		})
+	}
+
+	return chunks
+}
diff --git a/eval/debug_chunks.ts b/eval/debug_chunks.ts
@@ -1,7 +1,7 @@
 import { readFileSync } from 'node:fs'
 import { join } from 'node:path'
 import { chunk } from '../src'
-import { chunkFixed } from './chunkers/fixed'
+import { chunkFile as chunkFixed } from './chunkers/fixed'
 
 // Check deepmind_tracr/tracr/craft/transformers.py
 // Assume we're looking for lines 100-150
@@ -38,7 +38,7 @@ for (const maxSize of [1500, 1800]) {
 	console.log(`\n=== Max chunk size: ${maxSize} ===`)
 
 	const astChunks = await chunk(testFile, code, { maxChunkSize: maxSize })
-	const fixedChunks = chunkFixed(code, maxSize)
+	const fixedChunks = await chunkFixed(testFile, code, maxSize)
 
 	console.log('\nAST chunks:')
 	for (const c of astChunks) {
@@ -57,7 +57,7 @@ for (const maxSize of [1500, 1800]) {
 	for (const c of fixedChunks) {
 		const overlap = overlaps(c.startLine, c.endLine, targetStart, targetEnd)
 		console.log(
-			`  Lines ${c.startLine}-${c.endLine} (${c.nwsCount} NWS) ${overlap ? '*** RELEVANT ***' : ''}`,
+			`  Lines ${c.startLine}-${c.endLine} (${countNws(c.text)} NWS) ${overlap ? '*** RELEVANT ***' : ''}`,
 		)
 	}
 
diff --git a/eval/metrics.ts b/eval/metrics.ts
@@ -0,0 +1,72 @@
+/**
+ * Retrieval metrics for evaluation
+ *
+ * Computes precision, recall, and nDCG for retrieval evaluation.
+ */
+
+/**
+ * Compute precision, recall, and nDCG for a single query
+ *
+ * @param retrievedIds - Ordered list of retrieved chunk IDs
+ * @param relevantSet - Set of relevant (ground truth) chunk IDs
+ * @param k - Number of results to consider
+ */
+export function computeMetrics(
+	retrievedIds: string[],
+	relevantSet: Set<string>,
+	k: number,
+): { precision: number; recall: number; ndcg: number } {
+	const topK = retrievedIds.slice(0, k)
+
+	// Precision@k: fraction of retrieved that are relevant
+	const relevantInTopK = topK.filter((id) => relevantSet.has(id)).length
+	const precision = relevantInTopK / k
+
+	// Recall@k: fraction of relevant that are retrieved
+	const recall = relevantSet.size > 0 ? relevantInTopK / relevantSet.size : 0
+
+	// nDCG@k: normalized discounted cumulative gain
+	const dcg = topK.reduce((sum, id, i) => {
+		const rel = relevantSet.has(id) ? 1 : 0
+		return sum + rel / Math.log2(i + 2) // i+2 because log2(1) = 0
+	}, 0)
+
+	// Ideal DCG: all relevant docs at top
+	const idealK = Math.min(k, relevantSet.size)
+	const idcg = Array.from({ length: idealK }).reduce<number>(
+		(sum, _, i) => sum + 1 / Math.log2(i + 2),
+		0,
+	)
+
+	const ndcg = idcg > 0 ? dcg / idcg : 0
+
+	return { precision, recall, ndcg }
+}
+
+/**
+ * Aggregate metrics across multiple queries
+ *
+ * @param metrics - Array of metric objects
+ */
+export function aggregateMetrics(
+	metrics: Array<{ precision: number; recall: number; ndcg: number }>,
+): { precision: number; recall: number; ndcg: number } {
+	if (metrics.length === 0) {
+		return { precision: 0, recall: 0, ndcg: 0 }
+	}
+
+	const sum = metrics.reduce(
+		(acc, m) => ({
+			precision: acc.precision + m.precision,
+			recall: acc.recall + m.recall,
+			ndcg: acc.ndcg + m.ndcg,
+		}),
+		{ precision: 0, recall: 0, ndcg: 0 },
+	)
+
+	return {
+		precision: sum.precision / metrics.length,
+		recall: sum.recall / metrics.length,
+		ndcg: sum.ndcg / metrics.length,
+	}
+}