Skip to content

Commit e7fdfec

Browse files
committed
add contextualText for embedding
1 parent c37bf0f commit e7fdfec

File tree

7 files changed

+198
-61
lines changed

7 files changed

+198
-61
lines changed

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,3 +20,5 @@ yarn-error.log*
2020
.turbo
2121
todo.md
2222
plan.md
23+
eval/cache
24+
eval/results

eval/chunkers/ast.ts

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
* AST-aware chunker wrapper for evaluation
33
*
44
* Wraps the astchunk library for use in the evaluation harness.
5+
* Uses the built-in contextualizedText for better embedding quality.
56
*/
67

78
import { chunk } from '../../src'
@@ -28,7 +29,7 @@ export async function chunkFile(
2829

2930
return chunks.map((c) => ({
3031
id: `${filepath}:${c.lineRange.start}-${c.lineRange.end}`,
31-
text: c.text,
32+
text: c.contextualizedText,
3233
startLine: c.lineRange.start,
3334
endLine: c.lineRange.end,
3435
}))

eval/run.ts

Lines changed: 87 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -23,8 +23,8 @@ import { embedTexts, topK } from './embeddings'
2323
import { aggregateMetrics, computeMetrics } from './metrics'
2424

2525
const RESULTS_DIR = join(import.meta.dir, 'results')
26-
const K = 5 // Top-k for retrieval
27-
const MAX_CHUNK_SIZE = 1800 // NWS characters per chunk
26+
const K_VALUES = [5, 10] // Top-k values for retrieval
27+
const MAX_CHUNK_SIZE = 1500 // NWS characters per chunk
2828

2929
interface ChunkInfo {
3030
id: string
@@ -34,22 +34,28 @@ interface ChunkInfo {
3434
filepath: string
3535
}
3636

37+
interface MetricsAtK {
38+
precision: number
39+
recall: number
40+
ndcg: number
41+
}
42+
3743
interface QueryResult {
3844
taskId: string
3945
prompt: string
4046
groundTruthLines: { start: number; end: number }
4147
groundTruthFile: string
4248
retrievedChunks: Array<{ id: string; score: number; rank: number }>
4349
relevantChunkIds: string[]
44-
metrics: { precision: number; recall: number; ndcg: number }
50+
metrics: Record<number, MetricsAtK> // metrics per k value
4551
}
4652

4753
interface EvalResult {
4854
chunker: 'ast' | 'fixed'
4955
repo: string
50-
summary: { precision: number; recall: number; ndcg: number }
56+
summary: Record<number, MetricsAtK> // summary per k value
5157
queryResults: QueryResult[]
52-
config: { k: number; maxChunkSize: number }
58+
config: { kValues: number[]; maxChunkSize: number }
5359
timestamp: string
5460
}
5561

@@ -161,12 +167,14 @@ async function evaluateRepo(
161167
)
162168
}
163169

170+
const maxK = Math.max(...K_VALUES)
171+
164172
for (let i = 0; i < tasks.length; i++) {
165173
const task = tasks[i]
166174
const queryEmb = queryEmbeddings[i]
167175

168-
// Get top-k chunks
169-
const topKResults = topK(queryEmb, chunkEmbeddings, K)
176+
// Get top-k chunks (use max k to get all we need)
177+
const topKResults = topK(queryEmb, chunkEmbeddings, maxK)
170178

171179
// Determine ground truth: chunks that overlap with target location
172180
// fpath_tuple is ["repo_name", "path", "to", "file.py"], skip first element
@@ -197,8 +205,11 @@ async function evaluateRepo(
197205
// Get retrieved chunk IDs
198206
const retrievedIds = topKResults.map((r) => allChunks[r.index].id)
199207

200-
// Compute metrics
201-
const metrics = computeMetrics(retrievedIds, relevantSet, K)
208+
// Compute metrics for each k value
209+
const metrics: Record<number, MetricsAtK> = {}
210+
for (const k of K_VALUES) {
211+
metrics[k] = computeMetrics(retrievedIds, relevantSet, k)
212+
}
202213

203214
queryResults.push({
204215
taskId: task.metadata.task_id,
@@ -215,29 +226,51 @@ async function evaluateRepo(
215226
})
216227
}
217228

218-
// Aggregate metrics
219-
const summary = aggregateMetrics(queryResults.map((q) => q.metrics))
229+
// Aggregate metrics for each k value
230+
const summary: Record<number, MetricsAtK> = {}
231+
for (const k of K_VALUES) {
232+
summary[k] = aggregateMetrics(queryResults.map((q) => q.metrics[k]))
233+
}
220234

221235
return {
222236
chunker: chunkerType,
223237
repo,
224238
summary,
225239
queryResults,
226-
config: { k: K, maxChunkSize: MAX_CHUNK_SIZE },
240+
config: { kValues: K_VALUES, maxChunkSize: MAX_CHUNK_SIZE },
227241
timestamp: new Date().toISOString(),
228242
}
229243
}
230244

231245
/**
232-
* Format metrics as a table row
246+
* Format metrics as a table row for a specific k
233247
*/
234-
function formatMetrics(
235-
label: string,
236-
metrics: { precision: number; recall: number; ndcg: number },
237-
): string {
248+
function formatMetricsRow(label: string, metrics: MetricsAtK): string {
238249
return `${label.padEnd(20)} | ${(metrics.ndcg * 100).toFixed(1).padStart(6)} | ${(metrics.precision * 100).toFixed(1).padStart(6)} | ${(metrics.recall * 100).toFixed(1).padStart(6)}`
239250
}
240251

252+
/**
253+
* Print metrics table for all k values
254+
*/
255+
function printMetricsTable(
256+
astSummary: Record<number, MetricsAtK>,
257+
fixedSummary: Record<number, MetricsAtK>,
258+
indent = '',
259+
): void {
260+
for (const k of K_VALUES) {
261+
console.log(`${indent}k=${k}:`)
262+
console.log(indent + '-'.repeat(50))
263+
console.log(
264+
`${indent}${'Chunker'.padEnd(20)} | ${'nDCG'.padStart(6)} | ${'P@k'.padStart(6)} | ${'R@k'.padStart(6)}`,
265+
)
266+
console.log(indent + '-'.repeat(50))
267+
console.log(indent + formatMetricsRow('AST', astSummary[k]))
268+
console.log(indent + formatMetricsRow('Fixed', fixedSummary[k]))
269+
console.log(indent + '-'.repeat(50))
270+
console.log('')
271+
}
272+
}
273+
241274
async function main() {
242275
console.log('RepoEval Retrieval Evaluation')
243276
console.log('=============================\n')
@@ -290,14 +323,7 @@ async function main() {
290323

291324
// Print comparison
292325
console.log(`\n Results for ${repo}:`)
293-
console.log(' ' + '-'.repeat(50))
294-
console.log(
295-
` ${'Chunker'.padEnd(20)} | ${'nDCG@5'.padStart(6)} | ${'P@5'.padStart(6)} | ${'R@5'.padStart(6)}`,
296-
)
297-
console.log(' ' + '-'.repeat(50))
298-
console.log(' ' + formatMetrics('AST', astResult.summary))
299-
console.log(' ' + formatMetrics('Fixed', fixedResult.summary))
300-
console.log(' ' + '-'.repeat(50))
326+
printMetricsTable(astResult.summary, fixedResult.summary, ' ')
301327
}
302328

303329
// Step 4: Compute overall summary
@@ -308,36 +334,42 @@ async function main() {
308334
const astResults = allResults.filter((r) => r.chunker === 'ast')
309335
const fixedResults = allResults.filter((r) => r.chunker === 'fixed')
310336

311-
const astOverall = aggregateMetrics(astResults.map((r) => r.summary))
312-
const fixedOverall = aggregateMetrics(fixedResults.map((r) => r.summary))
337+
// Aggregate metrics for each k value
338+
const astOverall: Record<number, MetricsAtK> = {}
339+
const fixedOverall: Record<number, MetricsAtK> = {}
340+
for (const k of K_VALUES) {
341+
astOverall[k] = aggregateMetrics(astResults.map((r) => r.summary[k]))
342+
fixedOverall[k] = aggregateMetrics(fixedResults.map((r) => r.summary[k]))
343+
}
313344

314-
console.log(
315-
`\n${'Chunker'.padEnd(20)} | ${'nDCG@5'.padStart(6)} | ${'P@5'.padStart(6)} | ${'R@5'.padStart(6)}`,
316-
)
317-
console.log('-'.repeat(50))
318-
console.log(formatMetrics('AST', astOverall))
319-
console.log(formatMetrics('Fixed', fixedOverall))
320-
console.log('-'.repeat(50))
321-
322-
// Compute improvements
323-
const ndcgImprovement =
324-
((astOverall.ndcg - fixedOverall.ndcg) / fixedOverall.ndcg) * 100
325-
const precImprovement =
326-
((astOverall.precision - fixedOverall.precision) / fixedOverall.precision) *
327-
100
328-
const recallImprovement =
329-
((astOverall.recall - fixedOverall.recall) / fixedOverall.recall) * 100
330-
331-
console.log(`\nImprovement (AST vs Fixed):`)
332-
console.log(
333-
` nDCG@5: ${ndcgImprovement >= 0 ? '+' : ''}${ndcgImprovement.toFixed(1)}%`,
334-
)
335-
console.log(
336-
` Precision@5: ${precImprovement >= 0 ? '+' : ''}${precImprovement.toFixed(1)}%`,
337-
)
338-
console.log(
339-
` Recall@5: ${recallImprovement >= 0 ? '+' : ''}${recallImprovement.toFixed(1)}%`,
340-
)
345+
console.log('')
346+
printMetricsTable(astOverall, fixedOverall)
347+
348+
// Compute improvements for each k
349+
console.log('Improvement (AST vs Fixed):')
350+
for (const k of K_VALUES) {
351+
const ndcgImprovement =
352+
((astOverall[k].ndcg - fixedOverall[k].ndcg) / fixedOverall[k].ndcg) * 100
353+
const precImprovement =
354+
((astOverall[k].precision - fixedOverall[k].precision) /
355+
fixedOverall[k].precision) *
356+
100
357+
const recallImprovement =
358+
((astOverall[k].recall - fixedOverall[k].recall) /
359+
fixedOverall[k].recall) *
360+
100
361+
362+
console.log(` k=${k}:`)
363+
console.log(
364+
` nDCG: ${ndcgImprovement >= 0 ? '+' : ''}${ndcgImprovement.toFixed(1)}%`,
365+
)
366+
console.log(
367+
` Precision: ${precImprovement >= 0 ? '+' : ''}${precImprovement.toFixed(1)}%`,
368+
)
369+
console.log(
370+
` Recall: ${recallImprovement >= 0 ? '+' : ''}${recallImprovement.toFixed(1)}%`,
371+
)
372+
}
341373

342374
// Step 5: Save results
343375
const timestamp = new Date().toISOString().replace(/[:.]/g, '-')
@@ -351,11 +383,6 @@ async function main() {
351383
overall: {
352384
ast: astOverall,
353385
fixed: fixedOverall,
354-
improvement: {
355-
ndcg: ndcgImprovement,
356-
precision: precImprovement,
357-
recall: recallImprovement,
358-
},
359386
},
360387
perRepo: Object.fromEntries(
361388
repos.map((repo) => [
@@ -366,7 +393,7 @@ async function main() {
366393
},
367394
]),
368395
),
369-
config: { k: K, maxChunkSize: MAX_CHUNK_SIZE },
396+
config: { kValues: K_VALUES, maxChunkSize: MAX_CHUNK_SIZE },
370397
timestamp: new Date().toISOString(),
371398
},
372399
null,

src/chunking/index.ts

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ import {
44
getRelevantImports,
55
getScopeForRange,
66
} from '../context'
7+
import { formatChunkWithContext } from '../context/format'
78
import { getSiblings } from '../context/siblings'
89
import type {
910
ASTWindow,
@@ -301,8 +302,12 @@ export const chunk = (
301302
? { scope: [], entities: [], siblings: [], imports: [] }
302303
: buildContext(text, scopeTree, opts, filepath, language)
303304

305+
// Build contextualized text for embeddings
306+
const contextualizedText = formatChunkWithContext(text.text, context)
307+
304308
return {
305309
text: text.text,
310+
contextualizedText,
306311
byteRange: text.byteRange,
307312
lineRange: text.lineRange,
308313
context,
@@ -376,8 +381,12 @@ export async function* streamChunks(
376381
? { scope: [], entities: [], siblings: [], imports: [] }
377382
: buildContext(text, scopeTree, opts, filepath, language)
378383

384+
// Build contextualized text for embeddings
385+
const contextualizedText = formatChunkWithContext(text.text, context)
386+
379387
yield {
380388
text: text.text,
389+
contextualizedText,
381390
byteRange: text.byteRange,
382391
lineRange: text.lineRange,
383392
context,

src/context/format.ts

Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,87 @@
1+
/**
2+
* Format chunks with semantic context for embedding
3+
*
4+
* Prepends scope chain, entity signatures, and import context
5+
* to improve embedding similarity for semantic search.
6+
*/
7+
8+
import type { ChunkContext } from '../types'
9+
10+
/**
11+
* Format chunk text with semantic context prepended
12+
*
13+
* Creates a contextualized version of the chunk text that includes:
14+
* - File path (last 3 segments)
15+
* - Scope chain (e.g., "MyClass > process")
16+
* - Entity signatures defined in this chunk
17+
* - Import dependencies
18+
* - Sibling context for continuity
19+
*
20+
* This format is optimized for embedding models to capture
21+
* semantic relationships between code chunks.
22+
*
23+
* @param text - The raw chunk text
24+
* @param context - The chunk's semantic context
25+
* @returns Formatted text with context prepended
26+
*/
27+
export function formatChunkWithContext(
28+
text: string,
29+
context: ChunkContext,
30+
): string {
31+
const parts: string[] = []
32+
33+
// Add file path for context (last 3 segments)
34+
if (context.filepath) {
35+
const relPath = context.filepath.split('/').slice(-3).join('/')
36+
parts.push(`# ${relPath}`)
37+
}
38+
39+
// Add scope chain (e.g., "Scope: MyClass > process")
40+
if (context.scope.length > 0) {
41+
const scopePath = context.scope
42+
.map((s) => s.name)
43+
.reverse()
44+
.join(' > ')
45+
parts.push(`# Scope: ${scopePath}`)
46+
}
47+
48+
// Add entity signatures in this chunk
49+
const signatures = context.entities
50+
.filter((e) => e.signature && e.type !== 'import')
51+
.map((e) => e.signature)
52+
if (signatures.length > 0) {
53+
parts.push(`# Defines: ${signatures.join(', ')}`)
54+
}
55+
56+
// Add imports context (what this code depends on)
57+
if (context.imports.length > 0) {
58+
const importNames = context.imports
59+
.slice(0, 10) // Limit to avoid noise
60+
.map((i) => i.name)
61+
.join(', ')
62+
parts.push(`# Uses: ${importNames}`)
63+
}
64+
65+
// Add sibling context for continuity
66+
const beforeSiblings = context.siblings
67+
.filter((s) => s.position === 'before')
68+
.map((s) => s.name)
69+
const afterSiblings = context.siblings
70+
.filter((s) => s.position === 'after')
71+
.map((s) => s.name)
72+
73+
if (beforeSiblings.length > 0) {
74+
parts.push(`# After: ${beforeSiblings.join(', ')}`)
75+
}
76+
if (afterSiblings.length > 0) {
77+
parts.push(`# Before: ${afterSiblings.join(', ')}`)
78+
}
79+
80+
// Add separator and actual code
81+
if (parts.length > 0) {
82+
parts.push('')
83+
}
84+
parts.push(text)
85+
86+
return parts.join('\n')
87+
}

src/index.ts

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,9 @@ export { createChunker } from './chunker'
2323
// Re-export language utilities for advanced usage
2424
export { detectLanguage, LANGUAGE_EXTENSIONS } from './parser/languages'
2525

26+
// Context formatting utility for custom embedding text generation
27+
export { formatChunkWithContext } from './context/format'
28+
2629
// All public types
2730
export type {
2831
ASTWindow,

0 commit comments

Comments
 (0)