Skip to content

Commit 2871950

Browse files
catlog22claude
andcommitted
fix: 修复向量索引进度显示过早完成的问题
问题:FTS 索引完成后立即显示 100%,但嵌入生成仍在后台运行 修复: - codex-lens.ts: 将 "Indexed X files" 阶段从 complete 改为 fts_complete (60%) - codex-lens.ts: 添加嵌入批次和 Finalizing index 阶段解析 - embedding_manager.py: 使用 bulk_insert() 模式延迟 ANN 索引构建 - embedding_manager.py: 添加 "Finalizing index" 进度回调 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <[email protected]>
1 parent 5849f75 commit 2871950

File tree

2 files changed

+122
-90
lines changed

2 files changed

+122
-90
lines changed

ccw/src/tools/codex-lens.ts

Lines changed: 29 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -414,17 +414,42 @@ function parseProgressLine(line: string): ProgressInfo | null {
414414
return { stage: 'complete', message: 'Finalizing...', percent: 95 };
415415
}
416416

417-
// Parse indexed count: "Indexed X files"
417+
// Parse indexed count: "Indexed X files" - FTS complete, but embeddings may follow
418418
const indexedMatch = line.match(/Indexed (\d+) files/i);
419419
if (indexedMatch) {
420420
return {
421-
stage: 'complete',
422-
message: `Indexed ${indexedMatch[1]} files`,
423-
percent: 100,
421+
stage: 'fts_complete', // Not 'complete' - embeddings generation may still be pending
422+
message: `Indexed ${indexedMatch[1]} files, generating embeddings...`,
423+
percent: 60, // FTS done, embeddings starting
424424
filesProcessed: parseInt(indexedMatch[1], 10),
425425
};
426426
}
427427

428+
// Parse embedding batch progress: "Batch X: N files, M chunks"
429+
const batchMatch = line.match(/Batch (\d+):\s*(\d+) files,\s*(\d+) chunks/i);
430+
if (batchMatch) {
431+
return {
432+
stage: 'embeddings',
433+
message: `Embedding batch ${batchMatch[1]}: ${batchMatch[3]} chunks`,
434+
percent: 70, // Stay at 70% during embedding batches
435+
};
436+
}
437+
438+
// Parse embedding progress with file count
439+
const embedProgressMatch = line.match(/Processing (\d+) files/i);
440+
if (embedProgressMatch && line.toLowerCase().includes('embed')) {
441+
return {
442+
stage: 'embeddings',
443+
message: `Processing ${embedProgressMatch[1]} files for embeddings`,
444+
percent: 75,
445+
};
446+
}
447+
448+
// Parse finalizing ANN index
449+
if (line.includes('Finalizing index') || line.includes('Building ANN')) {
450+
return { stage: 'finalizing', message: 'Finalizing vector index...', percent: 90 };
451+
}
452+
428453
return null;
429454
}
430455

codex-lens/src/codexlens/cli/embedding_manager.py

Lines changed: 93 additions & 86 deletions
Original file line numberDiff line numberDiff line change
@@ -222,100 +222,107 @@ def generate_embeddings(
222222

223223
try:
224224
with VectorStore(index_path) as vector_store:
225-
with sqlite3.connect(index_path) as conn:
226-
conn.row_factory = sqlite3.Row
227-
path_column = _get_path_column(conn)
228-
229-
# Get total file count for progress reporting
230-
total_files = conn.execute("SELECT COUNT(*) FROM files").fetchone()[0]
231-
if total_files == 0:
232-
return {"success": False, "error": "No files found in index"}
233-
234-
if progress_callback:
235-
progress_callback(f"Processing {total_files} files in batches of {FILE_BATCH_SIZE}...")
225+
# Use bulk insert mode for efficient batch ANN index building
226+
# This defers ANN updates until end_bulk_insert() is called
227+
with vector_store.bulk_insert():
228+
with sqlite3.connect(index_path) as conn:
229+
conn.row_factory = sqlite3.Row
230+
path_column = _get_path_column(conn)
231+
232+
# Get total file count for progress reporting
233+
total_files = conn.execute("SELECT COUNT(*) FROM files").fetchone()[0]
234+
if total_files == 0:
235+
return {"success": False, "error": "No files found in index"}
236236

237-
cursor = conn.execute(f"SELECT {path_column}, content, language FROM files")
238-
batch_number = 0
239-
240-
while True:
241-
# Fetch a batch of files (streaming, not fetchall)
242-
file_batch = cursor.fetchmany(FILE_BATCH_SIZE)
243-
if not file_batch:
244-
break
245-
246-
batch_number += 1
247-
batch_chunks_with_paths = []
248-
files_in_batch_with_chunks = set()
249-
250-
# Periodic embedder recreation to prevent memory accumulation
251-
if batch_number % EMBEDDER_RECREATION_INTERVAL == 0:
237+
if progress_callback:
238+
progress_callback(f"Processing {total_files} files in batches of {FILE_BATCH_SIZE}...")
239+
240+
cursor = conn.execute(f"SELECT {path_column}, content, language FROM files")
241+
batch_number = 0
242+
243+
while True:
244+
# Fetch a batch of files (streaming, not fetchall)
245+
file_batch = cursor.fetchmany(FILE_BATCH_SIZE)
246+
if not file_batch:
247+
break
248+
249+
batch_number += 1
250+
batch_chunks_with_paths = []
251+
files_in_batch_with_chunks = set()
252+
253+
# Periodic embedder recreation to prevent memory accumulation
254+
if batch_number % EMBEDDER_RECREATION_INTERVAL == 0:
255+
if progress_callback:
256+
progress_callback(f" [Memory optimization] Recreating embedder at batch {batch_number}")
257+
clear_embedder_cache()
258+
embedder = get_embedder(profile=model_profile)
259+
gc.collect()
260+
261+
# Step 1: Chunking for the current file batch
262+
for file_row in file_batch:
263+
file_path = file_row[path_column]
264+
content = file_row["content"]
265+
language = file_row["language"] or "python"
266+
267+
try:
268+
chunks = chunker.chunk_sliding_window(
269+
content,
270+
file_path=file_path,
271+
language=language
272+
)
273+
if chunks:
274+
for chunk in chunks:
275+
batch_chunks_with_paths.append((chunk, file_path))
276+
files_in_batch_with_chunks.add(file_path)
277+
except Exception as e:
278+
logger.error(f"Failed to chunk {file_path}: {e}")
279+
failed_files.append((file_path, str(e)))
280+
281+
if not batch_chunks_with_paths:
282+
continue
283+
284+
batch_chunk_count = len(batch_chunks_with_paths)
252285
if progress_callback:
253-
progress_callback(f" [Memory optimization] Recreating embedder at batch {batch_number}")
254-
clear_embedder_cache()
255-
embedder = get_embedder(profile=model_profile)
256-
gc.collect()
286+
progress_callback(f" Batch {batch_number}: {len(file_batch)} files, {batch_chunk_count} chunks")
287+
288+
# Step 2: Generate embeddings for this batch (use memory-efficient numpy method)
289+
batch_embeddings = []
290+
try:
291+
for i in range(0, batch_chunk_count, EMBEDDING_BATCH_SIZE):
292+
batch_end = min(i + EMBEDDING_BATCH_SIZE, batch_chunk_count)
293+
batch_contents = [chunk.content for chunk, _ in batch_chunks_with_paths[i:batch_end]]
294+
# Use embed_to_numpy() to avoid unnecessary list conversion
295+
embeddings_numpy = embedder.embed_to_numpy(batch_contents)
296+
# Convert to list only for storage (VectorStore expects list format)
297+
embeddings = [emb.tolist() for emb in embeddings_numpy]
298+
batch_embeddings.extend(embeddings)
299+
# Explicit cleanup of intermediate data
300+
del batch_contents, embeddings_numpy
301+
except Exception as e:
302+
logger.error(f"Failed to generate embeddings for batch {batch_number}: {str(e)}")
303+
failed_files.extend([(file_row[path_column], str(e)) for file_row in file_batch])
304+
continue
257305

258-
# Step 1: Chunking for the current file batch
259-
for file_row in file_batch:
260-
file_path = file_row[path_column]
261-
content = file_row["content"]
262-
language = file_row["language"] or "python"
306+
# Step 3: Assign embeddings to chunks
307+
for (chunk, _), embedding in zip(batch_chunks_with_paths, batch_embeddings):
308+
chunk.embedding = embedding
263309

310+
# Step 4: Store this batch to database (ANN update deferred in bulk_insert mode)
264311
try:
265-
chunks = chunker.chunk_sliding_window(
266-
content,
267-
file_path=file_path,
268-
language=language
269-
)
270-
if chunks:
271-
for chunk in chunks:
272-
batch_chunks_with_paths.append((chunk, file_path))
273-
files_in_batch_with_chunks.add(file_path)
312+
vector_store.add_chunks_batch(batch_chunks_with_paths)
313+
total_chunks_created += batch_chunk_count
314+
total_files_processed += len(files_in_batch_with_chunks)
274315
except Exception as e:
275-
logger.error(f"Failed to chunk {file_path}: {e}")
276-
failed_files.append((file_path, str(e)))
316+
logger.error(f"Failed to store batch {batch_number}: {str(e)}")
317+
failed_files.extend([(file_row[path_column], str(e)) for file_row in file_batch])
277318

278-
if not batch_chunks_with_paths:
279-
continue
319+
# Explicit memory cleanup after each batch
320+
del batch_chunks_with_paths, batch_embeddings
321+
gc.collect()
280322

281-
batch_chunk_count = len(batch_chunks_with_paths)
282-
if progress_callback:
283-
progress_callback(f" Batch {batch_number}: {len(file_batch)} files, {batch_chunk_count} chunks")
284-
285-
# Step 2: Generate embeddings for this batch (use memory-efficient numpy method)
286-
batch_embeddings = []
287-
try:
288-
for i in range(0, batch_chunk_count, EMBEDDING_BATCH_SIZE):
289-
batch_end = min(i + EMBEDDING_BATCH_SIZE, batch_chunk_count)
290-
batch_contents = [chunk.content for chunk, _ in batch_chunks_with_paths[i:batch_end]]
291-
# Use embed_to_numpy() to avoid unnecessary list conversion
292-
embeddings_numpy = embedder.embed_to_numpy(batch_contents)
293-
# Convert to list only for storage (VectorStore expects list format)
294-
embeddings = [emb.tolist() for emb in embeddings_numpy]
295-
batch_embeddings.extend(embeddings)
296-
# Explicit cleanup of intermediate data
297-
del batch_contents, embeddings_numpy
298-
except Exception as e:
299-
logger.error(f"Failed to generate embeddings for batch {batch_number}: {str(e)}")
300-
failed_files.extend([(file_row[path_column], str(e)) for file_row in file_batch])
301-
continue
302-
303-
# Step 3: Assign embeddings to chunks
304-
for (chunk, _), embedding in zip(batch_chunks_with_paths, batch_embeddings):
305-
chunk.embedding = embedding
306-
307-
# Step 4: Store this batch to database immediately (releases memory)
308-
try:
309-
vector_store.add_chunks_batch(batch_chunks_with_paths)
310-
total_chunks_created += batch_chunk_count
311-
total_files_processed += len(files_in_batch_with_chunks)
312-
except Exception as e:
313-
logger.error(f"Failed to store batch {batch_number}: {str(e)}")
314-
failed_files.extend([(file_row[path_column], str(e)) for file_row in file_batch])
315-
316-
# Explicit memory cleanup after each batch
317-
del batch_chunks_with_paths, batch_embeddings
318-
gc.collect()
323+
# Notify before ANN index finalization (happens when bulk_insert context exits)
324+
if progress_callback:
325+
progress_callback(f"Finalizing index... Building ANN index for {total_chunks_created} chunks")
319326

320327
except Exception as e:
321328
return {"success": False, "error": f"Failed to read or process files: {str(e)}"}

0 commit comments

Comments
 (0)