Skip to content

Commit fa64e11

Browse files
author
catlog22
committed
refactor: 优化嵌入生成过程,调整批处理大小和内存管理策略
1 parent 210f0f1 commit fa64e11

File tree

2 files changed

+15
-21
lines changed

2 files changed

+15
-21
lines changed

ccw/src/tools/codex-lens.ts

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -410,9 +410,8 @@ function parseProgressLine(line: string): ProgressInfo | null {
410410
if (line.includes('Generating embeddings') || line.includes('Creating embeddings')) {
411411
return { stage: 'embeddings', message: 'Generating embeddings...', percent: 70 };
412412
}
413-
if (line.includes('Finalizing') || line.includes('Complete')) {
414-
return { stage: 'complete', message: 'Finalizing...', percent: 95 };
415-
}
413+
// Note: "Finalizing index" and "Building ANN" are handled separately below
414+
// Only match generic "Complete" here (not "Finalizing" which has specific handlers)
416415

417416
// Parse indexed count: "Indexed X files" - FTS complete, but embeddings may follow
418417
const indexedMatch = line.match(/Indexed (\d+) files/i);
@@ -460,6 +459,11 @@ function parseProgressLine(line: string): ProgressInfo | null {
460459
};
461460
}
462461

462+
// Parse generic completion (but not "Embeddings complete" which is handled above)
463+
if (line.includes('Complete') && !line.toLowerCase().includes('embeddings complete')) {
464+
return { stage: 'complete', message: 'Complete', percent: 98 };
465+
}
466+
463467
return null;
464468
}
465469

codex-lens/src/codexlens/cli/embedding_manager.py

Lines changed: 8 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -18,8 +18,9 @@
1818

1919
logger = logging.getLogger(__name__)
2020

21-
# Periodic embedder recreation interval to prevent memory accumulation
22-
EMBEDDER_RECREATION_INTERVAL = 10 # Recreate embedder every N batches
21+
# Embedding batch size - larger values improve throughput on modern hardware
22+
# Default 64 balances memory usage and GPU/CPU utilization
23+
EMBEDDING_BATCH_SIZE = 64 # Increased from 8 for better performance
2324

2425

2526
def _get_path_column(conn: sqlite3.Connection) -> str:
@@ -196,29 +197,27 @@ def generate_embeddings(
196197

197198
# Initialize components
198199
try:
199-
# Initialize embedder (will be periodically recreated to prevent memory leaks)
200+
# Initialize embedder (singleton, reused throughout the function)
200201
embedder = get_embedder(profile=model_profile)
201202
chunker = Chunker(config=ChunkConfig(max_chunk_size=chunk_size))
202203

203204
if progress_callback:
204205
progress_callback(f"Using model: {embedder.model_name} ({embedder.embedding_dim} dimensions)")
205-
progress_callback(f"Memory optimization: Embedder will be recreated every {EMBEDDER_RECREATION_INTERVAL} batches")
206206

207207
except Exception as e:
208208
return {
209209
"success": False,
210210
"error": f"Failed to initialize components: {str(e)}",
211211
}
212212

213-
# --- MEMORY-OPTIMIZED STREAMING PROCESSING ---
214-
# Process files in small batches to control memory usage
215-
# This keeps peak memory under 2GB regardless of project size
213+
# --- STREAMING PROCESSING ---
214+
# Process files in batches to control memory usage
216215
start_time = time.time()
217216
failed_files = []
218217
total_chunks_created = 0
219218
total_files_processed = 0
220219
FILE_BATCH_SIZE = 100 # Process 100 files at a time
221-
EMBEDDING_BATCH_SIZE = 8 # jina-embeddings-v2-base-code needs small batches
220+
# EMBEDDING_BATCH_SIZE is defined at module level (default: 64)
222221

223222
try:
224223
with VectorStore(index_path) as vector_store:
@@ -251,14 +250,6 @@ def generate_embeddings(
251250
batch_chunks_with_paths = []
252251
files_in_batch_with_chunks = set()
253252

254-
# Periodic embedder recreation to prevent memory accumulation
255-
if batch_number % EMBEDDER_RECREATION_INTERVAL == 0:
256-
if progress_callback:
257-
progress_callback(f" [Memory optimization] Recreating embedder at batch {batch_number}")
258-
clear_embedder_cache()
259-
embedder = get_embedder(profile=model_profile)
260-
gc.collect()
261-
262253
# Step 1: Chunking for the current file batch
263254
for file_row in file_batch:
264255
file_path = file_row[path_column]
@@ -317,9 +308,8 @@ def generate_embeddings(
317308
logger.error(f"Failed to store batch {batch_number}: {str(e)}")
318309
failed_files.extend([(file_row[path_column], str(e)) for file_row in file_batch])
319310

320-
# Explicit memory cleanup after each batch
311+
# Release batch references (let Python GC handle cleanup naturally)
321312
del batch_chunks_with_paths, batch_embeddings
322-
gc.collect()
323313

324314
# Notify before ANN index finalization (happens when bulk_insert context exits)
325315
if progress_callback:

0 commit comments

Comments
 (0)