|
18 | 18 |
|
19 | 19 | logger = logging.getLogger(__name__) |
20 | 20 |
|
21 | | -# Periodic embedder recreation interval to prevent memory accumulation |
22 | | -EMBEDDER_RECREATION_INTERVAL = 10 # Recreate embedder every N batches |
| 21 | +# Embedding batch size - larger values improve throughput on modern hardware |
| 22 | +# Default 64 balances memory usage and GPU/CPU utilization |
| 23 | +EMBEDDING_BATCH_SIZE = 64 # Increased from 8 for better performance |
23 | 24 |
|
24 | 25 |
|
25 | 26 | def _get_path_column(conn: sqlite3.Connection) -> str: |
@@ -196,29 +197,27 @@ def generate_embeddings( |
196 | 197 |
|
197 | 198 | # Initialize components |
198 | 199 | try: |
199 | | - # Initialize embedder (will be periodically recreated to prevent memory leaks) |
| 200 | + # Initialize embedder (singleton, reused throughout the function) |
200 | 201 | embedder = get_embedder(profile=model_profile) |
201 | 202 | chunker = Chunker(config=ChunkConfig(max_chunk_size=chunk_size)) |
202 | 203 |
|
203 | 204 | if progress_callback: |
204 | 205 | progress_callback(f"Using model: {embedder.model_name} ({embedder.embedding_dim} dimensions)") |
205 | | - progress_callback(f"Memory optimization: Embedder will be recreated every {EMBEDDER_RECREATION_INTERVAL} batches") |
206 | 206 |
|
207 | 207 | except Exception as e: |
208 | 208 | return { |
209 | 209 | "success": False, |
210 | 210 | "error": f"Failed to initialize components: {str(e)}", |
211 | 211 | } |
212 | 212 |
|
213 | | - # --- MEMORY-OPTIMIZED STREAMING PROCESSING --- |
214 | | - # Process files in small batches to control memory usage |
215 | | - # This keeps peak memory under 2GB regardless of project size |
| 213 | + # --- STREAMING PROCESSING --- |
| 214 | + # Process files in batches to control memory usage |
216 | 215 | start_time = time.time() |
217 | 216 | failed_files = [] |
218 | 217 | total_chunks_created = 0 |
219 | 218 | total_files_processed = 0 |
220 | 219 | FILE_BATCH_SIZE = 100 # Process 100 files at a time |
221 | | - EMBEDDING_BATCH_SIZE = 8 # jina-embeddings-v2-base-code needs small batches |
| 220 | + # EMBEDDING_BATCH_SIZE is defined at module level (default: 64) |
222 | 221 |
|
223 | 222 | try: |
224 | 223 | with VectorStore(index_path) as vector_store: |
@@ -251,14 +250,6 @@ def generate_embeddings( |
251 | 250 | batch_chunks_with_paths = [] |
252 | 251 | files_in_batch_with_chunks = set() |
253 | 252 |
|
254 | | - # Periodic embedder recreation to prevent memory accumulation |
255 | | - if batch_number % EMBEDDER_RECREATION_INTERVAL == 0: |
256 | | - if progress_callback: |
257 | | - progress_callback(f" [Memory optimization] Recreating embedder at batch {batch_number}") |
258 | | - clear_embedder_cache() |
259 | | - embedder = get_embedder(profile=model_profile) |
260 | | - gc.collect() |
261 | | - |
262 | 253 | # Step 1: Chunking for the current file batch |
263 | 254 | for file_row in file_batch: |
264 | 255 | file_path = file_row[path_column] |
@@ -317,9 +308,8 @@ def generate_embeddings( |
317 | 308 | logger.error(f"Failed to store batch {batch_number}: {str(e)}") |
318 | 309 | failed_files.extend([(file_row[path_column], str(e)) for file_row in file_batch]) |
319 | 310 |
|
320 | | - # Explicit memory cleanup after each batch |
| 311 | + # Release batch references (let Python GC handle cleanup naturally) |
321 | 312 | del batch_chunks_with_paths, batch_embeddings |
322 | | - gc.collect() |
323 | 313 |
|
324 | 314 | # Notify before ANN index finalization (happens when bulk_insert context exits) |
325 | 315 | if progress_callback: |
|
0 commit comments