@@ -222,100 +222,107 @@ def generate_embeddings(
222222
223223 try :
224224 with VectorStore (index_path ) as vector_store :
225- with sqlite3 . connect ( index_path ) as conn :
226- conn . row_factory = sqlite3 . Row
227- path_column = _get_path_column ( conn )
228-
229- # Get total file count for progress reporting
230- total_files = conn . execute ( "SELECT COUNT(*) FROM files" ). fetchone ()[ 0 ]
231- if total_files == 0 :
232- return { "success" : False , "error" : "No files found in index" }
233-
234- if progress_callback :
235- progress_callback ( f"Processing { total_files } files in batches of { FILE_BATCH_SIZE } ..." )
225+ # Use bulk insert mode for efficient batch ANN index building
226+ # This defers ANN updates until end_bulk_insert() is called
227+ with vector_store . bulk_insert ():
228+ with sqlite3 . connect ( index_path ) as conn :
229+ conn . row_factory = sqlite3 . Row
230+ path_column = _get_path_column ( conn )
231+
232+ # Get total file count for progress reporting
233+ total_files = conn . execute ( "SELECT COUNT(*) FROM files" ). fetchone ()[ 0 ]
234+ if total_files == 0 :
235+ return { "success" : False , "error" : "No files found in index" }
236236
237- cursor = conn .execute (f"SELECT { path_column } , content, language FROM files" )
238- batch_number = 0
239-
240- while True :
241- # Fetch a batch of files (streaming, not fetchall)
242- file_batch = cursor .fetchmany (FILE_BATCH_SIZE )
243- if not file_batch :
244- break
245-
246- batch_number += 1
247- batch_chunks_with_paths = []
248- files_in_batch_with_chunks = set ()
249-
250- # Periodic embedder recreation to prevent memory accumulation
251- if batch_number % EMBEDDER_RECREATION_INTERVAL == 0 :
237+ if progress_callback :
238+ progress_callback (f"Processing { total_files } files in batches of { FILE_BATCH_SIZE } ..." )
239+
240+ cursor = conn .execute (f"SELECT { path_column } , content, language FROM files" )
241+ batch_number = 0
242+
243+ while True :
244+ # Fetch a batch of files (streaming, not fetchall)
245+ file_batch = cursor .fetchmany (FILE_BATCH_SIZE )
246+ if not file_batch :
247+ break
248+
249+ batch_number += 1
250+ batch_chunks_with_paths = []
251+ files_in_batch_with_chunks = set ()
252+
253+ # Periodic embedder recreation to prevent memory accumulation
254+ if batch_number % EMBEDDER_RECREATION_INTERVAL == 0 :
255+ if progress_callback :
256+ progress_callback (f" [Memory optimization] Recreating embedder at batch { batch_number } " )
257+ clear_embedder_cache ()
258+ embedder = get_embedder (profile = model_profile )
259+ gc .collect ()
260+
261+ # Step 1: Chunking for the current file batch
262+ for file_row in file_batch :
263+ file_path = file_row [path_column ]
264+ content = file_row ["content" ]
265+ language = file_row ["language" ] or "python"
266+
267+ try :
268+ chunks = chunker .chunk_sliding_window (
269+ content ,
270+ file_path = file_path ,
271+ language = language
272+ )
273+ if chunks :
274+ for chunk in chunks :
275+ batch_chunks_with_paths .append ((chunk , file_path ))
276+ files_in_batch_with_chunks .add (file_path )
277+ except Exception as e :
278+ logger .error (f"Failed to chunk { file_path } : { e } " )
279+ failed_files .append ((file_path , str (e )))
280+
281+ if not batch_chunks_with_paths :
282+ continue
283+
284+ batch_chunk_count = len (batch_chunks_with_paths )
252285 if progress_callback :
253- progress_callback (f" [Memory optimization] Recreating embedder at batch { batch_number } " )
254- clear_embedder_cache ()
255- embedder = get_embedder (profile = model_profile )
256- gc .collect ()
286+ progress_callback (f" Batch { batch_number } : { len (file_batch )} files, { batch_chunk_count } chunks" )
287+
288+ # Step 2: Generate embeddings for this batch (use memory-efficient numpy method)
289+ batch_embeddings = []
290+ try :
291+ for i in range (0 , batch_chunk_count , EMBEDDING_BATCH_SIZE ):
292+ batch_end = min (i + EMBEDDING_BATCH_SIZE , batch_chunk_count )
293+ batch_contents = [chunk .content for chunk , _ in batch_chunks_with_paths [i :batch_end ]]
294+ # Use embed_to_numpy() to avoid unnecessary list conversion
295+ embeddings_numpy = embedder .embed_to_numpy (batch_contents )
296+ # Convert to list only for storage (VectorStore expects list format)
297+ embeddings = [emb .tolist () for emb in embeddings_numpy ]
298+ batch_embeddings .extend (embeddings )
299+ # Explicit cleanup of intermediate data
300+ del batch_contents , embeddings_numpy
301+ except Exception as e :
302+ logger .error (f"Failed to generate embeddings for batch { batch_number } : { str (e )} " )
303+ failed_files .extend ([(file_row [path_column ], str (e )) for file_row in file_batch ])
304+ continue
257305
258- # Step 1: Chunking for the current file batch
259- for file_row in file_batch :
260- file_path = file_row [path_column ]
261- content = file_row ["content" ]
262- language = file_row ["language" ] or "python"
306+ # Step 3: Assign embeddings to chunks
307+ for (chunk , _ ), embedding in zip (batch_chunks_with_paths , batch_embeddings ):
308+ chunk .embedding = embedding
263309
310+ # Step 4: Store this batch to database (ANN update deferred in bulk_insert mode)
264311 try :
265- chunks = chunker .chunk_sliding_window (
266- content ,
267- file_path = file_path ,
268- language = language
269- )
270- if chunks :
271- for chunk in chunks :
272- batch_chunks_with_paths .append ((chunk , file_path ))
273- files_in_batch_with_chunks .add (file_path )
312+ vector_store .add_chunks_batch (batch_chunks_with_paths )
313+ total_chunks_created += batch_chunk_count
314+ total_files_processed += len (files_in_batch_with_chunks )
274315 except Exception as e :
275- logger .error (f"Failed to chunk { file_path } : { e } " )
276- failed_files .append (( file_path , str (e )))
316+ logger .error (f"Failed to store batch { batch_number } : { str ( e ) } " )
317+ failed_files .extend ([( file_row [ path_column ] , str (e )) for file_row in file_batch ] )
277318
278- if not batch_chunks_with_paths :
279- continue
319+ # Explicit memory cleanup after each batch
320+ del batch_chunks_with_paths , batch_embeddings
321+ gc .collect ()
280322
281- batch_chunk_count = len (batch_chunks_with_paths )
282- if progress_callback :
283- progress_callback (f" Batch { batch_number } : { len (file_batch )} files, { batch_chunk_count } chunks" )
284-
285- # Step 2: Generate embeddings for this batch (use memory-efficient numpy method)
286- batch_embeddings = []
287- try :
288- for i in range (0 , batch_chunk_count , EMBEDDING_BATCH_SIZE ):
289- batch_end = min (i + EMBEDDING_BATCH_SIZE , batch_chunk_count )
290- batch_contents = [chunk .content for chunk , _ in batch_chunks_with_paths [i :batch_end ]]
291- # Use embed_to_numpy() to avoid unnecessary list conversion
292- embeddings_numpy = embedder .embed_to_numpy (batch_contents )
293- # Convert to list only for storage (VectorStore expects list format)
294- embeddings = [emb .tolist () for emb in embeddings_numpy ]
295- batch_embeddings .extend (embeddings )
296- # Explicit cleanup of intermediate data
297- del batch_contents , embeddings_numpy
298- except Exception as e :
299- logger .error (f"Failed to generate embeddings for batch { batch_number } : { str (e )} " )
300- failed_files .extend ([(file_row [path_column ], str (e )) for file_row in file_batch ])
301- continue
302-
303- # Step 3: Assign embeddings to chunks
304- for (chunk , _ ), embedding in zip (batch_chunks_with_paths , batch_embeddings ):
305- chunk .embedding = embedding
306-
307- # Step 4: Store this batch to database immediately (releases memory)
308- try :
309- vector_store .add_chunks_batch (batch_chunks_with_paths )
310- total_chunks_created += batch_chunk_count
311- total_files_processed += len (files_in_batch_with_chunks )
312- except Exception as e :
313- logger .error (f"Failed to store batch { batch_number } : { str (e )} " )
314- failed_files .extend ([(file_row [path_column ], str (e )) for file_row in file_batch ])
315-
316- # Explicit memory cleanup after each batch
317- del batch_chunks_with_paths , batch_embeddings
318- gc .collect ()
323+ # Notify before ANN index finalization (happens when bulk_insert context exits)
324+ if progress_callback :
325+ progress_callback (f"Finalizing index... Building ANN index for { total_chunks_created } chunks" )
319326
320327 except Exception as e :
321328 return {"success" : False , "error" : f"Failed to read or process files: { str (e )} " }
0 commit comments