@@ -352,17 +352,55 @@ def run_embedding_step(
352
352
args : argparse .Namespace , paths : Dict [str , Path ], logger
353
353
) -> bool :
354
354
"""Run the embedding generation step."""
355
- # Start with the base chunks directory (e.g., cache/chunks/4.18)
356
- chunks_dir = paths ["chunks" ]
355
+ base_chunks_dir = paths ["chunks" ]
357
356
output_dir = Path (args .output_dir )
358
-
359
- # If a specific document was processed, look for chunks inside its dedicated subdirectory.
360
- if args .specific_doc :
361
- chunks_dir = chunks_dir / args .specific_doc
357
+ nodes = []
362
358
363
359
try :
364
- logger .info ("Loading chunks..." )
365
- nodes = load_chunks_as_nodes (chunks_dir , logger )
360
+ if args .specific_doc :
361
+ logger .info ("Loading chunks for specific document and runbooks." )
362
+ # 1. Load chunks from the specific document's directory.
363
+ doc_chunks_dir = base_chunks_dir / args .specific_doc
364
+ if doc_chunks_dir .exists ():
365
+ logger .info ("Loading from specific doc directory: %s" , doc_chunks_dir )
366
+ nodes .extend (load_chunks_as_nodes (doc_chunks_dir , logger ))
367
+ else :
368
+ logger .warning (
369
+ "Chunk directory for specific doc not found: %s" , doc_chunks_dir
370
+ )
371
+
372
+ # 2. Load runbook chunks (which are in the base directory).
373
+ if not args .skip_runbooks :
374
+ # Find JSON files directly in base_chunks_dir, not subdirectories.
375
+ runbook_files = [
376
+ f for f in base_chunks_dir .glob ("*.json" ) if f .is_file ()
377
+ ]
378
+ runbook_files = [
379
+ f for f in runbook_files if not f .name .endswith ("_summary.json" )
380
+ ]
381
+
382
+ logger .info (
383
+ "Found %s potential runbook chunk files to load from %s" ,
384
+ len (runbook_files ),
385
+ base_chunks_dir ,
386
+ )
387
+
388
+ for chunk_file in runbook_files :
389
+ try :
390
+ with open (chunk_file , "r" , encoding = "utf-8" ) as f :
391
+ chunk_data = json .load (f )
392
+ node = TextNode (
393
+ text = chunk_data ["content" ],
394
+ metadata = chunk_data .get ("metadata" , {}),
395
+ id_ = chunk_data .get ("id" , str (chunk_file .stem )),
396
+ )
397
+ nodes .append (node )
398
+ except Exception as e :
399
+ logger .warning ("Failed to load chunk %s: %s" , chunk_file , e )
400
+ else :
401
+ # No specific doc, so load everything from the base directory recursively.
402
+ logger .info ("Loading all chunks recursively from %s" , base_chunks_dir )
403
+ nodes = load_chunks_as_nodes (base_chunks_dir , logger )
366
404
367
405
if not nodes :
368
406
logger .error ("No chunks found to embed" )
@@ -374,7 +412,7 @@ def run_embedding_step(
374
412
vector_store = FaissVectorStore (faiss_index = faiss_index )
375
413
storage_context = StorageContext .from_defaults (vector_store = vector_store )
376
414
377
- logger .info ("Generating embeddings..." )
415
+ logger .info ("Generating embeddings for %s nodes ..." , len ( nodes ) )
378
416
index = VectorStoreIndex (nodes , storage_context = storage_context )
379
417
index .set_index_id (args .index )
380
418
0 commit comments