Skip to content

Commit cac6582

Browse files
committed
Fix runbook processing
1 parent 3b85dfd commit cac6582

File tree

1 file changed

+47
-9
lines changed

1 file changed

+47
-9
lines changed

scripts/html_embeddings/generate_embeddings.py

Lines changed: 47 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -352,17 +352,55 @@ def run_embedding_step(
352352
args: argparse.Namespace, paths: Dict[str, Path], logger
353353
) -> bool:
354354
"""Run the embedding generation step."""
355-
# Start with the base chunks directory (e.g., cache/chunks/4.18)
356-
chunks_dir = paths["chunks"]
355+
base_chunks_dir = paths["chunks"]
357356
output_dir = Path(args.output_dir)
358-
359-
# If a specific document was processed, look for chunks inside its dedicated subdirectory.
360-
if args.specific_doc:
361-
chunks_dir = chunks_dir / args.specific_doc
357+
nodes = []
362358

363359
try:
364-
logger.info("Loading chunks...")
365-
nodes = load_chunks_as_nodes(chunks_dir, logger)
360+
if args.specific_doc:
361+
logger.info("Loading chunks for specific document and runbooks.")
362+
# 1. Load chunks from the specific document's directory.
363+
doc_chunks_dir = base_chunks_dir / args.specific_doc
364+
if doc_chunks_dir.exists():
365+
logger.info("Loading from specific doc directory: %s", doc_chunks_dir)
366+
nodes.extend(load_chunks_as_nodes(doc_chunks_dir, logger))
367+
else:
368+
logger.warning(
369+
"Chunk directory for specific doc not found: %s", doc_chunks_dir
370+
)
371+
372+
# 2. Load runbook chunks (which are in the base directory).
373+
if not args.skip_runbooks:
374+
# Find JSON files directly in base_chunks_dir, not subdirectories.
375+
runbook_files = [
376+
f for f in base_chunks_dir.glob("*.json") if f.is_file()
377+
]
378+
runbook_files = [
379+
f for f in runbook_files if not f.name.endswith("_summary.json")
380+
]
381+
382+
logger.info(
383+
"Found %s potential runbook chunk files to load from %s",
384+
len(runbook_files),
385+
base_chunks_dir,
386+
)
387+
388+
for chunk_file in runbook_files:
389+
try:
390+
with open(chunk_file, "r", encoding="utf-8") as f:
391+
chunk_data = json.load(f)
392+
node = TextNode(
393+
text=chunk_data["content"],
394+
metadata=chunk_data.get("metadata", {}),
395+
id_=chunk_data.get("id", str(chunk_file.stem)),
396+
)
397+
nodes.append(node)
398+
except Exception as e:
399+
logger.warning("Failed to load chunk %s: %s", chunk_file, e)
400+
else:
401+
# No specific doc, so load everything from the base directory recursively.
402+
logger.info("Loading all chunks recursively from %s", base_chunks_dir)
403+
nodes = load_chunks_as_nodes(base_chunks_dir, logger)
366404

367405
if not nodes:
368406
logger.error("No chunks found to embed")
@@ -374,7 +412,7 @@ def run_embedding_step(
374412
vector_store = FaissVectorStore(faiss_index=faiss_index)
375413
storage_context = StorageContext.from_defaults(vector_store=vector_store)
376414

377-
logger.info("Generating embeddings...")
415+
logger.info("Generating embeddings for %s nodes...", len(nodes))
378416
index = VectorStoreIndex(nodes, storage_context=storage_context)
379417
index.set_index_id(args.index)
380418

0 commit comments

Comments
 (0)