|
2 | 2 | # See: https://github.com/pytorch/pytorch/issues/37377 and https://openmp.llvm.org |
3 | 3 | import sys |
4 | 4 | import pathlib |
| 5 | +import uuid |
5 | 6 |
|
6 | 7 | # Ensure the root directory (where pyproject.toml lives) is on sys.path |
7 | 8 | ROOT = pathlib.Path(__file__).resolve().parents[1] |
|
15 | 16 | import logging # Added for ask command |
16 | 17 |
|
17 | 18 | import typer # type: ignore |
18 | | -import json # Added import |
19 | | -import csv # Added import |
| 19 | +import json, csv |
20 | 20 | from collections import defaultdict |
21 | 21 | from pathlib import Path |
22 | 22 |
|
|
30 | 30 | from scripts.retrieval.retrieval_manager import RetrievalManager |
31 | 31 | from scripts.prompting.prompt_builder import PromptBuilder # Added for ask command |
32 | 32 | from scripts.api_clients.openai.completer import OpenAICompleter # Added for ask command |
33 | | - |
| 33 | +from scripts.agents.image_insight_agent import ImageInsightAgent # Added for index_images command |
34 | 34 | app = typer.Typer() |
35 | 35 |
|
36 | 36 | # Setup basic logging for the CLI |
@@ -157,82 +157,61 @@ def ingest( |
157 | 157 | @app.command() |
158 | 158 | def embed( |
159 | 159 | project_dir: Path, |
160 | | - use_async: bool = typer.Option(False, "--a-b", "--async-batch", help="Use OpenAI async batch embedding") |
| 160 | + use_async: bool = typer.Option(False, "--a-b", "--async-batch", help="Use OpenAI async batch embedding"), |
| 161 | + with_image_index: bool = typer.Option(False, "--with-image-index", help="Run image enrichment and indexing after embedding") |
161 | 162 | ) -> None: |
162 | 163 | """ |
163 | 164 | Generate embeddings for chunks in the specified project directory. |
| 165 | + Optionally run image enrichment + indexing after embedding. |
164 | 166 | """ |
165 | 167 | cli_logger.info("\n" + "=" * 120) |
166 | 168 | cli_logger.info("DEBUG: CLI embed() command STARTING") |
167 | 169 | cli_logger.info("=" * 120) |
168 | 170 |
|
169 | 171 | cli_logger.info(f"DEBUG: CLI Arguments received:") |
170 | | - cli_logger.info(f"DEBUG: - project_dir: {project_dir}") |
171 | | - cli_logger.info(f"DEBUG: - use_async: {use_async}") |
172 | | - cli_logger.info(f"DEBUG: - use_async type: {type(use_async)}") |
173 | | - |
| 172 | + cli_logger.info(f" - project_dir: {project_dir}") |
| 173 | + cli_logger.info(f" - use_async: {use_async}") |
| 174 | + cli_logger.info(f" - with_image_index: {with_image_index}") |
| 175 | + |
174 | 176 | if not project_dir.exists(): |
175 | | - error_msg = f"Project directory does not exist: {project_dir}" |
176 | | - cli_logger.error(f"ERROR: {error_msg}") |
177 | | - typer.echo(f"Error: {error_msg}") |
| 177 | + typer.echo(f"❌ Project directory does not exist: {project_dir}") |
178 | 178 | raise typer.Exit(1) |
179 | | - |
180 | | - # logger = LoggerManager.get_logger("cli") # Already have cli_logger |
181 | | - |
182 | | - # Initialize project manager |
183 | | - cli_logger.info("DEBUG: Creating ProjectManager...") |
| 179 | + |
184 | 180 | project = ProjectManager(project_dir) |
185 | | - cli_logger.info(f"DEBUG: ProjectManager created for: {project_dir}") |
186 | 181 | runtime_config = copy_module.deepcopy(project.config) |
187 | | - cli_logger.info(f"DEBUG: Project config loaded: {runtime_config}") |
188 | 182 |
|
189 | | - # Override config if async flag is provided |
190 | 183 | if use_async: |
191 | | - cli_logger.info("DEBUG: CLI use_async is TRUE - Overriding config") |
192 | | - cli_logger.info("Embedding mode override: use_async_batch=True") |
193 | | - |
194 | | - # Set the runtime config directly (config is a plain dict) |
195 | | - if 'embedding' not in runtime_config: |
196 | | - runtime_config['embedding'] = {} |
197 | | - |
198 | | - runtime_config['embedding']['use_async_batch'] = True |
199 | | - cli_logger.info("DEBUG: Set runtime_config['embedding']['use_async_batch'] = True") |
200 | | - cli_logger.info(f"DEBUG: Updated runtime config: {runtime_config}") |
201 | | - cli_logger.info(f"DEBUG: Original project.config unchanged: {project.config}") |
202 | | - |
203 | | - # Verify the setting in runtime_config (since config is a plain dict) |
204 | | - if 'embedding' in runtime_config and 'use_async_batch' in runtime_config['embedding']: |
205 | | - async_batch_value = runtime_config['embedding']['use_async_batch'] |
206 | | - cli_logger.info(f"DEBUG: Verification - runtime_config['embedding']['use_async_batch'] = {async_batch_value}") |
207 | | - else: |
208 | | - cli_logger.info("DEBUG: use_async_batch not found in runtime_config") |
209 | | - |
210 | | - else: |
211 | | - cli_logger.info("DEBUG: CLI use_async is FALSE - Using default config") |
212 | | - cli_logger.info("Embedding mode: using default configuration") |
213 | | - |
214 | | - cli_logger.info("DEBUG: About to create UnifiedEmbedder...") |
215 | | - embedder = UnifiedEmbedder(project, runtime_config=runtime_config) |
216 | | - |
217 | | - cli_logger.info(f"DEBUG: UnifiedEmbedder created:") |
218 | | - cli_logger.info(f"DEBUG: - embedder.use_async_batch: {embedder.use_async_batch}") |
219 | | - cli_logger.info(f"DEBUG: - Expected: {use_async}") |
220 | | - |
221 | | - if use_async and not embedder.use_async_batch: |
222 | | - cli_logger.error("ERROR: CLI flag --async was True but embedder.use_async_batch is False!") |
223 | | - cli_logger.error("ERROR: Configuration override failed!") |
224 | | - elif use_async and embedder.use_async_batch: |
225 | | - cli_logger.info("SUCCESS: CLI flag --async correctly set embedder.use_async_batch = True") |
| 184 | + runtime_config.setdefault("embedding", {})["use_async_batch"] = True |
226 | 185 |
|
227 | | - cli_logger.info(f"CLI: Created embedder with use_async_batch={embedder.use_async_batch}") |
228 | | - |
229 | | - cli_logger.info("DEBUG: About to call embedder.run_from_folder()...") |
| 186 | + embedder = UnifiedEmbedder(project, runtime_config=runtime_config) |
230 | 187 | embedder.run_from_folder() |
231 | | - |
| 188 | + |
| 189 | + cli_logger.info("✅ Embedding complete.") |
| 190 | + |
| 191 | + # Optional post-processing: image enrichment and indexing |
| 192 | + if with_image_index: |
| 193 | + cli_logger.info("🧠 Starting image enrichment + indexing...") |
| 194 | + |
| 195 | + import subprocess |
| 196 | + doc_types = ["pptx", "pdf", "docx"] # You can extend this as needed |
| 197 | + |
| 198 | + for doc_type in doc_types: |
| 199 | + enrich_cmd = f"python cli.py enrich-images {project_dir} --doc-type {doc_type}" |
| 200 | + index_cmd = f"python cli.py index-images {project_dir} --doc-type {doc_type}" |
| 201 | + |
| 202 | + cli_logger.info(f"Running: {enrich_cmd}") |
| 203 | + subprocess.call(enrich_cmd, shell=True) |
| 204 | + |
| 205 | + cli_logger.info(f"Running: {index_cmd}") |
| 206 | + subprocess.call(index_cmd, shell=True) |
| 207 | + |
| 208 | + cli_logger.info("✅ Image indexing complete.") |
| 209 | + |
232 | 210 | cli_logger.info("=" * 120) |
233 | 211 | cli_logger.info("DEBUG: CLI embed() command COMPLETE") |
234 | 212 | cli_logger.info("=" * 120) |
235 | 213 |
|
| 214 | + |
236 | 215 | @app.command() |
237 | 216 | def retrieve( |
238 | 217 | project_path: str = typer.Argument(..., help="Path to the RAG project directory"), |
@@ -401,10 +380,6 @@ def enrich_images( |
401 | 380 | """ |
402 | 381 | Enrich chunks with image summaries using the ImageInsightAgent. |
403 | 382 | """ |
404 | | - from scripts.agents.image_insight_agent import ImageInsightAgent |
405 | | - from scripts.chunking.models import Chunk |
406 | | - import csv |
407 | | - import json |
408 | 383 |
|
409 | 384 | project = ProjectManager(project_path) |
410 | 385 | agent = ImageInsightAgent(project) |
@@ -452,6 +427,60 @@ def enrich_images( |
452 | 427 |
|
453 | 428 |
|
454 | 429 |
|
| 430 | +@app.command() |
| 431 | +def index_images( |
| 432 | + project_path: Path = typer.Argument(..., help="Path to the RAG project directory."), |
| 433 | + doc_type: str = typer.Option("pptx", help="Document type to read enriched chunks from") |
| 434 | +): |
| 435 | + """ |
| 436 | + Index enriched image summaries (ImageChunks) into image_index.faiss and image_metadata.jsonl. |
| 437 | + """ |
| 438 | + import csv |
| 439 | + import json |
| 440 | + from scripts.chunking.models import ImageChunk |
| 441 | + from scripts.core.project_manager import ProjectManager |
| 442 | + from scripts.embeddings.image_indexer import ImageIndexer |
| 443 | + |
| 444 | + project = ProjectManager(project_path) |
| 445 | + indexer = ImageIndexer(project) |
| 446 | + |
| 447 | + enriched_path = project_path / "input" / "enriched" / f"chunks_{doc_type}.tsv" |
| 448 | + if not enriched_path.exists(): |
| 449 | + typer.echo(f"❌ Enriched TSV not found: {enriched_path}") |
| 450 | + raise typer.Exit(1) |
| 451 | + |
| 452 | + image_chunks: list[ImageChunk] = [] |
| 453 | + |
| 454 | + with open(enriched_path, encoding="utf-8") as f: |
| 455 | + reader = csv.reader(f, delimiter="\t") |
| 456 | + header = next(reader) |
| 457 | + for row in reader: |
| 458 | + if len(row) < 5: |
| 459 | + continue |
| 460 | + meta = json.loads(row[4]) |
| 461 | + summaries = meta.get("image_summaries", []) |
| 462 | + for s in summaries: |
| 463 | + image_chunks.append( |
| 464 | + ImageChunk( |
| 465 | + id=str(uuid.uuid4()), |
| 466 | + description=s["description"], |
| 467 | + meta={ |
| 468 | + "image_path": s["image_path"], |
| 469 | + "source_chunk_id": row[0], |
| 470 | + "doc_type": meta.get("doc_type"), |
| 471 | + "source_filepath": meta.get("source_filepath"), |
| 472 | + "page_number": meta.get("page_number"), |
| 473 | + }, |
| 474 | + ) |
| 475 | + ) |
| 476 | + |
| 477 | + indexer.run(image_chunks) |
| 478 | + typer.echo(f"✅ Indexed {len(image_chunks)} image chunks into FAISS and metadata JSONL.") |
| 479 | + |
| 480 | + |
| 481 | + |
| 482 | + |
| 483 | + |
455 | 484 | if __name__ == "__main__": |
456 | 485 | # Configure root logger for CLI output if needed, or rely on LoggerManager |
457 | 486 | # For example, to see INFO messages from modules if not configured by LoggerManager: |
|
0 commit comments