hagaybar
diff --git a/‎README.md‎
Lines changed: 26 additions & 0 deletions b/‎README.md‎
Lines changed: 26 additions & 0 deletions
diff --git a/‎app/cli.py‎
Lines changed: 91 additions & 62 deletions b/‎app/cli.py‎
Lines changed: 91 additions & 62 deletions
diff --git a/‎docs/new_images_index_plan.docx‎
24.3 KB b/‎docs/new_images_index_plan.docx‎
24.3 KB
diff --git a/‎poetry.lock‎
Lines changed: 19 additions & 1 deletion b/‎poetry.lock‎
Lines changed: 19 additions & 1 deletion
diff --git a/‎pyproject.toml‎
Lines changed: 2 additions & 1 deletion b/‎pyproject.toml‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎scripts/agents/image_insight_agent.py‎
Lines changed: 46 additions & 38 deletions b/‎scripts/agents/image_insight_agent.py‎
Lines changed: 46 additions & 38 deletions
@@ -41,6 +41,32 @@ poetry run streamlit run app/ui_streamlit.py   # default browser opens
 
 ---
 
+## 💻 UI Usage
+
+The Streamlit UI provides a user-friendly interface for managing RAG-GP projects.
+
+### Creating a New Project
+
+1.  Navigate to the "Projects" section in the sidebar.
+2.  Fill out the "Create New Project" form:
+    *   **Project Name:** A unique name for your project.
+    *   **Project Description:** An optional description of your project.
+    *   **Language:** The primary language of your documents.
+    *   **Enable Image Enrichment:** Check this box to enable image analysis features.
+    *   **Embedding Model:** Select the embedding model to use for your project.
+3.  Click the "Create Project" button.
+
+### Managing a Project
+
+Once you have created a project, you can manage it from the "Projects" section.
+
+*   **Select a Project:** Choose a project from the dropdown menu to view its details.
+*   **Configuration Editor:** The `config.yml` file for the selected project is displayed in a text editor. You can make changes to the configuration and save them by clicking the "Save Config" button.
+*   **Upload Raw Data:** You can upload raw data files (e.g., .pdf, .docx, .txt) to your project using the file uploader. The files will be saved to the appropriate subdirectory under `data/projects/<project_name>/input/raw/`.
+*   **Raw File Repository:** The "Raw File Repository" section displays a list of all the raw data files in your project, grouped by file type.
+
+---
+
 ## 🗂️ Folder Structure (excerpt)
 
 ```text
 
@@ -2,6 +2,7 @@
 # See: https://github.com/pytorch/pytorch/issues/37377 and https://openmp.llvm.org
 import sys
 import pathlib
+import uuid
 
 # Ensure the root directory (where pyproject.toml lives) is on sys.path
 ROOT = pathlib.Path(__file__).resolve().parents[1]
@@ -15,8 +16,7 @@
 import logging # Added for ask command
 
 import typer  # type: ignore
-import json  # Added import
-import csv  # Added import
+import json, csv 
 from collections import defaultdict
 from pathlib import Path
 
@@ -30,7 +30,7 @@
 from scripts.retrieval.retrieval_manager import RetrievalManager
 from scripts.prompting.prompt_builder import PromptBuilder # Added for ask command
 from scripts.api_clients.openai.completer import OpenAICompleter # Added for ask command
-
+from scripts.agents.image_insight_agent import ImageInsightAgent # Added for index_images command
 app = typer.Typer()
 
 # Setup basic logging for the CLI
@@ -157,82 +157,61 @@ def ingest(
 @app.command()
 def embed(
     project_dir: Path,
-    use_async: bool = typer.Option(False, "--a-b", "--async-batch", help="Use OpenAI async batch embedding")
+    use_async: bool = typer.Option(False, "--a-b", "--async-batch", help="Use OpenAI async batch embedding"),
+    with_image_index: bool = typer.Option(False, "--with-image-index", help="Run image enrichment and indexing after embedding")
 ) -> None:
     """
     Generate embeddings for chunks in the specified project directory.
+    Optionally run image enrichment + indexing after embedding.
     """
     cli_logger.info("\n" + "=" * 120)
     cli_logger.info("DEBUG: CLI embed() command STARTING")
     cli_logger.info("=" * 120)
 
     cli_logger.info(f"DEBUG: CLI Arguments received:")
-    cli_logger.info(f"DEBUG:   - project_dir: {project_dir}")
-    cli_logger.info(f"DEBUG:   - use_async: {use_async}")
-    cli_logger.info(f"DEBUG:   - use_async type: {type(use_async)}")
-    
+    cli_logger.info(f"  - project_dir: {project_dir}")
+    cli_logger.info(f"  - use_async: {use_async}")
+    cli_logger.info(f"  - with_image_index: {with_image_index}")
+
     if not project_dir.exists():
-        error_msg = f"Project directory does not exist: {project_dir}"
-        cli_logger.error(f"ERROR: {error_msg}")
-        typer.echo(f"Error: {error_msg}")
+        typer.echo(f"❌ Project directory does not exist: {project_dir}")
         raise typer.Exit(1)
-    
-    # logger = LoggerManager.get_logger("cli") # Already have cli_logger
-    
-    # Initialize project manager
-    cli_logger.info("DEBUG: Creating ProjectManager...")
+
     project = ProjectManager(project_dir)
-    cli_logger.info(f"DEBUG: ProjectManager created for: {project_dir}")
     runtime_config = copy_module.deepcopy(project.config)
-    cli_logger.info(f"DEBUG: Project config loaded: {runtime_config}")
 
-    # Override config if async flag is provided
     if use_async:
-        cli_logger.info("DEBUG: CLI use_async is TRUE - Overriding config")
-        cli_logger.info("Embedding mode override: use_async_batch=True")
-        
-        # Set the runtime config directly (config is a plain dict)
-        if 'embedding' not in runtime_config:
-            runtime_config['embedding'] = {}
-        
-        runtime_config['embedding']['use_async_batch'] = True
-        cli_logger.info("DEBUG: Set runtime_config['embedding']['use_async_batch'] = True")
-        cli_logger.info(f"DEBUG: Updated runtime config: {runtime_config}")
-        cli_logger.info(f"DEBUG: Original project.config unchanged: {project.config}")
-        
-        # Verify the setting in runtime_config (since config is a plain dict)
-        if 'embedding' in runtime_config and 'use_async_batch' in runtime_config['embedding']:
-            async_batch_value = runtime_config['embedding']['use_async_batch']
-            cli_logger.info(f"DEBUG: Verification - runtime_config['embedding']['use_async_batch'] = {async_batch_value}")
-        else:
-            cli_logger.info("DEBUG: use_async_batch not found in runtime_config")
-        
-    else:
-        cli_logger.info("DEBUG: CLI use_async is FALSE - Using default config")
-        cli_logger.info("Embedding mode: using default configuration")
-    
-    cli_logger.info("DEBUG: About to create UnifiedEmbedder...")
-    embedder = UnifiedEmbedder(project, runtime_config=runtime_config)
-    
-    cli_logger.info(f"DEBUG: UnifiedEmbedder created:")
-    cli_logger.info(f"DEBUG:   - embedder.use_async_batch: {embedder.use_async_batch}")
-    cli_logger.info(f"DEBUG:   - Expected: {use_async}")
-    
-    if use_async and not embedder.use_async_batch:
-        cli_logger.error("ERROR: CLI flag --async was True but embedder.use_async_batch is False!")
-        cli_logger.error("ERROR: Configuration override failed!")
-    elif use_async and embedder.use_async_batch:
-        cli_logger.info("SUCCESS: CLI flag --async correctly set embedder.use_async_batch = True")
+        runtime_config.setdefault("embedding", {})["use_async_batch"] = True
 
-    cli_logger.info(f"CLI: Created embedder with use_async_batch={embedder.use_async_batch}")
-    
-    cli_logger.info("DEBUG: About to call embedder.run_from_folder()...")
+    embedder = UnifiedEmbedder(project, runtime_config=runtime_config)
     embedder.run_from_folder()
-    
+
+    cli_logger.info("✅ Embedding complete.")
+
+    # Optional post-processing: image enrichment and indexing
+    if with_image_index:
+        cli_logger.info("🧠 Starting image enrichment + indexing...")
+
+        import subprocess
+        doc_types = ["pptx", "pdf", "docx"]  # You can extend this as needed
+
+        for doc_type in doc_types:
+            enrich_cmd = f"python cli.py enrich-images {project_dir} --doc-type {doc_type}"
+            index_cmd = f"python cli.py index-images {project_dir} --doc-type {doc_type}"
+
+            cli_logger.info(f"Running: {enrich_cmd}")
+            subprocess.call(enrich_cmd, shell=True)
+
+            cli_logger.info(f"Running: {index_cmd}")
+            subprocess.call(index_cmd, shell=True)
+
+        cli_logger.info("✅ Image indexing complete.")
+
     cli_logger.info("=" * 120)
     cli_logger.info("DEBUG: CLI embed() command COMPLETE")
     cli_logger.info("=" * 120)
 
+
 @app.command()
 def retrieve(
     project_path: str = typer.Argument(..., help="Path to the RAG project directory"),
@@ -401,10 +380,6 @@ def enrich_images(
     """
     Enrich chunks with image summaries using the ImageInsightAgent.
     """
-    from scripts.agents.image_insight_agent import ImageInsightAgent
-    from scripts.chunking.models import Chunk
-    import csv
-    import json
 
     project = ProjectManager(project_path)
     agent = ImageInsightAgent(project)
@@ -452,6 +427,60 @@ def enrich_images(
 
 
 
+@app.command()
+def index_images(
+    project_path: Path = typer.Argument(..., help="Path to the RAG project directory."),
+    doc_type: str = typer.Option("pptx", help="Document type to read enriched chunks from")
+):
+    """
+    Index enriched image summaries (ImageChunks) into image_index.faiss and image_metadata.jsonl.
+    """
+    import csv
+    import json
+    from scripts.chunking.models import ImageChunk
+    from scripts.core.project_manager import ProjectManager
+    from scripts.embeddings.image_indexer import ImageIndexer
+
+    project = ProjectManager(project_path)
+    indexer = ImageIndexer(project)
+
+    enriched_path = project_path / "input" / "enriched" / f"chunks_{doc_type}.tsv"
+    if not enriched_path.exists():
+        typer.echo(f"❌ Enriched TSV not found: {enriched_path}")
+        raise typer.Exit(1)
+
+    image_chunks: list[ImageChunk] = []
+
+    with open(enriched_path, encoding="utf-8") as f:
+        reader = csv.reader(f, delimiter="\t")
+        header = next(reader)
+        for row in reader:
+            if len(row) < 5:
+                continue
+            meta = json.loads(row[4])
+            summaries = meta.get("image_summaries", [])
+            for s in summaries:
+                image_chunks.append(
+                    ImageChunk(
+                        id=str(uuid.uuid4()),
+                        description=s["description"],
+                        meta={
+                            "image_path": s["image_path"],
+                            "source_chunk_id": row[0],
+                            "doc_type": meta.get("doc_type"),
+                            "source_filepath": meta.get("source_filepath"),
+                            "page_number": meta.get("page_number"),
+                        },
+                    )
+                )
+
+    indexer.run(image_chunks)
+    typer.echo(f"✅ Indexed {len(image_chunks)} image chunks into FAISS and metadata JSONL.")
+
+
+
+
+
 if __name__ == "__main__":
     # Configure root logger for CLI output if needed, or rely on LoggerManager
     # For example, to see INFO messages from modules if not configured by LoggerManager:
 
@@ -19,7 +19,8 @@ dependencies = [
     "sentence-transformers (>=4.1.0,<5.0.0)",
     "litellm[proxy] (>=1.73.0,<2.0.0)",
     "pandas (>=2.3.0,<3.0.0)",
-    "streamlit (>=1.46.1,<2.0.0)"
+    "streamlit (>=1.46.1,<2.0.0)",
+    "werkzeug (>=3.1.3,<4.0.0)"
     ]
 
 
 
@@ -3,7 +3,7 @@
 import uuid
 
 from scripts.agents.base import AgentProtocol
-from scripts.chunking.models import Chunk
+from scripts.chunking.models import Chunk, ImageChunk
 from scripts.core.project_manager import ProjectManager
 from scripts.api_clients.openai.completer import OpenAICompleter
 from scripts.utils.logger import LoggerManager
@@ -30,53 +30,61 @@ def __init__(self, project: ProjectManager):
         self.logger = LoggerManager.get_logger(__name__)
 
 
-    def run(self, chunk: Chunk, project: ProjectManager) -> list[Chunk]:
-        image_path = chunk.meta.get("image_path")
-        if not image_path:
-            return [chunk]
-
-        full_path = Path(project.root_dir) / image_path
-        if not full_path.exists():
-            self.logger.warning(f"ImageInsightAgent: file not found {full_path}")
+    def run(self, chunk: Chunk, project: ProjectManager) -> list:
+        image_paths = chunk.meta.get("image_paths", [])
+        if not image_paths:
             return [chunk]
 
+        image_chunks = []
         context = chunk.text[:500]
-        encoded_image = self.encode_image(full_path)
         prompt = self.prompt_template.replace("{{ context }}", context)
 
-        try:
-            completer = OpenAICompleter(model_name=self.model_name)
-            insight = completer.get_multimodal_completion(
-                prompt=prompt, image_b64=encoded_image
-            )
-        except Exception as e:
-            self.logger.error(f"Image insight generation failed: {e}")
-            chunk.meta["image_summary_error"] = str(e)
-            return [chunk]
-
-        # Determine output behavior based on project config
-        # cfg = project.config.get("agents", {}).get("image_insight", {})
-        # mode = cfg.get("output_mode", "append_to_chunk").lower()
-
-        if self.output_mode == "separate_chunk":
-            image_chunk = Chunk(
+        for image_path in image_paths:
+            # full_path = Path(project.root_dir) / image_path
+            full_path = project.input_dir / image_path
+
+            if not full_path.exists():
+                self.logger.warning(f"Image file not found: {full_path}")
+                continue
+
+            try:
+                encoded_image = self.encode_image(full_path)
+                completer = OpenAICompleter(model_name=self.model_name)
+                insight = completer.get_multimodal_completion(prompt=prompt, image_b64=encoded_image)
+            except Exception as e:
+                self.logger.error(f"Failed to enrich {image_path}: {e}")
+                continue
+
+            image_meta = {
+                "image_path": image_path,
+                "image_name": Path(image_path).name,
+                "source_chunk_id": chunk.id,
+                "doc_type": chunk.meta.get("doc_type"),
+                "page_number": chunk.meta.get("page_number"),
+                "source_filepath": chunk.meta.get("source_filepath"),
+            }
+
+            image_chunk = ImageChunk(
                 id=str(uuid.uuid4()),
-                doc_id=chunk.doc_id,
-                text=insight,
-                token_count=len(insight.split()),
-                meta={
-                    "chunk_type": "image_insight",
-                    "source_filepath": chunk.meta.get("source_filepath"),
-                    "image_path": image_path,
-                    "parent_chunk_id": chunk.id,
-                }
+                description=insight,
+                meta=image_meta
             )
-            return [chunk, image_chunk]
+            image_chunks.append(image_chunk)
 
-        # Default behavior: append to meta
-        chunk.meta["image_summary"] = insight
+        if self.output_mode == "separate_chunk":
+            return [chunk] + image_chunks
+
+        # Default: append summaries to chunk.meta
+        chunk.meta["image_summaries"] = [
+            {
+                "image_path": ic.meta["image_path"],
+                "description": ic.description
+            }
+            for ic in image_chunks
+        ]
         return [chunk]
 
+
     def encode_image(self, path: Path) -> str:
         with open(path, "rb") as f:
             return base64.b64encode(f.read()).decode("utf-8")
Original file line number	Diff line number	Diff line change
`@@ -19,7 +19,8 @@ dependencies = [`
`19`	`19`	`"sentence-transformers (>=4.1.0,<5.0.0)",`
`20`	`20`	`"litellm[proxy] (>=1.73.0,<2.0.0)",`
`21`	`21`	`"pandas (>=2.3.0,<3.0.0)",`
`22`		`- "streamlit (>=1.46.1,<2.0.0)"`
	`22`	`+ "streamlit (>=1.46.1,<2.0.0)",`
	`23`	`+ "werkzeug (>=3.1.3,<4.0.0)"`
`23`	`24`	`]`
`24`	`25`
`25`	`26`