Skip to content

Commit 23de4c3

Browse files
authored
Merge pull request #70 from hagaybar/feature/project-management-ui
Feature/project management UI
2 parents 6ceda2c + dbe82b5 commit 23de4c3

26 files changed

+1713
-221
lines changed

README.md

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,32 @@ poetry run streamlit run app/ui_streamlit.py # default browser opens
4141
4242
---
4343

44+
## 💻 UI Usage
45+
46+
The Streamlit UI provides a user-friendly interface for managing RAG-GP projects.
47+
48+
### Creating a New Project
49+
50+
1. Navigate to the "Projects" section in the sidebar.
51+
2. Fill out the "Create New Project" form:
52+
* **Project Name:** A unique name for your project.
53+
* **Project Description:** An optional description of your project.
54+
* **Language:** The primary language of your documents.
55+
* **Enable Image Enrichment:** Check this box to enable image analysis features.
56+
* **Embedding Model:** Select the embedding model to use for your project.
57+
3. Click the "Create Project" button.
58+
59+
### Managing a Project
60+
61+
Once you have created a project, you can manage it from the "Projects" section.
62+
63+
* **Select a Project:** Choose a project from the dropdown menu to view its details.
64+
* **Configuration Editor:** The `config.yml` file for the selected project is displayed in a text editor. You can make changes to the configuration and save them by clicking the "Save Config" button.
65+
* **Upload Raw Data:** You can upload raw data files (e.g., .pdf, .docx, .txt) to your project using the file uploader. The files will be saved to the appropriate subdirectory under `data/projects/<project_name>/input/raw/`.
66+
* **Raw File Repository:** The "Raw File Repository" section displays a list of all the raw data files in your project, grouped by file type.
67+
68+
---
69+
4470
## 🗂️ Folder Structure (excerpt)
4571

4672
```text

app/cli.py

Lines changed: 91 additions & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
# See: https://github.com/pytorch/pytorch/issues/37377 and https://openmp.llvm.org
33
import sys
44
import pathlib
5+
import uuid
56

67
# Ensure the root directory (where pyproject.toml lives) is on sys.path
78
ROOT = pathlib.Path(__file__).resolve().parents[1]
@@ -15,8 +16,7 @@
1516
import logging # Added for ask command
1617

1718
import typer # type: ignore
18-
import json # Added import
19-
import csv # Added import
19+
import json, csv
2020
from collections import defaultdict
2121
from pathlib import Path
2222

@@ -30,7 +30,7 @@
3030
from scripts.retrieval.retrieval_manager import RetrievalManager
3131
from scripts.prompting.prompt_builder import PromptBuilder # Added for ask command
3232
from scripts.api_clients.openai.completer import OpenAICompleter # Added for ask command
33-
33+
from scripts.agents.image_insight_agent import ImageInsightAgent # Added for index_images command
3434
app = typer.Typer()
3535

3636
# Setup basic logging for the CLI
@@ -157,82 +157,61 @@ def ingest(
157157
@app.command()
158158
def embed(
159159
project_dir: Path,
160-
use_async: bool = typer.Option(False, "--a-b", "--async-batch", help="Use OpenAI async batch embedding")
160+
use_async: bool = typer.Option(False, "--a-b", "--async-batch", help="Use OpenAI async batch embedding"),
161+
with_image_index: bool = typer.Option(False, "--with-image-index", help="Run image enrichment and indexing after embedding")
161162
) -> None:
162163
"""
163164
Generate embeddings for chunks in the specified project directory.
165+
Optionally run image enrichment + indexing after embedding.
164166
"""
165167
cli_logger.info("\n" + "=" * 120)
166168
cli_logger.info("DEBUG: CLI embed() command STARTING")
167169
cli_logger.info("=" * 120)
168170

169171
cli_logger.info(f"DEBUG: CLI Arguments received:")
170-
cli_logger.info(f"DEBUG: - project_dir: {project_dir}")
171-
cli_logger.info(f"DEBUG: - use_async: {use_async}")
172-
cli_logger.info(f"DEBUG: - use_async type: {type(use_async)}")
173-
172+
cli_logger.info(f" - project_dir: {project_dir}")
173+
cli_logger.info(f" - use_async: {use_async}")
174+
cli_logger.info(f" - with_image_index: {with_image_index}")
175+
174176
if not project_dir.exists():
175-
error_msg = f"Project directory does not exist: {project_dir}"
176-
cli_logger.error(f"ERROR: {error_msg}")
177-
typer.echo(f"Error: {error_msg}")
177+
typer.echo(f"❌ Project directory does not exist: {project_dir}")
178178
raise typer.Exit(1)
179-
180-
# logger = LoggerManager.get_logger("cli") # Already have cli_logger
181-
182-
# Initialize project manager
183-
cli_logger.info("DEBUG: Creating ProjectManager...")
179+
184180
project = ProjectManager(project_dir)
185-
cli_logger.info(f"DEBUG: ProjectManager created for: {project_dir}")
186181
runtime_config = copy_module.deepcopy(project.config)
187-
cli_logger.info(f"DEBUG: Project config loaded: {runtime_config}")
188182

189-
# Override config if async flag is provided
190183
if use_async:
191-
cli_logger.info("DEBUG: CLI use_async is TRUE - Overriding config")
192-
cli_logger.info("Embedding mode override: use_async_batch=True")
193-
194-
# Set the runtime config directly (config is a plain dict)
195-
if 'embedding' not in runtime_config:
196-
runtime_config['embedding'] = {}
197-
198-
runtime_config['embedding']['use_async_batch'] = True
199-
cli_logger.info("DEBUG: Set runtime_config['embedding']['use_async_batch'] = True")
200-
cli_logger.info(f"DEBUG: Updated runtime config: {runtime_config}")
201-
cli_logger.info(f"DEBUG: Original project.config unchanged: {project.config}")
202-
203-
# Verify the setting in runtime_config (since config is a plain dict)
204-
if 'embedding' in runtime_config and 'use_async_batch' in runtime_config['embedding']:
205-
async_batch_value = runtime_config['embedding']['use_async_batch']
206-
cli_logger.info(f"DEBUG: Verification - runtime_config['embedding']['use_async_batch'] = {async_batch_value}")
207-
else:
208-
cli_logger.info("DEBUG: use_async_batch not found in runtime_config")
209-
210-
else:
211-
cli_logger.info("DEBUG: CLI use_async is FALSE - Using default config")
212-
cli_logger.info("Embedding mode: using default configuration")
213-
214-
cli_logger.info("DEBUG: About to create UnifiedEmbedder...")
215-
embedder = UnifiedEmbedder(project, runtime_config=runtime_config)
216-
217-
cli_logger.info(f"DEBUG: UnifiedEmbedder created:")
218-
cli_logger.info(f"DEBUG: - embedder.use_async_batch: {embedder.use_async_batch}")
219-
cli_logger.info(f"DEBUG: - Expected: {use_async}")
220-
221-
if use_async and not embedder.use_async_batch:
222-
cli_logger.error("ERROR: CLI flag --async was True but embedder.use_async_batch is False!")
223-
cli_logger.error("ERROR: Configuration override failed!")
224-
elif use_async and embedder.use_async_batch:
225-
cli_logger.info("SUCCESS: CLI flag --async correctly set embedder.use_async_batch = True")
184+
runtime_config.setdefault("embedding", {})["use_async_batch"] = True
226185

227-
cli_logger.info(f"CLI: Created embedder with use_async_batch={embedder.use_async_batch}")
228-
229-
cli_logger.info("DEBUG: About to call embedder.run_from_folder()...")
186+
embedder = UnifiedEmbedder(project, runtime_config=runtime_config)
230187
embedder.run_from_folder()
231-
188+
189+
cli_logger.info("✅ Embedding complete.")
190+
191+
# Optional post-processing: image enrichment and indexing
192+
if with_image_index:
193+
cli_logger.info("🧠 Starting image enrichment + indexing...")
194+
195+
import subprocess
196+
doc_types = ["pptx", "pdf", "docx"] # You can extend this as needed
197+
198+
for doc_type in doc_types:
199+
enrich_cmd = f"python cli.py enrich-images {project_dir} --doc-type {doc_type}"
200+
index_cmd = f"python cli.py index-images {project_dir} --doc-type {doc_type}"
201+
202+
cli_logger.info(f"Running: {enrich_cmd}")
203+
subprocess.call(enrich_cmd, shell=True)
204+
205+
cli_logger.info(f"Running: {index_cmd}")
206+
subprocess.call(index_cmd, shell=True)
207+
208+
cli_logger.info("✅ Image indexing complete.")
209+
232210
cli_logger.info("=" * 120)
233211
cli_logger.info("DEBUG: CLI embed() command COMPLETE")
234212
cli_logger.info("=" * 120)
235213

214+
236215
@app.command()
237216
def retrieve(
238217
project_path: str = typer.Argument(..., help="Path to the RAG project directory"),
@@ -401,10 +380,6 @@ def enrich_images(
401380
"""
402381
Enrich chunks with image summaries using the ImageInsightAgent.
403382
"""
404-
from scripts.agents.image_insight_agent import ImageInsightAgent
405-
from scripts.chunking.models import Chunk
406-
import csv
407-
import json
408383

409384
project = ProjectManager(project_path)
410385
agent = ImageInsightAgent(project)
@@ -452,6 +427,60 @@ def enrich_images(
452427

453428

454429

430+
@app.command()
431+
def index_images(
432+
project_path: Path = typer.Argument(..., help="Path to the RAG project directory."),
433+
doc_type: str = typer.Option("pptx", help="Document type to read enriched chunks from")
434+
):
435+
"""
436+
Index enriched image summaries (ImageChunks) into image_index.faiss and image_metadata.jsonl.
437+
"""
438+
import csv
439+
import json
440+
from scripts.chunking.models import ImageChunk
441+
from scripts.core.project_manager import ProjectManager
442+
from scripts.embeddings.image_indexer import ImageIndexer
443+
444+
project = ProjectManager(project_path)
445+
indexer = ImageIndexer(project)
446+
447+
enriched_path = project_path / "input" / "enriched" / f"chunks_{doc_type}.tsv"
448+
if not enriched_path.exists():
449+
typer.echo(f"❌ Enriched TSV not found: {enriched_path}")
450+
raise typer.Exit(1)
451+
452+
image_chunks: list[ImageChunk] = []
453+
454+
with open(enriched_path, encoding="utf-8") as f:
455+
reader = csv.reader(f, delimiter="\t")
456+
header = next(reader)
457+
for row in reader:
458+
if len(row) < 5:
459+
continue
460+
meta = json.loads(row[4])
461+
summaries = meta.get("image_summaries", [])
462+
for s in summaries:
463+
image_chunks.append(
464+
ImageChunk(
465+
id=str(uuid.uuid4()),
466+
description=s["description"],
467+
meta={
468+
"image_path": s["image_path"],
469+
"source_chunk_id": row[0],
470+
"doc_type": meta.get("doc_type"),
471+
"source_filepath": meta.get("source_filepath"),
472+
"page_number": meta.get("page_number"),
473+
},
474+
)
475+
)
476+
477+
indexer.run(image_chunks)
478+
typer.echo(f"✅ Indexed {len(image_chunks)} image chunks into FAISS and metadata JSONL.")
479+
480+
481+
482+
483+
455484
if __name__ == "__main__":
456485
# Configure root logger for CLI output if needed, or rely on LoggerManager
457486
# For example, to see INFO messages from modules if not configured by LoggerManager:

docs/new_images_index_plan.docx

24.3 KB
Binary file not shown.

poetry.lock

Lines changed: 19 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pyproject.toml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,8 @@ dependencies = [
1919
"sentence-transformers (>=4.1.0,<5.0.0)",
2020
"litellm[proxy] (>=1.73.0,<2.0.0)",
2121
"pandas (>=2.3.0,<3.0.0)",
22-
"streamlit (>=1.46.1,<2.0.0)"
22+
"streamlit (>=1.46.1,<2.0.0)",
23+
"werkzeug (>=3.1.3,<4.0.0)"
2324
]
2425

2526

scripts/agents/image_insight_agent.py

Lines changed: 46 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
import uuid
44

55
from scripts.agents.base import AgentProtocol
6-
from scripts.chunking.models import Chunk
6+
from scripts.chunking.models import Chunk, ImageChunk
77
from scripts.core.project_manager import ProjectManager
88
from scripts.api_clients.openai.completer import OpenAICompleter
99
from scripts.utils.logger import LoggerManager
@@ -30,53 +30,61 @@ def __init__(self, project: ProjectManager):
3030
self.logger = LoggerManager.get_logger(__name__)
3131

3232

33-
def run(self, chunk: Chunk, project: ProjectManager) -> list[Chunk]:
34-
image_path = chunk.meta.get("image_path")
35-
if not image_path:
36-
return [chunk]
37-
38-
full_path = Path(project.root_dir) / image_path
39-
if not full_path.exists():
40-
self.logger.warning(f"ImageInsightAgent: file not found {full_path}")
33+
def run(self, chunk: Chunk, project: ProjectManager) -> list:
34+
image_paths = chunk.meta.get("image_paths", [])
35+
if not image_paths:
4136
return [chunk]
4237

38+
image_chunks = []
4339
context = chunk.text[:500]
44-
encoded_image = self.encode_image(full_path)
4540
prompt = self.prompt_template.replace("{{ context }}", context)
4641

47-
try:
48-
completer = OpenAICompleter(model_name=self.model_name)
49-
insight = completer.get_multimodal_completion(
50-
prompt=prompt, image_b64=encoded_image
51-
)
52-
except Exception as e:
53-
self.logger.error(f"Image insight generation failed: {e}")
54-
chunk.meta["image_summary_error"] = str(e)
55-
return [chunk]
56-
57-
# Determine output behavior based on project config
58-
# cfg = project.config.get("agents", {}).get("image_insight", {})
59-
# mode = cfg.get("output_mode", "append_to_chunk").lower()
60-
61-
if self.output_mode == "separate_chunk":
62-
image_chunk = Chunk(
42+
for image_path in image_paths:
43+
# full_path = Path(project.root_dir) / image_path
44+
full_path = project.input_dir / image_path
45+
46+
if not full_path.exists():
47+
self.logger.warning(f"Image file not found: {full_path}")
48+
continue
49+
50+
try:
51+
encoded_image = self.encode_image(full_path)
52+
completer = OpenAICompleter(model_name=self.model_name)
53+
insight = completer.get_multimodal_completion(prompt=prompt, image_b64=encoded_image)
54+
except Exception as e:
55+
self.logger.error(f"Failed to enrich {image_path}: {e}")
56+
continue
57+
58+
image_meta = {
59+
"image_path": image_path,
60+
"image_name": Path(image_path).name,
61+
"source_chunk_id": chunk.id,
62+
"doc_type": chunk.meta.get("doc_type"),
63+
"page_number": chunk.meta.get("page_number"),
64+
"source_filepath": chunk.meta.get("source_filepath"),
65+
}
66+
67+
image_chunk = ImageChunk(
6368
id=str(uuid.uuid4()),
64-
doc_id=chunk.doc_id,
65-
text=insight,
66-
token_count=len(insight.split()),
67-
meta={
68-
"chunk_type": "image_insight",
69-
"source_filepath": chunk.meta.get("source_filepath"),
70-
"image_path": image_path,
71-
"parent_chunk_id": chunk.id,
72-
}
69+
description=insight,
70+
meta=image_meta
7371
)
74-
return [chunk, image_chunk]
72+
image_chunks.append(image_chunk)
7573

76-
# Default behavior: append to meta
77-
chunk.meta["image_summary"] = insight
74+
if self.output_mode == "separate_chunk":
75+
return [chunk] + image_chunks
76+
77+
# Default: append summaries to chunk.meta
78+
chunk.meta["image_summaries"] = [
79+
{
80+
"image_path": ic.meta["image_path"],
81+
"description": ic.description
82+
}
83+
for ic in image_chunks
84+
]
7885
return [chunk]
7986

87+
8088
def encode_image(self, path: Path) -> str:
8189
with open(path, "rb") as f:
8290
return base64.b64encode(f.read()).decode("utf-8")

0 commit comments

Comments
 (0)