cloudera
diff --git a/‎.env.example‎
Lines changed: 25 additions & 1 deletion b/‎.env.example‎
Lines changed: 25 additions & 1 deletion
diff --git a/‎.github/workflows/publish_release.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/publish_release.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.gitignore‎
Lines changed: 1 addition & 0 deletions b/‎.gitignore‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎README.md‎
Lines changed: 47 additions & 9 deletions b/‎README.md‎
Lines changed: 47 additions & 9 deletions
diff --git a/‎docs/chat_flow.md‎
Lines changed: 1 addition & 1 deletion b/‎docs/chat_flow.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎llm-service/app/ai/indexing/base.py‎
Lines changed: 14 additions & 2 deletions b/‎llm-service/app/ai/indexing/base.py‎
Lines changed: 14 additions & 2 deletions
diff --git a/‎llm-service/app/ai/indexing/embedding_indexer.py‎
Lines changed: 14 additions & 2 deletions b/‎llm-service/app/ai/indexing/embedding_indexer.py‎
Lines changed: 14 additions & 2 deletions
diff --git a/‎llm-service/app/ai/indexing/readers/docx.py‎
Lines changed: 1 addition & 1 deletion b/‎llm-service/app/ai/indexing/readers/docx.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎llm-service/app/ai/indexing/readers/pptx.py‎
Lines changed: 16 additions & 19 deletions b/‎llm-service/app/ai/indexing/readers/pptx.py‎
Lines changed: 16 additions & 19 deletions
diff --git a/‎llm-service/app/ai/indexing/summary_indexer.py‎
Lines changed: 14 additions & 11 deletions b/‎llm-service/app/ai/indexing/summary_indexer.py‎
Lines changed: 14 additions & 11 deletions
@@ -1,7 +1,19 @@
 AWS_DEFAULT_REGION=us-west-2
 
+# H2 or PostgreSQL (RDS) (H2 is default)
+DB_TYPE=H2
+
+# H2
 DB_URL=jdbc:h2:../databases/rag
 
+# RDS
+# DB_URL= "jdbc:postgresql://<host>:<port>/<database>"
+DB_USERNAME=
+DB_PASSWORD=
+
+# Model Provider
+MODEL_PROVIDER=Bedrock
+
 # CAII
 CAII_DOMAIN=
 
@@ -10,7 +22,7 @@ AZURE_OPENAI_API_KEY=
 AZURE_OPENAI_ENDPOINT=
 OPENAI_API_VERSION=
 
-# QDRANT or OPENSEARCH
+# QDRANT or OPENSEARCH or CHROMADB
 VECTOR_DB_PROVIDER=QDRANT
 
 # OpenSearch
@@ -19,6 +31,18 @@ OPENSEARCH_USERNAME=
 OPENSEARCH_PASSWORD=
 OPENSEARCH_NAMESPACE=
 
+# ChromaDB
+CHROMADB_HOST=http://localhost
+CHROMADB_PORT=8000
+CHROMADB_TOKEN=
+# Tenant and database defaults to the Chroma default values
+CHROMADB_TENANT=
+CHROMADB_DATABASE=
+# If CHROMADB_HOST starts with "https://" and your server uses a private CA,
+# set it to the path of your PEM bundle so Python can verify TLS connections to ChromaDB:
+CHROMADB_SERVER_SSL_CERT_PATH=/absolute/path/to/ca-bundle.pem
+CHROMADB_ENABLE_ANONYMIZED_TELEMETRY=false
+
 # AWS
 AWS_ACCESS_KEY_ID=
 AWS_SECRET_ACCESS_KEY=
 
@@ -122,4 +122,4 @@ jobs:
               echo "No changes to commit"
             fi
         env:
-          GITHUB_TOKEN: ${{ github.token }}
+          GITHUB_TOKEN: ${{ github.token }}
@@ -1,6 +1,7 @@
 .env
 .idea/*
 .vscode/*
+.cursor/*
 !.idea/copyright/
 !.idea/prettier.xml
 !.idea/google-java-format.xml
 
@@ -52,6 +52,32 @@ RAG Studio can utilize the local file system or an S3 bucket for storing documen
 
 S3 will also require providing the AWS credentials for the bucket.
 
+### Vector Database Options
+
+RAG Studio supports Qdrant (default), OpenSearch (Cloudera Semantic Search), and ChromaDB.
+
+- To choose the vector DB, set `VECTOR_DB_PROVIDER` to one of `QDRANT`, `OPENSEARCH`, or `CHROMADB` in your `.env`.
+
+#### ChromaDB Setup
+
+If you select ChromaDB, configure the following environment variables in `.env`:
+
+- `CHROMADB_HOST` - Hostname or URL for ChromaDB. Use `localhost` for local Docker.
+- `CHROMADB_PORT` - Port for ChromaDB (default `8000`). Not required if `CHROMADB_HOST` starts with `https://` and the server infers the port.
+- `CHROMADB_TENANT` - Optional. Defaults to the Chroma default tenant.
+- `CHROMADB_DATABASE` - Optional. Defaults to the Chroma default database.
+- `CHROMADB_TOKEN` - Optional. Include if your Chroma server requires an auth token.
+- `CHROMADB_SERVER_SSL_CERT_PATH` - Optional. Path to PEM bundle for TLS verification when using HTTPS with a private CA.
+- `CHROMADB_ENABLE_ANONYMIZED_TELEMETRY` - Optional. Enables anonymized telemetry in the ChromaDB client; defaults to `false`.
+
+Notes:
+
+- The local-dev script will automatically start a ChromaDB Docker container when `VECTOR_DB_PROVIDER=CHROMADB`, `CHROMADB_HOST=localhost` on `CHROMADB_PORT=8000`.
+- ChromaDB collections are automatically namespaced using the tenant and database values to avoid conflicts between different RAG Studio instances.
+- For production deployments, consider using a dedicated ChromaDB server with authentication enabled via `CHROMADB_TOKEN`.
+- When using HTTPS endpoints, ensure your certificate chain is properly configured or provide the CA bundle path via `CHROMADB_SERVER_SSL_CERT_PATH`.
+- Anonymized telemetry is disabled by default. You can enable it either by setting `CHROMADB_ENABLE_ANONYMIZED_TELEMETRY=true`.
+
 ### Enhanced Parsing Options:
 
 RAG Studio can optionally enable enhanced parsing by providing the `USE_ENHANCED_PDF_PROCESSING` environment variable. Enabling this will allow RAG Studio to parse images and tables from PDFs. When enabling this feature, we strongly recommend using this with a GPU and at least 16GB of memory.
@@ -82,7 +108,7 @@ This variable can be set from the project settings for the AMP in CML.
 ## Air-gapped Environments
 
 If you are using an air-gapped environment, you will need to whitelist at the minimum the following domains in order to use the AMP.
-There may be other domains that need to be whitelisted depending on your environment and the model service provider you select. 
+There may be other domains that need to be whitelisted depending on your environment and the model service provider you select.
 
 - `https://github.com`
 - `https://raw.githubusercontent.com`
@@ -150,17 +176,29 @@ the Node service locally, you can do so by following these steps:
 docker run -p 6333:6333 -p 6334:6334 -v $(pwd)/databases/qdrant_storage:/qdrant/storage:z qdrant/qdrant
 ```
 
+#### To run ChromaDB locally
+
+```
+docker run --name chromadb_dev --rm -d -p 8000:8000 -v $(pwd)/databases/chromadb_storage:/data chromadb/chroma
+```
+
+#### Use ChromaDB with local-dev.sh
+
+- Copy `.env.example` to `.env`.
+- Set `VECTOR_DB_PROVIDER=CHROMADB` in `.env` (defaults assume `CHROMADB_HOST=localhost` and `CHROMADB_PORT=8000`).
+- Run `./local-dev.sh` from the repo root. When `CHROMADB_HOST=localhost`, the script will auto-start a ChromaDB Docker container.
+
 #### Modifying UI in CML
 
-* This is an unsupported workflow, but it is possible to modify the UI code in CML.
+- This is an unsupported workflow, but it is possible to modify the UI code in CML.
 
-- Start a CML Session from a CML Project that has the RAG Studio AMP installed.
-- Open the terminal in the CML Session and navigate to the `ui` directory.
-- Run `source ~/.bashrc` to ensure the Node environment variables are loaded.
-- Install PNPM using `npm install -g pnpm`.  Docs on PNPM can be found here: https://pnpm.io/installation#using-npm
-- Run `pnpm install` to install the dependencies.
-- Make your changes to the UI code in the `ui` directory.
-- Run `pnpm build` to build the new UI bundle.
+* Start a CML Session from a CML Project that has the RAG Studio AMP installed.
+* Open the terminal in the CML Session and navigate to the `ui` directory.
+* Run `source ~/.bashrc` to ensure the Node environment variables are loaded.
+* Install PNPM using `npm install -g pnpm`. Docs on PNPM can be found here: https://pnpm.io/installation#using-npm
+* Run `pnpm install` to install the dependencies.
+* Make your changes to the UI code in the `ui` directory.
+* Run `pnpm build` to build the new UI bundle.
 
 ## The Fine Print
 
 
@@ -14,7 +14,7 @@ sequenceDiagram
     participant MLflow as MLflow
 
     User->>UI: Enters query
-    UI->>API: POST /sessions/{session_id}/chat
+    UI->>API: POST /sessions/{session_id}/stream-completion
     Note over UI,API: Request includes query and configuration
 
     API->>MetadataApi: GET session metadata
 
@@ -1,9 +1,12 @@
+import json
 import logging
 import os
 from abc import abstractmethod
 from dataclasses import dataclass
 from pathlib import Path
-from typing import Dict, Type, Optional
+from typing import Dict, Type, Optional, TypeVar
+
+from llama_index.core.schema import BaseNode
 
 from .readers.base_reader import BaseReader, ReaderConfig
 from .readers.csv import CSVReader
@@ -26,7 +29,6 @@
     ".docx": DocxReader,
     ".pptx": PptxReader,
     ".pptm": PptxReader,
-    ".ppt": PptxReader,
     ".csv": CSVReader,
     ".json": JSONReader,
     ".jpg": ImagesReader,
@@ -40,6 +42,9 @@
 }
 
 
+TNode = TypeVar("TNode", bound=BaseNode)
+
+
 @dataclass
 class NotSupportedFileExtensionError(Exception):
     file_extension: str
@@ -54,6 +59,13 @@ def __init__(
         self.data_source_id = data_source_id
         self.reader_config = reader_config
 
+    @staticmethod
+    def _flatten_metadata(chunk: TNode) -> TNode:
+        for key, value in chunk.metadata.items():
+            if isinstance(value, list) or isinstance(value, dict):
+                chunk.metadata[key] = json.dumps(value)
+        return chunk
+
     @abstractmethod
     def index_file(self, file_path: Path, doc_id: str) -> None:
         pass
 
@@ -108,6 +108,12 @@ def index_file(self, file_path: Path, document_id: str) -> None:
             # we're capturing "text".
             converted_chunks: List[BaseNode] = [chunk for chunk in chunk_batch]
 
+            # flatten metadata if vector store has self.flat_metadata
+            if self.chunks_vector_store.flat_metadata:
+                converted_chunks = [
+                    self._flatten_metadata(chunk) for chunk in converted_chunks
+                ]
+
             chunks_vector_store = self.chunks_vector_store.llama_vector_store()
             chunks_vector_store.add(converted_chunks)
 
@@ -130,6 +136,12 @@ def _compute_embeddings(
             logger.debug(f"Waiting for {len(futures)} futures")
             for future in as_completed(futures):
                 i, batch_embeddings = future.result()
-                for chunk, embedding in zip(batched_chunks[i], batch_embeddings):
+                batch_chunks = batched_chunks[i]
+                if len(batch_chunks) != len(batch_embeddings):
+                    raise ValueError(
+                        f"Expected {len(batch_chunks)} embedding vectors for this batch of chunks,"
+                        + f" but got {len(batch_embeddings)} from {self.embedding_model.model_name}"
+                    )
+                for chunk, embedding in zip(batch_chunks, batch_embeddings):
                     chunk.embedding = embedding
-                yield batched_chunks[i]
+                yield batch_chunks
@@ -52,7 +52,7 @@ def __init__(self, *args: Any, **kwargs: Any) -> None:
     def load_chunks(self, file_path: Path) -> ChunksResult:
         documents = self.inner.load_data(file_path)
         assert len(documents) == 1
-        document = documents[0]
+        document = documents[0]  # single document contains all pages' contents
         document.id_ = self.document_id
 
         document_text = document.text
 
@@ -39,7 +39,6 @@
 from pathlib import Path
 from typing import Any
 
-from llama_index.core import Document
 from llama_index.readers.file import PptxReader as LlamaIndexPptxReader
 
 from .base_reader import BaseReader, ChunksResult
@@ -51,27 +50,25 @@ def __init__(self, *args: Any, **kwargs: Any) -> None:
         self.inner = LlamaIndexPptxReader()
 
     def load_chunks(self, file_path: Path) -> ChunksResult:
+        # TODO: This loop makes a lot of function calls;
+        #       if it's slow, we should try .pdf.PageTracker which consolidates contents to avoid that
+        ret = ChunksResult()
+        for i, document in enumerate(self.inner.load_data(file_path)):
+            document.id_ = self.document_id
 
-        documents = self.inner.load_data(file_path)
-        assert len(documents) == 1
-        document: Document = documents[0]
-        document.id_ = self.document_id
-
-        document_text = document.text
-
-        secrets = self._block_secrets([document_text])
-        if secrets is not None:
-            return ChunksResult(secret_types=secrets)
+            document_text = document.text
 
-        ret = ChunksResult()
+            secrets = self._block_secrets([document_text])
+            if secrets is not None:
+                return ChunksResult(secret_types=secrets)
 
-        anonymized_text = self._anonymize_pii(document_text)
-        if anonymized_text is not None:
-            ret.pii_found = True
-            document_text = anonymized_text
+            anonymized_text = self._anonymize_pii(document_text)
+            if anonymized_text is not None:
+                ret.pii_found = True
+                document_text = anonymized_text
 
-        document.set_content(document_text)
+            document.set_content(document_text)
 
-        self._add_document_metadata(document, file_path)
-        ret.chunks = self._chunks_in_document(document)
+            self._add_document_metadata(document, file_path)
+            ret.chunks.extend(self._chunks_in_document(document))
         return ret
@@ -70,13 +70,13 @@
 from qdrant_client.http.exceptions import UnexpectedResponse
 
 from app.services import models
+from app.ai.vector_stores.vector_store import VectorStore
 from .base import BaseTextIndexer
 from .readers.base_reader import ReaderConfig, ChunksResult
 from ..vector_stores.vector_store_factory import VectorStoreFactory
-from ...config import settings
+from ...config import settings, ModelSource
 from ...services.metadata_apis import data_sources_metadata_api
-from ...services.models.providers import ModelProvider
-from ...services.models import ModelSource
+from ...services.models.providers import get_provider_class
 
 logger = logging.getLogger(__name__)
 
@@ -102,6 +102,7 @@ def __init__(
         self.splitter = splitter
         self.llm = llm
         self.embedding_model = embedding_model
+        self.summary_vector_store = VectorStoreFactory.for_summaries(data_source_id)
 
     @staticmethod
     def __database_dir(data_source_id: int) -> str:
@@ -133,9 +134,7 @@ def __index_configuration(
         embed_summaries: bool = True,
     ) -> Dict[str, Any]:
         prompt_helper: Optional[PromptHelper] = None
-        model_source: ModelSource = (
-            ModelProvider.get_provider_class().get_model_source()
-        )
+        model_source: ModelSource = get_provider_class().get_model_source()
         if model_source == "CAII":
             # if we're using CAII, let's be conservative and use a small context window to account for mistral's small context
             prompt_helper = PromptHelper(context_window=3000)
@@ -180,19 +179,20 @@ def __summary_indexer(
             return SummaryIndexer.__summary_indexer_with_config(
                 persist_dir=persist_dir,
                 index_configuration=self.__index_kwargs(embed_summaries),
+                summary_vector_store=self.summary_vector_store,
             )
         except (ValueError, FileNotFoundError):
             doc_summary_index = self.__init_summary_store(persist_dir)
             return doc_summary_index
 
     @staticmethod
     def __summary_indexer_with_config(
-        persist_dir: str, index_configuration: Dict[str, Any]
+        persist_dir: str, index_configuration: Dict[str, Any], 
+        summary_vector_store: VectorStore,
     ) -> DocumentSummaryIndex:
-        data_source_id: int = index_configuration.get("data_source_id")
         storage_context = SummaryIndexer.create_storage_context(
             persist_dir,
-            VectorStoreFactory.for_summaries(data_source_id).llama_vector_store(),
+            summary_vector_store.llama_vector_store(),
         )
         doc_summary_index: DocumentSummaryIndex = cast(
             DocumentSummaryIndex,
@@ -296,6 +296,8 @@ def index_file(self, file_path: Path, document_id: str) -> None:
         with _write_lock:
             persist_dir = self.__persist_dir()
             summary_store: DocumentSummaryIndex = self.__summary_indexer(persist_dir)
+            if self.summary_vector_store.flat_metadata:
+                nodes = [self._flatten_metadata(node) for node in nodes]
             summary_store.insert_nodes(nodes)
             summary_store.storage_context.persist(persist_dir=persist_dir)
 
@@ -314,7 +316,7 @@ def __update_global_summary_store(
         # and re-index it with the addition/removal.
         global_persist_dir = self.__persist_root_dir()
         global_summary_store = self.__summary_indexer(
-            global_persist_dir, embed_summaries=False
+            global_persist_dir, embed_summaries=False,
         )
         data_source_node = Document(doc_id=str(self.data_source_id))
 
@@ -496,7 +498,8 @@ def delete_data_source_by_id(data_source_id: int) -> None:
                     embed_summaries=False,
                 )
                 global_summary_store = SummaryIndexer.__summary_indexer_with_config(
-                    global_persist_dir, configuration
+                    global_persist_dir, configuration,
+                    summary_vector_store=vector_store,
                 )
             except FileNotFoundError:
                 ## global summary store doesn't exist, nothing to do