diff --git a/README.md b/README.md index 37bdcdf8..59085355 100644 --- a/README.md +++ b/README.md @@ -83,6 +83,7 @@ The following environment variables are required to run the application: - `AWS_DEFAULT_REGION`: (Optional) defaults to `us-east-1` - `AWS_ACCESS_KEY_ID`: (Optional) needed for bedrock embeddings - `AWS_SECRET_ACCESS_KEY`: (Optional) needed for bedrock embeddings +- `PINECONE_API_KEY`: (Optional) needed for pinecone vector database Make sure to set these environment variables before running the application. You can set them in a `.env` file or as system environment variables. @@ -97,7 +98,7 @@ COLLECTION_NAME= ATLAS_SEARCH_INDEX= ``` -The `ATLAS_MONGO_DB_URI` could be the same or different from what is used by LibreChat. Even if it is the same, the `$COLLECTION_NAME` collection needs to be a completely new one, separate from all collections used by LibreChat. In addition, create a vector search index for collection above (remember to assign `$ATLAS_SEARCH_INDEX`) with the following json: +The `ATLAS_MONGO_DB_URI` could be the same or different from what is used by LibreChat. Even if it is the same, the `$COLLECTION_NAME` collection needs to be a completely new one, separate from all collections used by LibreChat. In addition, create a vector search index for collection above (remember to assign `$ATLAS_SEARCH_INDEX`) with the following json: ```json { @@ -118,6 +119,18 @@ The `ATLAS_MONGO_DB_URI` could be the same or different from what is used by Lib Follow one of the [four documented methods](https://www.mongodb.com/docs/atlas/atlas-vector-search/create-index/#procedure) to create the vector index. +### Use Pinecone as Vector Database + +Another option for a vector database, we could use [Pinecone](https://www.pinecone.io/). To do so, set the following environment variables along with creating a pinecone account to get api key. + +```env +VECTOR_DB_TYPE=pinecone +COLLECTION_NAME= +PINECOIN_API_KEY= +AWS_DEFAULT_REGION= +``` + +A new index with name `COLLECTION_NAME` will be created automatically if one does not already exist in your Pinecone vector database. ### Cloud Installation Settings: diff --git a/config.py b/config.py index 7b94a976..86db965a 100644 --- a/config.py +++ b/config.py @@ -6,6 +6,8 @@ from enum import Enum from datetime import datetime from dotenv import find_dotenv, load_dotenv +from langchain_pinecone import Pinecone +from openai import api_key from starlette.middleware.base import BaseHTTPMiddleware from store_factory import get_vector_store @@ -15,6 +17,7 @@ class VectorDBType(Enum): PGVECTOR = "pgvector" ATLAS_MONGO = "atlas-mongo" + PINECONE = "pinecone" class EmbeddingsProvider(Enum): @@ -169,6 +172,7 @@ async def dispatch(self, request, call_next): OLLAMA_BASE_URL = get_env_variable("OLLAMA_BASE_URL", "http://ollama:11434") AWS_ACCESS_KEY_ID = get_env_variable("AWS_ACCESS_KEY_ID", "") AWS_SECRET_ACCESS_KEY = get_env_variable("AWS_SECRET_ACCESS_KEY", "") +PINECONE_API_KEY = get_env_variable("PINECONE_API_KEY", "") ## Embeddings @@ -276,6 +280,15 @@ def init_embeddings(provider, model): mode="atlas-mongo", search_index=ATLAS_SEARCH_INDEX, ) +elif VECTOR_DB_TYPE == VectorDBType.PINECONE: + AWS_DEFAULT_REGION = get_env_variable("AWS_DEFAULT_REGION", "us-east-1") + vector_store = get_vector_store( + connection_string=AWS_DEFAULT_REGION, + embeddings=embeddings, + collection_name=COLLECTION_NAME, + mode="pinecone", + api_key=PINECONE_API_KEY, + ) else: raise ValueError(f"Unsupported vector store type: {VECTOR_DB_TYPE}") diff --git a/requirements.lite.txt b/requirements.lite.txt index 3f554bbb..ed28f385 100644 --- a/requirements.lite.txt +++ b/requirements.lite.txt @@ -24,6 +24,7 @@ rapidocr-onnxruntime==1.3.24 opencv-python-headless==4.9.0.80 pymongo==4.6.3 langchain-mongodb==0.2.0 +langchain-pinecone==0.2.0 cryptography==42.0.7 python-magic==0.4.27 python-pptx==0.6.23 diff --git a/requirements.txt b/requirements.txt index 28f18a0a..84c877c5 100644 --- a/requirements.txt +++ b/requirements.txt @@ -30,6 +30,7 @@ langchain-mongodb==0.2.0 langchain-ollama==0.2.0 langchain-openai==0.2.0 langchain-huggingface==0.1.0 +langchain-pinecone==0.2.0 cryptography==42.0.7 python-magic==0.4.27 python-pptx==0.6.23 diff --git a/store.py b/store.py index 92694ef9..890269d5 100644 --- a/store.py +++ b/store.py @@ -4,7 +4,7 @@ from langchain_core.documents import Document from langchain_core.runnables.config import run_in_executor from sqlalchemy.orm import Session - +from langchain_pinecone import PineconeVectorStore from langchain_mongodb import MongoDBAtlasVectorSearch from langchain_core.embeddings import Embeddings from typing import ( @@ -150,3 +150,87 @@ def delete(self, ids: Optional[list[str]] = None) -> None: # implement the deletion of documents by file_id in self._collection if ids is not None: self._collection.delete_many({"file_id": {"$in": ids}}) + + +class ExtendedPCVector(PineconeVectorStore): + @property + def embedding_function(self) -> Embeddings: + return self.embeddings + + def add_documents(self, docs: list[Document], ids: list[str]): + # {file_id}_{idx} + new_ids = [id for id in range(len(ids))] + file_id = docs[0].metadata["file_id"] + f_ids = [f"{file_id}_{id}" for id in new_ids] + return super().add_documents(docs, ids=f_ids) + + def get_ids_prefix(self, file_id: str) -> list[str]: + prefix = file_id + "_" + ids = [] + for results in self._index.list(prefix=prefix, namespace=self._namespace): + ids.extend(results) + return ids + + def get_all_ids(self) -> List[str]: + baseIDs = set() + for ids in self._index.list(namespace=self._namespace): + for id in ids: + splitIndex = id.rfind('_') + base = id[:splitIndex] + baseIDs.add(base) + return list(baseIDs) + + def get_documents_by_ids(self, ids: List[str]) -> List[Document]: + idList = [] + for id in ids: + result = self.get_ids_prefix(id) + idList.extend(result) + results = self._index.fetch(idList) + documents = [] + for vector in results["vectors"]: + metadata = results["vectors"][vector]["metadata"] + metadataF = { + "file_id": metadata["file_id"], + "digest": metadata["digest"], + "source": metadata["source"], + "user_id": metadata["user_id"], + } + doc = Document(page_content=metadata["text"], metadata=metadataF) + documents.append(doc) + return documents + + def delete(self, ids: Optional[List[str]] = None) -> None: + for id in ids: + idList = self.get_ids_prefix(id) + self._index.delete(ids=idList, namespace=self._namespace) + # Deletes based on prefix, more efficient but delete more if names overlap, implement for all? + + def similarity_search_with_score_by_vector( + self, + embedding: List[float], + k: int = 4, + filter: Optional[dict] = None, + **kwargs: Any, + ) -> List[Tuple[Document, float]]: + """ + Perform a similarity search with scores using an embedding vector. + """ + query_results = self._index.query( + vector=embedding, + top_k=k, + include_metadata=True, + filter=filter, + namespace=self._namespace, + **kwargs, + ) + docs = query_results["matches"] + processed_documents = [] + for match in docs: + metadata = match["metadata"] + if "metadata" in metadata and "_id" in metadata["metadata"]: + del metadata["metadata"]["_id"] + text = metadata["text"] + del metadata["text"] + doc = Document(page_content=text, metadata=metadata) + processed_documents.append((doc, match["score"])) + return processed_documents diff --git a/store_factory.py b/store_factory.py index 6549e294..371f3070 100644 --- a/store_factory.py +++ b/store_factory.py @@ -1,16 +1,19 @@ from typing import Optional from langchain_core.embeddings import Embeddings -from store import AsyncPgVector, ExtendedPgVector +from store import AsyncPgVector, ExtendedPCVector, ExtendedPgVector from store import AtlasMongoVector from pymongo import MongoClient - +from pinecone import Pinecone +from pinecone import ServerlessSpec +import time def get_vector_store( connection_string: str, embeddings: Embeddings, collection_name: str, mode: str = "sync", - search_index: Optional[str] = None + search_index: Optional[str] = None, + api_key: Optional[str] = None ): if mode == "sync": return ExtendedPgVector( @@ -30,7 +33,24 @@ def get_vector_store( return AtlasMongoVector( collection=mong_collection, embedding=embeddings, index_name=search_index ) - + elif mode == "pinecone": + region = connection_string + index_name = collection_name + pc = Pinecone(api_key) + spec = ServerlessSpec(cloud="aws", region=region) + existing_indexes = [index_info["name"] for index_info in pc.list_indexes()] + if index_name not in existing_indexes: + pc.create_index( + name=index_name, + dimension=get_dimension_size(embeddings), + metric="cosine", + spec=spec, + ) + while not pc.describe_index(index_name).status["ready"]: + time.sleep(1) + host = pc.describe_index(index_name).host + index = pc.Index(host=host) + return ExtendedPCVector(index=index, embedding=embeddings) else: raise ValueError("Invalid mode specified. Choose 'sync' or 'async'.") @@ -61,3 +81,6 @@ async def create_index_if_not_exists(conn, table_name: str, column_name: str): print(f"Index {index_name} created on {table_name}.{column_name}") else: print(f"Index {index_name} already exists on {table_name}.{column_name}") + +def get_dimension_size(embeddings:Embeddings): + return len(embeddings.embed_query("Dimensions")) \ No newline at end of file