🚀 refactor: Langchain v0.3 (#74)

FinnConnor · web-flow · commit edd8a0c097d3 · 2024-09-27T11:04:18.000-04:00
* ↪️refactor: moved packages out of langchain community and fixed embeddings abstraction

* ↪️refactor: update requirements.txt

* ✓chore: removed long list of requirements.txt

* fix: mongo db query and  embed/upload

* refactor: change env var name

* docs: Atlas MongoDB update env var setting

* docs: MongoDB Atlas direction

* docs: update atlas mongoDB directions

* docs: add deprecated env variable

* refactor: mongo db env variable MONGO_VECTOR_COLLECTION backwards compatability add
diff --git a/README.md b/README.md
@@ -77,6 +77,8 @@ The following environment variables are required to run the application:
     - Note: `AZURE_OPENAI_ENDPOINT` will work but `RAG_AZURE_OPENAI_ENDPOINT` will override it in order to not conflict with LibreChat setting.
 - `HF_TOKEN`: (Optional) if needed for `huggingface` option.
 - `OLLAMA_BASE_URL`: (Optional) defaults to `http://ollama:11434`.
+- `ATLAS_SEARCH_INDEX`: (Optional) the name of the vector search index if using Atlas MongoDB, defaults to `vector_index`
+- `MONGO_VECTOR_COLLECTION`: Deprecated for MongoDB, please use `ATLAS_SEARCH_INDEX` and `COLLECTION_NAME`
 
 Make sure to set these environment variables before running the application. You can set them in a `.env` file or as system environment variables.
 
@@ -87,10 +89,11 @@ Instead of using the default pgvector, we could use [Atlas MongoDB](https://www.
 ```env
 VECTOR_DB_TYPE=atlas-mongo
 ATLAS_MONGO_DB_URI=<mongodb+srv://...>
-MONGO_VECTOR_COLLECTION=<collection name>
+COLLECTION_NAME=<vector collection>
+ATLAS_SEARCH_INDEX=<vector search index>
 ```
 
-The `ATLAS_MONGO_DB_URI` could be the same or different from what is used by LibreChat. Even if it is the same, the `$MONGO_VECTOR_COLLECTION` collection needs to be a completely new one, separate from all collections used by LibreChat. In additional,  create a vector search index for  `$MONGO_VECTOR_COLLECTION`  with the following json:
+The `ATLAS_MONGO_DB_URI` could be the same or different from what is used by LibreChat. Even if it is the same, the `$COLLECTION_NAME` collection needs to be a completely new one, separate from all collections used by LibreChat. In addition,  create a vector search index for collection above (remember to assign `$ATLAS_SEARCH_INDEX`) with the following json:
 
 ```json
 {
diff --git a/config.py b/config.py
@@ -5,11 +5,8 @@
 from enum import Enum
 from datetime import datetime
 from dotenv import find_dotenv, load_dotenv
-from langchain_community.embeddings import (
-    HuggingFaceEmbeddings,
-    HuggingFaceHubEmbeddings,
-    OllamaEmbeddings,
-)
+from langchain_ollama import OllamaEmbeddings
+from langchain_huggingface import HuggingFaceEmbeddings, HuggingFaceEndpointEmbeddings
 from langchain_openai import AzureOpenAIEmbeddings, OpenAIEmbeddings
 from starlette.middleware.base import BaseHTTPMiddleware
 from store_factory import get_vector_store
@@ -60,10 +57,10 @@ def get_env_variable(
 ATLAS_MONGO_DB_URI = get_env_variable(
     "ATLAS_MONGO_DB_URI", "mongodb://127.0.0.1:27018/LibreChat"
 )
+ATLAS_SEARCH_INDEX = get_env_variable("ATLAS_SEARCH_INDEX", "vector_index")
 MONGO_VECTOR_COLLECTION = get_env_variable(
-    "MONGO_VECTOR_COLLECTION", "vector_collection"
-)
-
+    "MONGO_VECTOR_COLLECTION", None
+)  # Deprecated, backwards compatability
 CHUNK_SIZE = int(get_env_variable("CHUNK_SIZE", "1500"))
 CHUNK_OVERLAP = int(get_env_variable("CHUNK_OVERLAP", "100"))
 
@@ -195,7 +192,7 @@ def init_embeddings(provider, model):
             model_name=model, encode_kwargs={"normalize_embeddings": True}
         )
     elif provider == EmbeddingsProvider.HUGGINGFACETEI:
-        return HuggingFaceHubEmbeddings(model=model)
+        return HuggingFaceEndpointEmbeddings(model=model)
     elif provider == EmbeddingsProvider.OLLAMA:
         return OllamaEmbeddings(model=model, base_url=OLLAMA_BASE_URL)
     else:
@@ -236,12 +233,17 @@ def init_embeddings(provider, model):
         mode="async",
     )
 elif VECTOR_DB_TYPE == VectorDBType.ATLAS_MONGO:
-    logger.warning("Using Atlas MongoDB as vector store is not fully supported yet.")
+    # Backward compatability check
+    if MONGO_VECTOR_COLLECTION:
+        logger.info(f"DEPRECATED: Please remove env var MONGO_VECTOR_COLLECTION and instead use COLLECTION_NAME and ATLAS_SEARCH_INDEX. You can set both as same, but not neccessary. See README for more information.")
+        ATLAS_SEARCH_INDEX = MONGO_VECTOR_COLLECTION
+        COLLECTION_NAME = MONGO_VECTOR_COLLECTION
     vector_store = get_vector_store(
         connection_string=ATLAS_MONGO_DB_URI,
         embeddings=embeddings,
-        collection_name=MONGO_VECTOR_COLLECTION,
+        collection_name=COLLECTION_NAME,
         mode="atlas-mongo",
+        search_index=ATLAS_SEARCH_INDEX,
     )
 else:
     raise ValueError(f"Unsupported vector store type: {VECTOR_DB_TYPE}")
diff --git a/requirements.txt b/requirements.txt
@@ -1,15 +1,15 @@
-langchain==0.1.12
-langchain_community==0.0.34
-langchain_openai==0.0.8
-langchain_core==0.1.45
+langchain==0.3
+langchain_community==0.3
+langchain_openai==0.2.0
+langchain_core==0.3.5
 sqlalchemy==2.0.28
 python-dotenv==1.0.1
 fastapi==0.110.0
 psycopg2-binary==2.9.9
 pgvector==0.2.5
 uvicorn==0.28.0
 pypdf==4.1.0
-unstructured==0.12.6
+unstructured==0.15.13
 markdown==3.6
 networkx==3.2.1
 pandas==2.2.1
@@ -19,12 +19,15 @@ pypandoc==1.13
 PyJWT==2.8.0
 asyncpg==0.29.0
 python-multipart==0.0.9
-sentence_transformers==2.5.1
+sentence_transformers==3.1.1
 aiofiles==23.2.1
-rapidocr-onnxruntime==1.3.17
+rapidocr-onnxruntime==1.3.24
 opencv-python-headless==4.9.0.80
 pymongo==4.6.3
-langchain-mongodb==0.1.3
+langchain-mongodb==0.2.0
+langchain-ollama==0.2.0
+langchain-openai==0.2.0
+langchain-huggingface==0.1.0
 cryptography==42.0.7
 python-magic==0.4.27
 python-pptx==0.6.23
diff --git a/store.py b/store.py
@@ -80,6 +80,14 @@ class AtlasMongoVector(MongoDBAtlasVectorSearch):
     @property
     def embedding_function(self) -> Embeddings:
         return self.embeddings
+    
+    def add_documents(self, docs: list[Document], ids: list[str]):
+        #{file_id}_{idx}
+        new_ids = [id for id in range(len(ids))]
+        file_id = docs[0].metadata['file_id']
+        f_ids = [f'{file_id}_{id}' for id in new_ids]
+        return super().add_documents(docs, f_ids)
+
 
     def similarity_search_with_score_by_vector(
         self,
diff --git a/store_factory.py b/store_factory.py
@@ -1,14 +1,16 @@
-from langchain_community.embeddings import OpenAIEmbeddings
-
+from typing import Optional
+from langchain_core.embeddings import Embeddings
 from store import AsyncPgVector, ExtendedPgVector
 from store import AtlasMongoVector
 from pymongo import MongoClient
 
+
 def get_vector_store(
     connection_string: str,
-    embeddings: OpenAIEmbeddings,
+    embeddings: Embeddings,
     collection_name: str,
     mode: str = "sync",
+    search_index: Optional[str] = None 
 ):
     if mode == "sync":
         return ExtendedPgVector(
@@ -25,7 +27,9 @@ def get_vector_store(
     elif mode == "atlas-mongo":
         mongo_db = MongoClient(connection_string).get_database()
         mong_collection = mongo_db[collection_name]
-        return AtlasMongoVector(collection=mong_collection, embedding=embeddings, index_name=collection_name)
+        return AtlasMongoVector(
+            collection=mong_collection, embedding=embeddings, index_name=search_index
+        )
 
     else:
         raise ValueError("Invalid mode specified. Choose 'sync' or 'async'.")
@@ -35,20 +39,25 @@ async def create_index_if_not_exists(conn, table_name: str, column_name: str):
     # Construct index name conventionally
     index_name = f"idx_{table_name}_{column_name}"
     # Check if index exists
-    exists = await conn.fetchval(f"""
+    exists = await conn.fetchval(
+        f"""
         SELECT EXISTS (
             SELECT FROM pg_class c
             JOIN pg_namespace n ON n.oid = c.relnamespace
             WHERE c.relname = $1
             AND n.nspname = 'public'  -- Or specify your schema if different
         );
-    """, index_name)
+    """,
+        index_name,
+    )
     # Create the index if it does not exist
     if not exists:
-        await conn.execute(f"""
+        await conn.execute(
+            f"""
             CREATE INDEX CONCURRENTLY IF NOT EXISTS {index_name}
             ON public.{table_name} ({column_name});
-        """)
+        """
+        )
         print(f"Index {index_name} created on {table_name}.{column_name}")
     else:
-        print(f"Index {index_name} already exists on {table_name}.{column_name}")
+        print(f"Index {index_name} already exists on {table_name}.{column_name}")