Skip to content

Commit edd8a0c

Browse files
authored
🚀 refactor: Langchain v0.3 (#74)
* ↪️refactor: moved packages out of langchain community and fixed embeddings abstraction * ↪️refactor: update requirements.txt * ✓chore: removed long list of requirements.txt * fix: mongo db query and embed/upload * refactor: change env var name * docs: Atlas MongoDB update env var setting * docs: MongoDB Atlas direction * docs: update atlas mongoDB directions * docs: add deprecated env variable * refactor: mongo db env variable MONGO_VECTOR_COLLECTION backwards compatability add
1 parent 9c65628 commit edd8a0c

File tree

5 files changed

+55
-30
lines changed

5 files changed

+55
-30
lines changed

README.md

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,8 @@ The following environment variables are required to run the application:
7777
- Note: `AZURE_OPENAI_ENDPOINT` will work but `RAG_AZURE_OPENAI_ENDPOINT` will override it in order to not conflict with LibreChat setting.
7878
- `HF_TOKEN`: (Optional) if needed for `huggingface` option.
7979
- `OLLAMA_BASE_URL`: (Optional) defaults to `http://ollama:11434`.
80+
- `ATLAS_SEARCH_INDEX`: (Optional) the name of the vector search index if using Atlas MongoDB, defaults to `vector_index`
81+
- `MONGO_VECTOR_COLLECTION`: Deprecated for MongoDB, please use `ATLAS_SEARCH_INDEX` and `COLLECTION_NAME`
8082

8183
Make sure to set these environment variables before running the application. You can set them in a `.env` file or as system environment variables.
8284

@@ -87,10 +89,11 @@ Instead of using the default pgvector, we could use [Atlas MongoDB](https://www.
8789
```env
8890
VECTOR_DB_TYPE=atlas-mongo
8991
ATLAS_MONGO_DB_URI=<mongodb+srv://...>
90-
MONGO_VECTOR_COLLECTION=<collection name>
92+
COLLECTION_NAME=<vector collection>
93+
ATLAS_SEARCH_INDEX=<vector search index>
9194
```
9295

93-
The `ATLAS_MONGO_DB_URI` could be the same or different from what is used by LibreChat. Even if it is the same, the `$MONGO_VECTOR_COLLECTION` collection needs to be a completely new one, separate from all collections used by LibreChat. In additional, create a vector search index for `$MONGO_VECTOR_COLLECTION` with the following json:
96+
The `ATLAS_MONGO_DB_URI` could be the same or different from what is used by LibreChat. Even if it is the same, the `$COLLECTION_NAME` collection needs to be a completely new one, separate from all collections used by LibreChat. In addition, create a vector search index for collection above (remember to assign `$ATLAS_SEARCH_INDEX`) with the following json:
9497

9598
```json
9699
{

config.py

Lines changed: 13 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -5,11 +5,8 @@
55
from enum import Enum
66
from datetime import datetime
77
from dotenv import find_dotenv, load_dotenv
8-
from langchain_community.embeddings import (
9-
HuggingFaceEmbeddings,
10-
HuggingFaceHubEmbeddings,
11-
OllamaEmbeddings,
12-
)
8+
from langchain_ollama import OllamaEmbeddings
9+
from langchain_huggingface import HuggingFaceEmbeddings, HuggingFaceEndpointEmbeddings
1310
from langchain_openai import AzureOpenAIEmbeddings, OpenAIEmbeddings
1411
from starlette.middleware.base import BaseHTTPMiddleware
1512
from store_factory import get_vector_store
@@ -60,10 +57,10 @@ def get_env_variable(
6057
ATLAS_MONGO_DB_URI = get_env_variable(
6158
"ATLAS_MONGO_DB_URI", "mongodb://127.0.0.1:27018/LibreChat"
6259
)
60+
ATLAS_SEARCH_INDEX = get_env_variable("ATLAS_SEARCH_INDEX", "vector_index")
6361
MONGO_VECTOR_COLLECTION = get_env_variable(
64-
"MONGO_VECTOR_COLLECTION", "vector_collection"
65-
)
66-
62+
"MONGO_VECTOR_COLLECTION", None
63+
) # Deprecated, backwards compatability
6764
CHUNK_SIZE = int(get_env_variable("CHUNK_SIZE", "1500"))
6865
CHUNK_OVERLAP = int(get_env_variable("CHUNK_OVERLAP", "100"))
6966

@@ -195,7 +192,7 @@ def init_embeddings(provider, model):
195192
model_name=model, encode_kwargs={"normalize_embeddings": True}
196193
)
197194
elif provider == EmbeddingsProvider.HUGGINGFACETEI:
198-
return HuggingFaceHubEmbeddings(model=model)
195+
return HuggingFaceEndpointEmbeddings(model=model)
199196
elif provider == EmbeddingsProvider.OLLAMA:
200197
return OllamaEmbeddings(model=model, base_url=OLLAMA_BASE_URL)
201198
else:
@@ -236,12 +233,17 @@ def init_embeddings(provider, model):
236233
mode="async",
237234
)
238235
elif VECTOR_DB_TYPE == VectorDBType.ATLAS_MONGO:
239-
logger.warning("Using Atlas MongoDB as vector store is not fully supported yet.")
236+
# Backward compatability check
237+
if MONGO_VECTOR_COLLECTION:
238+
logger.info(f"DEPRECATED: Please remove env var MONGO_VECTOR_COLLECTION and instead use COLLECTION_NAME and ATLAS_SEARCH_INDEX. You can set both as same, but not neccessary. See README for more information.")
239+
ATLAS_SEARCH_INDEX = MONGO_VECTOR_COLLECTION
240+
COLLECTION_NAME = MONGO_VECTOR_COLLECTION
240241
vector_store = get_vector_store(
241242
connection_string=ATLAS_MONGO_DB_URI,
242243
embeddings=embeddings,
243-
collection_name=MONGO_VECTOR_COLLECTION,
244+
collection_name=COLLECTION_NAME,
244245
mode="atlas-mongo",
246+
search_index=ATLAS_SEARCH_INDEX,
245247
)
246248
else:
247249
raise ValueError(f"Unsupported vector store type: {VECTOR_DB_TYPE}")

requirements.txt

Lines changed: 11 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,15 @@
1-
langchain==0.1.12
2-
langchain_community==0.0.34
3-
langchain_openai==0.0.8
4-
langchain_core==0.1.45
1+
langchain==0.3
2+
langchain_community==0.3
3+
langchain_openai==0.2.0
4+
langchain_core==0.3.5
55
sqlalchemy==2.0.28
66
python-dotenv==1.0.1
77
fastapi==0.110.0
88
psycopg2-binary==2.9.9
99
pgvector==0.2.5
1010
uvicorn==0.28.0
1111
pypdf==4.1.0
12-
unstructured==0.12.6
12+
unstructured==0.15.13
1313
markdown==3.6
1414
networkx==3.2.1
1515
pandas==2.2.1
@@ -19,12 +19,15 @@ pypandoc==1.13
1919
PyJWT==2.8.0
2020
asyncpg==0.29.0
2121
python-multipart==0.0.9
22-
sentence_transformers==2.5.1
22+
sentence_transformers==3.1.1
2323
aiofiles==23.2.1
24-
rapidocr-onnxruntime==1.3.17
24+
rapidocr-onnxruntime==1.3.24
2525
opencv-python-headless==4.9.0.80
2626
pymongo==4.6.3
27-
langchain-mongodb==0.1.3
27+
langchain-mongodb==0.2.0
28+
langchain-ollama==0.2.0
29+
langchain-openai==0.2.0
30+
langchain-huggingface==0.1.0
2831
cryptography==42.0.7
2932
python-magic==0.4.27
3033
python-pptx==0.6.23

store.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,14 @@ class AtlasMongoVector(MongoDBAtlasVectorSearch):
8080
@property
8181
def embedding_function(self) -> Embeddings:
8282
return self.embeddings
83+
84+
def add_documents(self, docs: list[Document], ids: list[str]):
85+
#{file_id}_{idx}
86+
new_ids = [id for id in range(len(ids))]
87+
file_id = docs[0].metadata['file_id']
88+
f_ids = [f'{file_id}_{id}' for id in new_ids]
89+
return super().add_documents(docs, f_ids)
90+
8391

8492
def similarity_search_with_score_by_vector(
8593
self,

store_factory.py

Lines changed: 18 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,16 @@
1-
from langchain_community.embeddings import OpenAIEmbeddings
2-
1+
from typing import Optional
2+
from langchain_core.embeddings import Embeddings
33
from store import AsyncPgVector, ExtendedPgVector
44
from store import AtlasMongoVector
55
from pymongo import MongoClient
66

7+
78
def get_vector_store(
89
connection_string: str,
9-
embeddings: OpenAIEmbeddings,
10+
embeddings: Embeddings,
1011
collection_name: str,
1112
mode: str = "sync",
13+
search_index: Optional[str] = None
1214
):
1315
if mode == "sync":
1416
return ExtendedPgVector(
@@ -25,7 +27,9 @@ def get_vector_store(
2527
elif mode == "atlas-mongo":
2628
mongo_db = MongoClient(connection_string).get_database()
2729
mong_collection = mongo_db[collection_name]
28-
return AtlasMongoVector(collection=mong_collection, embedding=embeddings, index_name=collection_name)
30+
return AtlasMongoVector(
31+
collection=mong_collection, embedding=embeddings, index_name=search_index
32+
)
2933

3034
else:
3135
raise ValueError("Invalid mode specified. Choose 'sync' or 'async'.")
@@ -35,20 +39,25 @@ async def create_index_if_not_exists(conn, table_name: str, column_name: str):
3539
# Construct index name conventionally
3640
index_name = f"idx_{table_name}_{column_name}"
3741
# Check if index exists
38-
exists = await conn.fetchval(f"""
42+
exists = await conn.fetchval(
43+
f"""
3944
SELECT EXISTS (
4045
SELECT FROM pg_class c
4146
JOIN pg_namespace n ON n.oid = c.relnamespace
4247
WHERE c.relname = $1
4348
AND n.nspname = 'public' -- Or specify your schema if different
4449
);
45-
""", index_name)
50+
""",
51+
index_name,
52+
)
4653
# Create the index if it does not exist
4754
if not exists:
48-
await conn.execute(f"""
55+
await conn.execute(
56+
f"""
4957
CREATE INDEX CONCURRENTLY IF NOT EXISTS {index_name}
5058
ON public.{table_name} ({column_name});
51-
""")
59+
"""
60+
)
5261
print(f"Index {index_name} created on {table_name}.{column_name}")
5362
else:
54-
print(f"Index {index_name} already exists on {table_name}.{column_name}")
63+
print(f"Index {index_name} already exists on {table_name}.{column_name}")

0 commit comments

Comments
 (0)