Skip to content

Commit 5701090

Browse files
committed
enh: bypass embedding and retrieval
1 parent 1c2e36f commit 5701090

File tree

10 files changed

+468
-352
lines changed

10 files changed

+468
-352
lines changed

backend/open_webui/config.py

Lines changed: 16 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1502,13 +1502,16 @@ class BannerModel(BaseModel):
15021502
# Chroma
15031503
if VECTOR_DB == "chroma":
15041504
import chromadb
1505+
15051506
CHROMA_DATA_PATH = f"{DATA_DIR}/vector_db"
15061507
CHROMA_TENANT = os.environ.get("CHROMA_TENANT", chromadb.DEFAULT_TENANT)
15071508
CHROMA_DATABASE = os.environ.get("CHROMA_DATABASE", chromadb.DEFAULT_DATABASE)
15081509
CHROMA_HTTP_HOST = os.environ.get("CHROMA_HTTP_HOST", "")
15091510
CHROMA_HTTP_PORT = int(os.environ.get("CHROMA_HTTP_PORT", "8000"))
15101511
CHROMA_CLIENT_AUTH_PROVIDER = os.environ.get("CHROMA_CLIENT_AUTH_PROVIDER", "")
1511-
CHROMA_CLIENT_AUTH_CREDENTIALS = os.environ.get("CHROMA_CLIENT_AUTH_CREDENTIALS", "")
1512+
CHROMA_CLIENT_AUTH_CREDENTIALS = os.environ.get(
1513+
"CHROMA_CLIENT_AUTH_CREDENTIALS", ""
1514+
)
15121515
# Comma-separated list of header=value pairs
15131516
CHROMA_HTTP_HEADERS = os.environ.get("CHROMA_HTTP_HEADERS", "")
15141517
if CHROMA_HTTP_HEADERS:
@@ -1608,6 +1611,14 @@ class BannerModel(BaseModel):
16081611
os.getenv("DOCUMENT_INTELLIGENCE_KEY", ""),
16091612
)
16101613

1614+
1615+
BYPASS_EMBEDDING_AND_RETRIEVAL = PersistentConfig(
1616+
"BYPASS_EMBEDDING_AND_RETRIEVAL",
1617+
"rag.bypass_embedding_and_retrieval",
1618+
os.environ.get("BYPASS_EMBEDDING_AND_RETRIEVAL", "False").lower() == "true",
1619+
)
1620+
1621+
16111622
RAG_TOP_K = PersistentConfig(
16121623
"RAG_TOP_K", "rag.top_k", int(os.environ.get("RAG_TOP_K", "3"))
16131624
)
@@ -1824,10 +1835,10 @@ class BannerModel(BaseModel):
18241835
os.getenv("RAG_WEB_SEARCH_ENGINE", ""),
18251836
)
18261837

1827-
RAG_WEB_SEARCH_FULL_CONTEXT = PersistentConfig(
1828-
"RAG_WEB_SEARCH_FULL_CONTEXT",
1829-
"rag.web.search.full_context",
1830-
os.getenv("RAG_WEB_SEARCH_FULL_CONTEXT", "False").lower() == "true",
1838+
BYPASS_WEB_SEARCH_EMBEDDING_AND_RETRIEVAL = PersistentConfig(
1839+
"BYPASS_WEB_SEARCH_EMBEDDING_AND_RETRIEVAL",
1840+
"rag.web.search.bypass_embedding_and_retrieval",
1841+
os.getenv("BYPASS_WEB_SEARCH_EMBEDDING_AND_RETRIEVAL", "False").lower() == "true",
18311842
)
18321843

18331844
# You can provide a list of your own websites to filter after performing a web search.

backend/open_webui/main.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -162,6 +162,7 @@
162162
RAG_TEMPLATE,
163163
DEFAULT_RAG_TEMPLATE,
164164
RAG_FULL_CONTEXT,
165+
BYPASS_EMBEDDING_AND_RETRIEVAL,
165166
RAG_EMBEDDING_MODEL,
166167
RAG_EMBEDDING_MODEL_AUTO_UPDATE,
167168
RAG_EMBEDDING_MODEL_TRUST_REMOTE_CODE,
@@ -191,7 +192,7 @@
191192
YOUTUBE_LOADER_PROXY_URL,
192193
# Retrieval (Web Search)
193194
RAG_WEB_SEARCH_ENGINE,
194-
RAG_WEB_SEARCH_FULL_CONTEXT,
195+
BYPASS_WEB_SEARCH_EMBEDDING_AND_RETRIEVAL,
195196
RAG_WEB_SEARCH_RESULT_COUNT,
196197
RAG_WEB_SEARCH_CONCURRENT_REQUESTS,
197198
RAG_WEB_SEARCH_TRUST_ENV,
@@ -531,6 +532,7 @@ async def lifespan(app: FastAPI):
531532

532533

533534
app.state.config.RAG_FULL_CONTEXT = RAG_FULL_CONTEXT
535+
app.state.config.BYPASS_EMBEDDING_AND_RETRIEVAL = BYPASS_EMBEDDING_AND_RETRIEVAL
534536
app.state.config.ENABLE_RAG_HYBRID_SEARCH = ENABLE_RAG_HYBRID_SEARCH
535537
app.state.config.ENABLE_RAG_WEB_LOADER_SSL_VERIFICATION = (
536538
ENABLE_RAG_WEB_LOADER_SSL_VERIFICATION
@@ -567,7 +569,9 @@ async def lifespan(app: FastAPI):
567569

568570
app.state.config.ENABLE_RAG_WEB_SEARCH = ENABLE_RAG_WEB_SEARCH
569571
app.state.config.RAG_WEB_SEARCH_ENGINE = RAG_WEB_SEARCH_ENGINE
570-
app.state.config.RAG_WEB_SEARCH_FULL_CONTEXT = RAG_WEB_SEARCH_FULL_CONTEXT
572+
app.state.config.BYPASS_WEB_SEARCH_EMBEDDING_AND_RETRIEVAL = (
573+
BYPASS_WEB_SEARCH_EMBEDDING_AND_RETRIEVAL
574+
)
571575
app.state.config.RAG_WEB_SEARCH_DOMAIN_FILTER_LIST = RAG_WEB_SEARCH_DOMAIN_FILTER_LIST
572576

573577
app.state.config.ENABLE_GOOGLE_DRIVE_INTEGRATION = ENABLE_GOOGLE_DRIVE_INTEGRATION

backend/open_webui/retrieval/utils.py

Lines changed: 50 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
from open_webui.utils.misc import get_last_user_message, calculate_sha256_string
1818

1919
from open_webui.models.users import UserModel
20+
from open_webui.models.files import Files
2021

2122
from open_webui.env import (
2223
SRC_LOG_LEVELS,
@@ -342,6 +343,7 @@ def generate_multiple(query, user, func):
342343

343344

344345
def get_sources_from_files(
346+
request,
345347
files,
346348
queries,
347349
embedding_function,
@@ -359,19 +361,64 @@ def get_sources_from_files(
359361
relevant_contexts = []
360362

361363
for file in files:
364+
365+
context = None
362366
if file.get("docs"):
367+
# BYPASS_WEB_SEARCH_EMBEDDING_AND_RETRIEVAL
363368
context = {
364369
"documents": [[doc.get("content") for doc in file.get("docs")]],
365370
"metadatas": [[doc.get("metadata") for doc in file.get("docs")]],
366371
}
367372
elif file.get("context") == "full":
373+
# Manual Full Mode Toggle
368374
context = {
369375
"documents": [[file.get("file").get("data", {}).get("content")]],
370376
"metadatas": [[{"file_id": file.get("id"), "name": file.get("name")}]],
371377
}
372-
else:
373-
context = None
378+
elif (
379+
file.get("type") != "web_search"
380+
and request.app.state.config.BYPASS_EMBEDDING_AND_RETRIEVAL
381+
):
382+
# BYPASS_EMBEDDING_AND_RETRIEVAL
383+
if file.get("type") == "collection":
384+
file_ids = file.get("data", {}).get("file_ids", [])
385+
386+
documents = []
387+
metadatas = []
388+
for file_id in file_ids:
389+
file_object = Files.get_file_by_id(file_id)
390+
391+
if file_object:
392+
documents.append(file_object.data.get("content", ""))
393+
metadatas.append(
394+
{
395+
"file_id": file_id,
396+
"name": file_object.filename,
397+
"source": file_object.filename,
398+
}
399+
)
400+
401+
context = {
402+
"documents": [documents],
403+
"metadatas": [metadatas],
404+
}
374405

406+
elif file.get("id"):
407+
file_object = Files.get_file_by_id(file.get("id"))
408+
if file_object:
409+
context = {
410+
"documents": [[file_object.data.get("content", "")]],
411+
"metadatas": [
412+
[
413+
{
414+
"file_id": file.get("id"),
415+
"name": file_object.filename,
416+
"source": file_object.filename,
417+
}
418+
]
419+
],
420+
}
421+
else:
375422
collection_names = []
376423
if file.get("type") == "collection":
377424
if file.get("legacy"):
@@ -434,6 +481,7 @@ def get_sources_from_files(
434481
if context:
435482
if "data" in file:
436483
del file["data"]
484+
437485
relevant_contexts.append({**context, "file": file})
438486

439487
sources = []

backend/open_webui/retrieval/vector/dbs/chroma.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -107,8 +107,7 @@ def query(
107107
}
108108
)
109109
return None
110-
except Exception as e:
111-
log.exception(f"{e}")
110+
except:
112111
return None
113112

114113
def get(self, collection_name: str) -> Optional[GetResult]:

backend/open_webui/routers/retrieval.py

Lines changed: 53 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -352,6 +352,7 @@ async def get_rag_config(request: Request, user=Depends(get_admin_user)):
352352
"status": True,
353353
"pdf_extract_images": request.app.state.config.PDF_EXTRACT_IMAGES,
354354
"RAG_FULL_CONTEXT": request.app.state.config.RAG_FULL_CONTEXT,
355+
"BYPASS_EMBEDDING_AND_RETRIEVAL": request.app.state.config.BYPASS_EMBEDDING_AND_RETRIEVAL,
355356
"enable_google_drive_integration": request.app.state.config.ENABLE_GOOGLE_DRIVE_INTEGRATION,
356357
"enable_onedrive_integration": request.app.state.config.ENABLE_ONEDRIVE_INTEGRATION,
357358
"content_extraction": {
@@ -378,7 +379,7 @@ async def get_rag_config(request: Request, user=Depends(get_admin_user)):
378379
},
379380
"web": {
380381
"ENABLE_RAG_WEB_LOADER_SSL_VERIFICATION": request.app.state.config.ENABLE_RAG_WEB_LOADER_SSL_VERIFICATION,
381-
"RAG_WEB_SEARCH_FULL_CONTEXT": request.app.state.config.RAG_WEB_SEARCH_FULL_CONTEXT,
382+
"BYPASS_WEB_SEARCH_EMBEDDING_AND_RETRIEVAL": request.app.state.config.BYPASS_WEB_SEARCH_EMBEDDING_AND_RETRIEVAL,
382383
"search": {
383384
"enabled": request.app.state.config.ENABLE_RAG_WEB_SEARCH,
384385
"drive": request.app.state.config.ENABLE_GOOGLE_DRIVE_INTEGRATION,
@@ -473,11 +474,12 @@ class WebSearchConfig(BaseModel):
473474
class WebConfig(BaseModel):
474475
search: WebSearchConfig
475476
ENABLE_RAG_WEB_LOADER_SSL_VERIFICATION: Optional[bool] = None
476-
RAG_WEB_SEARCH_FULL_CONTEXT: Optional[bool] = None
477+
BYPASS_WEB_SEARCH_EMBEDDING_AND_RETRIEVAL: Optional[bool] = None
477478

478479

479480
class ConfigUpdateForm(BaseModel):
480481
RAG_FULL_CONTEXT: Optional[bool] = None
482+
BYPASS_EMBEDDING_AND_RETRIEVAL: Optional[bool] = None
481483
pdf_extract_images: Optional[bool] = None
482484
enable_google_drive_integration: Optional[bool] = None
483485
enable_onedrive_integration: Optional[bool] = None
@@ -504,6 +506,12 @@ async def update_rag_config(
504506
else request.app.state.config.RAG_FULL_CONTEXT
505507
)
506508

509+
request.app.state.config.BYPASS_EMBEDDING_AND_RETRIEVAL = (
510+
form_data.BYPASS_EMBEDDING_AND_RETRIEVAL
511+
if form_data.BYPASS_EMBEDDING_AND_RETRIEVAL is not None
512+
else request.app.state.config.BYPASS_EMBEDDING_AND_RETRIEVAL
513+
)
514+
507515
request.app.state.config.ENABLE_GOOGLE_DRIVE_INTEGRATION = (
508516
form_data.enable_google_drive_integration
509517
if form_data.enable_google_drive_integration is not None
@@ -557,8 +565,8 @@ async def update_rag_config(
557565
request.app.state.config.ENABLE_RAG_WEB_SEARCH = form_data.web.search.enabled
558566
request.app.state.config.RAG_WEB_SEARCH_ENGINE = form_data.web.search.engine
559567

560-
request.app.state.config.RAG_WEB_SEARCH_FULL_CONTEXT = (
561-
form_data.web.RAG_WEB_SEARCH_FULL_CONTEXT
568+
request.app.state.config.BYPASS_WEB_SEARCH_EMBEDDING_AND_RETRIEVAL = (
569+
form_data.web.BYPASS_WEB_SEARCH_EMBEDDING_AND_RETRIEVAL
562570
)
563571

564572
request.app.state.config.SEARXNG_QUERY_URL = (
@@ -626,6 +634,7 @@ async def update_rag_config(
626634
"status": True,
627635
"pdf_extract_images": request.app.state.config.PDF_EXTRACT_IMAGES,
628636
"RAG_FULL_CONTEXT": request.app.state.config.RAG_FULL_CONTEXT,
637+
"BYPASS_EMBEDDING_AND_RETRIEVAL": request.app.state.config.BYPASS_EMBEDDING_AND_RETRIEVAL,
629638
"file": {
630639
"max_size": request.app.state.config.FILE_MAX_SIZE,
631640
"max_count": request.app.state.config.FILE_MAX_COUNT,
@@ -650,7 +659,7 @@ async def update_rag_config(
650659
},
651660
"web": {
652661
"ENABLE_RAG_WEB_LOADER_SSL_VERIFICATION": request.app.state.config.ENABLE_RAG_WEB_LOADER_SSL_VERIFICATION,
653-
"RAG_WEB_SEARCH_FULL_CONTEXT": request.app.state.config.RAG_WEB_SEARCH_FULL_CONTEXT,
662+
"BYPASS_WEB_SEARCH_EMBEDDING_AND_RETRIEVAL": request.app.state.config.BYPASS_WEB_SEARCH_EMBEDDING_AND_RETRIEVAL,
654663
"search": {
655664
"enabled": request.app.state.config.ENABLE_RAG_WEB_SEARCH,
656665
"engine": request.app.state.config.RAG_WEB_SEARCH_ENGINE,
@@ -1019,36 +1028,45 @@ def process_file(
10191028
hash = calculate_sha256_string(text_content)
10201029
Files.update_file_hash_by_id(file.id, hash)
10211030

1022-
try:
1023-
result = save_docs_to_vector_db(
1024-
request,
1025-
docs=docs,
1026-
collection_name=collection_name,
1027-
metadata={
1028-
"file_id": file.id,
1029-
"name": file.filename,
1030-
"hash": hash,
1031-
},
1032-
add=(True if form_data.collection_name else False),
1033-
user=user,
1034-
)
1035-
1036-
if result:
1037-
Files.update_file_metadata_by_id(
1038-
file.id,
1039-
{
1040-
"collection_name": collection_name,
1031+
if not request.app.state.config.BYPASS_EMBEDDING_AND_RETRIEVAL:
1032+
try:
1033+
result = save_docs_to_vector_db(
1034+
request,
1035+
docs=docs,
1036+
collection_name=collection_name,
1037+
metadata={
1038+
"file_id": file.id,
1039+
"name": file.filename,
1040+
"hash": hash,
10411041
},
1042+
add=(True if form_data.collection_name else False),
1043+
user=user,
10421044
)
10431045

1044-
return {
1045-
"status": True,
1046-
"collection_name": collection_name,
1047-
"filename": file.filename,
1048-
"content": text_content,
1049-
}
1050-
except Exception as e:
1051-
raise e
1046+
if result:
1047+
Files.update_file_metadata_by_id(
1048+
file.id,
1049+
{
1050+
"collection_name": collection_name,
1051+
},
1052+
)
1053+
1054+
return {
1055+
"status": True,
1056+
"collection_name": collection_name,
1057+
"filename": file.filename,
1058+
"content": text_content,
1059+
}
1060+
except Exception as e:
1061+
raise e
1062+
else:
1063+
return {
1064+
"status": True,
1065+
"collection_name": None,
1066+
"filename": file.filename,
1067+
"content": text_content,
1068+
}
1069+
10521070
except Exception as e:
10531071
log.exception(e)
10541072
if "No pandoc was found" in str(e):
@@ -1408,17 +1426,18 @@ async def process_web_search(
14081426
)
14091427
docs = await loader.aload()
14101428

1411-
if request.app.state.config.RAG_WEB_SEARCH_FULL_CONTEXT:
1429+
if request.app.state.config.BYPASS_WEB_SEARCH_EMBEDDING_AND_RETRIEVAL:
14121430
return {
14131431
"status": True,
1432+
"collection_name": None,
1433+
"filenames": urls,
14141434
"docs": [
14151435
{
14161436
"content": doc.page_content,
14171437
"metadata": doc.metadata,
14181438
}
14191439
for doc in docs
14201440
],
1421-
"filenames": urls,
14221441
"loaded_count": len(docs),
14231442
}
14241443
else:

0 commit comments

Comments
 (0)