Skip to content

Commit cd0a1b4

Browse files
committed
fix: fix for text file handling with docling
1 parent 04799f1 commit cd0a1b4

File tree

1 file changed

+15
-11
lines changed
  • backend/open_webui/retrieval/loaders

1 file changed

+15
-11
lines changed

backend/open_webui/retrieval/loaders/main.py

Lines changed: 15 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -181,13 +181,16 @@ def load(
181181
for doc in docs
182182
]
183183

184+
def _is_text_file(self, file_ext: str, file_content_type: str) -> bool:
185+
return file_ext in known_source_ext or (
186+
file_content_type and file_content_type.find("text/") >= 0
187+
)
188+
184189
def _get_loader(self, filename: str, file_content_type: str, file_path: str):
185190
file_ext = filename.split(".")[-1].lower()
186191

187192
if self.engine == "tika" and self.kwargs.get("TIKA_SERVER_URL"):
188-
if file_ext in known_source_ext or (
189-
file_content_type and file_content_type.find("text/") >= 0
190-
):
193+
if self._is_text_file(file_ext, file_content_type):
191194
loader = TextLoader(file_path, autodetect_encoding=True)
192195
else:
193196
loader = TikaLoader(
@@ -196,11 +199,14 @@ def _get_loader(self, filename: str, file_content_type: str, file_path: str):
196199
mime_type=file_content_type,
197200
)
198201
elif self.engine == "docling" and self.kwargs.get("DOCLING_SERVER_URL"):
199-
loader = DoclingLoader(
200-
url=self.kwargs.get("DOCLING_SERVER_URL"),
201-
file_path=file_path,
202-
mime_type=file_content_type,
203-
)
202+
if self._is_text_file(file_ext, file_content_type):
203+
loader = TextLoader(file_path, autodetect_encoding=True)
204+
else:
205+
loader = DoclingLoader(
206+
url=self.kwargs.get("DOCLING_SERVER_URL"),
207+
file_path=file_path,
208+
mime_type=file_content_type,
209+
)
204210
elif (
205211
self.engine == "document_intelligence"
206212
and self.kwargs.get("DOCUMENT_INTELLIGENCE_ENDPOINT") != ""
@@ -257,9 +263,7 @@ def _get_loader(self, filename: str, file_content_type: str, file_path: str):
257263
loader = UnstructuredPowerPointLoader(file_path)
258264
elif file_ext == "msg":
259265
loader = OutlookMessageLoader(file_path)
260-
elif file_ext in known_source_ext or (
261-
file_content_type and file_content_type.find("text/") >= 0
262-
):
266+
elif self._is_text_file(file_ext, file_content_type):
263267
loader = TextLoader(file_path, autodetect_encoding=True)
264268
else:
265269
loader = TextLoader(file_path, autodetect_encoding=True)

0 commit comments

Comments
 (0)