Skip to content

Commit ef787e4

Browse files
authored
Merge pull request open-webui#12486 from FabioPolito24/text-file-handling-docling
fix: text file handling with docling
2 parents 48d690c + cd0a1b4 commit ef787e4

File tree

1 file changed

+15
-11
lines changed
  • backend/open_webui/retrieval/loaders

1 file changed

+15
-11
lines changed

backend/open_webui/retrieval/loaders/main.py

Lines changed: 15 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -184,13 +184,16 @@ def load(
184184
for doc in docs
185185
]
186186

187+
def _is_text_file(self, file_ext: str, file_content_type: str) -> bool:
188+
return file_ext in known_source_ext or (
189+
file_content_type and file_content_type.find("text/") >= 0
190+
)
191+
187192
def _get_loader(self, filename: str, file_content_type: str, file_path: str):
188193
file_ext = filename.split(".")[-1].lower()
189194

190195
if self.engine == "tika" and self.kwargs.get("TIKA_SERVER_URL"):
191-
if file_ext in known_source_ext or (
192-
file_content_type and file_content_type.find("text/") >= 0
193-
):
196+
if self._is_text_file(file_ext, file_content_type):
194197
loader = TextLoader(file_path, autodetect_encoding=True)
195198
else:
196199
loader = TikaLoader(
@@ -199,11 +202,14 @@ def _get_loader(self, filename: str, file_content_type: str, file_path: str):
199202
mime_type=file_content_type,
200203
)
201204
elif self.engine == "docling" and self.kwargs.get("DOCLING_SERVER_URL"):
202-
loader = DoclingLoader(
203-
url=self.kwargs.get("DOCLING_SERVER_URL"),
204-
file_path=file_path,
205-
mime_type=file_content_type,
206-
)
205+
if self._is_text_file(file_ext, file_content_type):
206+
loader = TextLoader(file_path, autodetect_encoding=True)
207+
else:
208+
loader = DoclingLoader(
209+
url=self.kwargs.get("DOCLING_SERVER_URL"),
210+
file_path=file_path,
211+
mime_type=file_content_type,
212+
)
207213
elif (
208214
self.engine == "document_intelligence"
209215
and self.kwargs.get("DOCUMENT_INTELLIGENCE_ENDPOINT") != ""
@@ -269,9 +275,7 @@ def _get_loader(self, filename: str, file_content_type: str, file_path: str):
269275
loader = UnstructuredPowerPointLoader(file_path)
270276
elif file_ext == "msg":
271277
loader = OutlookMessageLoader(file_path)
272-
elif file_ext in known_source_ext or (
273-
file_content_type and file_content_type.find("text/") >= 0
274-
):
278+
elif self._is_text_file(file_ext, file_content_type):
275279
loader = TextLoader(file_path, autodetect_encoding=True)
276280
else:
277281
loader = TextLoader(file_path, autodetect_encoding=True)

0 commit comments

Comments
 (0)