@@ -184,13 +184,16 @@ def load(
184184 for doc in docs
185185 ]
186186
187+ def _is_text_file (self , file_ext : str , file_content_type : str ) -> bool :
188+ return file_ext in known_source_ext or (
189+ file_content_type and file_content_type .find ("text/" ) >= 0
190+ )
191+
187192 def _get_loader (self , filename : str , file_content_type : str , file_path : str ):
188193 file_ext = filename .split ("." )[- 1 ].lower ()
189194
190195 if self .engine == "tika" and self .kwargs .get ("TIKA_SERVER_URL" ):
191- if file_ext in known_source_ext or (
192- file_content_type and file_content_type .find ("text/" ) >= 0
193- ):
196+ if self ._is_text_file (file_ext , file_content_type ):
194197 loader = TextLoader (file_path , autodetect_encoding = True )
195198 else :
196199 loader = TikaLoader (
@@ -199,11 +202,14 @@ def _get_loader(self, filename: str, file_content_type: str, file_path: str):
199202 mime_type = file_content_type ,
200203 )
201204 elif self .engine == "docling" and self .kwargs .get ("DOCLING_SERVER_URL" ):
202- loader = DoclingLoader (
203- url = self .kwargs .get ("DOCLING_SERVER_URL" ),
204- file_path = file_path ,
205- mime_type = file_content_type ,
206- )
205+ if self ._is_text_file (file_ext , file_content_type ):
206+ loader = TextLoader (file_path , autodetect_encoding = True )
207+ else :
208+ loader = DoclingLoader (
209+ url = self .kwargs .get ("DOCLING_SERVER_URL" ),
210+ file_path = file_path ,
211+ mime_type = file_content_type ,
212+ )
207213 elif (
208214 self .engine == "document_intelligence"
209215 and self .kwargs .get ("DOCUMENT_INTELLIGENCE_ENDPOINT" ) != ""
@@ -269,9 +275,7 @@ def _get_loader(self, filename: str, file_content_type: str, file_path: str):
269275 loader = UnstructuredPowerPointLoader (file_path )
270276 elif file_ext == "msg" :
271277 loader = OutlookMessageLoader (file_path )
272- elif file_ext in known_source_ext or (
273- file_content_type and file_content_type .find ("text/" ) >= 0
274- ):
278+ elif self ._is_text_file (file_ext , file_content_type ):
275279 loader = TextLoader (file_path , autodetect_encoding = True )
276280 else :
277281 loader = TextLoader (file_path , autodetect_encoding = True )
0 commit comments