@@ -181,13 +181,16 @@ def load(
181181 for doc in docs
182182 ]
183183
184+ def _is_text_file (self , file_ext : str , file_content_type : str ) -> bool :
185+ return file_ext in known_source_ext or (
186+ file_content_type and file_content_type .find ("text/" ) >= 0
187+ )
188+
184189 def _get_loader (self , filename : str , file_content_type : str , file_path : str ):
185190 file_ext = filename .split ("." )[- 1 ].lower ()
186191
187192 if self .engine == "tika" and self .kwargs .get ("TIKA_SERVER_URL" ):
188- if file_ext in known_source_ext or (
189- file_content_type and file_content_type .find ("text/" ) >= 0
190- ):
193+ if self ._is_text_file (file_ext , file_content_type ):
191194 loader = TextLoader (file_path , autodetect_encoding = True )
192195 else :
193196 loader = TikaLoader (
@@ -196,11 +199,14 @@ def _get_loader(self, filename: str, file_content_type: str, file_path: str):
196199 mime_type = file_content_type ,
197200 )
198201 elif self .engine == "docling" and self .kwargs .get ("DOCLING_SERVER_URL" ):
199- loader = DoclingLoader (
200- url = self .kwargs .get ("DOCLING_SERVER_URL" ),
201- file_path = file_path ,
202- mime_type = file_content_type ,
203- )
202+ if self ._is_text_file (file_ext , file_content_type ):
203+ loader = TextLoader (file_path , autodetect_encoding = True )
204+ else :
205+ loader = DoclingLoader (
206+ url = self .kwargs .get ("DOCLING_SERVER_URL" ),
207+ file_path = file_path ,
208+ mime_type = file_content_type ,
209+ )
204210 elif (
205211 self .engine == "document_intelligence"
206212 and self .kwargs .get ("DOCUMENT_INTELLIGENCE_ENDPOINT" ) != ""
@@ -257,9 +263,7 @@ def _get_loader(self, filename: str, file_content_type: str, file_path: str):
257263 loader = UnstructuredPowerPointLoader (file_path )
258264 elif file_ext == "msg" :
259265 loader = OutlookMessageLoader (file_path )
260- elif file_ext in known_source_ext or (
261- file_content_type and file_content_type .find ("text/" ) >= 0
262- ):
266+ elif self ._is_text_file (file_ext , file_content_type ):
263267 loader = TextLoader (file_path , autodetect_encoding = True )
264268 else :
265269 loader = TextLoader (file_path , autodetect_encoding = True )
0 commit comments