@@ -156,10 +156,9 @@ def setup_file_processors(
156
156
local_html_parser : bool = False ,
157
157
search_images : bool = False ,
158
158
):
159
- html_parser : Parser
160
- pdf_parser : Parser
161
- doc_int_parser : DocumentAnalysisParser = None
159
+ sentence_text_splitter = SentenceTextSplitter (has_image_embeddings = search_images )
162
160
161
+ doc_int_parser : Optional [DocumentAnalysisParser ] = None
163
162
# check if Azure Document Intelligence credentials are provided
164
163
if document_intelligence_service is not None :
165
164
documentintelligence_creds : Union [AsyncTokenCredential , AzureKeyCredential ] = (
@@ -169,25 +168,34 @@ def setup_file_processors(
169
168
endpoint = f"https://{ document_intelligence_service } .cognitiveservices.azure.com/" ,
170
169
credential = documentintelligence_creds ,
171
170
)
171
+
172
+ pdf_parser : Optional [Parser ] = None
172
173
if local_pdf_parser or document_intelligence_service is None :
173
174
pdf_parser = LocalPdfParser ()
174
- else :
175
+ elif document_intelligence_service is not None :
175
176
pdf_parser = doc_int_parser
177
+ else :
178
+ logger .warning ("No PDF parser available" )
179
+
180
+ html_parser : Optional [Parser ] = None
176
181
if local_html_parser or document_intelligence_service is None :
177
182
html_parser = LocalHTMLParser ()
178
- else :
183
+ elif document_intelligence_service is not None :
179
184
html_parser = doc_int_parser
180
- sentence_text_splitter = SentenceTextSplitter (has_image_embeddings = search_images )
185
+ else :
186
+ logger .warning ("No HTML parser available" )
181
187
182
- # These file formats can always be parsed, thanks to local packages
188
+ # These file formats can always be parsed:
183
189
file_processors = {
184
- ".pdf" : FileProcessor (pdf_parser , sentence_text_splitter ),
185
- ".html" : FileProcessor (html_parser , sentence_text_splitter ),
186
190
".json" : FileProcessor (JsonParser (), SimpleTextSplitter ()),
187
191
".md" : FileProcessor (TextParser (), sentence_text_splitter ),
188
192
".txt" : FileProcessor (TextParser (), sentence_text_splitter ),
189
193
}
190
-
194
+ # These require either a Python package or Document Intelligence
195
+ if pdf_parser is not None :
196
+ file_processors .update ({".pdf" : FileProcessor (pdf_parser , sentence_text_splitter )})
197
+ if html_parser is not None :
198
+ file_processors .update ({".html" : FileProcessor (html_parser , sentence_text_splitter )})
191
199
# These file formats require Document Intelligence
192
200
if doc_int_parser is not None :
193
201
file_processors .update (
0 commit comments