Skip to content

Commit b378727

Browse files
committed
Make prepdocs mypy happy
1 parent 1980845 commit b378727

File tree

1 file changed

+18
-10
lines changed

1 file changed

+18
-10
lines changed

app/backend/prepdocs.py

Lines changed: 18 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -156,10 +156,9 @@ def setup_file_processors(
156156
local_html_parser: bool = False,
157157
search_images: bool = False,
158158
):
159-
html_parser: Parser
160-
pdf_parser: Parser
161-
doc_int_parser: DocumentAnalysisParser = None
159+
sentence_text_splitter = SentenceTextSplitter(has_image_embeddings=search_images)
162160

161+
doc_int_parser: Optional[DocumentAnalysisParser] = None
163162
# check if Azure Document Intelligence credentials are provided
164163
if document_intelligence_service is not None:
165164
documentintelligence_creds: Union[AsyncTokenCredential, AzureKeyCredential] = (
@@ -169,25 +168,34 @@ def setup_file_processors(
169168
endpoint=f"https://{document_intelligence_service}.cognitiveservices.azure.com/",
170169
credential=documentintelligence_creds,
171170
)
171+
172+
pdf_parser: Optional[Parser] = None
172173
if local_pdf_parser or document_intelligence_service is None:
173174
pdf_parser = LocalPdfParser()
174-
else:
175+
elif document_intelligence_service is not None:
175176
pdf_parser = doc_int_parser
177+
else:
178+
logger.warning("No PDF parser available")
179+
180+
html_parser: Optional[Parser] = None
176181
if local_html_parser or document_intelligence_service is None:
177182
html_parser = LocalHTMLParser()
178-
else:
183+
elif document_intelligence_service is not None:
179184
html_parser = doc_int_parser
180-
sentence_text_splitter = SentenceTextSplitter(has_image_embeddings=search_images)
185+
else:
186+
logger.warning("No HTML parser available")
181187

182-
# These file formats can always be parsed, thanks to local packages
188+
# These file formats can always be parsed:
183189
file_processors = {
184-
".pdf": FileProcessor(pdf_parser, sentence_text_splitter),
185-
".html": FileProcessor(html_parser, sentence_text_splitter),
186190
".json": FileProcessor(JsonParser(), SimpleTextSplitter()),
187191
".md": FileProcessor(TextParser(), sentence_text_splitter),
188192
".txt": FileProcessor(TextParser(), sentence_text_splitter),
189193
}
190-
194+
# These require either a Python package or Document Intelligence
195+
if pdf_parser is not None:
196+
file_processors.update({".pdf": FileProcessor(pdf_parser, sentence_text_splitter)})
197+
if html_parser is not None:
198+
file_processors.update({".html": FileProcessor(html_parser, sentence_text_splitter)})
191199
# These file formats require Document Intelligence
192200
if doc_int_parser is not None:
193201
file_processors.update(

0 commit comments

Comments
 (0)