Skip to content

Commit 4fef884

Browse files
committed
Adding more tests for prepdocs
1 parent 0ab84da commit 4fef884

File tree

2 files changed

+151
-11
lines changed

2 files changed

+151
-11
lines changed

app/backend/prepdocs.py

Lines changed: 21 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -158,7 +158,7 @@ def setup_file_processors(
158158
):
159159
html_parser: Parser
160160
pdf_parser: Parser
161-
doc_int_parser: DocumentAnalysisParser
161+
doc_int_parser: DocumentAnalysisParser = None
162162

163163
# check if Azure Document Intelligence credentials are provided
164164
if document_intelligence_service is not None:
@@ -178,23 +178,33 @@ def setup_file_processors(
178178
else:
179179
html_parser = doc_int_parser
180180
sentence_text_splitter = SentenceTextSplitter(has_image_embeddings=search_images)
181-
return {
181+
182+
# These file formats can always be parsed, thanks to local packages
183+
file_processors = {
182184
".pdf": FileProcessor(pdf_parser, sentence_text_splitter),
183185
".html": FileProcessor(html_parser, sentence_text_splitter),
184186
".json": FileProcessor(JsonParser(), SimpleTextSplitter()),
185-
".docx": FileProcessor(doc_int_parser, sentence_text_splitter),
186-
".pptx": FileProcessor(doc_int_parser, sentence_text_splitter),
187-
".xlsx": FileProcessor(doc_int_parser, sentence_text_splitter),
188-
".png": FileProcessor(doc_int_parser, sentence_text_splitter),
189-
".jpg": FileProcessor(doc_int_parser, sentence_text_splitter),
190-
".jpeg": FileProcessor(doc_int_parser, sentence_text_splitter),
191-
".tiff": FileProcessor(doc_int_parser, sentence_text_splitter),
192-
".bmp": FileProcessor(doc_int_parser, sentence_text_splitter),
193-
".heic": FileProcessor(doc_int_parser, sentence_text_splitter),
194187
".md": FileProcessor(TextParser(), sentence_text_splitter),
195188
".txt": FileProcessor(TextParser(), sentence_text_splitter),
196189
}
197190

191+
# These file formats require Document Intelligence
192+
if doc_int_parser is not None:
193+
file_processors.update(
194+
{
195+
".docx": FileProcessor(doc_int_parser, sentence_text_splitter),
196+
".pptx": FileProcessor(doc_int_parser, sentence_text_splitter),
197+
".xlsx": FileProcessor(doc_int_parser, sentence_text_splitter),
198+
".png": FileProcessor(doc_int_parser, sentence_text_splitter),
199+
".jpg": FileProcessor(doc_int_parser, sentence_text_splitter),
200+
".jpeg": FileProcessor(doc_int_parser, sentence_text_splitter),
201+
".tiff": FileProcessor(doc_int_parser, sentence_text_splitter),
202+
".bmp": FileProcessor(doc_int_parser, sentence_text_splitter),
203+
".heic": FileProcessor(doc_int_parser, sentence_text_splitter),
204+
}
205+
)
206+
return file_processors
207+
198208

199209
def setup_image_embeddings_service(
200210
azure_credential: AsyncTokenCredential, vision_endpoint: Union[str, None], search_images: bool

tests/test_app_config.py

Lines changed: 130 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
from unittest import mock
33

44
import pytest
5+
import quart
56

67
import app
78

@@ -52,6 +53,65 @@ async def test_app_azure_custom_identity(monkeypatch, minimal_env):
5253
assert quart_app.config[app.CONFIG_OPENAI_CLIENT].base_url == "http://azureapi.com/api/v1/openai/"
5354

5455

56+
@pytest.mark.asyncio
57+
async def test_app_user_upload_processors(monkeypatch, minimal_env):
58+
monkeypatch.setenv("AZURE_USERSTORAGE_ACCOUNT", "test-user-storage-account")
59+
monkeypatch.setenv("AZURE_USERSTORAGE_CONTAINER", "test-user-storage-container")
60+
monkeypatch.setenv("USE_USER_UPLOAD", "true")
61+
62+
quart_app = app.create_app()
63+
async with quart_app.test_app():
64+
ingester = quart_app.config[app.CONFIG_INGESTER]
65+
assert ingester is not None
66+
assert len(ingester.file_processors.keys()) == 5
67+
68+
69+
@pytest.mark.asyncio
70+
async def test_app_user_upload_processors_docint(monkeypatch, minimal_env):
71+
monkeypatch.setenv("AZURE_USERSTORAGE_ACCOUNT", "test-user-storage-account")
72+
monkeypatch.setenv("AZURE_USERSTORAGE_CONTAINER", "test-user-storage-container")
73+
monkeypatch.setenv("USE_USER_UPLOAD", "true")
74+
monkeypatch.setenv("AZURE_DOCUMENTINTELLIGENCE_SERVICE", "test-docint-service")
75+
76+
quart_app = app.create_app()
77+
async with quart_app.test_app():
78+
ingester = quart_app.config[app.CONFIG_INGESTER]
79+
assert ingester is not None
80+
assert len(ingester.file_processors.keys()) == 14
81+
82+
83+
@pytest.mark.asyncio
84+
async def test_app_user_upload_processors_docint_localpdf(monkeypatch, minimal_env):
85+
monkeypatch.setenv("AZURE_USERSTORAGE_ACCOUNT", "test-user-storage-account")
86+
monkeypatch.setenv("AZURE_USERSTORAGE_CONTAINER", "test-user-storage-container")
87+
monkeypatch.setenv("USE_USER_UPLOAD", "true")
88+
monkeypatch.setenv("AZURE_DOCUMENTINTELLIGENCE_SERVICE", "test-docint-service")
89+
monkeypatch.setenv("USE_LOCAL_PDF_PARSER", "true")
90+
91+
quart_app = app.create_app()
92+
async with quart_app.test_app():
93+
ingester = quart_app.config[app.CONFIG_INGESTER]
94+
assert ingester is not None
95+
assert len(ingester.file_processors.keys()) == 14
96+
assert ingester.file_processors[".pdf"] is not ingester.file_processors[".pptx"]
97+
98+
99+
@pytest.mark.asyncio
100+
async def test_app_user_upload_processors_docint_localhtml(monkeypatch, minimal_env):
101+
monkeypatch.setenv("AZURE_USERSTORAGE_ACCOUNT", "test-user-storage-account")
102+
monkeypatch.setenv("AZURE_USERSTORAGE_CONTAINER", "test-user-storage-container")
103+
monkeypatch.setenv("USE_USER_UPLOAD", "true")
104+
monkeypatch.setenv("AZURE_DOCUMENTINTELLIGENCE_SERVICE", "test-docint-service")
105+
monkeypatch.setenv("USE_LOCAL_HTML_PARSER", "true")
106+
107+
quart_app = app.create_app()
108+
async with quart_app.test_app():
109+
ingester = quart_app.config[app.CONFIG_INGESTER]
110+
assert ingester is not None
111+
assert len(ingester.file_processors.keys()) == 14
112+
assert ingester.file_processors[".html"] is not ingester.file_processors[".pptx"]
113+
114+
55115
@pytest.mark.asyncio
56116
async def test_app_config_default(monkeypatch, minimal_env):
57117
quart_app = app.create_app()
@@ -105,6 +165,7 @@ async def test_app_config_semanticranker_free(monkeypatch, minimal_env):
105165
assert result["showGPT4VOptions"] is False
106166
assert result["showSemanticRankerOption"] is True
107167
assert result["showVectorOption"] is True
168+
assert result["showUserUpload"] is False
108169

109170

110171
@pytest.mark.asyncio
@@ -119,6 +180,75 @@ async def test_app_config_semanticranker_disabled(monkeypatch, minimal_env):
119180
assert result["showGPT4VOptions"] is False
120181
assert result["showSemanticRankerOption"] is False
121182
assert result["showVectorOption"] is True
183+
assert result["showUserUpload"] is False
184+
185+
186+
@pytest.mark.asyncio
187+
async def test_app_config_user_upload(monkeypatch, minimal_env):
188+
monkeypatch.setenv("AZURE_USERSTORAGE_ACCOUNT", "test-user-storage-account")
189+
monkeypatch.setenv("AZURE_USERSTORAGE_CONTAINER", "test-user-storage-container")
190+
monkeypatch.setenv("USE_USER_UPLOAD", "true")
191+
quart_app = app.create_app()
192+
async with quart_app.test_app() as test_app:
193+
client = test_app.test_client()
194+
response = await client.get("/config")
195+
assert response.status_code == 200
196+
result = await response.get_json()
197+
assert result["showGPT4VOptions"] is False
198+
assert result["showSemanticRankerOption"] is True
199+
assert result["showVectorOption"] is True
200+
assert result["showUserUpload"] is True
201+
202+
203+
@pytest.mark.asyncio
204+
async def test_app_config_user_upload_novectors(monkeypatch, minimal_env):
205+
"""Check that this combo works correctly with prepdocs.py embedding service."""
206+
monkeypatch.setenv("AZURE_USERSTORAGE_ACCOUNT", "test-user-storage-account")
207+
monkeypatch.setenv("AZURE_USERSTORAGE_CONTAINER", "test-user-storage-container")
208+
monkeypatch.setenv("USE_USER_UPLOAD", "true")
209+
monkeypatch.setenv("USE_VECTORS", "false")
210+
quart_app = app.create_app()
211+
async with quart_app.test_app() as test_app:
212+
client = test_app.test_client()
213+
response = await client.get("/config")
214+
assert response.status_code == 200
215+
result = await response.get_json()
216+
assert result["showGPT4VOptions"] is False
217+
assert result["showSemanticRankerOption"] is True
218+
assert result["showVectorOption"] is False
219+
assert result["showUserUpload"] is True
220+
221+
222+
@pytest.mark.asyncio
223+
async def test_app_config_user_upload_bad_openai_config(monkeypatch, minimal_env):
224+
"""Check that this combo works correctly with prepdocs.py embedding service."""
225+
monkeypatch.setenv("AZURE_USERSTORAGE_ACCOUNT", "test-user-storage-account")
226+
monkeypatch.setenv("AZURE_USERSTORAGE_CONTAINER", "test-user-storage-container")
227+
monkeypatch.setenv("USE_USER_UPLOAD", "true")
228+
monkeypatch.setenv("OPENAI_HOST", "openai")
229+
quart_app = app.create_app()
230+
with pytest.raises(
231+
quart.testing.app.LifespanError, match="OpenAI key is required when using the non-Azure OpenAI API"
232+
):
233+
async with quart_app.test_app() as test_app:
234+
test_app.test_client()
235+
236+
237+
@pytest.mark.asyncio
238+
async def test_app_config_user_upload_openaicom(monkeypatch, minimal_env):
239+
"""Check that this combo works correctly with prepdocs.py embedding service."""
240+
monkeypatch.setenv("AZURE_USERSTORAGE_ACCOUNT", "test-user-storage-account")
241+
monkeypatch.setenv("AZURE_USERSTORAGE_CONTAINER", "test-user-storage-container")
242+
monkeypatch.setenv("USE_USER_UPLOAD", "true")
243+
monkeypatch.setenv("OPENAI_HOST", "openai")
244+
monkeypatch.setenv("OPENAI_API_KEY", "pretendkey")
245+
quart_app = app.create_app()
246+
async with quart_app.test_app() as test_app:
247+
client = test_app.test_client()
248+
response = await client.get("/config")
249+
assert response.status_code == 200
250+
result = await response.get_json()
251+
assert result["showUserUpload"] is True
122252

123253

124254
@pytest.mark.asyncio

0 commit comments

Comments
 (0)