Skip to content

Commit 9664143

Browse files
Priyanka-MicrosoftRoopan-MicrosoftHarmanpreet Kaur
authored
fix: Resolve chunking issue during deployment when enabling advanced image… (#1633)
Co-authored-by: Roopan-Microsoft <[email protected]> Co-authored-by: Harmanpreet Kaur <[email protected]>
1 parent dc763c2 commit 9664143

File tree

2 files changed

+31
-43
lines changed

2 files changed

+31
-43
lines changed

code/backend/batch/utilities/helpers/config/config_helper.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -309,10 +309,10 @@ def clear_config():
309309
@staticmethod
310310
def _append_advanced_image_processors():
311311
image_file_types = ["jpeg", "jpg", "png", "tiff", "bmp"]
312-
ConfigHelper._remove_processors_for_file_types(image_file_types)
312+
# ConfigHelper._remove_processors_for_file_types(image_file_types)
313313
ConfigHelper._default_config["document_processors"].extend(
314314
[
315-
{"document_type": file_type, "use_advanced_image_processing": True}
315+
{"document_type": file_type, "chunking" : ConfigHelper._default_config["document_processors"][0]["chunking"], "loading" : ConfigHelper._default_config["document_processors"][0]["loading"], "use_advanced_image_processing": True}
316316
for file_type in image_file_types
317317
]
318318
)

code/tests/utilities/helpers/test_config_helper.py

Lines changed: 29 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -178,63 +178,51 @@ def test_default_config_is_cached():
178178
assert default_config_one is default_config_two
179179

180180

181+
@patch("backend.batch.utilities.helpers.config.config_helper.EnvHelper")
181182
def test_default_config_when_use_advanced_image_processing(env_helper_mock):
182183
# given
184+
ConfigHelper._default_config = None
183185
env_helper_mock.return_value.USE_ADVANCED_IMAGE_PROCESSING = True
184186

185187
# when
186188
config = ConfigHelper.get_default_config()
187189

188190
# then
189191
expected_chunking = {"strategy": "layout", "size": 500, "overlap": 100}
190-
assert config["document_processors"] == [
191-
{
192-
"document_type": "pdf",
193-
"chunking": expected_chunking,
194-
"loading": {"strategy": "layout"},
195-
},
196-
{
197-
"document_type": "txt",
198-
"chunking": expected_chunking,
199-
"loading": {"strategy": "web"},
200-
},
201-
{
202-
"document_type": "url",
203-
"chunking": expected_chunking,
204-
"loading": {"strategy": "web"},
205-
},
206-
{
207-
"document_type": "md",
208-
"chunking": expected_chunking,
209-
"loading": {"strategy": "web"},
210-
},
211-
{
212-
"document_type": "html",
213-
"chunking": expected_chunking,
214-
"loading": {"strategy": "web"},
215-
},
216-
{
217-
"document_type": "htm",
218-
"chunking": expected_chunking,
219-
"loading": {"strategy": "web"},
220-
},
221-
{
222-
"document_type": "docx",
223-
"chunking": expected_chunking,
224-
"loading": {"strategy": "docx"},
225-
},
192+
expected_loading = {"strategy": "layout"}
193+
expected_image_processor = {
194+
"chunking": expected_chunking,
195+
"loading": expected_loading,
196+
"use_advanced_image_processing": True,
197+
}
198+
199+
actual_processors = config["document_processors"]
200+
201+
expected_processors = [
202+
{"document_type": "pdf", "chunking": expected_chunking, "loading": expected_loading},
203+
{"document_type": "txt", "chunking": expected_chunking, "loading": {"strategy": "web"}},
204+
{"document_type": "url", "chunking": expected_chunking, "loading": {"strategy": "web"}},
205+
{"document_type": "md", "chunking": expected_chunking, "loading": {"strategy": "web"}},
206+
{"document_type": "html", "chunking": expected_chunking, "loading": {"strategy": "web"}},
207+
{"document_type": "htm", "chunking": expected_chunking, "loading": {"strategy": "web"}},
208+
{"document_type": "docx", "chunking": expected_chunking, "loading": {"strategy": "docx"}},
226209
{
227210
"document_type": "json",
228211
"chunking": {"strategy": "json", "size": 500, "overlap": 100},
229212
"loading": {"strategy": "web"},
230213
},
231-
{"document_type": "jpeg", "use_advanced_image_processing": True},
232-
{"document_type": "jpg", "use_advanced_image_processing": True},
233-
{"document_type": "png", "use_advanced_image_processing": True},
234-
{"document_type": "tiff", "use_advanced_image_processing": True},
235-
{"document_type": "bmp", "use_advanced_image_processing": True},
214+
{"document_type": "jpg", "chunking": expected_chunking, "loading": expected_loading},
215+
{"document_type": "jpeg", "chunking": expected_chunking, "loading": expected_loading},
216+
{"document_type": "png", "chunking": expected_chunking, "loading": expected_loading},
217+
{"document_type": "jpeg", **expected_image_processor},
218+
{"document_type": "jpg", **expected_image_processor},
219+
{"document_type": "png", **expected_image_processor},
220+
{"document_type": "tiff", **expected_image_processor},
221+
{"document_type": "bmp", **expected_image_processor},
236222
]
237223

224+
assert actual_processors == expected_processors
225+
238226

239227
def test_get_config_from_azure(
240228
AzureBlobStorageClientMock: MagicMock,

0 commit comments

Comments
 (0)