Skip to content

Commit 37f3a6b

Browse files
committed
Update mappings
1 parent 967ccfb commit 37f3a6b

File tree

2 files changed

+144
-63
lines changed

2 files changed

+144
-63
lines changed

deploy_ai_search_indexes/src/deploy_ai_search_indexes/ai_search.py

Lines changed: 79 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -196,7 +196,7 @@ def get_data_source(self) -> SearchIndexerDataSourceConnection:
196196

197197
return data_source_connection
198198

199-
def get_mark_up_cleaner_skill(self, context, source) -> WebApiSkill:
199+
def get_mark_up_cleaner_skill(self, chunk_by_page: False) -> WebApiSkill:
200200
"""Get the custom skill for data cleanup.
201201
202202
Args:
@@ -215,29 +215,47 @@ def get_mark_up_cleaner_skill(self, context, source) -> WebApiSkill:
215215
batch_size = 16
216216
degree_of_parallelism = 16
217217

218-
mark_up_cleaner_skill_inputs = [
219-
InputFieldMappingEntry(name="chunk", source=source),
220-
InputFieldMappingEntry(
221-
name="figure_storage_prefix", source="/document/metadata_storage_path"
222-
),
223-
]
218+
if chunk_by_page:
219+
mark_up_cleaner_context = "/document/page_wise_layout/*"
220+
inputs = [
221+
InputFieldMappingEntry(
222+
name="chunk", source="/document/page_wise_layout/*/merged_content"
223+
),
224+
InputFieldMappingEntry(
225+
name="figures",
226+
source="/document/page_wise_layout/*/figures/*/updated_figure",
227+
),
228+
]
229+
else:
230+
mark_up_cleaner_context = "/document/chunk_mark_ups/*"
231+
inputs = [
232+
InputFieldMappingEntry(
233+
name="chunk", source="/document/chunk_mark_ups/*"
234+
),
235+
InputFieldMappingEntry(
236+
name="figures", source="/document/layout/figures/*/updated_figure"
237+
),
238+
]
224239

225240
mark_up_cleaner_skill_outputs = [
226-
OutputFieldMappingEntry(name="cleaned_chunk", target_name="cleaned_chunk"),
227-
OutputFieldMappingEntry(name="chunk", target_name="chunk"),
228-
OutputFieldMappingEntry(name="sections", target_name="sections"),
241+
OutputFieldMappingEntry(name="chunk_cleaned", target_name="chunk_cleaned"),
242+
OutputFieldMappingEntry(
243+
name="chunk_sections", target_name="chunk_sections"
244+
),
245+
OutputFieldMappingEntry(name="chunk_mark_up", target_name="chunk_mark_up"),
246+
OutputFieldMappingEntry(name="chunk_figures", target_name="chunk_figures"),
229247
]
230248

231249
mark_up_cleaner_skill = WebApiSkill(
232250
name="Mark Up Cleaner Skill",
233251
description="Skill to clean the data before sending to embedding",
234-
context=context,
252+
context=mark_up_cleaner_context,
235253
uri=self.environment.get_custom_skill_function_url("mark_up_cleaner"),
236254
timeout="PT230S",
237255
batch_size=batch_size,
238256
degree_of_parallelism=degree_of_parallelism,
239257
http_method="POST",
240-
inputs=mark_up_cleaner_skill_inputs,
258+
inputs=inputs,
241259
outputs=mark_up_cleaner_skill_outputs,
242260
)
243261

@@ -255,8 +273,6 @@ def get_mark_up_cleaner_skill(self, context, source) -> WebApiSkill:
255273

256274
def get_semantic_chunker_skill(
257275
self,
258-
context,
259-
source,
260276
num_surrounding_sentences: int = 1,
261277
similarity_threshold: float = 0.8,
262278
max_chunk_tokens: int = 200,
@@ -284,17 +300,17 @@ def get_semantic_chunker_skill(
284300
degree_of_parallelism = 16
285301

286302
semantic_text_chunker_skill_inputs = [
287-
InputFieldMappingEntry(name="content", source=source)
303+
InputFieldMappingEntry(name="content", source="/document/merged_content")
288304
]
289305

290306
semantic_text_chunker_skill_outputs = [
291-
OutputFieldMappingEntry(name="chunks", target_name="chunks"),
307+
OutputFieldMappingEntry(name="chunks", target_name="chunk_mark_ups"),
292308
]
293309

294310
semantic_text_chunker_skill = WebApiSkill(
295311
name="Semantic Chunker Skill",
296312
description="Skill to clean the data before sending to embedding",
297-
context=context,
313+
context="/document",
298314
uri=self.environment.get_custom_skill_function_url("semantic_text_chunker"),
299315
timeout="PT230S",
300316
batch_size=batch_size,
@@ -345,7 +361,9 @@ def get_layout_analysis_skill(
345361

346362
if chunk_by_page:
347363
output = [
348-
OutputFieldMappingEntry(name="layout", target_name="page_wise_layout")
364+
OutputFieldMappingEntry(
365+
name="page_wise_layout", target_name="page_wise_layout"
366+
)
349367
]
350368
else:
351369
output = [OutputFieldMappingEntry(name="layout", target_name="layout")]
@@ -383,7 +401,7 @@ def get_layout_analysis_skill(
383401

384402
return layout_analysis_skill
385403

386-
def get_figure_analysis_skill(self, figure_source) -> WebApiSkill:
404+
def get_figure_analysis_skill(self, chunk_by_page=False) -> WebApiSkill:
387405
"""Get the custom skill for figure analysis.
388406
389407
Args:
@@ -406,16 +424,32 @@ def get_figure_analysis_skill(self, figure_source) -> WebApiSkill:
406424
OutputFieldMappingEntry(name="updated_figure", target_name="updated_figure")
407425
]
408426

427+
if chunk_by_page:
428+
figure_context = "/document/page_wise_layout/*"
429+
inputs = [
430+
InputFieldMappingEntry(
431+
name="figure", source="/document/page_wise_layout/*/figures/*"
432+
)
433+
]
434+
else:
435+
figure_context = "/document/layout/figures/*"
436+
437+
inputs = [
438+
InputFieldMappingEntry(
439+
name="figure", source="/document/layout/figures/*"
440+
)
441+
]
442+
409443
figure_analysis_skill = WebApiSkill(
410444
name="Figure Analysis Skill",
411445
description="Skill to generate figure analysis",
412-
context=figure_source,
446+
context=figure_context,
413447
uri=self.environment.get_custom_skill_function_url("figure_analysis"),
414448
timeout="PT230S",
415449
batch_size=batch_size,
416450
degree_of_parallelism=degree_of_parallelism,
417451
http_method="POST",
418-
inputs=[InputFieldMappingEntry(name="figure", source=figure_source)],
452+
inputs=inputs,
419453
outputs=output,
420454
)
421455

@@ -431,7 +465,7 @@ def get_figure_analysis_skill(self, figure_source) -> WebApiSkill:
431465

432466
return figure_analysis_skill
433467

434-
def get_layout_and_figure_merger_skill(self, figure_source) -> WebApiSkill:
468+
def get_layout_and_figure_merger_skill(self, chunk_by_page=False) -> WebApiSkill:
435469
"""Get the custom skill for layout and figure merger.
436470
437471
Args:
@@ -450,22 +484,40 @@ def get_layout_and_figure_merger_skill(self, figure_source) -> WebApiSkill:
450484
batch_size = 1
451485
degree_of_parallelism = 8
452486

453-
output = [
454-
OutputFieldMappingEntry(name="updated_figure", target_name="updated_figure")
455-
]
487+
output = [OutputFieldMappingEntry(name="content", target_name="merged_content")]
488+
if chunk_by_page:
489+
merger_context = "/document/page_wise_layout/*"
490+
inputs = [
491+
InputFieldMappingEntry(
492+
name="layout", source="/document/page_wise_layout/*"
493+
),
494+
InputFieldMappingEntry(
495+
name="figures",
496+
source="/document/page_wise_layout/*/figures/*/updated_figure",
497+
),
498+
]
499+
else:
500+
merger_context = "/document/layout"
501+
502+
inputs = [
503+
InputFieldMappingEntry(name="layout", source="/document/layout"),
504+
InputFieldMappingEntry(
505+
name="figures", source="/document/layout/figures/*/updated_figure"
506+
),
507+
]
456508

457509
figure_analysis_skill = WebApiSkill(
458510
name="Layout and Figure Merger Skill",
459511
description="Skill to merge layout and figure analysis",
460-
context=figure_source,
512+
context=merger_context,
461513
uri=self.environment.get_custom_skill_function_url(
462514
"layout_and_figure_merger"
463515
),
464516
timeout="PT230S",
465517
batch_size=batch_size,
466518
degree_of_parallelism=degree_of_parallelism,
467519
http_method="POST",
468-
inputs=[InputFieldMappingEntry(name="figure", source=figure_source)],
520+
inputs=inputs,
469521
outputs=output,
470522
)
471523

deploy_ai_search_indexes/src/deploy_ai_search_indexes/image_processing.py

Lines changed: 65 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -181,21 +181,27 @@ def get_skills(self) -> list:
181181

182182
layout_skill = self.get_layout_analysis_skill(self.enable_page_by_chunking)
183183

184-
figure_skill = self.get_figure_analysis_skill()
184+
figure_skill = self.get_figure_analysis_skill(self.enable_page_by_chunking)
185185

186-
merger_skill = self.get_layout_and_figure_merger_skill()
187-
188-
text_split_skill = self.get_semantic_chunker_skill(
189-
"/document", "/document/extracted_content/content"
186+
merger_skill = self.get_layout_and_figure_merger_skill(
187+
self.enable_page_by_chunking
190188
)
191189

190+
text_split_skill = self.get_semantic_chunker_skill(self.enable_page_by_chunking)
191+
192192
mark_up_cleaner_skill = self.get_mark_up_cleaner_skill(
193-
"/document/chunks/*", "/document/chunks/*/content"
193+
self.enable_page_by_chunking
194194
)
195195

196-
embedding_skill = self.get_vector_skill(
197-
"/document/chunks/*", "/document/chunks/*/cleaned_chunk"
198-
)
196+
if self.enable_page_by_chunking:
197+
embedding_skill = self.get_vector_skill(
198+
"/document/page_wise_layout/*",
199+
"/document/page_wise_layout/*/chunk_cleaned",
200+
)
201+
else:
202+
embedding_skill = self.get_vector_skill(
203+
"/document/chunk_mark_ups/*", "/document/chunk_mark_ups/*/chunk_cleaned"
204+
)
199205

200206
if self.enable_page_by_chunking:
201207
skills = [
@@ -219,41 +225,64 @@ def get_skills(self) -> list:
219225

220226
def get_index_projections(self) -> SearchIndexerIndexProjection:
221227
"""This function returns the index projections for rag document."""
222-
mappings = [
223-
InputFieldMappingEntry(name="Chunk", source="/document/chunks/*/chunk"),
224-
InputFieldMappingEntry(
225-
name="ChunkEmbedding",
226-
source="/document/chunks/*/vector",
227-
),
228-
InputFieldMappingEntry(name="Title", source="/document/Title"),
229-
InputFieldMappingEntry(name="SourceUri", source="/document/SourceUri"),
230-
InputFieldMappingEntry(
231-
name="Sections", source="/document/chunks/*/sections"
232-
),
233-
InputFieldMappingEntry(
234-
name="Figures",
235-
source_context="/document/chunks/*/chunk_figures/*",
236-
),
237-
InputFieldMappingEntry(
238-
name="DateLastModified", source="/document/DateLastModified"
239-
),
240-
]
241228

242229
if self.enable_page_by_chunking:
243-
mappings.extend(
244-
[
245-
InputFieldMappingEntry(
246-
name="PageNumber", source="/document/chunks/*/pageNumber"
247-
)
248-
]
249-
)
230+
source_context = "/document/page_wise_layout/*"
231+
mappings = [
232+
InputFieldMappingEntry(
233+
name="Chunk", source="/document/page_wise_layout/*/chunk_mark_up"
234+
),
235+
InputFieldMappingEntry(
236+
name="ChunkEmbedding",
237+
source="/document/page_wise_layout/*/vector",
238+
),
239+
InputFieldMappingEntry(name="Title", source="/document/Title"),
240+
InputFieldMappingEntry(name="SourceUri", source="/document/SourceUri"),
241+
InputFieldMappingEntry(
242+
name="Sections",
243+
source="/document/page_wise_layout/*/chunk_sections",
244+
),
245+
InputFieldMappingEntry(
246+
name="Figures",
247+
source_context="/document/page_wise_layout/*/chunk_figures/*",
248+
),
249+
InputFieldMappingEntry(
250+
name="DateLastModified", source="/document/DateLastModified"
251+
),
252+
InputFieldMappingEntry(
253+
name="PageNumber", source="/document/page_wise_layout/*/#"
254+
),
255+
]
256+
else:
257+
source_context = "/document/chunks/*"
258+
mappings = [
259+
InputFieldMappingEntry(
260+
name="Chunk", source="/document/chunk_mark_ups/*/chunk_mark_up"
261+
),
262+
InputFieldMappingEntry(
263+
name="ChunkEmbedding",
264+
source="/document/chunk_mark_ups/*/vector",
265+
),
266+
InputFieldMappingEntry(name="Title", source="/document/Title"),
267+
InputFieldMappingEntry(name="SourceUri", source="/document/SourceUri"),
268+
InputFieldMappingEntry(
269+
name="Sections", source="/document/chunk_mark_ups/*/chunk_sections"
270+
),
271+
InputFieldMappingEntry(
272+
name="Figures",
273+
source_context="/document/chunk_mark_ups/*/chunk_figures/*",
274+
),
275+
InputFieldMappingEntry(
276+
name="DateLastModified", source="/document/DateLastModified"
277+
),
278+
]
250279

251280
index_projections = SearchIndexerIndexProjection(
252281
selectors=[
253282
SearchIndexerIndexProjectionSelector(
254283
target_index_name=self.index_name,
255284
parent_key_field_name="Id",
256-
source_context="/document/chunks/*",
285+
source_context=source_context,
257286
mappings=mappings,
258287
),
259288
],

0 commit comments

Comments
 (0)