@@ -196,7 +196,7 @@ def get_data_source(self) -> SearchIndexerDataSourceConnection:
196196
197197 return data_source_connection
198198
199- def get_mark_up_cleaner_skill (self , context , source ) -> WebApiSkill :
199+ def get_mark_up_cleaner_skill (self , chunk_by_page : False ) -> WebApiSkill :
200200 """Get the custom skill for data cleanup.
201201
202202 Args:
@@ -215,29 +215,47 @@ def get_mark_up_cleaner_skill(self, context, source) -> WebApiSkill:
215215 batch_size = 16
216216 degree_of_parallelism = 16
217217
218- mark_up_cleaner_skill_inputs = [
219- InputFieldMappingEntry (name = "chunk" , source = source ),
220- InputFieldMappingEntry (
221- name = "figure_storage_prefix" , source = "/document/metadata_storage_path"
222- ),
223- ]
218+ if chunk_by_page :
219+ mark_up_cleaner_context = "/document/page_wise_layout/*"
220+ inputs = [
221+ InputFieldMappingEntry (
222+ name = "chunk" , source = "/document/page_wise_layout/*/merged_content"
223+ ),
224+ InputFieldMappingEntry (
225+ name = "figures" ,
226+ source = "/document/page_wise_layout/*/figures/*/updated_figure" ,
227+ ),
228+ ]
229+ else :
230+ mark_up_cleaner_context = "/document/chunk_mark_ups/*"
231+ inputs = [
232+ InputFieldMappingEntry (
233+ name = "chunk" , source = "/document/chunk_mark_ups/*"
234+ ),
235+ InputFieldMappingEntry (
236+ name = "figures" , source = "/document/layout/figures/*/updated_figure"
237+ ),
238+ ]
224239
225240 mark_up_cleaner_skill_outputs = [
226- OutputFieldMappingEntry (name = "cleaned_chunk" , target_name = "cleaned_chunk" ),
227- OutputFieldMappingEntry (name = "chunk" , target_name = "chunk" ),
228- OutputFieldMappingEntry (name = "sections" , target_name = "sections" ),
241+ OutputFieldMappingEntry (name = "chunk_cleaned" , target_name = "chunk_cleaned" ),
242+ OutputFieldMappingEntry (
243+ name = "chunk_sections" , target_name = "chunk_sections"
244+ ),
245+ OutputFieldMappingEntry (name = "chunk_mark_up" , target_name = "chunk_mark_up" ),
246+ OutputFieldMappingEntry (name = "chunk_figures" , target_name = "chunk_figures" ),
229247 ]
230248
231249 mark_up_cleaner_skill = WebApiSkill (
232250 name = "Mark Up Cleaner Skill" ,
233251 description = "Skill to clean the data before sending to embedding" ,
234- context = context ,
252+ context = mark_up_cleaner_context ,
235253 uri = self .environment .get_custom_skill_function_url ("mark_up_cleaner" ),
236254 timeout = "PT230S" ,
237255 batch_size = batch_size ,
238256 degree_of_parallelism = degree_of_parallelism ,
239257 http_method = "POST" ,
240- inputs = mark_up_cleaner_skill_inputs ,
258+ inputs = inputs ,
241259 outputs = mark_up_cleaner_skill_outputs ,
242260 )
243261
@@ -255,8 +273,6 @@ def get_mark_up_cleaner_skill(self, context, source) -> WebApiSkill:
255273
256274 def get_semantic_chunker_skill (
257275 self ,
258- context ,
259- source ,
260276 num_surrounding_sentences : int = 1 ,
261277 similarity_threshold : float = 0.8 ,
262278 max_chunk_tokens : int = 200 ,
@@ -284,17 +300,17 @@ def get_semantic_chunker_skill(
284300 degree_of_parallelism = 16
285301
286302 semantic_text_chunker_skill_inputs = [
287- InputFieldMappingEntry (name = "content" , source = source )
303+ InputFieldMappingEntry (name = "content" , source = "/document/merged_content" )
288304 ]
289305
290306 semantic_text_chunker_skill_outputs = [
291- OutputFieldMappingEntry (name = "chunks" , target_name = "chunks " ),
307+ OutputFieldMappingEntry (name = "chunks" , target_name = "chunk_mark_ups " ),
292308 ]
293309
294310 semantic_text_chunker_skill = WebApiSkill (
295311 name = "Semantic Chunker Skill" ,
296312 description = "Skill to clean the data before sending to embedding" ,
297- context = context ,
313+ context = "/document" ,
298314 uri = self .environment .get_custom_skill_function_url ("semantic_text_chunker" ),
299315 timeout = "PT230S" ,
300316 batch_size = batch_size ,
@@ -345,7 +361,9 @@ def get_layout_analysis_skill(
345361
346362 if chunk_by_page :
347363 output = [
348- OutputFieldMappingEntry (name = "layout" , target_name = "page_wise_layout" )
364+ OutputFieldMappingEntry (
365+ name = "page_wise_layout" , target_name = "page_wise_layout"
366+ )
349367 ]
350368 else :
351369 output = [OutputFieldMappingEntry (name = "layout" , target_name = "layout" )]
@@ -383,7 +401,7 @@ def get_layout_analysis_skill(
383401
384402 return layout_analysis_skill
385403
386- def get_figure_analysis_skill (self , figure_source ) -> WebApiSkill :
404+ def get_figure_analysis_skill (self , chunk_by_page = False ) -> WebApiSkill :
387405 """Get the custom skill for figure analysis.
388406
389407 Args:
@@ -406,16 +424,32 @@ def get_figure_analysis_skill(self, figure_source) -> WebApiSkill:
406424 OutputFieldMappingEntry (name = "updated_figure" , target_name = "updated_figure" )
407425 ]
408426
427+ if chunk_by_page :
428+ figure_context = "/document/page_wise_layout/*"
429+ inputs = [
430+ InputFieldMappingEntry (
431+ name = "figure" , source = "/document/page_wise_layout/*/figures/*"
432+ )
433+ ]
434+ else :
435+ figure_context = "/document/layout/figures/*"
436+
437+ inputs = [
438+ InputFieldMappingEntry (
439+ name = "figure" , source = "/document/layout/figures/*"
440+ )
441+ ]
442+
409443 figure_analysis_skill = WebApiSkill (
410444 name = "Figure Analysis Skill" ,
411445 description = "Skill to generate figure analysis" ,
412- context = figure_source ,
446+ context = figure_context ,
413447 uri = self .environment .get_custom_skill_function_url ("figure_analysis" ),
414448 timeout = "PT230S" ,
415449 batch_size = batch_size ,
416450 degree_of_parallelism = degree_of_parallelism ,
417451 http_method = "POST" ,
418- inputs = [ InputFieldMappingEntry ( name = "figure" , source = figure_source )] ,
452+ inputs = inputs ,
419453 outputs = output ,
420454 )
421455
@@ -431,7 +465,7 @@ def get_figure_analysis_skill(self, figure_source) -> WebApiSkill:
431465
432466 return figure_analysis_skill
433467
434- def get_layout_and_figure_merger_skill (self , figure_source ) -> WebApiSkill :
468+ def get_layout_and_figure_merger_skill (self , chunk_by_page = False ) -> WebApiSkill :
435469 """Get the custom skill for layout and figure merger.
436470
437471 Args:
@@ -450,22 +484,40 @@ def get_layout_and_figure_merger_skill(self, figure_source) -> WebApiSkill:
450484 batch_size = 1
451485 degree_of_parallelism = 8
452486
453- output = [
454- OutputFieldMappingEntry (name = "updated_figure" , target_name = "updated_figure" )
455- ]
487+ output = [OutputFieldMappingEntry (name = "content" , target_name = "merged_content" )]
488+ if chunk_by_page :
489+ merger_context = "/document/page_wise_layout/*"
490+ inputs = [
491+ InputFieldMappingEntry (
492+ name = "layout" , source = "/document/page_wise_layout/*"
493+ ),
494+ InputFieldMappingEntry (
495+ name = "figures" ,
496+ source = "/document/page_wise_layout/*/figures/*/updated_figure" ,
497+ ),
498+ ]
499+ else :
500+ merger_context = "/document/layout"
501+
502+ inputs = [
503+ InputFieldMappingEntry (name = "layout" , source = "/document/layout" ),
504+ InputFieldMappingEntry (
505+ name = "figures" , source = "/document/layout/figures/*/updated_figure"
506+ ),
507+ ]
456508
457509 figure_analysis_skill = WebApiSkill (
458510 name = "Layout and Figure Merger Skill" ,
459511 description = "Skill to merge layout and figure analysis" ,
460- context = figure_source ,
512+ context = merger_context ,
461513 uri = self .environment .get_custom_skill_function_url (
462514 "layout_and_figure_merger"
463515 ),
464516 timeout = "PT230S" ,
465517 batch_size = batch_size ,
466518 degree_of_parallelism = degree_of_parallelism ,
467519 http_method = "POST" ,
468- inputs = [ InputFieldMappingEntry ( name = "figure" , source = figure_source )] ,
520+ inputs = inputs ,
469521 outputs = output ,
470522 )
471523
0 commit comments