44import logging
55import re
66from layout_holders import FigureHolder , LayoutHolder
7+ from typing import List
78
89
910class LayoutAndFigureMerger :
@@ -18,37 +19,48 @@ def insert_figure_description(
1819 figure_holder (FigureHolder): The figure to be updated.
1920
2021 Returns:
21- str : The updated Markdown content with the new figure description.
22+ int : The change in length of the Markdown content after updating the figure description.
2223 """
23-
2424 # Calculate the end index of the content to be replaced
2525 end_index = figure_holder .offset + figure_holder .length
2626
27- # Ensure that the end_index does not exceed the length of the Markdown content
27+ # Ensure the offset is valid
28+ if figure_holder .offset < 0 or figure_holder .offset > len (
29+ layout_holder .content
30+ ):
31+ logging .error ("Figure offset is out of bounds." )
32+ raise ValueError ("Figure offset is out of bounds." )
33+
34+ # Ensure the end index does not exceed the length of the Markdown content
2835 if end_index > len (layout_holder .content ):
2936 logging .info (
30- "End index exceeds the length of the content. Adjusting the end index to the length of the content."
37+ "End index exceeds the length of the content. Adjusting to the length of the content."
3138 )
3239 end_index = len (layout_holder .content )
3340
41+ logging .info (f"Figure Markdown Content: { figure_holder .markdown } " )
42+
3443 # Replace the old string with the new string
3544 layout_holder .content = (
3645 layout_holder .content [: figure_holder .offset ]
3746 + figure_holder .markdown
3847 + layout_holder .content [end_index :]
3948 )
4049
41- return len (figure_holder .markdown ) - figure_holder .length
50+ inserted_length = len (figure_holder .markdown ) - figure_holder .length
51+ logging .info (f"Inserted Length: { inserted_length } " )
52+
53+ return layout_holder , inserted_length
4254
4355 async def merge_figures_into_layout (
44- self , layout : LayoutHolder , figures : list [FigureHolder ]
56+ self , layout_holder : LayoutHolder , figures : List [FigureHolder ]
4557 ) -> LayoutHolder :
4658 """
4759 Merges the figures into the layout.
4860
4961 Args:
50- layout (LayoutHolder): The layout text.
51- figures (list ): The list of figures.
62+ layout_holder (LayoutHolder): The layout text.
63+ figures (List[FigureHolder] ): The list of figures.
5264
5365 Returns:
5466 LayoutHolder: The updated layout text with the figures.
@@ -59,30 +71,51 @@ async def merge_figures_into_layout(
5971 # Iterate over the figures
6072 for figure in figures :
6173 logging .info (f"Inserting Figure: { figure .figure_id } " )
74+ logging .info (f"Figure Description: { figure .description } " )
6275 # Update the figure description in the layout
6376 figure .offset += running_offset
64- length = self .insert_figure_description (layout , figure )
77+ layout_holder , inserted_length = self .insert_figure_description (
78+ layout_holder , figure
79+ )
6580
6681 # Update the offset
67- running_offset += length
82+ running_offset += inserted_length
83+
84+ logging .info ("Merged figures into layout." )
85+ logging .info ("Updated Layout with Figures: %s" , layout_holder .content )
86+ # Precompile regex patterns
87+ irrelevant_figure_pattern = re .compile (
88+ r"<figure[^>]*>\s*(Irrelevant Image|\'Irrelevant Image\')\s*</figure>" ,
89+ re .DOTALL ,
90+ )
91+ empty_or_whitespace_figure_pattern = re .compile (
92+ r"<figure[^>]*>\s*</figure>" , re .DOTALL
93+ )
94+ html_comments_pattern = re .compile (r"<!--.*?-->" , re .DOTALL )
6895
6996 # Remove irrelevant figures
70- irrelevant_figure_pattern = r"<figure[^>]*>.*?Irrelevant Image.*?</figure>"
71- layout .content = re .sub (
72- irrelevant_figure_pattern , "" , layout .content , flags = re .DOTALL
97+ layout_holder .content = irrelevant_figure_pattern .sub ("" , layout_holder .content )
98+ logging .info ("Removed irrelevant figures from layout." )
99+ logging .info (
100+ "Updated Layout without Irrelevant Figures: %s" , layout_holder .content
73101 )
74102
75- empty_or_whitespace_figure_pattern = r"<figure[^>]*>\s*</figure>"
76- layout .content = re .sub (
77- empty_or_whitespace_figure_pattern , "" , layout .content , flags = re . DOTALL
103+ # Remove empty or whitespace figures
104+ layout_holder .content = empty_or_whitespace_figure_pattern .sub (
105+ "" , layout_holder .content
78106 )
79-
80- html_comments_pattern = r"<!--.*?-->"
81- layout . content = re . sub (
82- html_comments_pattern , "" , layout .content , flags = re . DOTALL
107+ logging . info ( "Removed empty or whitespace figures from layout." )
108+ logging . info (
109+ "Updated Layout without Empty or Whitespace Figures: %s" ,
110+ layout_holder .content ,
83111 )
84112
85- return layout
113+ # Remove HTML comments
114+ layout_holder .content = html_comments_pattern .sub ("" , layout_holder .content )
115+ logging .info ("Removed HTML comments from layout." )
116+ logging .info ("Updated Layout without HTML Comments: %s" , layout_holder .content )
117+
118+ return layout_holder
86119
87120 async def merge (self , record : dict ) -> dict :
88121 """
@@ -94,19 +127,21 @@ async def merge(self, record: dict) -> dict:
94127 Returns:
95128 - record (dict): The record containing the image, its caption, and the generated description.
96129 """
97- layout = LayoutHolder (** record ["data" ]["layout" ])
130+ layout_holder = LayoutHolder (** record ["data" ]["layout" ])
98131
99132 figures = [FigureHolder (** figure ) for figure in record ["data" ]["figures" ]]
100133
101134 try :
102- logging .info (f"Input Data: { layout } " )
103- updated_layout = await self .merge_figures_into_layout (layout , figures )
104- logging .info (f"Updated Data: { updated_layout } " )
135+ logging .info (f"Input Data: { layout_holder } " )
136+ updated_layout = await self .merge_figures_into_layout (
137+ layout_holder , figures
138+ )
139+ logging .info (f"Updated Layout Data: { updated_layout } " )
105140 except Exception as e :
106141 logging .error (f"Failed to merge figures into layout. Error: { e } " )
107142 return {
108143 "recordId" : record ["recordId" ],
109- "data" : {} ,
144+ "data" : None ,
110145 "errors" : [
111146 {
112147 "message" : "Failed to merge figures into layout." ,
0 commit comments