33import logging
44import json
55import regex as re
6+ from layout_holders import FigureHolder
67
78
8- class MarkDownCleaner :
9- def get_sections (self , text : str ) -> list :
9+ class MarkUpCleaner :
10+ def get_sections (self , text ) -> list :
1011 """
1112 Returns the section details from the content.
1213
@@ -21,14 +22,15 @@ def get_sections(self, text: str) -> list:
2122 doc_metadata = re .findall (combined_pattern , text , re .DOTALL )
2223 return self .clean_sections (doc_metadata )
2324
24- def clean_sections (self , sections : list ) -> list :
25- """
26- Cleans the sections by removing special characters and extra white spaces.
25+ def get_figure_ids (self , text : str ) -> list :
2726 """
28- cleaned_sections = [re .sub (r"[=#]" , "" , match ).strip () for match in sections ]
29- return cleaned_sections
27+ Get the FigureIds from the text.
3028
31- def get_figures (self , text : str ) -> list :
29+ Args:
30+ text: The input text.
31+
32+ Returns:
33+ list: The list of FigureIds."""
3234 # Regex pattern to extract FigureIds
3335 pattern = r"FigureId='([^']+)'"
3436
@@ -37,6 +39,13 @@ def get_figures(self, text: str) -> list:
3739
3840 return figure_ids
3941
42+ def clean_sections (self , sections : list ) -> list :
43+ """
44+ Cleans the sections by removing special characters and extra white spaces.
45+ """
46+ cleaned_sections = [re .sub (r"[=#]" , "" , match ).strip () for match in sections ]
47+ return cleaned_sections
48+
4049 def remove_markdown_tags (self , text : str , tag_patterns : dict ) -> str :
4150 """
4251 Remove specified Markdown tags from the text, keeping the contents of the tags.
@@ -64,26 +73,37 @@ def remove_markdown_tags(self, text: str, tag_patterns: dict) -> str:
6473 logging .error (f"An error occurred in remove_markdown_tags: { e } " )
6574 return text
6675
67- def clean_text_and_extract_metadata (self , src_text : str ) -> tuple [str , str ]:
76+ def clean_text_and_extract_metadata (
77+ self , text : str , figures : list [FigureHolder ]
78+ ) -> tuple [str , str ]:
6879 """This function performs following cleanup activities on the text, remove all unicode characters
6980 remove line spacing,remove stop words, normalize characters
7081
7182 Args:
72- src_text (str): The text to cleanup.
83+ text (str): The input text to clean.
84+ figures (list): The list of figures.
7385
7486 Returns:
7587 str: The clean text."""
7688
7789 return_record = {}
7890
7991 try :
80- logging .info (f"Input text: { src_text } " )
81- if len (src_text ) == 0 :
92+ logging .info (f"Input text: { text } " )
93+ if len (text ) == 0 :
8294 logging .error ("Input text is empty" )
8395 raise ValueError ("Input text is empty" )
8496
85- return_record ["marked_up_chunk" ] = src_text
86- return_record ["sections" ] = self .get_sections (src_text )
97+ return_record ["marked_up_chunk" ] = text
98+
99+ figure_ids = self .get_figure_ids (text )
100+
101+ return_record ["sections" ] = self .get_sections (text )
102+ return_record ["figures" ] = [
103+ figure .model_dump (by_alias = True )
104+ for figure in figures
105+ if figure .figure_id in figure_ids
106+ ]
87107
88108 logging .info (f"Sections: { return_record ['sections' ]} " )
89109
@@ -95,7 +115,7 @@ def clean_text_and_extract_metadata(self, src_text: str) -> tuple[str, str]:
95115 "figcaption" : r"<figcaption>(.*?)</figcaption>" ,
96116 "header" : r"^\s*(#{1,6})\s*(.*?)\s*$" ,
97117 }
98- cleaned_text = self .remove_markdown_tags (src_text , tag_patterns )
118+ cleaned_text = self .remove_markdown_tags (text , tag_patterns )
99119
100120 logging .info (f"Removed markdown tags: { cleaned_text } " )
101121
@@ -114,7 +134,7 @@ def clean_text_and_extract_metadata(self, src_text: str) -> tuple[str, str]:
114134 return ""
115135 return return_record
116136
117- async def process_mark_up_cleaner (self , record : dict ) -> dict :
137+ async def clean (self , record : dict ) -> dict :
118138 """Cleanup the data using standard python libraries.
119139
120140 Args:
@@ -135,15 +155,17 @@ async def process_mark_up_cleaner(self, record: dict) -> dict:
135155 "warnings" : None ,
136156 }
137157
158+ figures = [FigureHolder (** figure ) for figure in record ["data" ]["figures" ]]
159+
138160 cleaned_record ["data" ] = self .clean_text_and_extract_metadata (
139- record ["data" ]["chunk" ]
161+ record ["data" ]["chunk" ], figures
140162 )
141163
142164 except Exception as e :
143165 logging .error ("string cleanup Error: %s" , e )
144166 return {
145167 "recordId" : record ["recordId" ],
146- "data" : {} ,
168+ "data" : None ,
147169 "errors" : [
148170 {
149171 "message" : "Failed to cleanup data. Check function app logs for more details of exact failure."
0 commit comments