Skip to content

Commit d6386e9

Browse files
committed
Update return
1 parent c357ddd commit d6386e9

File tree

1 file changed

+40
-18
lines changed

1 file changed

+40
-18
lines changed

image_processing/src/image_processing/mark_up_cleaner.py

Lines changed: 40 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -3,10 +3,11 @@
33
import logging
44
import json
55
import regex as re
6+
from layout_holders import FigureHolder
67

78

8-
class MarkDownCleaner:
9-
def get_sections(self, text: str) -> list:
9+
class MarkUpCleaner:
10+
def get_sections(self, text) -> list:
1011
"""
1112
Returns the section details from the content.
1213
@@ -21,14 +22,15 @@ def get_sections(self, text: str) -> list:
2122
doc_metadata = re.findall(combined_pattern, text, re.DOTALL)
2223
return self.clean_sections(doc_metadata)
2324

24-
def clean_sections(self, sections: list) -> list:
25-
"""
26-
Cleans the sections by removing special characters and extra white spaces.
25+
def get_figure_ids(self, text: str) -> list:
2726
"""
28-
cleaned_sections = [re.sub(r"[=#]", "", match).strip() for match in sections]
29-
return cleaned_sections
27+
Get the FigureIds from the text.
3028
31-
def get_figures(self, text: str) -> list:
29+
Args:
30+
text: The input text.
31+
32+
Returns:
33+
list: The list of FigureIds."""
3234
# Regex pattern to extract FigureIds
3335
pattern = r"FigureId='([^']+)'"
3436

@@ -37,6 +39,13 @@ def get_figures(self, text: str) -> list:
3739

3840
return figure_ids
3941

42+
def clean_sections(self, sections: list) -> list:
43+
"""
44+
Cleans the sections by removing special characters and extra white spaces.
45+
"""
46+
cleaned_sections = [re.sub(r"[=#]", "", match).strip() for match in sections]
47+
return cleaned_sections
48+
4049
def remove_markdown_tags(self, text: str, tag_patterns: dict) -> str:
4150
"""
4251
Remove specified Markdown tags from the text, keeping the contents of the tags.
@@ -64,26 +73,37 @@ def remove_markdown_tags(self, text: str, tag_patterns: dict) -> str:
6473
logging.error(f"An error occurred in remove_markdown_tags: {e}")
6574
return text
6675

67-
def clean_text_and_extract_metadata(self, src_text: str) -> tuple[str, str]:
76+
def clean_text_and_extract_metadata(
77+
self, text: str, figures: list[FigureHolder]
78+
) -> tuple[str, str]:
6879
"""This function performs following cleanup activities on the text, remove all unicode characters
6980
remove line spacing,remove stop words, normalize characters
7081
7182
Args:
72-
src_text (str): The text to cleanup.
83+
text (str): The input text to clean.
84+
figures (list): The list of figures.
7385
7486
Returns:
7587
str: The clean text."""
7688

7789
return_record = {}
7890

7991
try:
80-
logging.info(f"Input text: {src_text}")
81-
if len(src_text) == 0:
92+
logging.info(f"Input text: {text}")
93+
if len(text) == 0:
8294
logging.error("Input text is empty")
8395
raise ValueError("Input text is empty")
8496

85-
return_record["marked_up_chunk"] = src_text
86-
return_record["sections"] = self.get_sections(src_text)
97+
return_record["marked_up_chunk"] = text
98+
99+
figure_ids = self.get_figure_ids(text)
100+
101+
return_record["sections"] = self.get_sections(text)
102+
return_record["figures"] = [
103+
figure.model_dump(by_alias=True)
104+
for figure in figures
105+
if figure.figure_id in figure_ids
106+
]
87107

88108
logging.info(f"Sections: {return_record['sections']}")
89109

@@ -95,7 +115,7 @@ def clean_text_and_extract_metadata(self, src_text: str) -> tuple[str, str]:
95115
"figcaption": r"<figcaption>(.*?)</figcaption>",
96116
"header": r"^\s*(#{1,6})\s*(.*?)\s*$",
97117
}
98-
cleaned_text = self.remove_markdown_tags(src_text, tag_patterns)
118+
cleaned_text = self.remove_markdown_tags(text, tag_patterns)
99119

100120
logging.info(f"Removed markdown tags: {cleaned_text}")
101121

@@ -114,7 +134,7 @@ def clean_text_and_extract_metadata(self, src_text: str) -> tuple[str, str]:
114134
return ""
115135
return return_record
116136

117-
async def process_mark_up_cleaner(self, record: dict) -> dict:
137+
async def clean(self, record: dict) -> dict:
118138
"""Cleanup the data using standard python libraries.
119139
120140
Args:
@@ -135,15 +155,17 @@ async def process_mark_up_cleaner(self, record: dict) -> dict:
135155
"warnings": None,
136156
}
137157

158+
figures = [FigureHolder(**figure) for figure in record["data"]["figures"]]
159+
138160
cleaned_record["data"] = self.clean_text_and_extract_metadata(
139-
record["data"]["chunk"]
161+
record["data"]["chunk"], figures
140162
)
141163

142164
except Exception as e:
143165
logging.error("string cleanup Error: %s", e)
144166
return {
145167
"recordId": record["recordId"],
146-
"data": {},
168+
"data": None,
147169
"errors": [
148170
{
149171
"message": "Failed to cleanup data. Check function app logs for more details of exact failure."

0 commit comments

Comments
 (0)