Skip to content

Commit f0912ad

Browse files
committed
Update cleaner
1 parent c920377 commit f0912ad

File tree

3 files changed

+158
-150
lines changed

3 files changed

+158
-150
lines changed

image_processing/src/image_processing/layout_holders.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ def markdown(self) -> str:
2727
--------
2828
str: The Markdown string representation of the figure."""
2929

30-
return f"<figure FigureId='{self.figure_id}' Uri='{self.uri}'>{self.description}</figure>"
30+
return f"<figure FigureId='{self.figure_id}'>{self.description}</figure>"
3131

3232

3333
class LayoutHolder(BaseModel):
Lines changed: 157 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,157 @@
1+
# Copyright (c) Microsoft Corporation.
2+
# Licensed under the MIT License.
3+
import logging
4+
import json
5+
import regex as re
6+
7+
8+
class MarkDownCleaner:
9+
def get_sections(self, text: str) -> list:
10+
"""
11+
Returns the section details from the content.
12+
13+
Args:
14+
text: The input text
15+
16+
Returns:
17+
list: The sections related to text
18+
"""
19+
# Updated regex pattern to capture markdown headers like ### Header
20+
combined_pattern = r"(?<=\n|^)[#]+\s*(.*?)(?=\n)"
21+
doc_metadata = re.findall(combined_pattern, text, re.DOTALL)
22+
return self.clean_sections(doc_metadata)
23+
24+
def clean_sections(self, sections: list) -> list:
25+
"""
26+
Cleans the sections by removing special characters and extra white spaces.
27+
"""
28+
cleaned_sections = [re.sub(r"[=#]", "", match).strip() for match in sections]
29+
return cleaned_sections
30+
31+
def get_figures(self, text: str) -> list:
32+
# Regex pattern to extract FigureIds
33+
pattern = r"FigureId='([^']+)'"
34+
35+
# Extract FigureIds using findall
36+
figure_ids = re.findall(pattern, text)
37+
38+
return figure_ids
39+
40+
def remove_markdown_tags(self, text: str, tag_patterns: dict) -> str:
41+
"""
42+
Remove specified Markdown tags from the text, keeping the contents of the tags.
43+
44+
Args:
45+
text: The input text containing Markdown tags.
46+
tag_patterns: A dictionary where keys are tags and values are their specific patterns.
47+
48+
Returns:
49+
str: The text with specified tags removed.
50+
"""
51+
try:
52+
for tag, pattern in tag_patterns.items():
53+
try:
54+
# Replace the tags using the specific pattern, keeping the content inside the tags
55+
if tag == "header":
56+
text = re.sub(
57+
pattern, r"\2", text, flags=re.DOTALL | re.MULTILINE
58+
)
59+
else:
60+
text = re.sub(pattern, r"\1", text, flags=re.DOTALL)
61+
except re.error as e:
62+
logging.error(f"Regex error for tag '{tag}': {e}")
63+
except Exception as e:
64+
logging.error(f"An error occurred in remove_markdown_tags: {e}")
65+
return text
66+
67+
def clean_text_and_extract_metadata(self, src_text: str) -> tuple[str, str]:
68+
"""This function performs following cleanup activities on the text, remove all unicode characters
69+
remove line spacing,remove stop words, normalize characters
70+
71+
Args:
72+
src_text (str): The text to cleanup.
73+
74+
Returns:
75+
str: The clean text."""
76+
77+
return_record = {}
78+
79+
try:
80+
logging.info(f"Input text: {src_text}")
81+
if len(src_text) == 0:
82+
logging.error("Input text is empty")
83+
raise ValueError("Input text is empty")
84+
85+
return_record["marked_up_chunk"] = src_text
86+
return_record["sections"] = self.get_sections(src_text)
87+
88+
logging.info(f"Sections: {return_record['sections']}")
89+
90+
# Define specific patterns for each tag
91+
tag_patterns = {
92+
"figurecontent": r"<!-- FigureContent=(.*?)-->",
93+
"figure": r"<figure(?:\s+FigureId=\"[^\"]*\")?>(.*?)</figure>",
94+
"figures": r"\(figures/\d+\)(.*?)\(figures/\d+\)",
95+
"figcaption": r"<figcaption>(.*?)</figcaption>",
96+
"header": r"^\s*(#{1,6})\s*(.*?)\s*$",
97+
}
98+
cleaned_text = self.remove_markdown_tags(src_text, tag_patterns)
99+
100+
logging.info(f"Removed markdown tags: {cleaned_text}")
101+
102+
# Updated regex to keep Unicode letters, punctuation, whitespace, currency symbols, and percentage signs,
103+
# while also removing non-printable characters
104+
cleaned_text = re.sub(r"[^\p{L}\p{P}\s\p{Sc}%\x20-\x7E]", "", cleaned_text)
105+
106+
logging.info(f"Cleaned text: {cleaned_text}")
107+
if len(cleaned_text) == 0:
108+
logging.error("Cleaned text is empty")
109+
raise ValueError("Cleaned text is empty")
110+
else:
111+
return_record["cleaned_chunk"] = cleaned_text
112+
except Exception as e:
113+
logging.error(f"An error occurred in clean_text_and_extract_metadata: {e}")
114+
return ""
115+
return return_record
116+
117+
async def process_mark_up_cleaner(self, record: dict) -> dict:
118+
"""Cleanup the data using standard python libraries.
119+
120+
Args:
121+
record (dict): The record to cleanup.
122+
123+
Returns:
124+
dict: The clean record."""
125+
126+
try:
127+
json_str = json.dumps(record, indent=4)
128+
129+
logging.info(f"embedding cleaner Input: {json_str}")
130+
131+
cleaned_record = {
132+
"recordId": record["recordId"],
133+
"data": {},
134+
"errors": None,
135+
"warnings": None,
136+
}
137+
138+
cleaned_record["data"] = self.clean_text_and_extract_metadata(
139+
record["data"]["chunk"]
140+
)
141+
142+
except Exception as e:
143+
logging.error("string cleanup Error: %s", e)
144+
return {
145+
"recordId": record["recordId"],
146+
"data": {},
147+
"errors": [
148+
{
149+
"message": "Failed to cleanup data. Check function app logs for more details of exact failure."
150+
}
151+
],
152+
"warnings": None,
153+
}
154+
json_str = json.dumps(cleaned_record, indent=4)
155+
156+
logging.info(f"embedding cleaner output: {json_str}")
157+
return cleaned_record

image_processing/src/image_processing/mark_up_cleaner.py

Lines changed: 0 additions & 149 deletions
This file was deleted.

0 commit comments

Comments
 (0)