Skip to content

Commit 8b4d716

Browse files
committed
Update tests
1 parent 057d54f commit 8b4d716

File tree

3 files changed

+25
-9
lines changed

3 files changed

+25
-9
lines changed

image_processing/src/image_processing/layout_holders.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@
66

77

88
class FigureHolder(BaseModel):
9-
109
"""A class to hold the figure extracted from the document."""
1110

1211
figure_id: str = Field(..., alias="FigureId")

image_processing/src/image_processing/mark_up_cleaner.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ def get_sections(self, text) -> list:
1818
list: The sections related to text
1919
"""
2020
# Updated regex pattern to capture markdown headers like ### Header
21-
combined_pattern = r"^[#]+\s*(.*?)(?=\n|$)"
21+
combined_pattern = r"^\s*[#]+\s*(.*?)(?=\n|$)"
2222
doc_metadata = re.findall(combined_pattern, text, re.MULTILINE)
2323
return self.clean_sections(doc_metadata)
2424

@@ -61,12 +61,14 @@ def remove_markdown_tags(self, text: str, tag_patterns: dict) -> str:
6161
for tag, pattern in tag_patterns.items():
6262
try:
6363
# Replace the tags using the specific pattern, keeping the content inside the tags
64-
if tag == "header":
64+
if tag in ["header", "figure"]:
6565
text = re.sub(
6666
pattern, r"\2", text, flags=re.DOTALL | re.MULTILINE
6767
)
6868
else:
69-
text = re.sub(pattern, r"\1", text, flags=re.DOTALL)
69+
text = re.sub(
70+
pattern, r"\1", text, flags=re.DOTALL | re.MULTILINE
71+
)
7072
except re.error as e:
7173
logging.error(f"Regex error for tag '{tag}': {e}")
7274
except Exception as e:
@@ -110,7 +112,7 @@ def clean_text_and_extract_metadata(
110112
# Define specific patterns for each tag
111113
tag_patterns = {
112114
"figurecontent": r"<!-- FigureContent=(.*?)-->",
113-
"figure": r"<figure(?:\s+FigureId=\"[^\"]*\")?>(.*?)</figure>",
115+
"figure": r"<figure(?:\s+FigureId=(\"[^\"]*\"|'[^']*'))?>(.*?)</figure>",
114116
"figures": r"\(figures/\d+\)(.*?)\(figures/\d+\)",
115117
"figcaption": r"<figcaption>(.*?)</figcaption>",
116118
"header": r"^\s*(#{1,6})\s*(.*?)\s*$",

image_processing/tests/image_processing/test_mark_up_cleaner.py

Lines changed: 19 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@ def test_remove_markdown_tags(cleaner):
6161
"""
6262
tag_patterns = {
6363
"figurecontent": r"<!-- FigureContent=(.*?)-->",
64-
"figure": r"<figure(?:\s+FigureId=\"[^\"]*\")?>(.*?)</figure>",
64+
"figure": r"<figure(?:\s+FigureId=(\"[^\"]*\"|'[^']*'))?>(.*?)</figure>",
6565
}
6666
cleaned_text = cleaner.remove_markdown_tags(text, tag_patterns)
6767
assert "Some figure" in cleaned_text
@@ -77,12 +77,18 @@ def test_clean_text_and_extract_metadata(cleaner, sample_text, figures):
7777
assert result["chunk_sections"] == ["Header 1", "Header 2"]
7878
assert result["chunk_figures"] == [
7979
{
80-
"figure_id": "12345",
81-
"uri": "https://example.com/12345.png",
82-
"description": "Figure 1",
80+
"FigureId": "12345",
81+
"Caption": None,
82+
"offset": 0,
83+
"length": 8,
84+
"PageNumber": None,
85+
"Uri": "https://example.com/12345.png",
86+
"Description": "Figure 1",
87+
"Data": None,
8388
}
8489
]
8590
assert "chunk_cleaned" in result
91+
print(result["chunk_cleaned"])
8692
assert "FigureId='12345'" not in result["chunk_cleaned"]
8793

8894

@@ -97,11 +103,15 @@ async def test_clean(cleaner, sample_text, figures):
97103
"figure_id": "12345",
98104
"uri": "https://example.com/12345.png",
99105
"description": "Figure 1",
106+
"offset": 0,
107+
"length": 8,
100108
},
101109
{
102110
"figure_id": "123456789",
103111
"uri": "https://example.com/123456789.png",
104112
"description": "Figure 2",
113+
"offset": 10,
114+
"length": 8,
105115
},
106116
],
107117
},
@@ -112,3 +122,8 @@ async def test_clean(cleaner, sample_text, figures):
112122
assert result["data"] is not None
113123
assert result["data"]["chunk_cleaned"]
114124
assert "errors" not in result or result["errors"] is None
125+
assert "chunk_mark_up" in result["data"]
126+
assert "chunk_sections" in result["data"]
127+
assert "chunk_figures" in result["data"]
128+
assert len(result["data"]["chunk_figures"]) == 1
129+
assert result["data"]["chunk_figures"][0]["FigureId"] == "12345"

0 commit comments

Comments
 (0)