@@ -61,7 +61,7 @@ def test_remove_markdown_tags(cleaner):
6161 """
6262 tag_patterns = {
6363 "figurecontent" : r"<!-- FigureContent=(.*?)-->" ,
64- "figure" : r"<figure(?:\s+FigureId=\"[^\"]*\")?>(.*?)</figure>" ,
64+ "figure" : r"<figure(?:\s+FigureId=( \"[^\"]*\"|'[^']*') )?>(.*?)</figure>" ,
6565 }
6666 cleaned_text = cleaner .remove_markdown_tags (text , tag_patterns )
6767 assert "Some figure" in cleaned_text
@@ -77,12 +77,18 @@ def test_clean_text_and_extract_metadata(cleaner, sample_text, figures):
7777 assert result ["chunk_sections" ] == ["Header 1" , "Header 2" ]
7878 assert result ["chunk_figures" ] == [
7979 {
80- "figure_id" : "12345" ,
81- "uri" : "https://example.com/12345.png" ,
82- "description" : "Figure 1" ,
80+ "FigureId" : "12345" ,
81+ "Caption" : None ,
82+ "offset" : 0 ,
83+ "length" : 8 ,
84+ "PageNumber" : None ,
85+ "Uri" : "https://example.com/12345.png" ,
86+ "Description" : "Figure 1" ,
87+ "Data" : None ,
8388 }
8489 ]
8590 assert "chunk_cleaned" in result
91+ print (result ["chunk_cleaned" ])
8692 assert "FigureId='12345'" not in result ["chunk_cleaned" ]
8793
8894
@@ -97,11 +103,15 @@ async def test_clean(cleaner, sample_text, figures):
97103 "figure_id" : "12345" ,
98104 "uri" : "https://example.com/12345.png" ,
99105 "description" : "Figure 1" ,
106+ "offset" : 0 ,
107+ "length" : 8 ,
100108 },
101109 {
102110 "figure_id" : "123456789" ,
103111 "uri" : "https://example.com/123456789.png" ,
104112 "description" : "Figure 2" ,
113+ "offset" : 10 ,
114+ "length" : 8 ,
105115 },
106116 ],
107117 },
@@ -112,3 +122,8 @@ async def test_clean(cleaner, sample_text, figures):
112122 assert result ["data" ] is not None
113123 assert result ["data" ]["chunk_cleaned" ]
114124 assert "errors" not in result or result ["errors" ] is None
125+ assert "chunk_mark_up" in result ["data" ]
126+ assert "chunk_sections" in result ["data" ]
127+ assert "chunk_figures" in result ["data" ]
128+ assert len (result ["data" ]["chunk_figures" ]) == 1
129+ assert result ["data" ]["chunk_figures" ][0 ]["FigureId" ] == "12345"
0 commit comments