1+ #!/usr/bin/env python
2+ """测试Markdown公式提取功能"""
3+
4+ import unittest
5+ import sys
6+ import os
7+
8+ # 添加项目根目录到Python路径
9+ sys .path .insert (0 , os .path .join (os .path .dirname (__file__ ), '..' ))
10+
11+ from webmainbench .metrics .base import BaseMetric , MetricResult
12+
13+
14+ class TestFormulaExtractionMetric (BaseMetric ):
15+ """测试用的公式提取 metric 实现类"""
16+
17+ def _setup (self ) -> None :
18+ pass
19+
20+ def _calculate_score (self , predicted : str , groundtruth : str , ** kwargs ) -> MetricResult :
21+ return MetricResult (
22+ metric_name = self .name ,
23+ score = 1.0 ,
24+ details = {"test" : True }
25+ )
26+
27+
28+ class TestFormulaExtraction (unittest .TestCase ):
29+ """测试Markdown公式提取功能"""
30+
31+ def setUp (self ):
32+ self .metric = TestFormulaExtractionMetric ("test_formula_metric" )
33+
34+ def test_inline_formula_extraction (self ):
35+ """测试行内公式提取"""
36+ text = """这是行内公式示例: $E = mc^2$,这是普通文本。"""
37+
38+ result = self .metric ._extract_from_markdown (text )
39+
40+ # 验证公式被提取
41+ self .assertIn ('E = mc^2' , result ['formula' ])
42+
43+ # 验证文本中公式标记被移除
44+ self .assertNotIn ('$E = mc^2$' , result ['text' ])
45+ self .assertIn ('这是行内公式示例: ,这是普通文本。' , result ['text' ])
46+
47+ def test_block_formula_extraction (self ):
48+ """测试行间公式提取"""
49+ text = """这是行间公式:
50+ $$
51+ \\ int_{-\\ infty}^{\\ infty} e^{-x^2} dx = \\ sqrt{\\ pi}
52+ $$
53+ 公式结束"""
54+
55+ result = self .metric ._extract_from_markdown (text )
56+
57+ # 验证公式被提取
58+ self .assertIn ('\\ int_{-\\ infty}^{\\ infty} e^{-x^2} dx = \\ sqrt{\\ pi}' , result ['formula' ])
59+
60+ # 修正:允许提取后有多个空行
61+ self .assertIn ('这是行间公式:' , result ['text' ])
62+ self .assertIn ('公式结束' , result ['text' ])
63+ # 检查原始公式位置是否被清空
64+ self .assertNotIn ('$$' , result ['text' ])
65+
66+ def test_escaped_dollar_signs (self ):
67+ """测试转义美元符号不被识别为公式"""
68+ text = """
69+ 这是转义的美元符号: \\ $100,不会被识别为公式。
70+ 而这个是公式: $a + b = c$
71+ """
72+
73+ result = self .metric ._extract_from_markdown (text )
74+ # 验证转义的美元符号不被提取
75+ self .assertNotIn ('100' , result ['formula' ])
76+ # 验证正常公式被提取
77+ self .assertIn ('a + b = c' , result ['formula' ])
78+ # 验证转义符号保留在文本中
79+ self .assertIn ('\\ $100' , result ['text' ])
80+
81+ def test_multiple_formulas (self ):
82+ """测试多个公式提取"""
83+ text = """公式1: $a = b + c$
84+ 公式2: $$x = \\ frac{-b \\ pm \\ sqrt{b^2 - 4ac}}{2a}$$
85+ 公式3: $E_k = \\ frac{1}{2}mv^2$"""
86+
87+ result = self .metric ._extract_from_markdown (text )
88+
89+ # 验证所有公式被提取
90+ self .assertIn ('a = b + c' , result ['formula' ])
91+ self .assertIn ('x = \\ frac{-b \\ pm \\ sqrt{b^2 - 4ac}}{2a}' , result ['formula' ])
92+ self .assertIn ('E_k = \\ frac{1}{2}mv^2' , result ['formula' ])
93+
94+ # 验证公式间的分隔
95+ self .assertIn ('\n ' , result ['formula' ])
96+
97+ def test_formula_with_special_characters (self ):
98+ """测试包含特殊字符的公式"""
99+ text = """复杂公式: $\\ sum_{i=1}^n i = \\ frac{n(n+1)}{2}$
100+ 带希腊字母: $$\\ alpha + \\ beta = \\ gamma$$"""
101+
102+ result = self .metric ._extract_from_markdown (text )
103+
104+ # 验证特殊字符处理正确
105+ self .assertIn ('\\ sum_{i=1}^n i = \\ frac{n(n+1)}{2}' , result ['formula' ])
106+ self .assertIn ('\\ alpha + \\ beta = \\ gamma' , result ['formula' ])
107+
108+ def test_formula_within_text (self ):
109+ """测试文本中的公式提取"""
110+ text = """根据相对论 $E = mc^2$,能量和质量可以互相转换。
111+ 更复杂的情况如 $$\\ nabla \\ cdot \\ mathbf{E} = \\ frac{\\ rho}{\\ epsilon_0}$$ 所示。"""
112+
113+ result = self .metric ._extract_from_markdown (text )
114+
115+ # 验证公式被提取
116+ self .assertIn ('E = mc^2' , result ['formula' ])
117+ self .assertIn ('\\ nabla \\ cdot \\ mathbf{E} = \\ frac{\\ rho}{\\ epsilon_0}' , result ['formula' ])
118+
119+ # 修正:允许提取后有多个空格
120+ self .assertIn ('根据相对论 ,能量和质量可以互相转换。' , result ['text' ])
121+ self .assertIn ('更复杂的情况如 所示。' , result ['text' ])
122+
123+ def test_empty_formulas (self ):
124+ """测试空公式处理"""
125+ text = """空行内公式: $ $
126+ 空行间公式: $$ $$"""
127+
128+ result = self .metric ._extract_from_markdown (text )
129+
130+ # 验证空公式被提取但内容为空
131+ self .assertTrue (result ['formula' ].strip () == '' )
132+
133+ # 验证空公式标记从文本中移除
134+ self .assertNotIn ('$ $' , result ['text' ])
135+ self .assertNotIn ('$$ $$' , result ['text' ])
136+
137+ def test_formula_at_document_edges (self ):
138+ """测试文档开头和结尾的公式"""
139+ # 开头的公式
140+ text1 = """$start = 0$
141+ 后续文本"""
142+ result1 = self .metric ._extract_from_markdown (text1 )
143+ self .assertIn ('start = 0' , result1 ['formula' ])
144+
145+ # 结尾的公式
146+ text2 = """前置文本
147+ $$end = 1$$"""
148+ result2 = self .metric ._extract_from_markdown (text2 )
149+ self .assertIn ('end = 1' , result2 ['formula' ])
150+
151+ def test_formula_within_table (self ):
152+ """测试表格中的公式提取"""
153+ text = """| 公式类型 | 示例 |
154+ |----------|------|
155+ | 行内公式 | $a + b = c$ |
156+ | 行间公式 | $$\\ int_0^1 x dx = 0.5$$ |"""
157+
158+ result = self .metric ._extract_from_markdown (text )
159+
160+ # 验证表格中的公式被提取
161+ self .assertIn ('a + b = c' , result ['formula' ])
162+ self .assertIn ('\\ int_0^1 x dx = 0.5' , result ['formula' ])
163+
164+ # 验证表格结构仍然被正确提取
165+ self .assertIn ('| 公式类型 | 示例 |' , result ['table' ])
166+
167+
168+ if __name__ == '__main__' :
169+ unittest .main ()
0 commit comments