Skip to content

Commit 8b0182d

Browse files
authored
Merge pull request #98 from koevas1226/fix-reg-warnings
fix: python>=3.12正则产生的warnings
2 parents 807c63b + d00c15e commit 8b0182d

File tree

2 files changed

+77
-14
lines changed

2 files changed

+77
-14
lines changed

rapid_table/table_matcher/utils.py

Lines changed: 14 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -28,20 +28,20 @@ def deal_isolate_span(thead_part):
2828
"""
2929
# 1. find out isolate span tokens.
3030
isolate_pattern = (
31-
'<td></td> rowspan="(\d)+" colspan="(\d)+"></b></td>|'
32-
'<td></td> colspan="(\d)+" rowspan="(\d)+"></b></td>|'
33-
'<td></td> rowspan="(\d)+"></b></td>|'
34-
'<td></td> colspan="(\d)+"></b></td>'
31+
r'<td></td> rowspan="(\d)+" colspan="(\d)+"></b></td>|'
32+
r'<td></td> colspan="(\d)+" rowspan="(\d)+"></b></td>|'
33+
r'<td></td> rowspan="(\d)+"></b></td>|'
34+
r'<td></td> colspan="(\d)+"></b></td>'
3535
)
3636
isolate_iter = re.finditer(isolate_pattern, thead_part)
3737
isolate_list = [i.group() for i in isolate_iter]
3838

3939
# 2. find out span number, by step 1 results.
4040
span_pattern = (
41-
' rowspan="(\d)+" colspan="(\d)+"|'
42-
' colspan="(\d)+" rowspan="(\d)+"|'
43-
' rowspan="(\d)+"|'
44-
' colspan="(\d)+"'
41+
r' rowspan="(\d)+" colspan="(\d)+"|'
42+
r' colspan="(\d)+" rowspan="(\d)+"|'
43+
r' rowspan="(\d)+"|'
44+
r' colspan="(\d)+"'
4545
)
4646
corrected_list = []
4747
for isolate_item in isolate_list:
@@ -72,11 +72,11 @@ def deal_duplicate_bb(thead_part):
7272
"""
7373
# 1. find out <td></td> in <thead></thead>.
7474
td_pattern = (
75-
'<td rowspan="(\d)+" colspan="(\d)+">(.+?)</td>|'
76-
'<td colspan="(\d)+" rowspan="(\d)+">(.+?)</td>|'
77-
'<td rowspan="(\d)+">(.+?)</td>|'
78-
'<td colspan="(\d)+">(.+?)</td>|'
79-
"<td>(.*?)</td>"
75+
r'<td rowspan="(\d)+" colspan="(\d)+">(.+?)</td>|'
76+
r'<td colspan="(\d)+" rowspan="(\d)+">(.+?)</td>|'
77+
r'<td rowspan="(\d)+">(.+?)</td>|'
78+
r'<td colspan="(\d)+">(.+?)</td>|'
79+
r'<td>(.*?)</td>'
8080
)
8181
td_iter = re.finditer(td_pattern, thead_part)
8282
td_list = [t.group() for t in td_iter]
@@ -115,7 +115,7 @@ def deal_bb(result_token):
115115
origin_thead_part = copy.deepcopy(thead_part)
116116

117117
# check "rowspan" or "colspan" occur in <thead></thead> parts or not .
118-
span_pattern = '<td rowspan="(\d)+" colspan="(\d)+">|<td colspan="(\d)+" rowspan="(\d)+">|<td rowspan="(\d)+">|<td colspan="(\d)+">'
118+
span_pattern = r'<td rowspan="(\d)+" colspan="(\d)+">|<td colspan="(\d)+" rowspan="(\d)+">|<td rowspan="(\d)+">|<td colspan="(\d)+">'
119119
span_iter = re.finditer(span_pattern, thead_part)
120120
span_list = [s.group() for s in span_iter]
121121
has_span_in_head = True if len(span_list) > 0 else False

tests/table_matcher/utils.py

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
import unittest
2+
import warnings
3+
4+
5+
class TestRegexWarning(unittest.TestCase):
6+
def test_regex_syntax_warning(self):
7+
"""测试捕获正则表达式中无效转义序列产生的 SyntaxWarning"""
8+
9+
with warnings.catch_warnings(record=True) as w:
10+
warnings.simplefilter("always")
11+
12+
# 使用 compile() 来编译包含无效转义序列的代码,这会触发 SyntaxWarning
13+
code_with_invalid_escape = """
14+
import re
15+
thead_part = '<td></td> rowspan="2"></b></td>'
16+
isolate_pattern = (
17+
'<td></td> rowspan="(\d)+" colspan="(\d)+"></b></td>|'
18+
'<td></td> colspan="(\d)+" rowspan="(\d)+"></b></td>|'
19+
'<td></td> rowspan="(\d)+"></b></td>|'
20+
'<td></td> colspan="(\d)+"></b></td>'
21+
)
22+
re.finditer(isolate_pattern, thead_part)
23+
"""
24+
25+
# 编译代码时会产生 SyntaxWarning
26+
compile(code_with_invalid_escape, "<string>", "exec")
27+
28+
# 检查是否捕获到 SyntaxWarning
29+
syntax_warnings = [warn for warn in w if issubclass(warn.category, SyntaxWarning)]
30+
self.assertTrue(
31+
len(syntax_warnings) > 0, f"未捕获到 SyntaxWarning: {[str(warn.message) for warn in w]}"
32+
)
33+
# 应该捕获到无效转义序列的警告
34+
for warning in syntax_warnings:
35+
self.assertIn("invalid escape sequence", str(warning.message))
36+
37+
def test_correct_regex_pattern(self):
38+
with warnings.catch_warnings(record=True) as w:
39+
warnings.simplefilter("always")
40+
41+
# 这不会触发 SyntaxWarning
42+
code_with_invalid_escape = """
43+
import re
44+
thead_part = '<td></td> rowspan="2"></b></td>'
45+
isolate_pattern_raw = (
46+
r'<td></td> rowspan="(\d)+" colspan="(\d)+"></b></td>|'
47+
r'<td></td> colspan="(\d)+" rowspan="(\d)+"></b></td>|'
48+
r'<td></td> rowspan="(\d)+"></b></td>|'
49+
r'<td></td> colspan="(\d)+"></b></td>'
50+
)
51+
re.finditer(isolate_pattern_raw, thead_part)
52+
"""
53+
compile(code_with_invalid_escape, "<string>", "exec")
54+
55+
# 检查是否捕获到 SyntaxWarning
56+
syntax_warnings = [warn for warn in w if issubclass(warn.category, SyntaxWarning)]
57+
self.assertTrue(
58+
len(syntax_warnings) == 0, f"正常写法捕获到 SyntaxWarning: {[str(warn.message) for warn in w]}"
59+
)
60+
61+
62+
if __name__ == "__main__":
63+
unittest.main()

0 commit comments

Comments
 (0)