Merge pull request #98 from koevas1226/fix-reg-warnings

SWHL · web-flow · commit 8b0182dfa483 · 2025-06-21T08:10:25.000+08:00
fix: python&gt;=3.12正则产生的warnings
diff --git a/rapid_table/table_matcher/utils.py b/rapid_table/table_matcher/utils.py
@@ -28,20 +28,20 @@ def deal_isolate_span(thead_part):
     """
     # 1. find out isolate span tokens.
     isolate_pattern = (
-        '<td></td> rowspan="(\d)+" colspan="(\d)+"></b></td>|'
-        '<td></td> colspan="(\d)+" rowspan="(\d)+"></b></td>|'
-        '<td></td> rowspan="(\d)+"></b></td>|'
-        '<td></td> colspan="(\d)+"></b></td>'
+        r'<td></td> rowspan="(\d)+" colspan="(\d)+"></b></td>|'
+        r'<td></td> colspan="(\d)+" rowspan="(\d)+"></b></td>|'
+        r'<td></td> rowspan="(\d)+"></b></td>|'
+        r'<td></td> colspan="(\d)+"></b></td>'
     )
     isolate_iter = re.finditer(isolate_pattern, thead_part)
     isolate_list = [i.group() for i in isolate_iter]
 
     # 2. find out span number, by step 1 results.
     span_pattern = (
-        ' rowspan="(\d)+" colspan="(\d)+"|'
-        ' colspan="(\d)+" rowspan="(\d)+"|'
-        ' rowspan="(\d)+"|'
-        ' colspan="(\d)+"'
+        r' rowspan="(\d)+" colspan="(\d)+"|'
+        r' colspan="(\d)+" rowspan="(\d)+"|'
+        r' rowspan="(\d)+"|'
+        r' colspan="(\d)+"'
     )
     corrected_list = []
     for isolate_item in isolate_list:
@@ -72,11 +72,11 @@ def deal_duplicate_bb(thead_part):
     """
     # 1. find out <td></td> in <thead></thead>.
     td_pattern = (
-        '<td rowspan="(\d)+" colspan="(\d)+">(.+?)</td>|'
-        '<td colspan="(\d)+" rowspan="(\d)+">(.+?)</td>|'
-        '<td rowspan="(\d)+">(.+?)</td>|'
-        '<td colspan="(\d)+">(.+?)</td>|'
-        "<td>(.*?)</td>"
+        r'<td rowspan="(\d)+" colspan="(\d)+">(.+?)</td>|'
+        r'<td colspan="(\d)+" rowspan="(\d)+">(.+?)</td>|'
+        r'<td rowspan="(\d)+">(.+?)</td>|'
+        r'<td colspan="(\d)+">(.+?)</td>|'
+        r'<td>(.*?)</td>'
     )
     td_iter = re.finditer(td_pattern, thead_part)
     td_list = [t.group() for t in td_iter]
@@ -115,7 +115,7 @@ def deal_bb(result_token):
     origin_thead_part = copy.deepcopy(thead_part)
 
     # check "rowspan" or "colspan" occur in <thead></thead> parts or not .
-    span_pattern = '<td rowspan="(\d)+" colspan="(\d)+">|<td colspan="(\d)+" rowspan="(\d)+">|<td rowspan="(\d)+">|<td colspan="(\d)+">'
+    span_pattern = r'<td rowspan="(\d)+" colspan="(\d)+">|<td colspan="(\d)+" rowspan="(\d)+">|<td rowspan="(\d)+">|<td colspan="(\d)+">'
     span_iter = re.finditer(span_pattern, thead_part)
     span_list = [s.group() for s in span_iter]
     has_span_in_head = True if len(span_list) > 0 else False
diff --git a/tests/table_matcher/utils.py b/tests/table_matcher/utils.py
@@ -0,0 +1,63 @@
+import unittest
+import warnings
+
+
+class TestRegexWarning(unittest.TestCase):
+    def test_regex_syntax_warning(self):
+        """测试捕获正则表达式中无效转义序列产生的 SyntaxWarning"""
+
+        with warnings.catch_warnings(record=True) as w:
+            warnings.simplefilter("always")
+
+            # 使用 compile() 来编译包含无效转义序列的代码，这会触发 SyntaxWarning
+            code_with_invalid_escape = """
+import re
+thead_part = '<td></td> rowspan="2"></b></td>'
+isolate_pattern = (
+    '<td></td> rowspan="(\d)+" colspan="(\d)+"></b></td>|'
+    '<td></td> colspan="(\d)+" rowspan="(\d)+"></b></td>|'
+    '<td></td> rowspan="(\d)+"></b></td>|'
+    '<td></td> colspan="(\d)+"></b></td>'
+)
+re.finditer(isolate_pattern, thead_part)
+"""
+
+            # 编译代码时会产生 SyntaxWarning
+            compile(code_with_invalid_escape, "<string>", "exec")
+
+            # 检查是否捕获到 SyntaxWarning
+            syntax_warnings = [warn for warn in w if issubclass(warn.category, SyntaxWarning)]
+            self.assertTrue(
+                len(syntax_warnings) > 0, f"未捕获到 SyntaxWarning: {[str(warn.message) for warn in w]}"
+            )
+            # 应该捕获到无效转义序列的警告
+            for warning in syntax_warnings:
+                self.assertIn("invalid escape sequence", str(warning.message))
+
+    def test_correct_regex_pattern(self):
+        with warnings.catch_warnings(record=True) as w:
+            warnings.simplefilter("always")
+
+            # 这不会触发 SyntaxWarning
+            code_with_invalid_escape = """
+import re
+thead_part = '<td></td> rowspan="2"></b></td>'
+isolate_pattern_raw = (
+    r'<td></td> rowspan="(\d)+" colspan="(\d)+"></b></td>|'
+    r'<td></td> colspan="(\d)+" rowspan="(\d)+"></b></td>|'
+    r'<td></td> rowspan="(\d)+"></b></td>|'
+    r'<td></td> colspan="(\d)+"></b></td>'
+)
+re.finditer(isolate_pattern_raw, thead_part)
+"""
+            compile(code_with_invalid_escape, "<string>", "exec")
+
+            # 检查是否捕获到 SyntaxWarning
+            syntax_warnings = [warn for warn in w if issubclass(warn.category, SyntaxWarning)]
+            self.assertTrue(
+                len(syntax_warnings) == 0, f"正常写法捕获到 SyntaxWarning: {[str(warn.message) for warn in w]}"
+            )
+
+
+if __name__ == "__main__":
+    unittest.main()