feat: upgrade the command spec to 0.31.2 (#198)

frostming · web-flow · commit b65bc852b407 · 2024-06-21T11:20:54.000+08:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,10 @@
+## v2.1.2(2024-06-21)
+
+### Changed
+
+- Update the GFM spec to the latest master branch.
+- Update the CommonMark spec to 0.31.2.
+
 ## v2.1.1(2024-06-19)
 
 ### Fixed
diff --git a/README.md b/README.md
@@ -5,13 +5,13 @@
 [![PyPI](https://img.shields.io/pypi/v/marko.svg?logo=python&logoColor=white)](https://pypi.org/project/marko/)
 [![PyPI - Python Version](https://img.shields.io/pypi/pyversions/marko.svg?logo=python&logoColor=white)](https://pypi.org/project/marko/)
 [![Documentation Status](https://img.shields.io/readthedocs/marko-py.svg?logo=readthedocs)](https://marko-py.readthedocs.io/en/latest/?badge=latest)
-[![CommonMark Spec](https://img.shields.io/badge/CommonMark-0.30-blue.svg)][spec]
+[![CommonMark Spec](https://img.shields.io/badge/CommonMark-0.31.2-blue.svg)][spec]
 
 ![Build Status](https://github.com/frostming/marko/workflows/Tests/badge.svg)
 [![codecov](https://codecov.io/gh/frostming/marko/branch/master/graph/badge.svg)](https://codecov.io/gh/frostming/marko)
 [![Codacy Badge](https://api.codacy.com/project/badge/Grade/b785f5b3fa7c4d93a02372d31b3f73b1)](https://www.codacy.com/app/frostming/marko?utm_source=github.com&utm_medium=referral&utm_content=frostming/marko&utm_campaign=Badge_Grade)
 
-Marko is a pure Python markdown parser that adheres to the specifications of [CommonMark's spec v0.30][spec]. It has been designed with high extensibility in mind, as detailed in the [Extensions](#extensions) section.
+Marko is a pure Python markdown parser that adheres to the specifications of [CommonMark's spec v0.31.2][spec]. It has been designed with high extensibility in mind, as detailed in the [Extensions](#extensions) section.
 
 Marko requires Python 3.8 or higher.
 
@@ -21,7 +21,7 @@ Of all the Python markdown parsers available, a common issue is the difficulty f
 
 Marko's compliance with the complex CommonMark specification can impact its performance. However, using a parser that does not adhere to this spec may result in unexpected rendering outcomes. According to benchmark results, Marko is three times slower than Python-Markdown but slightly faster than Commonmark-py and significantly slower than mistune. If prioritizing performance over spec compliance is crucial for you, it would be best to opt for another parser.
 
-[spec]: https://spec.commonmark.org/0.30/
+[spec]: https://spec.commonmark.org/0.31.2/
 [pymd]: https://github.com/waylan/Python-Markdown
 [mistune]: https://github.com/lepture/mistune
 [cmpy]: https://github.com/rtfd/CommonMark-py
diff --git a/marko/ext/gfm/__init__.py b/marko/ext/gfm/__init__.py
@@ -23,7 +23,6 @@
 GFM = MarkoExtension(
     elements=[
         elements.Paragraph,
-        elements.InlineHTML,
         elements.Strikethrough,
         elements.Url,
         elements.Table,
diff --git a/marko/ext/gfm/elements.py b/marko/ext/gfm/elements.py
@@ -8,7 +8,7 @@
 import re
 from typing import Any, cast
 
-from marko import block, inline, patterns
+from marko import block, inline
 from marko.source import Source
 
 
@@ -24,18 +24,6 @@ def __init__(self, lines):
             self.inline_body = self.inline_body[m.end(1) :]
 
 
-class InlineHTML(inline.InlineHTML):
-    pattern = re.compile(
-        r"(<%s(?:%s)* */?>"  # open tag
-        r"|</%s *>"  # closing tag
-        r"|<!--(?:>|->|[\s\S]*?-->)"  # HTML comment
-        r"|<\?[\s\S]*?\?>"  # processing instruction
-        r"|<![A-Z]+ +[\s\S]*?>"  # declaration
-        r"|<!\[CDATA\[[\s\S]*?\]\]>)"  # CDATA section
-        % (patterns.tag_name, patterns.attribute, patterns.tag_name)
-    )
-
-
 class Strikethrough(inline.InlineElement):
     pattern = re.compile(r"(?<!~)(~|~~)([^~]+)\1(?!~)")
     priority = 5
diff --git a/marko/inline.py b/marko/inline.py
@@ -92,7 +92,7 @@ class InlineHTML(InlineElement):
     pattern = re.compile(
         r"(<%s(?:%s)* */?>"  # open tag
         r"|</%s *>"  # closing tag
-        r"|<!--(?!>|->|[\s\S]*?--[\s\S]*?-->)[\s\S]*?(?<!-)-->"  # HTML comment
+        r"|<!--(?:>|->|[\s\S]*?-->)"  # HTML comment
         r"|<\?[\s\S]*?\?>"  # processing instruction
         r"|<![A-Z]+ +[\s\S]*?>"  # declaration
         r"|<!\[CDATA\[[\s\S]*?\]\]>)"  # CDATA section
diff --git a/marko/inline_parser.py b/marko/inline_parser.py
@@ -515,16 +515,12 @@ def is_right_flanking(self) -> bool:
         )
 
     def followed_by_punc(self) -> bool:
-        return (
-            self.end < len(self.text)
-            and patterns.punctuation.match(self.text, self.end) is not None
+        return self.end < len(self.text) and patterns.is_punctuation(
+            self.text[self.end]
         )
 
     def preceded_by_punc(self) -> bool:
-        return (
-            self.start > 0
-            and patterns.punctuation.match(self.text[self.start - 1]) is not None
-        )
+        return self.start > 0 and patterns.is_punctuation(self.text[self.start - 1])
 
     def closed_by(self, other: Delimiter) -> bool:
         return not (
diff --git a/marko/patterns.py b/marko/patterns.py
@@ -2,7 +2,10 @@
 Some regex patterns
 """
 
+import functools
 import re
+import string
+import unicodedata
 
 tags = [
     "address",
@@ -86,30 +89,11 @@
     r"(?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?(?:\.[a-zA-Z0-9]"
     r"(?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*"
 )
-punctuation = re.compile(
-    r'[!"#$%&\'()*+,\-./:;<=>?@\[\]\\^_`{|}~\xA1\xA7\xAB\xB6\xB7\xBB'
-    r"\xBF\u037E\u0387\u055A-\u055F\u0589\u058A\u05BE\u05C0\u05C3"
-    r"\u05C6\u05F3\u05F4\u0609\u060A\u060C\u060D\u061B\u061E\u061F"
-    r"\u066A-\u066D\u06D4\u0700-\u070D\u07F7-\u07F9\u0830-\u083E"
-    r"\u085E\u0964\u0965\u0970\u0AF0\u0DF4\u0E4F\u0E5A\u0E5B\u0F04-\u0F12"
-    r"\u0F14\u0F3A-\u0F3D\u0F85\u0FD0-\u0FD4\u0FD9\u0FDA\u104A-\u104F\u10FB"
-    r"\u1360-\u1368\u1400\u166D\u166E\u169B\u169C\u16EB-\u16ED\u1735\u1736"
-    r"\u17D4-\u17D6\u17D8-\u17DA\u1800-\u180A\u1944\u1945\u1A1E\u1A1F\u1AA0-"
-    r"\u1AA6\u1AA8-\u1AAD\u1B5A-\u1B60\u1BFC-\u1BFF\u1C3B-\u1C3F\u1C7E\u1C7F"
-    r"\u1CC0-\u1CC7\u1CD3\u2010-\u2027\u2030-\u2043\u2045-\u2051\u2053-\u205E"
-    r"\u207D\u207E\u208D\u208E\u2308-\u230B\u2329\u232A\u2768-\u2775\u27C5"
-    r"\u27C6\u27E6-\u27EF\u2983-\u2998\u29D8-\u29DB\u29FC\u29FD\u2CF9-\u2CFC"
-    r"\u2CFE\u2CFF\u2D70\u2E00-\u2E2E\u2E30-\u2E42\u3001-\u3003\u3008-\u3011"
-    r"\u3014-\u301F\u3030\u303D\u30A0\u30FB\uA4FE\uA4FF\uA60D-\uA60F\uA673"
-    r"\uA67E\uA6F2-\uA6F7\uA874-\uA877\uA8CE\uA8CF\uA8F8-\uA8FA\uA8FC\uA92E"
-    r"\uA92F\uA95F\uA9C1-\uA9CD\uA9DE\uA9DF\uAA5C-\uAA5F\uAADE\uAADF\uAAF0"
-    r"\uAAF1\uABEB\uFD3E\uFD3F\uFE10-\uFE19\uFE30-\uFE52\uFE54-\uFE61\uFE63"
-    r"\uFE68\uFE6A\uFE6B\uFF01-\uFF03\uFF05-\uFF0A\uFF0C-\uFF0F\uFF1A\uFF1B"
-    r"\uFF1F\uFF20\uFF3B-\uFF3D\uFF3F\uFF5B\uFF5D\uFF5F-\uFF65]|\uD800[\uDD00-"
-    r"\uDD02\uDF9F\uDFD0]|\uD801\uDD6F|\uD802[\uDC57\uDD1F\uDD3F\uDE50-\uDE58"
-    r"\uDE7F\uDEF0-\uDEF6\uDF39-\uDF3F\uDF99-\uDF9C]|\uD804[\uDC47-\uDC4D"
-    r"\uDCBB\uDCBC\uDCBE-\uDCC1\uDD40-\uDD43\uDD74\uDD75\uDDC5-\uDDC9\uDDCD"
-    r"\uDDDB\uDDDD-\uDDDF\uDE38-\uDE3D\uDEA9]|\uD805[\uDCC6\uDDC1-\uDDD7"
-    r"\uDE41-\uDE43\uDF3C-\uDF3E]|\uD809[\uDC70-\uDC74]|\uD81A[\uDE6E\uDE6F"
-    r"\uDEF5\uDF37-\uDF3B\uDF44]|\uD82F\uDC9F|\uD836[\uDE87-\uDE8B]"
-)
+
+
+@functools.lru_cache(maxsize=128)
+def is_punctuation(ch: str) -> bool:
+    if ch in string.punctuation:
+        return True
+    category = unicodedata.category(ch)
+    return category.startswith("P") or category.startswith("S")
diff --git a/tests/__init__.py b/tests/__init__.py
@@ -1,63 +0,0 @@
-# -*- coding: utf-8 -*-
-import codecs
-import os
-import re
-
-from tests.normalize import normalize_html
-
-TEST_ROOT = os.path.dirname(__file__)
-EXAMPLE_PATTERN = re.compile(
-    r"^`{32} example\b.*?\n([\s\S]*?)^\.\n([\s\S]*?)^`{32}$|^#{1,6} *(.*)$",
-    flags=re.M,
-)
-
-
-def parse_examples(text):
-    data = EXAMPLE_PATTERN.findall(text)
-
-    section = None
-    count = 0
-    for md, html, title in data:
-        if title:
-            count = 0
-            section = title.lower().split("(")[0].replace(" ", "_")
-
-        if md and html:
-            count += 1
-            name = "%s_%03d" % (section, count)
-            md = md.replace("→", "\t")
-            html = html.replace("→", "\t")
-            yield name, md, html
-
-
-class SpecTestSuite:
-    @classmethod
-    def load_spec(cls, spec_name):
-        def attach_case(n, md, html):
-            def method(self):
-                self.assert_case(md, html)
-
-            name = "test_{}".format(n)
-            method.__name__ = name
-            method.__doc__ = "Run spec {} - {}".format(spec_name, n)
-            setattr(cls, name, method)
-
-        spec_file = os.path.join(TEST_ROOT, "spec/{}.txt".format(spec_name))
-        with codecs.open(spec_file, encoding="utf-8") as f:
-            for name, md, html in parse_examples(f.read()):
-                if not cls.ignore_case(name):
-                    attach_case(name, md, html)
-
-    @classmethod
-    def ignore_case(cls, n):
-        return False
-
-    def assert_case(self, text, html):
-        result = self.markdown(text)
-        assert normalize_html(result) == normalize_html(html), repr(result)
-
-    # Extra cases that are not included
-    def test_mixed_tab_space_in_list_item(self):
-        text = "* foo\n\t* foo.bar"
-        html = "<ul><li>foo<ul><li>foo.bar</li></ul></li></ul>"
-        self.assert_case(text, html)
diff --git a/tests/spec/commonmark.txt b/tests/spec/commonmark.txt
diff --git a/tests/spec/gfm.txt b/tests/spec/gfm.txt
diff --git a/tests/test_spec.py b/tests/test_spec.py
diff --git a/tests/update_spec.sh b/tests/update_spec.sh