Python enable markdown acceptance tests (#64)

temyers · web-flow · commit 8f2bc620282d · 2025-06-30T11:26:06.000+02:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -15,6 +15,7 @@ This document is formatted according to the principles of [Keep A CHANGELOG](htt
 - [cpp] Actually allow comment inside descriptions ([#414](https://github.com/cucumber/gherkin/pull/414))
 - [cpp] Add missing translations for Rule ([#415](https://github.com/cucumber/gherkin/pull/415))
 - [cpp] Prefer the longest step keyword ([#416](https://github.com/cucumber/gherkin/pull/416))
+- [Python] Fix acceptance tests ([#64](https://github.com/cucumber/gherkin/pull/64))
 
 ## [32.1.2] - 2025-05-25
 ### Fixed
diff --git a/python/Makefile b/python/Makefile
@@ -8,8 +8,8 @@ SOURCE_FILES = $(shell find . -name "*.py" | grep -v $(GHERKIN_PARSER))
 GHERKIN_GENERATE_EVENTS = python -m scripts.generate_events
 GHERKIN_GENERATE_TOKENS = python -m scripts.generate_tokens
 
-GOOD_FEATURE_FILES = $(shell find ../testdata/good -name "*.feature")
-BAD_FEATURE_FILES  = $(shell find ../testdata/bad -name "*.feature")
+GOOD_FEATURE_FILES = $(shell find ../testdata/good -name "*.feature" -o -name "*.feature.md")
+BAD_FEATURE_FILES  = $(shell find ../testdata/bad -name "*.feature" -o -name "*.feature.md")
 
 TOKENS       = $(patsubst ../testdata/%,acceptance/testdata/%.tokens,$(GOOD_FEATURE_FILES))
 ASTS         = $(patsubst ../testdata/%,acceptance/testdata/%.ast.ndjson,$(GOOD_FEATURE_FILES))
diff --git a/python/gherkin/stream/gherkin_events.py b/python/gherkin/stream/gherkin_events.py
@@ -11,6 +11,8 @@
 from gherkin.stream.id_generator import IdGenerator
 from gherkin.stream.source_events import Event
 from gherkin.token import Location
+from gherkin.token_matcher import TokenMatcher
+from gherkin.token_matcher_markdown import GherkinInMarkdownTokenMatcher
 
 
 class Source(TypedDict):
@@ -65,7 +67,12 @@ def enum(
         source = source_event["source"]["data"]
 
         try:
-            gherkin_document = self.parser.parse(source)
+            matcher=None
+            if source_event["source"]["mediaType"] == 'text/x.cucumber.gherkin+plain':
+                matcher = TokenMatcher()
+            elif source_event["source"]["mediaType"] == 'text/x.cucumber.gherkin+markdown':
+                matcher = GherkinInMarkdownTokenMatcher()
+            gherkin_document = self.parser.parse(source, matcher)
             gherkin_document_with_uri: GherkinDocumentWithURI = {
                 **gherkin_document,
                 "uri": uri,
diff --git a/python/gherkin/stream/source_events.py b/python/gherkin/stream/source_events.py
@@ -14,12 +14,18 @@ class Event(TypedDict):
     source: Source
 
 
+def _media_type(path) -> String:
+    if(path.endswith(".feature")):
+        return 'text/x.cucumber.gherkin+plain'
+    if(path.endswith(".feature.md")):
+        return 'text/x.cucumber.gherkin+markdown'
+
 def source_event(path: str) -> Event:
     event: Event = {
         "source": {
             "uri": path,
             "data": open(path, encoding="utf8", newline="").read(),
-            "mediaType": "text/x.cucumber.gherkin+plain",
+            "mediaType": _media_type(path),
         }
     }
     return event
diff --git a/python/gherkin/token_matcher_markdown.py b/python/gherkin/token_matcher_markdown.py
@@ -1,11 +1,14 @@
 from __future__ import annotations
 
 import re
+from collections import defaultdict
 from collections.abc import Iterable
+from typing import TypedDict
 
 from .gherkin_line import Cell
 from .token import Token
 from .token_matcher import TokenMatcher, MatchedItems
+from .dialect import Dialect
 
 KEYWORD_PREFIX_BULLET = "^(\\s*[*+-]\\s*)"
 KEYWORD_PREFIX_HEADER = "^(#{1,6}\\s)"
@@ -23,7 +26,7 @@ def reset(self) -> None:
     def match_FeatureLine(self, token: Token) -> bool:
 
         if self.matched_feature_line:
-            self._set_token_matched(token, None)
+            return False
 
         # We first try to match "# Feature: blah"
         result = self._match_title_line(
@@ -39,8 +42,8 @@ def match_FeatureLine(self, token: Token) -> bool:
 
         if not result:
             self._set_token_matched(token, "FeatureLine", token.line.get_line_text())
-        self.matched_feature_line = result
-        return result
+        self.matched_feature_line = True
+        return True
 
     def match_RuleLine(self, token: Token) -> bool:
         return self._match_title_line(
@@ -103,26 +106,19 @@ def _is_gfm_table_separator(self, table_cells: list[Cell]) -> bool:
         return len(separator_values) > 0
 
     def match_StepLine(self, token: Token) -> bool:
-        nonStarStepKeywords = (
-            self.dialect.given_keywords
-            + self.dialect.when_keywords
-            + self.dialect.then_keywords
-            + self.dialect.and_keywords
-            + self.dialect.but_keywords
-        )
         return self._match_title_line(
-            KEYWORD_PREFIX_BULLET, nonStarStepKeywords, "", token, "StepLine"
+            KEYWORD_PREFIX_BULLET, self._sorted_step_keywords, "", token, "StepLine"
         )
 
     def match_Comment(self, token: Token) -> bool:
         if token.line.startswith("|"):
             table_cells = token.line.table_cells
             if self._is_gfm_table_separator(table_cells):
+                self._set_token_matched(token, "Empty", indent=0)
                 return True
-        return self._set_token_matched(token, None, False)
+        return False
 
     def match_Empty(self, token: Token) -> bool:
-
         result = False
         if token.line.is_empty():
             result = True
@@ -199,18 +195,35 @@ def _match_title_line(
         token: Token,
         token_type: str,
     ) -> bool:
-        keywords_or_list = "|".join(map(lambda x: re.escape(x), keywords))
-        match = re.search(
-            f"{prefix}({keywords_or_list}){keywordSuffix}(.*)",
-            token.line.get_line_text(),
-        )
-        indent = token.line.indent
-
-        if match:
-            matchedKeyword = match.group(2)
-            indent += len(match.group(1))
-            self._set_token_matched(
-                token, token_type, match.group(3).strip(), matchedKeyword, indent=indent
+        text = token.line.get_line_text()
+        for keyword in keywords:
+            match = re.search(
+                    f"{prefix}({re.escape(keyword)}){keywordSuffix}(.*)",
+                    text
             )
-            return True
+            if match:
+                indent = token.line.indent + len(match.group(1))
+                matchedKeyword = match.group(2)
+                # only set the keyword type if this is a step keyword
+                if( matchedKeyword in self.keyword_types ):
+                    matchedKeywordType = self.keyword_types[matchedKeyword][0]
+                else:
+                    matchedKeywordType = None
+                self._set_token_matched(
+                    token,
+                    token_type,
+                    match.group(3).strip(),
+                    matchedKeyword,
+                    keyword_type=matchedKeywordType,
+                    indent=indent
+                )
+                return True
+
         return False
+
+    def _change_dialect(self, dialect_name, location=None) -> None:
+        super()._change_dialect(dialect_name, location)
+        self._sorted_step_keywords = list(filter(
+            lambda key: key != '* ',
+            self._sorted_step_keywords
+        ))
diff --git a/python/scripts/generate_tokens.py b/python/scripts/generate_tokens.py
@@ -1,16 +1,20 @@
 import sys
-
 from gherkin.token_scanner import TokenScanner
 from gherkin.token_formatter_builder import TokenFormatterBuilder
 from gherkin.parser import Parser
+from gherkin.token_matcher_markdown import GherkinInMarkdownTokenMatcher
 
 
 def main() -> None:
     files = sys.argv[1:]
     parser = Parser(TokenFormatterBuilder())
     for file in files:
         scanner = TokenScanner(file)
-        print(parser.parse(scanner))
+
+        if(file.endswith('.md')):
+            print(parser.parse(scanner, GherkinInMarkdownTokenMatcher()) )
+        else:
+            print(parser.parse(scanner))
 
 
 if __name__ == "__main__":
diff --git a/python/test/gherkin_in_markdown_token_matcher_test.py b/python/test/gherkin_in_markdown_token_matcher_test.py
@@ -5,9 +5,18 @@
 location = {"line": 1, "column": 1}
 
 
-def test_it_matches_FeatureLine():
-    tm = GherkinInMarkdownTokenMatcher("en")
-    line = GherkinLine("""## Feature: hello""", location["line"])
+def test_it_matches_FeatureLineH1():
+    tm = GherkinInMarkdownTokenMatcher('en')
+    line = GherkinLine('''# Feature: hello''',location['line'])
+    token = Token(gherkin_line=line, location=location)
+    assert tm.match_FeatureLine(token)
+    assert token.matched_type == 'FeatureLine'
+    assert token.matched_keyword == 'Feature'
+    assert token.matched_text == 'hello'
+
+def test_it_matches_FeatureLineH2():
+    tm = GherkinInMarkdownTokenMatcher('en')
+    line = GherkinLine('''## Feature: hello''',location['line'])
     token = Token(gherkin_line=line, location=location)
     assert tm.match_FeatureLine(token)
     assert token.matched_type == "FeatureLine"
@@ -25,6 +34,15 @@ def test_it_matches_FeatureLine_in_French():
     assert token.matched_text == "hello"
 
 
+def test_it_matches_FeatureLine_without_the_Feature_keyword():
+    tm = GherkinInMarkdownTokenMatcher('en')
+    line = GherkinLine('''# hello''',location['line'])
+    token = Token(gherkin_line=line, location=location)
+    assert tm.match_FeatureLine(token)
+    assert token.matched_type == 'FeatureLine'
+    assert token.matched_keyword == None
+    assert token.matched_text == '# hello'
+
 def test_it_matches_bullet_Step():
     tm = GherkinInMarkdownTokenMatcher("en")
     line = GherkinLine("""  *  Given I have 3 cukes""", location["line"])
@@ -41,21 +59,33 @@ def test_it_matches_plus_Step():
     line = GherkinLine("""  +  Given I have 3 cukes""", location["line"])
     token = Token(gherkin_line=line, location=location)
     assert tm.match_StepLine(token)
-    assert token.matched_type == "StepLine"
-    assert token.matched_keyword == "Given "
-    assert token.matched_text == "I have 3 cukes"
-    assert token.location["column"] == 6
-
+    assert token.matched_type == 'StepLine'
+    assert token.matched_keyword == 'Given '
+    assert token.matched_keyword_type == 'Context'
+    assert token.matched_text == 'I have 3 cukes'
+    assert token.location['column'] == 6
 
 def test_it_matches_hyphen_Step():
     tm = GherkinInMarkdownTokenMatcher("en")
     line = GherkinLine("""  -  Given I have 3 cukes""", location["line"])
     token = Token(gherkin_line=line, location=location)
     assert tm.match_StepLine(token)
-    assert token.matched_type == "StepLine"
-    assert token.matched_keyword == "Given "
-    assert token.matched_text == "I have 3 cukes"
-    assert token.location["column"] == 6
+    assert token.matched_type == 'StepLine'
+    assert token.matched_keyword == 'Given '
+    assert token.matched_keyword_type == 'Context'
+    assert token.matched_text == 'I have 3 cukes'
+    assert token.location['column'] == 6
+
+def test_it_matches_a_when_Step():
+    tm = GherkinInMarkdownTokenMatcher('en')
+    line = GherkinLine('''  -  When I do something''',location['line'])
+    token = Token(gherkin_line=line, location=location)
+    assert tm.match_StepLine(token)
+    assert token.matched_type == 'StepLine'
+    assert token.matched_keyword == 'When '
+    assert token.matched_keyword_type == 'Action'
+    assert token.matched_text == 'I do something'
+    assert token.location['column'] == 6
 
 
 def test_it_matches_arbitrary_text_as_Other():
@@ -156,19 +186,6 @@ def test_it_does_not_match_table_row_indented_6_space():
     assert not tm.match_TableRow(token)
 
 
-def test_it_matches_table_separator_row_as_comment():
-    tm = GherkinInMarkdownTokenMatcher("en")
-
-    l1 = GherkinLine("  | h1 | h2 |", location["line"])
-    t1 = Token(l1, location)
-    assert tm.match_TableRow(t1)
-
-    l2 = GherkinLine("  | --- | --- |", location["line"])
-    t2 = Token(l2, location)
-    assert not tm.match_TableRow(t2)
-    assert tm.match_Comment(t2)
-
-
 def test_it_matches_indented_tags():
     tm = GherkinInMarkdownTokenMatcher("en")
 
@@ -238,6 +255,34 @@ def test_it_matches_ExamplesLine():
     line = GherkinLine("""## Examples: """, location["line"])
     token = Token(gherkin_line=line, location=location)
     assert tm.match_ExamplesLine(token)
-    assert token.matched_type == "ExamplesLine"
-    assert token.matched_keyword == "Examples"
-    assert token.matched_text == ""
+    assert token.matched_type == 'ExamplesLine'
+    assert token.matched_keyword == 'Examples'
+    assert token.matched_text == ''
+
+def test_it_matches_Empty():
+    tm = GherkinInMarkdownTokenMatcher('en')
+    line = GherkinLine('''''',location['line'])
+    token = Token(gherkin_line=line, location=location)
+    assert tm.match_Empty(token)
+    assert token.matched_type == 'Empty'
+    assert token.matched_keyword == None
+    assert token.matched_text == None
+
+def test_it_matches_arbitrary_text_as_Empty_after_the_FeatureLine_has_already_been_matched():
+    # White Box testing - implementation detail...
+    # Given the FeatureLine has already been matched
+    tm = GherkinInMarkdownTokenMatcher('en')
+
+    line = GherkinLine('''# something arbitrary''',location['line'])
+    token = Token(gherkin_line=line, location=location)
+    assert(tm.match_FeatureLine(token))
+
+    line = GherkinLine('''arbitrary text''',location['line'])
+    token=Token(gherkin_line=line, location=location)
+
+    assert(tm.match_Empty(token))
+    assert token.matched_type == 'Empty'
+    assert token.matched_items == []
+    assert token.matched_keyword == None
+    assert token.matched_text == None
+    pass
diff --git a/python/test/gherkin_test.py b/python/test/gherkin_test.py
diff --git a/python/test/source_events_test.py b/python/test/source_events_test.py