[Perl] add MarkdownTokenMatcher skeleton

fperrad · fperrad · commit f1e3ad34fc19 · 2025-05-14T17:02:17.000+02:00
diff --git a/perl/bin/gherkin-generate-tokens b/perl/bin/gherkin-generate-tokens
@@ -6,16 +6,23 @@ use lib 'Gherkin-latest/lib';
 
 use Gherkin::Parser;
 use Gherkin::TokenFormatterBuilder;
+use Gherkin::TokenMatcher;
+use Gherkin::MarkdownTokenMatcher;
 
 package App::GherkinGenerateTokens;
 
 sub run {
     my ( $class, $fh, @file_list ) = @_;
 
-    my $parser
-        = Gherkin::Parser->new( Gherkin::TokenFormatterBuilder->new() );
-
-    print $fh join "\n", @{ $parser->parse($_) } for @file_list;
+    print $fh join "\n",
+      @{ Gherkin::Parser->new(
+            Gherkin::TokenFormatterBuilder->new(),
+            /\.md$/
+            ? Gherkin::MarkdownTokenMatcher->new()
+            : Gherkin::TokenMatcher->new()
+        )->parse($_)
+      }
+      for @file_list;
     print $fh "\n";
 
 }
diff --git a/perl/lib/Gherkin.pm b/perl/lib/Gherkin.pm
@@ -10,6 +10,8 @@ use Cucumber::Messages;
 use Gherkin::AstBuilder;
 use Gherkin::Parser;
 use Gherkin::Pickles::Compiler;
+use Gherkin::TokenMatcher;
+use Gherkin::MarkdownTokenMatcher;
 
 
 use Class::XSAccessor accessors =>
@@ -53,8 +55,10 @@ sub from_paths {
                 source => Cucumber::Messages::Source->new(
                     uri        => $path,
                     data       => $content,
-                    media_type => Cucumber::Messages::Source::MEDIATYPE_TEXT_X_CUCUMBER_GHERKIN_PLAIN,
-                    )
+                    media_type => $path =~ m/\.md$/
+                    ? Cucumber::Messages::Source::MEDIATYPE_TEXT_X_CUCUMBER_GHERKIN_MARKDOWN
+                    : Cucumber::Messages::Source::MEDIATYPE_TEXT_X_CUCUMBER_GHERKIN_PLAIN,
+                )
             ),
             $id_generator,
             $sink);
@@ -111,8 +115,11 @@ sub from_source {
     if ($self->include_ast or $self->include_pickles) {
         my $source = $envelope->source;
         my $parser = Gherkin::Parser->new(
-            Gherkin::AstBuilder->new($id_generator)
-            );
+            Gherkin::AstBuilder->new($id_generator),
+            $source->media_type eq Cucumber::Messages::Source::MEDIATYPE_TEXT_X_CUCUMBER_GHERKIN_MARKDOWN
+            ? Gherkin::MarkdownTokenMatcher->new()
+            : Gherkin::TokenMatcher->new()
+        );
         my $data = $source->data;
 
         local $@;
diff --git a/perl/lib/Gherkin/Dialect.pm b/perl/lib/Gherkin/Dialect.pm
@@ -94,7 +94,8 @@ one to be used for keyword translation lookup. Out of the box, Gherkin comes
 with actual translations, such as C<Afrikaans> as well as 'slang-like'
 translations such as "Pirate English".
 
-This module is used by the L<token matcher|Gherkin::TokenMatcher> to identify
+This module is used by the L<token matcher|Gherkin::TokenMatcher> and
+the L<Markdown token matcher|Gherkin::MarkdownTokenMatcher> to identify
 the type of token (input line) passed to the scanner.
 
 =head1 METHODS
diff --git a/perl/lib/Gherkin/MarkdownTokenMatcher.pm b/perl/lib/Gherkin/MarkdownTokenMatcher.pm
@@ -0,0 +1,87 @@
+package Gherkin::MarkdownTokenMatcher;
+
+use strict;
+use warnings;
+
+use base 'Gherkin::TokenMatcher';
+
+1;
+
+__END__
+
+
+=head1 NAME
+
+Gherkin::MarkdownTokenMatcher - Line token matching for the Gherkin parser
+
+=head1 SYNOPSIS
+
+  use Gherkin::MarkdownTokenMatcher;
+  use Gherkin::Dialect;
+
+  # Instantiate a token matcher with the default language 'Emoji'
+  my $matcher = Gherkin::MarkdownTokenMatcher->new( {
+        dialect => Gherkin::Dialect->new( { dialect => 'em'} )
+  } );
+
+=head1 DESCRIPTION
+
+This is an alternate token matcher for Markdown with Gherkin (MDG).
+
+The Gherkin language has a line-based structure. The parser knows about state,
+but defers identifying the type of line tokens to the token matcher. The
+matcher knows how to identify line tokens based on the grammar's keywords.
+Although the matcher knows how to identify line tokens based on the keywords,
+it depends on L<Gherkin::Dialect> to provide the actual keyword texts.
+
+=head1 METHODS
+
+=head2 new( [$options] )
+
+Constructor.
+
+C<$options> is a hashref with the following keys:
+
+=over
+
+=item C<dialect>
+
+An instance of L<Gherkin::Dialect> to provide the keyword texts used to identify
+the type of line-token being matched.
+
+=back
+
+=head2 dialect_name
+
+Returns the name of the current dialect selected from the C<dialect> instance.
+
+=head2 change_dialect
+
+Changes the selected dialect on the C<dialect> instance. Dialects are groups of
+keywords belonging together; this is how keyword translations are being handled.
+
+=head2 reset
+
+Changes the token scanner's state back to its initial state; used to restart
+scanning a document. Multiple documents may be parsed using a single token
+scanner with a C<reset> call in-between.
+
+=head1 SEE ALSO
+
+=over 8
+
+=item * L<Gherkin>
+
+=item * L<Gherkin::Dialect>
+
+=item * L<Gherkin::Parser>
+
+=item * L<Gherkin::TokenMatcher>
+
+=back
+
+=head1 LICENSE
+
+See L<Gherkin>.
+
+=cut
diff --git a/perl/lib/Gherkin/Parser.pm b/perl/lib/Gherkin/Parser.pm
@@ -75,6 +75,8 @@ The C<$uri> parameter is expected to be passed in all but the third case.
 
 =item * L<Gherkin::Dialect>
 
+=item * L<Gherkin::MarkdownTokenMatcher>
+
 =item * L<Gherkin::TokenMatcher>
 
 =item * L<Gherkin::TokenScanner>
diff --git a/perl/lib/Gherkin/TokenMatcher.pm b/perl/lib/Gherkin/TokenMatcher.pm
@@ -344,6 +344,8 @@ scanner with a C<reset> call in-between.
 
 =item * L<Gherkin::Dialect>
 
+=item * L<Gherkin::MarkdownTokenMatcher>
+
 =item * L<Gherkin::Parser>
 
 =back