Skip to content

Commit d0059d2

Browse files
committed
[Perl] implements MarkdownTokenMatcher
1 parent 0e845dd commit d0059d2

File tree

2 files changed

+262
-3
lines changed

2 files changed

+262
-3
lines changed

perl/Makefile

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,8 @@ SOURCE_FILES = $(shell find lib -name "*.pm" | grep -v $(GHERKIN_PARSER) | grep
88
GHERKIN = bin/gherkin
99
GHERKIN_GENERATE_TOKENS = bin/gherkin-generate-tokens
1010

11-
GOOD_FEATURE_FILES = $(shell find ../testdata/good -name "*.feature")
12-
BAD_FEATURE_FILES = $(shell find ../testdata/bad -name "*.feature")
11+
GOOD_FEATURE_FILES = $(shell find ../testdata/good -name "*.feature" -o -name "*.feature.md")
12+
BAD_FEATURE_FILES = $(shell find ../testdata/bad -name "*.feature" -o -name "*.feature.md")
1313

1414
TOKENS = $(patsubst ../testdata/%,acceptance/testdata/%.tokens,$(GOOD_FEATURE_FILES))
1515
ASTS = $(patsubst ../testdata/%,acceptance/testdata/%.ast.ndjson,$(GOOD_FEATURE_FILES))

perl/lib/Gherkin/MarkdownTokenMatcher.pm

Lines changed: 260 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,266 @@ package Gherkin::MarkdownTokenMatcher;
33
use strict;
44
use warnings;
55

6-
use base 'Gherkin::TokenMatcher';
6+
my $DEFAULT_DOC_STRING_SEPARATOR = q/^(```[`]*)(.*)/;
7+
my $KEYWORD_PREFIX_BULLET = q/^(\\s*[*+-]\\s*)/;
8+
my $KEYWORD_PREFIX_HEADER = q/^(#{1,6}\\s)/;
9+
10+
use Class::XSAccessor accessors => [
11+
qw/dialect _default_dialect_name _indent_to_remove _active_doc_string_separator _keyword_types
12+
_matched_FeatureLine _non_star_step_keywords/,
13+
];
14+
15+
use Cucumber::Messages;
16+
use Gherkin::Dialect;
17+
18+
sub new {
19+
my ( $class, $options ) = @_;
20+
$options->{'dialect'} ||= Gherkin::Dialect->new( { dialect => 'en' } );
21+
my $self = bless $options, $class;
22+
$self->_default_dialect_name( $self->dialect_name );
23+
my @non_star_step_keywords = map {
24+
grep { $_ ne '* ' }
25+
@{ $self->dialect->$_ }
26+
} qw/Given When Then And But/;
27+
$self->_non_star_step_keywords( \@non_star_step_keywords );
28+
$self->reset();
29+
return $self;
30+
}
31+
32+
sub _add_keyword_type_mappings {
33+
my ( $keyword_types, $keywords, $type ) = @_;
34+
35+
for my $keyword ( @{$keywords} ) {
36+
if ( not exists $keyword_types->{$keyword} ) {
37+
$keyword_types->{$keyword} = [];
38+
}
39+
push( @{ $keyword_types->{$keyword} }, $type );
40+
}
41+
}
42+
43+
sub dialect_name { $_[0]->dialect->dialect }
44+
45+
sub change_dialect {
46+
my $self = shift;
47+
$self->dialect->change_dialect(@_);
48+
49+
my $keyword_types = {};
50+
_add_keyword_type_mappings( $keyword_types, $self->dialect->Given,
51+
Cucumber::Messages::Step::KEYWORDTYPE_CONTEXT );
52+
_add_keyword_type_mappings( $keyword_types, $self->dialect->When,
53+
Cucumber::Messages::Step::KEYWORDTYPE_ACTION );
54+
_add_keyword_type_mappings( $keyword_types, $self->dialect->Then,
55+
Cucumber::Messages::Step::KEYWORDTYPE_OUTCOME );
56+
_add_keyword_type_mappings( $keyword_types, [ @{ $self->dialect->And }, @{ $self->dialect->But } ],
57+
Cucumber::Messages::Step::KEYWORDTYPE_CONJUNCTION );
58+
$self->_keyword_types($keyword_types);
59+
}
60+
61+
sub reset {
62+
my $self = shift;
63+
$self->change_dialect( $self->_default_dialect_name );
64+
$self->_indent_to_remove(0);
65+
$self->_active_doc_string_separator($DEFAULT_DOC_STRING_SEPARATOR);
66+
}
67+
68+
sub match_FeatureLine {
69+
my ( $self, $token ) = @_;
70+
return if $self->_matched_FeatureLine;
71+
72+
# We first try to match "# Feature: blah"
73+
my $result = $self->_match_title_line( $KEYWORD_PREFIX_HEADER, ':', $token,
74+
FeatureLine => $self->dialect->Feature );
75+
# If we didn't match "# Feature: blah", we still match this line
76+
# as a FeatureLine.
77+
# The reason for this is that users may not want to be constrained by having this as their fist line.
78+
unless ($result) {
79+
$self->_set_token_matched( $token,
80+
FeatureLine => { text => $token->line->_trimmed_line_text } );
81+
}
82+
$self->_matched_FeatureLine(1);
83+
}
84+
85+
sub match_RuleLine {
86+
my ( $self, $token ) = @_;
87+
$self->_match_title_line( $KEYWORD_PREFIX_HEADER, ':', $token,
88+
RuleLine => $self->dialect->Rule );
89+
}
90+
91+
sub match_ScenarioLine {
92+
my ( $self, $token ) = @_;
93+
$self->_match_title_line( $KEYWORD_PREFIX_HEADER, ':', $token,
94+
ScenarioLine => $self->dialect->Scenario )
95+
or $self->_match_title_line( $KEYWORD_PREFIX_HEADER, ':', $token,
96+
ScenarioLine => $self->dialect->ScenarioOutline );
97+
}
98+
99+
sub match_BackgroundLine {
100+
my ( $self, $token ) = @_;
101+
$self->_match_title_line( $KEYWORD_PREFIX_HEADER, ':', $token,
102+
BackgroundLine => $self->dialect->Background );
103+
}
104+
105+
sub match_ExamplesLine {
106+
my ( $self, $token ) = @_;
107+
$self->_match_title_line( $KEYWORD_PREFIX_HEADER, ':', $token,
108+
ExamplesLine => $self->dialect->Examples );
109+
}
110+
111+
sub match_Language {
112+
my ( $self, $token ) = @_;
113+
# We've made a deliberate choice not to support `# language: [ISO 639-1]` headers or similar
114+
# in Markdown. Users should specify a language globally.
115+
return '';
116+
}
117+
118+
sub match_TagLine {
119+
my ( $self, $token ) = @_;
120+
my @tags = ();
121+
while ( $token->line->line_text =~ m/`(@[^`]+)`/g ) {
122+
push(
123+
@tags,
124+
{
125+
column => 2 + length($`),
126+
text => $1,
127+
}
128+
);
129+
}
130+
return unless scalar(@tags);
131+
$self->_set_token_matched( $token, TagLine => { items => \@tags } );
132+
}
133+
134+
sub _match_title_line {
135+
my ( $self, $prefix, $keyword_suffix, $token, $token_type, $keywords ) = @_;
136+
my $regex = $prefix . '(' . join( '|', @{$keywords} ) . ')' . $keyword_suffix . '\s*(.*)';
137+
if ( $token->line->_trimmed_line_text =~ qr/$regex/ ) {
138+
my $indent = $token->line->indent + ( length($1) || 0 );
139+
my $keyword = $2;
140+
my $text = $3;
141+
$text =~ s/\s+$//;
142+
my $keyword_type;
143+
if ( exists $self->_keyword_types->{$keyword} ) {
144+
# only set the keyword type if this is a step keyword
145+
$keyword_type =
146+
( scalar( @{ $self->_keyword_types->{$keyword} } ) > 1 )
147+
? Cucumber::Messages::Step::KEYWORDTYPE_UNKNOWN
148+
: $self->_keyword_types->{$keyword}->[0];
149+
}
150+
$self->_set_token_matched( $token, $token_type,
151+
{ indent => $indent, keyword => $keyword, text => $text, keyword_type => $keyword_type } );
152+
}
153+
}
154+
155+
sub _set_token_matched {
156+
my ( $self, $token, $matched_type, $options ) = @_;
157+
$options->{'items'} ||= [];
158+
$token->matched_type($matched_type);
159+
160+
if ( defined $options->{'text'} ) {
161+
chomp( $options->{'text'} );
162+
$token->matched_text( $options->{'text'} );
163+
}
164+
165+
$token->matched_keyword( $options->{'keyword'} )
166+
if defined $options->{'keyword'};
167+
$token->matched_keyword_type( $options->{'keyword_type'} )
168+
if defined $options->{'keyword_type'};
169+
170+
if ( defined $options->{'indent'} ) {
171+
$token->matched_indent( $options->{'indent'} );
172+
} else {
173+
$token->matched_indent( $token->line ? $token->line->indent : 0 );
174+
}
175+
176+
$token->matched_items( $options->{'items'} )
177+
if defined $options->{'items'};
178+
179+
$token->location->{'column'} = $token->matched_indent + 1;
180+
$token->matched_gherkin_dialect( $self->dialect_name );
181+
}
182+
183+
sub match_EOF {
184+
my ( $self, $token ) = @_;
185+
return unless $token->is_eof;
186+
$self->_set_token_matched( $token, 'EOF' );
187+
}
188+
189+
sub match_Empty {
190+
my ( $self, $token ) = @_;
191+
if (
192+
$token->line->is_empty
193+
|| ( !$self->match_TagLine($token)
194+
&& !$self->match_FeatureLine($token)
195+
&& !$self->match_ScenarioLine($token)
196+
&& !$self->match_BackgroundLine($token)
197+
&& !$self->match_ExamplesLine($token)
198+
&& !$self->match_RuleLine($token)
199+
&& !$self->match_TableRow($token)
200+
&& !$self->match_Comment($token)
201+
&& !$self->match_Language($token)
202+
&& !$self->match_DocStringSeparator($token)
203+
&& !$self->match_EOF($token)
204+
&& !$self->match_StepLine($token) )
205+
)
206+
{
207+
$self->_set_token_matched( $token, Empty => { indent => 0 } );
208+
}
209+
}
210+
211+
sub match_Comment {
212+
my ( $self, $token ) = @_;
213+
if ( $token->line->startswith('|')
214+
&& $self->_is_gfm_table_separator( $token->line->table_cells ) )
215+
{
216+
$self->_set_token_matched( $token,
217+
Comment => { text => $token->line->line_text, indent => 0 } );
218+
}
219+
}
220+
221+
sub match_Other {
222+
my ( $self, $token ) = @_;
223+
# take the entire line, except removing DocString indents
224+
my $text = $token->line->get_line_text( $self->_indent_to_remove );
225+
$self->_set_token_matched( $token,
226+
Other => { indent => 0, text => $text } );
227+
}
228+
229+
sub match_StepLine {
230+
my ( $self, $token ) = @_;
231+
$self->_match_title_line( $KEYWORD_PREFIX_BULLET, '', $token,
232+
StepLine => $self->_non_star_step_keywords );
233+
}
234+
235+
sub match_DocStringSeparator {
236+
my ( $self, $token ) = @_;
237+
my $active_doc_string_separator = $self->_active_doc_string_separator;
238+
if ( $token->line->line_text =~ qr/$active_doc_string_separator/ ) {
239+
if ( $self->_active_doc_string_separator eq $DEFAULT_DOC_STRING_SEPARATOR ) {
240+
$self->_active_doc_string_separator( '^(' . $1 . ')$' );
241+
$self->_indent_to_remove( $token->line->indent );
242+
} else {
243+
$self->_active_doc_string_separator($DEFAULT_DOC_STRING_SEPARATOR);
244+
}
245+
$self->_set_token_matched( $token,
246+
DocStringSeparator => { keyword => $1, text => '' } );
247+
}
248+
}
249+
250+
sub match_TableRow {
251+
my ( $self, $token ) = @_;
252+
# Gherkin tables must be indented 2-5 spaces in order to be distinguidedn from non-Gherkin tables
253+
if ( $token->line->line_text =~ m/^\s\s\s?\s?\s?\|/ ) {
254+
my $table_cells = $token->line->table_cells;
255+
return if ( $self->_is_gfm_table_separator($table_cells) );
256+
$self->_set_token_matched( $token,
257+
TableRow => { items => $table_cells } );
258+
}
259+
}
260+
261+
sub _is_gfm_table_separator {
262+
my ( $self, $table_cells ) = @_;
263+
my @separator_values = grep { $_->{'text'} =~ m/^:?-+:?$/ } @{$table_cells};
264+
return scalar(@separator_values) > 0;
265+
}
7266

8267
1;
9268

0 commit comments

Comments
 (0)