Skip to content

Commit d134db8

Browse files
committed
[Perl] implements MarkdownTokenMatcher
1 parent 79d37f1 commit d134db8

File tree

2 files changed

+294
-3
lines changed

2 files changed

+294
-3
lines changed

perl/Makefile

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,8 @@ SOURCE_FILES = $(shell find lib -name "*.pm" | grep -v $(GHERKIN_PARSER) | grep
88
GHERKIN = bin/gherkin
99
GHERKIN_GENERATE_TOKENS = bin/gherkin-generate-tokens
1010

11-
GOOD_FEATURE_FILES = $(shell find ../testdata/good -name "*.feature")
12-
BAD_FEATURE_FILES = $(shell find ../testdata/bad -name "*.feature")
11+
GOOD_FEATURE_FILES = $(shell find ../testdata/good -name "*.feature" -o -name "*.feature.md")
12+
BAD_FEATURE_FILES = $(shell find ../testdata/bad -name "*.feature" -o -name "*.feature.md")
1313

1414
TOKENS = $(patsubst ../testdata/%,acceptance/testdata/%.tokens,$(GOOD_FEATURE_FILES))
1515
ASTS = $(patsubst ../testdata/%,acceptance/testdata/%.ast.ndjson,$(GOOD_FEATURE_FILES))

perl/lib/Gherkin/MarkdownTokenMatcher.pm

Lines changed: 292 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,298 @@ package Gherkin::MarkdownTokenMatcher;
33
use strict;
44
use warnings;
55

6-
use base 'Gherkin::TokenMatcher';
6+
our $DEFAULT_DOC_STRING_SEPARATOR = q/^(```[`]*)(.*)/;
7+
our $KEYWORD_PREFIX_BULLET = q/^(\\s*[*+-]\\s*)/;
8+
our $KEYWORD_PREFIX_HEADER = q/^(#{1,6}\\s)/;
9+
10+
use Class::XSAccessor accessors => [
11+
qw/dialect _default_dialect_name _indent_to_remove _active_doc_string_separator _keyword_types
12+
_matched_FeatureLine _non_star_step_keywords/,
13+
];
14+
15+
use Gherkin::Dialect;
16+
17+
sub new {
18+
my ( $class, $options ) = @_;
19+
$options->{'dialect'} ||= Gherkin::Dialect->new( { dialect => 'en' } );
20+
my $self = bless $options, $class;
21+
$self->_default_dialect_name( $self->dialect_name );
22+
my @non_star_step_keywords = map {
23+
grep { $_ ne '* ' }
24+
@{ $self->dialect->$_ }
25+
} qw/Given When Then And But/;
26+
$self->_non_star_step_keywords( \@non_star_step_keywords );
27+
$self->reset();
28+
return $self;
29+
}
30+
31+
sub _add_keyword_type_mappings {
32+
my ( $keyword_types, $keywords, $type ) = @_;
33+
34+
for my $keyword ( @{$keywords} ) {
35+
if ( not exists $keyword_types->{$keyword} ) {
36+
$keyword_types->{$keyword} = [];
37+
}
38+
push @{ $keyword_types->{$keyword} }, $type;
39+
}
40+
return;
41+
}
42+
43+
sub dialect_name { return $_[0]->dialect->dialect; }
44+
45+
sub change_dialect {
46+
my $self = shift;
47+
$self->dialect->change_dialect(@_);
48+
49+
my $keyword_types = {};
50+
_add_keyword_type_mappings( $keyword_types, $self->dialect->Given,
51+
Cucumber::Messages::Step::KEYWORDTYPE_CONTEXT );
52+
_add_keyword_type_mappings( $keyword_types, $self->dialect->When,
53+
Cucumber::Messages::Step::KEYWORDTYPE_ACTION );
54+
_add_keyword_type_mappings( $keyword_types, $self->dialect->Then,
55+
Cucumber::Messages::Step::KEYWORDTYPE_OUTCOME );
56+
_add_keyword_type_mappings( $keyword_types, [ @{ $self->dialect->And }, @{ $self->dialect->But } ],
57+
Cucumber::Messages::Step::KEYWORDTYPE_CONJUNCTION );
58+
$self->_keyword_types($keyword_types);
59+
return;
60+
}
61+
62+
sub reset {
63+
my $self = shift;
64+
$self->change_dialect( $self->_default_dialect_name );
65+
$self->_indent_to_remove(0);
66+
$self->_active_doc_string_separator($DEFAULT_DOC_STRING_SEPARATOR);
67+
return;
68+
}
69+
70+
sub match_FeatureLine {
71+
my ( $self, $token ) = @_;
72+
return if $self->_matched_FeatureLine;
73+
74+
# We first try to match "# Feature: blah"
75+
my $result = $self->_match_title_line( $KEYWORD_PREFIX_HEADER, ':',
76+
$token, FeatureLine => $self->dialect->Feature );
77+
# If we didn't match "# Feature: blah", we still match this line
78+
# as a FeatureLine.
79+
# The reason for this is that users may not want to be constrained by having this as their fist line.
80+
unless ($result) {
81+
$self->_set_token_matched( $token,
82+
FeatureLine => { text => $token->line->_trimmed_line_text } );
83+
}
84+
$self->_matched_FeatureLine(1);
85+
return 1;
86+
}
87+
88+
sub match_RuleLine {
89+
my ( $self, $token ) = @_;
90+
return $self->_match_title_line( $KEYWORD_PREFIX_HEADER, ':',
91+
$token, RuleLine => $self->dialect->Rule );
92+
}
93+
94+
sub match_ScenarioLine {
95+
my ( $self, $token ) = @_;
96+
return $self->_match_title_line( $KEYWORD_PREFIX_HEADER, ':',
97+
$token, ScenarioLine => $self->dialect->Scenario )
98+
|| $self->_match_title_line( $KEYWORD_PREFIX_HEADER, ':',
99+
$token, 'ScenarioLine' => $self->dialect->ScenarioOutline );
100+
}
101+
102+
sub match_BackgroundLine {
103+
my ( $self, $token ) = @_;
104+
return $self->_match_title_line( $KEYWORD_PREFIX_HEADER, ':',
105+
$token, BackgroundLine => $self->dialect->Background );
106+
}
107+
108+
sub match_ExamplesLine {
109+
my ( $self, $token ) = @_;
110+
return $self->_match_title_line( $KEYWORD_PREFIX_HEADER, ':',
111+
$token, ExamplesLine => $self->dialect->Examples );
112+
}
113+
114+
sub match_Language {
115+
my ( $self, $token ) = @_;
116+
# We've made a deliberate choice not to support `# language: [ISO 639-1]` headers or similar
117+
# in Markdown. Users should specify a language globally.
118+
return;
119+
}
120+
121+
sub match_TagLine {
122+
my ( $self, $token ) = @_;
123+
return unless $token->line;
124+
125+
my @tags = ();
126+
while ( $token->line->line_text =~ m/`(@[^`]+)`/g ) {
127+
push @tags,
128+
{
129+
column => 2 + length $`,
130+
text => $1,
131+
};
132+
}
133+
return unless scalar @tags;
134+
$self->_set_token_matched( $token,
135+
TagLine => { items => \@tags } );
136+
return 1;
137+
}
138+
139+
sub _match_title_line {
140+
my ( $self, $prefix, $keyword_suffix, $token, $token_type, $keywords ) = @_;
141+
return unless $token->line;
142+
143+
my $regex = $prefix . '(' . join( '|', @{$keywords} ) . ')' . $keyword_suffix . '\s*(.*)';
144+
if ( $token->line->_trimmed_line_text =~ qr/$regex/ ) {
145+
my $indent = $token->line->indent + ( length $1 || 0 );
146+
my $keyword = $2;
147+
my $text = $3;
148+
$text =~ s/\s+$//;
149+
my $keyword_type;
150+
if ( exists $self->_keyword_types->{$keyword} ) {
151+
# only set the keyword type if this is a step keyword
152+
$keyword_type =
153+
( scalar @{ $self->_keyword_types->{$keyword} } > 1 )
154+
? Cucumber::Messages::Step::KEYWORDTYPE_UNKNOWN
155+
: $self->_keyword_types->{$keyword}->[0];
156+
}
157+
$self->_set_token_matched( $token, $token_type,
158+
{ indent => $indent, keyword => $keyword, text => $text, keyword_type => $keyword_type } );
159+
return 1;
160+
}
161+
return;
162+
}
163+
164+
sub _set_token_matched {
165+
my ( $self, $token, $matched_type, $options ) = @_;
166+
$options->{'items'} ||= [];
167+
$token->matched_type($matched_type);
168+
169+
if ( defined $options->{'text'} ) {
170+
chomp $options->{'text'};
171+
$token->matched_text( $options->{'text'} );
172+
}
173+
174+
$token->matched_keyword( $options->{'keyword'} )
175+
if defined $options->{'keyword'};
176+
$token->matched_keyword_type( $options->{'keyword_type'} )
177+
if defined $options->{'keyword_type'};
178+
179+
if ( defined $options->{'indent'} ) {
180+
$token->matched_indent( $options->{'indent'} );
181+
} else {
182+
$token->matched_indent( $token->line ? $token->line->indent : 0 );
183+
}
184+
185+
$token->matched_items( $options->{'items'} )
186+
if defined $options->{'items'};
187+
188+
$token->location->{'column'} = $token->matched_indent + 1;
189+
$token->matched_gherkin_dialect( $self->dialect_name );
190+
return;
191+
}
192+
193+
sub match_EOF {
194+
my ( $self, $token ) = @_;
195+
if ( $token->is_eof ) {
196+
$self->_set_token_matched( $token, 'EOF' );
197+
return 1;
198+
}
199+
}
200+
201+
sub match_Empty {
202+
my ( $self, $token ) = @_;
203+
return unless $token->line;
204+
205+
if (
206+
$token->line->is_empty
207+
|| ( !$self->match_TagLine($token)
208+
&& !$self->match_FeatureLine($token)
209+
&& !$self->match_ScenarioLine($token)
210+
&& !$self->match_BackgroundLine($token)
211+
&& !$self->match_ExamplesLine($token)
212+
&& !$self->match_RuleLine($token)
213+
&& !$self->match_TableRow($token)
214+
&& !$self->match_Comment($token)
215+
&& !$self->match_Language($token)
216+
&& !$self->match_DocStringSeparator($token)
217+
&& !$self->match_EOF($token)
218+
&& !$self->match_StepLine($token) )
219+
)
220+
{
221+
$self->_set_token_matched( $token,
222+
Empty => { indent => 0 } );
223+
return 1;
224+
}
225+
return;
226+
}
227+
228+
sub match_Comment {
229+
my ( $self, $token ) = @_;
230+
return unless $token->line;
231+
232+
if ( $token->line->startswith('|')
233+
&& $self->_is_gfm_table_separator( $token->line->table_cells ) )
234+
{
235+
$self->_set_token_matched( $token,
236+
Comment => { text => $token->line->line_text, indent => 0 } );
237+
return 1;
238+
}
239+
return;
240+
}
241+
242+
sub _is_gfm_table_separator {
243+
my ( $self, $table_cells ) = @_;
244+
my @separator_values = grep { $_->{'text'} =~ m/^:?-+:?$/ } @{$table_cells};
245+
return scalar @separator_values;
246+
}
247+
248+
sub match_Other {
249+
my ( $self, $token ) = @_;
250+
return unless $token->line;
251+
252+
# take the entire line, except removing DocString indents
253+
my $text = $token->line->get_line_text( $self->_indent_to_remove );
254+
$self->_set_token_matched( $token,
255+
Other => { indent => 0, text => $text } );
256+
return 1;
257+
}
258+
259+
sub match_StepLine {
260+
my ( $self, $token ) = @_;
261+
return $self->_match_title_line( $KEYWORD_PREFIX_BULLET, '',
262+
$token, StepLine => $self->_non_star_step_keywords );
263+
}
264+
265+
sub match_DocStringSeparator {
266+
my ( $self, $token ) = @_;
267+
return unless $token->line;
268+
269+
my $active_doc_string_separator = $self->_active_doc_string_separator;
270+
if ( $token->line->line_text =~ qr/$active_doc_string_separator/ ) {
271+
if ( $self->_active_doc_string_separator eq $DEFAULT_DOC_STRING_SEPARATOR ) {
272+
$self->_active_doc_string_separator( '^(' . $1 . ')$' );
273+
$self->_indent_to_remove( $token->line->indent );
274+
} else {
275+
$self->_active_doc_string_separator($DEFAULT_DOC_STRING_SEPARATOR);
276+
}
277+
$self->_set_token_matched( $token,
278+
DocStringSeparator => { text => '', keyword => $1 } );
279+
return 1;
280+
}
281+
return;
282+
}
283+
284+
sub match_TableRow {
285+
my ( $self, $token ) = @_;
286+
return unless $token->line;
287+
288+
# Gherkin tables must be indented 2-5 spaces in order to be distinguidedn from non-Gherkin tables
289+
if ( $token->line->line_text =~ m/^\s\s\s?\s?\s?\|/ ) {
290+
my $table_cells = $token->line->table_cells;
291+
return if ( $self->_is_gfm_table_separator($table_cells) );
292+
$self->_set_token_matched( $token,
293+
TableRow => { items => $table_cells } );
294+
return 1;
295+
}
296+
return;
297+
}
7298

8299
1;
9300

0 commit comments

Comments
 (0)