@@ -3,7 +3,269 @@ package Gherkin::MarkdownTokenMatcher;
33use strict;
44use warnings;
55
6- use base ' Gherkin::TokenMatcher' ;
6+ my $DEFAULT_DOC_STRING_SEPARATOR = q/ ^(```[`]*)(.*)/ ;
7+ my $KEYWORD_PREFIX_BULLET = q/ ^(\\s*[*+-]\\s*)/ ;
8+ my $KEYWORD_PREFIX_HEADER = q/ ^(#{1,6}\\s)/ ;
9+
10+ use Class::XSAccessor accessors => [
11+ qw/ dialect _default_dialect_name _indent_to_remove _active_doc_string_separator _keyword_types
12+ _matched_FeatureLine _non_star_step_keywords/ ,
13+ ];
14+
15+ use Cucumber::Messages;
16+ use Gherkin::Dialect;
17+
18+ sub new {
19+ my ( $class , $options ) = @_ ;
20+ $options -> {' dialect' } ||= Gherkin::Dialect-> new( { dialect => ' en' } );
21+ my $self = bless $options , $class ;
22+ $self -> _default_dialect_name( $self -> dialect_name );
23+ my @non_star_step_keywords = map {
24+ grep { $_ ne ' * ' }
25+ @{ $self -> dialect-> $_ }
26+ } qw/ Given When Then And But/ ;
27+ $self -> _non_star_step_keywords( \@non_star_step_keywords );
28+ $self -> reset ();
29+ return $self ;
30+ }
31+
32+ sub _add_keyword_type_mappings {
33+ my ( $keyword_types , $keywords , $type ) = @_ ;
34+
35+ for my $keyword ( @{$keywords } ) {
36+ if ( not exists $keyword_types -> {$keyword } ) {
37+ $keyword_types -> {$keyword } = [];
38+ }
39+ push ( @{ $keyword_types -> {$keyword } }, $type );
40+ }
41+ }
42+
43+ sub dialect_name { $_ [0]-> dialect-> dialect; }
44+
45+ sub change_dialect {
46+ my $self = shift ;
47+ $self -> dialect-> change_dialect(@_ );
48+
49+ my $keyword_types = {};
50+ _add_keyword_type_mappings( $keyword_types , $self -> dialect-> Given,
51+ Cucumber::Messages::Step::KEYWORDTYPE_CONTEXT );
52+ _add_keyword_type_mappings( $keyword_types , $self -> dialect-> When,
53+ Cucumber::Messages::Step::KEYWORDTYPE_ACTION );
54+ _add_keyword_type_mappings( $keyword_types , $self -> dialect-> Then,
55+ Cucumber::Messages::Step::KEYWORDTYPE_OUTCOME );
56+ _add_keyword_type_mappings( $keyword_types , [ @{ $self -> dialect-> And }, @{ $self -> dialect-> But } ],
57+ Cucumber::Messages::Step::KEYWORDTYPE_CONJUNCTION );
58+ $self -> _keyword_types($keyword_types );
59+ }
60+
61+ sub reset {
62+ my $self = shift ;
63+ $self -> change_dialect( $self -> _default_dialect_name );
64+ $self -> _indent_to_remove(0);
65+ $self -> _active_doc_string_separator($DEFAULT_DOC_STRING_SEPARATOR );
66+ }
67+
68+ sub match_FeatureLine {
69+ my ( $self , $token ) = @_ ;
70+ return if $self -> _matched_FeatureLine;
71+
72+ # We first try to match "# Feature: blah"
73+ my $result = $self -> _match_title_line( $KEYWORD_PREFIX_HEADER , ' :' ,
74+ $token , FeatureLine => $self -> dialect-> Feature );
75+ # If we didn't match "# Feature: blah", we still match this line
76+ # as a FeatureLine.
77+ # The reason for this is that users may not want to be constrained by having this as their fist line.
78+ unless ($result ) {
79+ $self -> _set_token_matched( $token ,
80+ FeatureLine => { text => $token -> line-> _trimmed_line_text } );
81+ }
82+ $self -> _matched_FeatureLine(1);
83+ }
84+
85+ sub match_RuleLine {
86+ my ( $self , $token ) = @_ ;
87+ $self -> _match_title_line( $KEYWORD_PREFIX_HEADER , ' :' ,
88+ $token , RuleLine => $self -> dialect-> Rule );
89+ }
90+
91+ sub match_ScenarioLine {
92+ my ( $self , $token ) = @_ ;
93+ $self -> _match_title_line( $KEYWORD_PREFIX_HEADER , ' :' ,
94+ $token , ScenarioLine => $self -> dialect-> Scenario )
95+ or $self -> _match_title_line( $KEYWORD_PREFIX_HEADER , ' :' ,
96+ $token , ScenarioLine => $self -> dialect-> ScenarioOutline );
97+ }
98+
99+ sub match_BackgroundLine {
100+ my ( $self , $token ) = @_ ;
101+ $self -> _match_title_line( $KEYWORD_PREFIX_HEADER , ' :' ,
102+ $token , BackgroundLine => $self -> dialect-> Background );
103+ }
104+
105+ sub match_ExamplesLine {
106+ my ( $self , $token ) = @_ ;
107+ $self -> _match_title_line( $KEYWORD_PREFIX_HEADER , ' :' ,
108+ $token , ExamplesLine => $self -> dialect-> Examples );
109+ }
110+
111+ sub match_Language {
112+ my ( $self , $token ) = @_ ;
113+ # We've made a deliberate choice not to support `# language: [ISO 639-1]` headers or similar
114+ # in Markdown. Users should specify a language globally.
115+ return ' ' ;
116+ }
117+
118+ sub match_TagLine {
119+ my ( $self , $token ) = @_ ;
120+ my @tags = ();
121+ while ( $token -> line-> line_text =~ m / `(@[^`]+)`/ g ) {
122+ push (
123+ @tags ,
124+ {
125+ column => 2 + length ($` ),
126+ text => $1 ,
127+ }
128+ );
129+ }
130+ return unless scalar (@tags );
131+ $self -> _set_token_matched( $token ,
132+ TagLine => { items => \@tags } );
133+ }
134+
135+ sub _match_title_line {
136+ my ( $self , $prefix , $keyword_suffix , $token , $token_type , $keywords ) = @_ ;
137+ my $regex = $prefix . ' (' . join ( ' |' , @{$keywords } ) . ' )' . $keyword_suffix . ' \s*(.*)' ;
138+ if ( $token -> line-> _trimmed_line_text =~ qr /$regex / ) {
139+ my $indent = $token -> line-> indent + ( length ($1 ) || 0 );
140+ my $keyword = $2 ;
141+ my $text = $3 ;
142+ $text =~ s /\s +$// ;
143+ my $keyword_type ;
144+ if ( exists $self -> _keyword_types-> {$keyword } ) {
145+ # only set the keyword type if this is a step keyword
146+ $keyword_type =
147+ ( scalar ( @{ $self -> _keyword_types-> {$keyword } } ) > 1 )
148+ ? Cucumber::Messages::Step::KEYWORDTYPE_UNKNOWN
149+ : $self -> _keyword_types-> {$keyword }-> [0];
150+ }
151+ $self -> _set_token_matched( $token , $token_type ,
152+ { indent => $indent , keyword => $keyword , text => $text , keyword_type => $keyword_type } );
153+ }
154+ }
155+
156+ sub _set_token_matched {
157+ my ( $self , $token , $matched_type , $options ) = @_ ;
158+ $options -> {' items' } ||= [];
159+ $token -> matched_type($matched_type );
160+
161+ if ( defined $options -> {' text' } ) {
162+ chomp ( $options -> {' text' } );
163+ $token -> matched_text( $options -> {' text' } );
164+ }
165+
166+ $token -> matched_keyword( $options -> {' keyword' } )
167+ if defined $options -> {' keyword' };
168+ $token -> matched_keyword_type( $options -> {' keyword_type' } )
169+ if defined $options -> {' keyword_type' };
170+
171+ if ( defined $options -> {' indent' } ) {
172+ $token -> matched_indent( $options -> {' indent' } );
173+ } else {
174+ $token -> matched_indent( $token -> line ? $token -> line-> indent : 0 );
175+ }
176+
177+ $token -> matched_items( $options -> {' items' } )
178+ if defined $options -> {' items' };
179+
180+ $token -> location-> {' column' } = $token -> matched_indent + 1;
181+ $token -> matched_gherkin_dialect( $self -> dialect_name );
182+ }
183+
184+ sub match_EOF {
185+ my ( $self , $token ) = @_ ;
186+ if ( $token -> is_eof ) {
187+ $self -> _set_token_matched( $token , ' EOF' );
188+ }
189+ }
190+
191+ sub match_Empty {
192+ my ( $self , $token ) = @_ ;
193+ if (
194+ $token -> line-> is_empty
195+ || ( !$self -> match_TagLine($token )
196+ && !$self -> match_FeatureLine($token )
197+ && !$self -> match_ScenarioLine($token )
198+ && !$self -> match_BackgroundLine($token )
199+ && !$self -> match_ExamplesLine($token )
200+ && !$self -> match_RuleLine($token )
201+ && !$self -> match_TableRow($token )
202+ && !$self -> match_Comment($token )
203+ && !$self -> match_Language($token )
204+ && !$self -> match_DocStringSeparator($token )
205+ && !$self -> match_EOF($token )
206+ && !$self -> match_StepLine($token ) )
207+ )
208+ {
209+ $self -> _set_token_matched( $token ,
210+ Empty => { indent => 0 } );
211+ }
212+ }
213+
214+ sub match_Comment {
215+ my ( $self , $token ) = @_ ;
216+ if ( $token -> line-> startswith(' |' )
217+ && $self -> _is_gfm_table_separator( $token -> line-> table_cells ) )
218+ {
219+ $self -> _set_token_matched( $token ,
220+ Comment => { text => $token -> line-> line_text, indent => 0 } );
221+ }
222+ }
223+
224+ sub _is_gfm_table_separator {
225+ my ( $self , $table_cells ) = @_ ;
226+ my @separator_values = grep { $_ -> {' text' } =~ m / ^:?-+:?$ / } @{$table_cells };
227+ return scalar (@separator_values ) > 0;
228+ }
229+
230+ sub match_Other {
231+ my ( $self , $token ) = @_ ;
232+ # take the entire line, except removing DocString indents
233+ my $text = $token -> line-> get_line_text( $self -> _indent_to_remove );
234+ $self -> _set_token_matched( $token ,
235+ Other => { indent => 0, text => $text } );
236+ }
237+
238+ sub match_StepLine {
239+ my ( $self , $token ) = @_ ;
240+ $self -> _match_title_line( $KEYWORD_PREFIX_BULLET , ' ' ,
241+ $token , StepLine => $self -> _non_star_step_keywords );
242+ }
243+
244+ sub match_DocStringSeparator {
245+ my ( $self , $token ) = @_ ;
246+ my $active_doc_string_separator = $self -> _active_doc_string_separator;
247+ if ( $token -> line-> line_text =~ qr /$active_doc_string_separator / ) {
248+ if ( $self -> _active_doc_string_separator eq $DEFAULT_DOC_STRING_SEPARATOR ) {
249+ $self -> _active_doc_string_separator( ' ^(' . $1 . ' )$' );
250+ $self -> _indent_to_remove( $token -> line-> indent );
251+ } else {
252+ $self -> _active_doc_string_separator($DEFAULT_DOC_STRING_SEPARATOR );
253+ }
254+ $self -> _set_token_matched( $token ,
255+ DocStringSeparator => { text => ' ' , keyword => $1 } );
256+ }
257+ }
258+
259+ sub match_TableRow {
260+ my ( $self , $token ) = @_ ;
261+ # Gherkin tables must be indented 2-5 spaces in order to be distinguidedn from non-Gherkin tables
262+ if ( $token -> line-> line_text =~ m / ^\s\s\s ?\s ?\s ?\| / ) {
263+ my $table_cells = $token -> line-> table_cells;
264+ return if ( $self -> _is_gfm_table_separator($table_cells ) );
265+ $self -> _set_token_matched( $token ,
266+ TableRow => { items => $table_cells } );
267+ }
268+ }
7269
82701;
9271
0 commit comments