@@ -3,7 +3,268 @@ package Gherkin::MarkdownTokenMatcher;
33use strict;
44use warnings;
55
6- use base ' Gherkin::TokenMatcher' ;
6+ my $DEFAULT_DOC_STRING_SEPARATOR = q/ ^(```[`]*)(.*)/ ;
7+ my $KEYWORD_PREFIX_BULLET = q/ ^(\\s*[*+-]\\s*)/ ;
8+ my $KEYWORD_PREFIX_HEADER = q/ ^(#{1,6}\\s)/ ;
9+
10+ use Class::XSAccessor accessors => [
11+ qw/ dialect _default_dialect_name _indent_to_remove _active_doc_string_separator _keyword_types
12+ _matched_FeatureLine _non_star_step_keywords/ ,
13+ ];
14+
15+ use Gherkin::Dialect;
16+
17+ sub new {
18+ my ( $class , $options ) = @_ ;
19+ $options -> {' dialect' } ||= Gherkin::Dialect-> new( { dialect => ' en' } );
20+ my $self = bless $options , $class ;
21+ $self -> _default_dialect_name( $self -> dialect_name );
22+ my @non_star_step_keywords = map {
23+ grep { $_ ne ' * ' }
24+ @{ $self -> dialect-> $_ }
25+ } qw/ Given When Then And But/ ;
26+ $self -> _non_star_step_keywords( \@non_star_step_keywords );
27+ $self -> reset ();
28+ return $self ;
29+ }
30+
31+ sub _add_keyword_type_mappings {
32+ my ( $keyword_types , $keywords , $type ) = @_ ;
33+
34+ for my $keyword ( @{$keywords } ) {
35+ if ( not exists $keyword_types -> {$keyword } ) {
36+ $keyword_types -> {$keyword } = [];
37+ }
38+ push ( @{ $keyword_types -> {$keyword } }, $type );
39+ }
40+ }
41+
42+ sub dialect_name { return $_ [0]-> dialect-> dialect; }
43+
44+ sub change_dialect {
45+ my $self = shift ;
46+ $self -> dialect-> change_dialect(@_ );
47+
48+ my $keyword_types = {};
49+ _add_keyword_type_mappings( $keyword_types , $self -> dialect-> Given,
50+ Cucumber::Messages::Step::KEYWORDTYPE_CONTEXT );
51+ _add_keyword_type_mappings( $keyword_types , $self -> dialect-> When,
52+ Cucumber::Messages::Step::KEYWORDTYPE_ACTION );
53+ _add_keyword_type_mappings( $keyword_types , $self -> dialect-> Then,
54+ Cucumber::Messages::Step::KEYWORDTYPE_OUTCOME );
55+ _add_keyword_type_mappings( $keyword_types , [ @{ $self -> dialect-> And }, @{ $self -> dialect-> But } ],
56+ Cucumber::Messages::Step::KEYWORDTYPE_CONJUNCTION );
57+ $self -> _keyword_types($keyword_types );
58+ }
59+
60+ sub reset {
61+ my $self = shift ;
62+ $self -> change_dialect( $self -> _default_dialect_name );
63+ $self -> _indent_to_remove(0);
64+ $self -> _active_doc_string_separator($DEFAULT_DOC_STRING_SEPARATOR );
65+ }
66+
67+ sub match_FeatureLine {
68+ my ( $self , $token ) = @_ ;
69+ return if $self -> _matched_FeatureLine;
70+
71+ # We first try to match "# Feature: blah"
72+ my $result = $self -> _match_title_line( $KEYWORD_PREFIX_HEADER , ' :' ,
73+ $token , FeatureLine => $self -> dialect-> Feature );
74+ # If we didn't match "# Feature: blah", we still match this line
75+ # as a FeatureLine.
76+ # The reason for this is that users may not want to be constrained by having this as their fist line.
77+ unless ($result ) {
78+ $self -> _set_token_matched( $token ,
79+ FeatureLine => { text => $token -> line-> _trimmed_line_text } );
80+ }
81+ $self -> _matched_FeatureLine(1);
82+ }
83+
84+ sub match_RuleLine {
85+ my ( $self , $token ) = @_ ;
86+ $self -> _match_title_line( $KEYWORD_PREFIX_HEADER , ' :' ,
87+ $token , RuleLine => $self -> dialect-> Rule );
88+ }
89+
90+ sub match_ScenarioLine {
91+ my ( $self , $token ) = @_ ;
92+ $self -> _match_title_line( $KEYWORD_PREFIX_HEADER , ' :' ,
93+ $token , ScenarioLine => $self -> dialect-> Scenario )
94+ or $self -> _match_title_line( $KEYWORD_PREFIX_HEADER , ' :' ,
95+ $token , ScenarioLine => $self -> dialect-> ScenarioOutline );
96+ }
97+
98+ sub match_BackgroundLine {
99+ my ( $self , $token ) = @_ ;
100+ $self -> _match_title_line( $KEYWORD_PREFIX_HEADER , ' :' ,
101+ $token , BackgroundLine => $self -> dialect-> Background );
102+ }
103+
104+ sub match_ExamplesLine {
105+ my ( $self , $token ) = @_ ;
106+ $self -> _match_title_line( $KEYWORD_PREFIX_HEADER , ' :' ,
107+ $token , ExamplesLine => $self -> dialect-> Examples );
108+ }
109+
110+ sub match_Language {
111+ my ( $self , $token ) = @_ ;
112+ # We've made a deliberate choice not to support `# language: [ISO 639-1]` headers or similar
113+ # in Markdown. Users should specify a language globally.
114+ return ' ' ;
115+ }
116+
117+ sub match_TagLine {
118+ my ( $self , $token ) = @_ ;
119+ my @tags = ();
120+ while ( $token -> line-> line_text =~ m / `(@[^`]+)`/ g ) {
121+ push (
122+ @tags ,
123+ {
124+ column => 2 + length ($` ),
125+ text => $1 ,
126+ }
127+ );
128+ }
129+ return unless scalar (@tags );
130+ $self -> _set_token_matched( $token ,
131+ TagLine => { items => \@tags } );
132+ }
133+
134+ sub _match_title_line {
135+ my ( $self , $prefix , $keyword_suffix , $token , $token_type , $keywords ) = @_ ;
136+ my $regex = $prefix . ' (' . join ( ' |' , @{$keywords } ) . ' )' . $keyword_suffix . ' \s*(.*)' ;
137+ if ( $token -> line-> _trimmed_line_text =~ qr /$regex / ) {
138+ my $indent = $token -> line-> indent + ( length ($1 ) || 0 );
139+ my $keyword = $2 ;
140+ my $text = $3 ;
141+ $text =~ s /\s +$// ;
142+ my $keyword_type ;
143+ if ( exists $self -> _keyword_types-> {$keyword } ) {
144+ # only set the keyword type if this is a step keyword
145+ $keyword_type =
146+ ( scalar ( @{ $self -> _keyword_types-> {$keyword } } ) > 1 )
147+ ? Cucumber::Messages::Step::KEYWORDTYPE_UNKNOWN
148+ : $self -> _keyword_types-> {$keyword }-> [0];
149+ }
150+ $self -> _set_token_matched( $token , $token_type ,
151+ { indent => $indent , keyword => $keyword , text => $text , keyword_type => $keyword_type } );
152+ }
153+ }
154+
155+ sub _set_token_matched {
156+ my ( $self , $token , $matched_type , $options ) = @_ ;
157+ $options -> {' items' } ||= [];
158+ $token -> matched_type($matched_type );
159+
160+ if ( defined $options -> {' text' } ) {
161+ chomp $options -> {' text' };
162+ $token -> matched_text( $options -> {' text' } );
163+ }
164+
165+ $token -> matched_keyword( $options -> {' keyword' } )
166+ if defined $options -> {' keyword' };
167+ $token -> matched_keyword_type( $options -> {' keyword_type' } )
168+ if defined $options -> {' keyword_type' };
169+
170+ if ( defined $options -> {' indent' } ) {
171+ $token -> matched_indent( $options -> {' indent' } );
172+ } else {
173+ $token -> matched_indent( $token -> line ? $token -> line-> indent : 0 );
174+ }
175+
176+ $token -> matched_items( $options -> {' items' } )
177+ if defined $options -> {' items' };
178+
179+ $token -> location-> {' column' } = $token -> matched_indent + 1;
180+ $token -> matched_gherkin_dialect( $self -> dialect_name );
181+ }
182+
183+ sub match_EOF {
184+ my ( $self , $token ) = @_ ;
185+ if ( $token -> is_eof ) {
186+ $self -> _set_token_matched( $token , ' EOF' );
187+ }
188+ }
189+
190+ sub match_Empty {
191+ my ( $self , $token ) = @_ ;
192+ if (
193+ $token -> line-> is_empty
194+ || ( !$self -> match_TagLine($token )
195+ && !$self -> match_FeatureLine($token )
196+ && !$self -> match_ScenarioLine($token )
197+ && !$self -> match_BackgroundLine($token )
198+ && !$self -> match_ExamplesLine($token )
199+ && !$self -> match_RuleLine($token )
200+ && !$self -> match_TableRow($token )
201+ && !$self -> match_Comment($token )
202+ && !$self -> match_Language($token )
203+ && !$self -> match_DocStringSeparator($token )
204+ && !$self -> match_EOF($token )
205+ && !$self -> match_StepLine($token ) )
206+ )
207+ {
208+ $self -> _set_token_matched( $token ,
209+ Empty => { indent => 0 } );
210+ }
211+ }
212+
213+ sub match_Comment {
214+ my ( $self , $token ) = @_ ;
215+ if ( $token -> line-> startswith(' |' )
216+ && $self -> _is_gfm_table_separator( $token -> line-> table_cells ) )
217+ {
218+ $self -> _set_token_matched( $token ,
219+ Comment => { text => $token -> line-> line_text, indent => 0 } );
220+ }
221+ }
222+
223+ sub _is_gfm_table_separator {
224+ my ( $self , $table_cells ) = @_ ;
225+ my @separator_values = grep { $_ -> {' text' } =~ m / ^:?-+:?$ / } @{$table_cells };
226+ return scalar (@separator_values ) > 0;
227+ }
228+
229+ sub match_Other {
230+ my ( $self , $token ) = @_ ;
231+ # take the entire line, except removing DocString indents
232+ my $text = $token -> line-> get_line_text( $self -> _indent_to_remove );
233+ $self -> _set_token_matched( $token ,
234+ Other => { indent => 0, text => $text } );
235+ }
236+
237+ sub match_StepLine {
238+ my ( $self , $token ) = @_ ;
239+ $self -> _match_title_line( $KEYWORD_PREFIX_BULLET , ' ' ,
240+ $token , StepLine => $self -> _non_star_step_keywords );
241+ }
242+
243+ sub match_DocStringSeparator {
244+ my ( $self , $token ) = @_ ;
245+ my $active_doc_string_separator = $self -> _active_doc_string_separator;
246+ if ( $token -> line-> line_text =~ qr /$active_doc_string_separator / ) {
247+ if ( $self -> _active_doc_string_separator eq $DEFAULT_DOC_STRING_SEPARATOR ) {
248+ $self -> _active_doc_string_separator( ' ^(' . $1 . ' )$' );
249+ $self -> _indent_to_remove( $token -> line-> indent );
250+ } else {
251+ $self -> _active_doc_string_separator($DEFAULT_DOC_STRING_SEPARATOR );
252+ }
253+ $self -> _set_token_matched( $token ,
254+ DocStringSeparator => { text => ' ' , keyword => $1 } );
255+ }
256+ }
257+
258+ sub match_TableRow {
259+ my ( $self , $token ) = @_ ;
260+ # Gherkin tables must be indented 2-5 spaces in order to be distinguidedn from non-Gherkin tables
261+ if ( $token -> line-> line_text =~ m / ^\s\s\s ?\s ?\s ?\| / ) {
262+ my $table_cells = $token -> line-> table_cells;
263+ return if ( $self -> _is_gfm_table_separator($table_cells ) );
264+ $self -> _set_token_matched( $token ,
265+ TableRow => { items => $table_cells } );
266+ }
267+ }
7268
82691;
9270
0 commit comments