Skip to content

Commit 5f0321a

Browse files
sunshinecottaylorr
authored andcommitted
chainlint: latch start/end position of each token
When chainlint detects problems in a test, such as a broken &&-chain, it prints out the test with "?!FOO?!" annotations inserted at each problem location. However, rather than annotating the original test definition, it instead dumps out a parsed token representation of the test. Since it lacks comments, indentations, here-doc bodies, and so forth, this tokenized representation can be difficult for the test author to digest and relate back to the original test definition. To address this shortcoming, an upcoming change will make it print out an annotated copy of the original test definition rather than the tokenized representation. In order to do so, it will need to know the start and end positions of each token in the original test definition. As preparation, upgrade TestParser::scan_token() to latch the start and end position of the token being scanned, and return that information along with the token itself. A subsequent change will take advantage of this positional information. In terms of implementation, TestParser::scan_token() is retrofitted to return a tuple consisting of the token's lexeme and its start and end positions, rather than returning just the lexeme. However, an alternative would be to define a class which represents a token: package Token; sub new { my ($class, $lexeme, $start, $end) = @_; bless [$lexeme, $start, $end] => $class; } sub as_string { my $self = shift @_; return $self->[0]; } sub compare { my ($x, $y) = @_; if (UNIVERSAL::isa($y, 'Token')) { return $x->[0] cmp $y->[0]; } return $x->[0] cmp $y; } use overload ( '""' => 'as_string', 'cmp' => 'compare' ); The major benefit of the class-based approach is that it is entirely non-invasive; it requires no additional changes to the rest of the script since a Token converts automatically to a string, which is what scan_token() historically returned. The big downside to the Token approach, however, is that it is _slow_; on this developer's (old) machine, it increases user-time by an unacceptable seven seconds when scanning all test scripts in the project. Hence, the simple tuple approach is employed instead since it adds only a fraction of a second user-time. Signed-off-by: Eric Sunshine <[email protected]> Signed-off-by: Taylor Blau <[email protected]>
1 parent ca748f5 commit 5f0321a

File tree

1 file changed

+43
-37
lines changed

1 file changed

+43
-37
lines changed

t/chainlint.pl

Lines changed: 43 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,9 @@ sub scan_heredoc_tag {
7575
my $self = shift @_;
7676
${$self->{buff}} =~ /\G(-?)/gc;
7777
my $indented = $1;
78-
my $tag = $self->scan_token();
78+
my $token = $self->scan_token();
79+
return "<<$indented" unless $token;
80+
my $tag = $token->[0];
7981
$tag =~ s/['"\\]//g;
8082
push(@{$self->{heretags}}, $indented ? "\t$tag" : "$tag");
8183
return "<<$indented$tag";
@@ -149,7 +151,7 @@ sub scan_dollar {
149151
my $self = shift @_;
150152
my $b = $self->{buff};
151153
return $self->scan_balanced('(', ')') if $$b =~ /\G\((?=\()/gc; # $((...))
152-
return '(' . join(' ', $self->scan_subst()) . ')' if $$b =~ /\G\(/gc; # $(...)
154+
return '(' . join(' ', map {$_->[0]} $self->scan_subst()) . ')' if $$b =~ /\G\(/gc; # $(...)
153155
return $self->scan_balanced('{', '}') if $$b =~ /\G\{/gc; # ${...}
154156
return $1 if $$b =~ /\G(\w+)/gc; # $var
155157
return $1 if $$b =~ /\G([@*#?$!0-9-])/gc; # $*, $1, $$, etc.
@@ -170,9 +172,11 @@ sub scan_token {
170172
my $self = shift @_;
171173
my $b = $self->{buff};
172174
my $token = '';
175+
my $start;
173176
RESTART:
174177
$$b =~ /\G[ \t]+/gc; # skip whitespace (but not newline)
175-
return "\n" if $$b =~ /\G#[^\n]*(?:\n|\z)/gc; # comment
178+
$start = pos($$b) || 0;
179+
return ["\n", $start, pos($$b)] if $$b =~ /\G#[^\n]*(?:\n|\z)/gc; # comment
176180
while (1) {
177181
# slurp up non-special characters
178182
$token .= $1 if $$b =~ /\G([^\\;&|<>(){}'"\$\s]+)/gc;
@@ -197,7 +201,7 @@ sub scan_token {
197201
}
198202
die("internal error scanning character '$c'\n");
199203
}
200-
return length($token) ? $token : undef;
204+
return length($token) ? [$token, $start, pos($$b)] : undef;
201205
}
202206

203207
# ShellParser parses POSIX shell scripts (with minor extensions for Bash). It
@@ -239,14 +243,14 @@ sub stop_at {
239243
my ($self, $token) = @_;
240244
return 1 unless defined($token);
241245
my $stop = ${$self->{stop}}[-1] if @{$self->{stop}};
242-
return defined($stop) && $token =~ $stop;
246+
return defined($stop) && $token->[0] =~ $stop;
243247
}
244248

245249
sub expect {
246250
my ($self, $expect) = @_;
247251
my $token = $self->next_token();
248-
return $token if defined($token) && $token eq $expect;
249-
push(@{$self->{output}}, "?!ERR?! expected '$expect' but found '" . (defined($token) ? $token : "<end-of-input>") . "'\n");
252+
return $token if defined($token) && $token->[0] eq $expect;
253+
push(@{$self->{output}}, "?!ERR?! expected '$expect' but found '" . (defined($token) ? $token->[0] : "<end-of-input>") . "'\n");
250254
$self->untoken($token) if defined($token);
251255
return ();
252256
}
@@ -255,7 +259,7 @@ sub optional_newlines {
255259
my $self = shift @_;
256260
my @tokens;
257261
while (my $token = $self->peek()) {
258-
last unless $token eq "\n";
262+
last unless $token->[0] eq "\n";
259263
push(@tokens, $self->next_token());
260264
}
261265
return @tokens;
@@ -278,7 +282,7 @@ sub parse_case_pattern {
278282
my @tokens;
279283
while (defined(my $token = $self->next_token())) {
280284
push(@tokens, $token);
281-
last if $token eq ')';
285+
last if $token->[0] eq ')';
282286
}
283287
return @tokens;
284288
}
@@ -293,13 +297,13 @@ sub parse_case {
293297
$self->optional_newlines());
294298
while (1) {
295299
my $token = $self->peek();
296-
last unless defined($token) && $token ne 'esac';
300+
last unless defined($token) && $token->[0] ne 'esac';
297301
push(@tokens,
298302
$self->parse_case_pattern(),
299303
$self->optional_newlines(),
300304
$self->parse(qr/^(?:;;|esac)$/)); # item body
301305
$token = $self->peek();
302-
last unless defined($token) && $token ne 'esac';
306+
last unless defined($token) && $token->[0] ne 'esac';
303307
push(@tokens,
304308
$self->expect(';;'),
305309
$self->optional_newlines());
@@ -315,7 +319,7 @@ sub parse_for {
315319
$self->next_token(), # variable
316320
$self->optional_newlines());
317321
my $token = $self->peek();
318-
if (defined($token) && $token eq 'in') {
322+
if (defined($token) && $token->[0] eq 'in') {
319323
push(@tokens,
320324
$self->expect('in'),
321325
$self->optional_newlines());
@@ -339,11 +343,11 @@ sub parse_if {
339343
$self->optional_newlines(),
340344
$self->parse(qr/^(?:elif|else|fi)$/)); # if/elif body
341345
my $token = $self->peek();
342-
last unless defined($token) && $token eq 'elif';
346+
last unless defined($token) && $token->[0] eq 'elif';
343347
push(@tokens, $self->expect('elif'));
344348
}
345349
my $token = $self->peek();
346-
if (defined($token) && $token eq 'else') {
350+
if (defined($token) && $token->[0] eq 'else') {
347351
push(@tokens,
348352
$self->expect('else'),
349353
$self->optional_newlines(),
@@ -380,7 +384,7 @@ sub parse_bash_array_assignment {
380384
my @tokens = $self->expect('(');
381385
while (defined(my $token = $self->next_token())) {
382386
push(@tokens, $token);
383-
last if $token eq ')';
387+
last if $token->[0] eq ')';
384388
}
385389
return @tokens;
386390
}
@@ -398,29 +402,31 @@ sub parse_cmd {
398402
my $self = shift @_;
399403
my $cmd = $self->next_token();
400404
return () unless defined($cmd);
401-
return $cmd if $cmd eq "\n";
405+
return $cmd if $cmd->[0] eq "\n";
402406

403407
my $token;
404408
my @tokens = $cmd;
405-
if ($cmd eq '!') {
409+
if ($cmd->[0] eq '!') {
406410
push(@tokens, $self->parse_cmd());
407411
return @tokens;
408-
} elsif (my $f = $compound{$cmd}) {
412+
} elsif (my $f = $compound{$cmd->[0]}) {
409413
push(@tokens, $self->$f());
410-
} elsif (defined($token = $self->peek()) && $token eq '(') {
411-
if ($cmd !~ /\w=$/) {
414+
} elsif (defined($token = $self->peek()) && $token->[0] eq '(') {
415+
if ($cmd->[0] !~ /\w=$/) {
412416
push(@tokens, $self->parse_func());
413417
return @tokens;
414418
}
415-
$tokens[-1] .= join(' ', $self->parse_bash_array_assignment());
419+
my @array = $self->parse_bash_array_assignment();
420+
$tokens[-1]->[0] .= join(' ', map {$_->[0]} @array);
421+
$tokens[-1]->[2] = $array[$#array][2] if @array;
416422
}
417423

418424
while (defined(my $token = $self->next_token())) {
419425
$self->untoken($token), last if $self->stop_at($token);
420426
push(@tokens, $token);
421-
last if $token =~ /^(?:[;&\n|]|&&|\|\|)$/;
427+
last if $token->[0] =~ /^(?:[;&\n|]|&&|\|\|)$/;
422428
}
423-
push(@tokens, $self->next_token()) if $tokens[-1] ne "\n" && defined($token = $self->peek()) && $token eq "\n";
429+
push(@tokens, $self->next_token()) if $tokens[-1]->[0] ne "\n" && defined($token = $self->peek()) && $token->[0] eq "\n";
424430
return @tokens;
425431
}
426432

@@ -457,7 +463,7 @@ sub find_non_nl {
457463
my $tokens = shift @_;
458464
my $n = shift @_;
459465
$n = $#$tokens if !defined($n);
460-
$n-- while $n >= 0 && $$tokens[$n] eq "\n";
466+
$n-- while $n >= 0 && $$tokens[$n]->[0] eq "\n";
461467
return $n;
462468
}
463469

@@ -467,7 +473,7 @@ sub ends_with {
467473
for my $needle (reverse(@$needles)) {
468474
return undef if $n < 0;
469475
$n = find_non_nl($tokens, $n), next if $needle eq "\n";
470-
return undef if $$tokens[$n] !~ $needle;
476+
return undef if $$tokens[$n]->[0] !~ $needle;
471477
$n--;
472478
}
473479
return 1;
@@ -486,13 +492,13 @@ sub parse_loop_body {
486492
my $self = shift @_;
487493
my @tokens = $self->SUPER::parse_loop_body(@_);
488494
# did loop signal failure via "|| return" or "|| exit"?
489-
return @tokens if !@tokens || grep(/^(?:return|exit|\$\?)$/, @tokens);
495+
return @tokens if !@tokens || grep {$_->[0] =~ /^(?:return|exit|\$\?)$/} @tokens;
490496
# did loop upstream of a pipe signal failure via "|| echo 'impossible
491497
# text'" as the final command in the loop body?
492498
return @tokens if ends_with(\@tokens, [qr/^\|\|$/, "\n", qr/^echo$/, qr/^.+$/]);
493499
# flag missing "return/exit" handling explicit failure in loop body
494500
my $n = find_non_nl(\@tokens);
495-
splice(@tokens, $n + 1, 0, '?!LOOP?!');
501+
splice(@tokens, $n + 1, 0, ['?!LOOP?!', $tokens[$n]->[1], $tokens[$n]->[2]]);
496502
return @tokens;
497503
}
498504

@@ -510,28 +516,28 @@ sub accumulate {
510516
goto DONE unless @$tokens;
511517

512518
# new command is empty line; can't yet check if previous is missing "&&"
513-
goto DONE if @$cmd == 1 && $$cmd[0] eq "\n";
519+
goto DONE if @$cmd == 1 && $$cmd[0]->[0] eq "\n";
514520

515521
# did previous command end with "&&", "|", "|| return" or similar?
516522
goto DONE if match_ending($tokens, \@safe_endings);
517523

518524
# if this command handles "$?" specially, then okay for previous
519525
# command to be missing "&&"
520526
for my $token (@$cmd) {
521-
goto DONE if $token =~ /\$\?/;
527+
goto DONE if $token->[0] =~ /\$\?/;
522528
}
523529

524530
# if this command is "false", "return 1", or "exit 1" (which signal
525531
# failure explicitly), then okay for all preceding commands to be
526532
# missing "&&"
527-
if ($$cmd[0] =~ /^(?:false|return|exit)$/) {
528-
@$tokens = grep(!/^\?!AMP\?!$/, @$tokens);
533+
if ($$cmd[0]->[0] =~ /^(?:false|return|exit)$/) {
534+
@$tokens = grep {$_->[0] !~ /^\?!AMP\?!$/} @$tokens;
529535
goto DONE;
530536
}
531537

532538
# flag missing "&&" at end of previous command
533539
my $n = find_non_nl($tokens);
534-
splice(@$tokens, $n + 1, 0, '?!AMP?!') unless $n < 0;
540+
splice(@$tokens, $n + 1, 0, ['?!AMP?!', $$tokens[$n]->[1], $$tokens[$n]->[2]]) unless $n < 0;
535541

536542
DONE:
537543
$self->SUPER::accumulate($tokens, $cmd);
@@ -557,7 +563,7 @@ sub new {
557563
# composition of multiple strings and non-string character runs; for instance,
558564
# `"test body"` unwraps to `test body`; `word"a b"42'c d'` to `worda b42c d`
559565
sub unwrap {
560-
my $token = @_ ? shift @_ : $_;
566+
my $token = (@_ ? shift @_ : $_)->[0];
561567
# simple case: 'sqstring' or "dqstring"
562568
return $token if $token =~ s/^'([^']*)'$/$1/;
563569
return $token if $token =~ s/^"([^"]*)"$/$1/;
@@ -588,9 +594,9 @@ sub check_test {
588594
$self->{ntests}++;
589595
my $parser = TestParser->new(\$body);
590596
my @tokens = $parser->parse();
591-
return unless $emit_all || grep(/\?![^?]+\?!/, @tokens);
597+
return unless $emit_all || grep {$_->[0] =~ /\?![^?]+\?!/} @tokens;
592598
my $c = main::fd_colors(1);
593-
my $checked = join(' ', @tokens);
599+
my $checked = join(' ', map {$_->[0]} @tokens);
594600
$checked =~ s/^\n//;
595601
$checked =~ s/^ //mg;
596602
$checked =~ s/ $//mg;
@@ -602,9 +608,9 @@ sub check_test {
602608
sub parse_cmd {
603609
my $self = shift @_;
604610
my @tokens = $self->SUPER::parse_cmd();
605-
return @tokens unless @tokens && $tokens[0] =~ /^test_expect_(?:success|failure)$/;
611+
return @tokens unless @tokens && $tokens[0]->[0] =~ /^test_expect_(?:success|failure)$/;
606612
my $n = $#tokens;
607-
$n-- while $n >= 0 && $tokens[$n] =~ /^(?:[;&\n|]|&&|\|\|)$/;
613+
$n-- while $n >= 0 && $tokens[$n]->[0] =~ /^(?:[;&\n|]|&&|\|\|)$/;
608614
$self->check_test($tokens[1], $tokens[2]) if $n == 2; # title body
609615
$self->check_test($tokens[2], $tokens[3]) if $n > 2; # prereq title body
610616
return @tokens;

0 commit comments

Comments
 (0)