Skip to content

Commit f56f855

Browse files
committed
Fix lexing of nested compiler directives
1 parent 73e8b82 commit f56f855

File tree

2 files changed

+259
-48
lines changed

2 files changed

+259
-48
lines changed

CHANGELOG.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
77

88
## [Unreleased]
99

10+
### Fixed
11+
12+
- Lexing of conditional directive expressions containing compiler directives, comments, or strings.
13+
1014
## [0.3.0] - 2024-05-29
1115

1216
### Removed

core/src/defaults/lexer.rs

Lines changed: 255 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -179,6 +179,20 @@ impl LexArgs<'_, '_> {
179179
}
180180
}
181181

182+
// I can't seem to write a function signature for 'cloning' this type that the borrow checker is happy with.
183+
// It can't be cloned in the usual way because it contains a mutable reference.
184+
// The borrow checker is smart enough to see that the mutable references don't overlap when you construct the clone
185+
// in place, and this macro just makes it more convenient to do that.
186+
macro_rules! lex_args_copy {
187+
($args: ident) => {
188+
LexArgs {
189+
input: $args.input,
190+
offset: $args.offset,
191+
lex_state: $args.lex_state,
192+
}
193+
};
194+
}
195+
182196
type LexerFn = fn(LexArgs) -> OffsetAndTokenType;
183197

184198
const COMMON_LEXER_MAP: [Option<LexerFn>; 256] = make_byte_map(
@@ -980,54 +994,150 @@ fn consume_to_eof(input: &str, token_type: RawTokenType) -> (usize, RawTokenType
980994

981995
// region: directives/comments
982996

983-
fn compiler_directive_type(input: &str, offset: usize) -> RawTokenType {
984-
let count = count_matching(input, offset, |b| b.is_ascii_alphabetic());
985-
986-
let directive = &input[offset..(offset + count)];
987-
988-
if directive.eq_ignore_ascii_case("if") {
989-
TT::ConditionalDirective(CDK::If)
990-
} else if directive.eq_ignore_ascii_case("ifdef") {
991-
TT::ConditionalDirective(CDK::Ifdef)
992-
} else if directive.eq_ignore_ascii_case("ifndef") {
993-
TT::ConditionalDirective(CDK::Ifndef)
994-
} else if directive.eq_ignore_ascii_case("ifopt") {
995-
TT::ConditionalDirective(CDK::Ifopt)
996-
} else if directive.eq_ignore_ascii_case("elseif") {
997-
TT::ConditionalDirective(CDK::Elseif)
998-
} else if directive.eq_ignore_ascii_case("else") {
999-
TT::ConditionalDirective(CDK::Else)
1000-
} else if directive.eq_ignore_ascii_case("ifend") {
1001-
TT::ConditionalDirective(CDK::Ifend)
1002-
} else if directive.eq_ignore_ascii_case("endif") {
1003-
TT::ConditionalDirective(CDK::Endif)
1004-
} else {
1005-
TT::CompilerDirective
997+
fn conditional_directive_type(
998+
input: &str,
999+
offset: usize,
1000+
) -> (usize, Option<ConditionalDirectiveKind>) {
1001+
let end_offset = offset + count_matching(input, offset, |b| b.is_ascii_alphabetic());
1002+
1003+
let directive = &input[offset..end_offset];
1004+
1005+
let kind = {
1006+
if directive.eq_ignore_ascii_case("if") {
1007+
Some(CDK::If)
1008+
} else if directive.eq_ignore_ascii_case("ifdef") {
1009+
Some(CDK::Ifdef)
1010+
} else if directive.eq_ignore_ascii_case("ifndef") {
1011+
Some(CDK::Ifndef)
1012+
} else if directive.eq_ignore_ascii_case("ifopt") {
1013+
Some(CDK::Ifopt)
1014+
} else if directive.eq_ignore_ascii_case("elseif") {
1015+
Some(CDK::Elseif)
1016+
} else if directive.eq_ignore_ascii_case("else") {
1017+
Some(CDK::Else)
1018+
} else if directive.eq_ignore_ascii_case("ifend") {
1019+
Some(CDK::Ifend)
1020+
} else if directive.eq_ignore_ascii_case("endif") {
1021+
Some(CDK::Endif)
1022+
} else {
1023+
None
1024+
}
1025+
};
1026+
1027+
(end_offset, kind)
1028+
}
1029+
1030+
#[derive(Eq, PartialEq, Copy, Clone)]
1031+
enum BlockCommentKind {
1032+
ParenStar,
1033+
Brace,
1034+
}
1035+
1036+
fn parse_directive_expr(
1037+
mut args: LexArgs,
1038+
kind: BlockCommentKind,
1039+
) -> (RawTokenType, Option<usize>) {
1040+
let (offset, cdk) = conditional_directive_type(args.input, args.offset);
1041+
args.offset = offset;
1042+
1043+
match cdk {
1044+
Some(cdk @ (CDK::If | CDK::Elseif)) => (
1045+
TT::ConditionalDirective(cdk),
1046+
find_directive_expr_end(args, kind),
1047+
),
1048+
Some(cdk) => (
1049+
TT::ConditionalDirective(cdk),
1050+
find_block_comment_end(args, kind),
1051+
),
1052+
None => (TT::CompilerDirective, find_block_comment_end(args, kind)),
10061053
}
10071054
}
10081055

1009-
fn _compiler_directive<const START_LEN: usize>(
1010-
input: &str,
1011-
start_offset: usize,
1012-
end_offset: Option<usize>,
1013-
) -> OffsetAndTokenType {
1014-
let token_type = compiler_directive_type(input, start_offset);
1015-
if let Some(pos) = end_offset {
1016-
(pos, token_type)
1017-
} else {
1018-
warn_unterminated("compiler directive", input, start_offset - START_LEN);
1019-
consume_to_eof(input, token_type)
1056+
fn find_directive_expr_end(mut args: LexArgs, kind: BlockCommentKind) -> Option<usize> {
1057+
let input = args.input.as_bytes();
1058+
loop {
1059+
match (
1060+
input.get(args.offset),
1061+
input.get(args.offset + 1),
1062+
input.get(args.offset + 2),
1063+
) {
1064+
// end alt block comment or directive
1065+
(Some(b'*'), Some(b')'), _) if kind == BlockCommentKind::ParenStar => {
1066+
return Some(args.offset + 2);
1067+
}
1068+
// end block comment or directive
1069+
(Some(b'}'), _, _) if kind == BlockCommentKind::Brace => {
1070+
return Some(args.offset + 1);
1071+
}
1072+
// start alt directive
1073+
(Some(b'('), Some(b'*'), Some(b'$')) => {
1074+
args.offset += 3;
1075+
args.offset =
1076+
parse_directive_expr(lex_args_copy!(args), BlockCommentKind::ParenStar).1?;
1077+
}
1078+
// start directive
1079+
(Some(b'{'), Some(b'$'), _) => {
1080+
args.offset += 2;
1081+
args.offset =
1082+
parse_directive_expr(lex_args_copy!(args), BlockCommentKind::Brace).1?;
1083+
}
1084+
// start alt block
1085+
(Some(b'('), Some(b'*'), _) => {
1086+
args.offset += 2;
1087+
args.offset = block_comment_alt(lex_args_copy!(args)).0;
1088+
}
1089+
// start block
1090+
(Some(b'{'), _, _) => {
1091+
args.offset += 1;
1092+
args.offset = block_comment(lex_args_copy!(args)).0;
1093+
}
1094+
// start string
1095+
(Some(b'\''), _, _) => {
1096+
args.offset += 1;
1097+
args.offset = text_literal(lex_args_copy!(args)).0;
1098+
}
1099+
// start line comment
1100+
(Some(b'/'), Some(b'/'), _) => {
1101+
args.offset += 2;
1102+
args.offset = line_comment(lex_args_copy!(args)).0;
1103+
}
1104+
(None, _, _) => {
1105+
return None;
1106+
}
1107+
_ => {
1108+
args.offset += 1;
1109+
}
1110+
}
10201111
}
10211112
}
10221113

1023-
fn compiler_directive_alt(args: LexArgs) -> OffsetAndTokenType {
1024-
let end_offset = memchr::memmem::find(args.input.as_bytes(), b"*)").map(|o| o + 2);
1025-
_compiler_directive::<2>(args.input, args.offset, end_offset)
1114+
fn find_block_comment_end(
1115+
LexArgs { input, offset, .. }: LexArgs,
1116+
kind: BlockCommentKind,
1117+
) -> Option<usize> {
1118+
match kind {
1119+
BlockCommentKind::ParenStar => {
1120+
memchr::memmem::find(&input.as_bytes()[offset..], b"*)").map(|o| offset + o + 2)
1121+
}
1122+
BlockCommentKind::Brace => {
1123+
memchr::memchr(b'}', &input.as_bytes()[offset..]).map(|o| offset + o + 1)
1124+
}
1125+
}
10261126
}
10271127

1028-
fn compiler_directive(args: LexArgs) -> OffsetAndTokenType {
1029-
let end_offset = memchr::memchr(b'}', args.input.as_bytes()).map(|o| o + 1);
1030-
_compiler_directive::<1>(args.input, args.offset, end_offset)
1128+
fn compiler_directive(args: LexArgs, kind: BlockCommentKind) -> OffsetAndTokenType {
1129+
let (token_type, end_offset) = parse_directive_expr(lex_args_copy!(args), kind);
1130+
1131+
if let Some(pos) = end_offset {
1132+
(pos, token_type)
1133+
} else {
1134+
let start_len = match kind {
1135+
BlockCommentKind::ParenStar => 2,
1136+
BlockCommentKind::Brace => 1,
1137+
};
1138+
warn_unterminated("compiler directive", args.input, args.offset - start_len);
1139+
consume_to_eof(args.input, token_type)
1140+
}
10311141
}
10321142

10331143
fn block_comment_kind(
@@ -1045,12 +1155,13 @@ fn block_comment_kind(
10451155
}
10461156
}
10471157

1048-
fn _block_comment<const START_LEN: usize>(
1158+
fn _block_comment(
10491159
LexArgs {
10501160
input,
10511161
offset,
10521162
lex_state,
10531163
}: LexArgs,
1164+
start_len: usize,
10541165
end_offset: Option<usize>,
10551166
) -> OffsetAndTokenType {
10561167
if let Some(end_offset) = end_offset {
@@ -1059,19 +1170,19 @@ fn _block_comment<const START_LEN: usize>(
10591170
let comment_kind = block_comment_kind(nl_offset, offset, end_offset, lex_state);
10601171
(end_offset, TT::Comment(comment_kind))
10611172
} else {
1062-
warn_unterminated("block comment", input, offset - START_LEN);
1173+
warn_unterminated("block comment", input, offset - start_len);
10631174
consume_to_eof(input, TT::Comment(CommentKind::MultilineBlock))
10641175
}
10651176
}
10661177

10671178
fn block_comment_alt(args: LexArgs) -> OffsetAndTokenType {
1068-
let end_offset = memchr::memmem::find(args.input.as_bytes(), b"*)").map(|pos| pos + 2);
1069-
_block_comment::<2>(args, end_offset)
1179+
let end_offset = find_block_comment_end(lex_args_copy!(args), BlockCommentKind::ParenStar);
1180+
_block_comment(args, 2, end_offset)
10701181
}
10711182

10721183
fn block_comment(args: LexArgs) -> OffsetAndTokenType {
1073-
let end_offset = memchr::memchr(b'}', args.input.as_bytes()).map(|pos| pos + 1);
1074-
_block_comment::<1>(args, end_offset)
1184+
let end_offset = find_block_comment_end(lex_args_copy!(args), BlockCommentKind::Brace);
1185+
_block_comment(args, 1, end_offset)
10751186
}
10761187

10771188
fn line_comment(
@@ -1096,14 +1207,14 @@ fn line_comment(
10961207

10971208
fn compiler_directive_or_comment_alt(args: LexArgs) -> OffsetAndTokenType {
10981209
match args.next_byte() {
1099-
Some(b'$') => compiler_directive_alt(args.consume(1)),
1210+
Some(b'$') => compiler_directive(args.consume(1), BlockCommentKind::ParenStar),
11001211
_ => block_comment_alt(args),
11011212
}
11021213
}
11031214

11041215
fn compiler_directive_or_comment(args: LexArgs) -> OffsetAndTokenType {
11051216
match args.next_byte() {
1106-
Some(b'$') => compiler_directive(args.consume(1)),
1217+
Some(b'$') => compiler_directive(args.consume(1), BlockCommentKind::Brace),
11071218
_ => block_comment(args),
11081219
}
11091220
}
@@ -1449,6 +1560,102 @@ mod tests {
14491560
},
14501561
&[("(*$if\n// other comment\nFoo;", IF_DIRECTIVE)],
14511562
);
1563+
// nested unterminated block comment
1564+
run_test("{$if (*if } //", &[("{$if (*if } //", IF_DIRECTIVE)]);
1565+
// nested unterminated directive
1566+
run_test("{$if (*$if } //", &[("{$if (*$if } //", IF_DIRECTIVE)]);
1567+
// nested line comment
1568+
run_test("{$if // } //", &[("{$if // } //", IF_DIRECTIVE)]);
1569+
// nested string literal
1570+
run_test("{$if '} //", &[("{$if '} //", IF_DIRECTIVE)]);
1571+
}
1572+
1573+
#[test]
1574+
fn lex_complex_directive_expressions() {
1575+
/*
1576+
Since Delphi conditional directives contain expressions, they can also contain comments
1577+
and other conditional directives (to a limited and buggy extent). This means that we
1578+
can't just treat them as block comments when finding the bounds of the token for the
1579+
directive.
1580+
1581+
The only directives that can contain these expressions are `if` and `elseif`, all the
1582+
others can safely be lexed as a simple block comment.
1583+
*/
1584+
1585+
run_test(
1586+
indoc! {"
1587+
{$if {$i foo} = 0}
1588+
{$if (*$i foo*) = 0}
1589+
(*$if (*$i foo*) = 0*)
1590+
(*$if {$i foo} = 0*)
1591+
1592+
{$if {(*} (*{*) {(* {$if }}
1593+
1594+
{$if {{} }
1595+
{$if {$if {}} }
1596+
1597+
{$if {$i foo} = 0*) }
1598+
(*$if {$i foo} = 0} *)
1599+
1600+
{$if
1601+
{$if True}
1602+
FOO
1603+
(*$elseif {$if True}FOO{$else}BAR{$endif} *)
1604+
BAR
1605+
{$endif}
1606+
= 0
1607+
}
1608+
1609+
{$ifdef {$i inc}
1610+
{$if {$ifdef {}}
1611+
1612+
{$if a = '}'#10#13''''}
1613+
{$if a = #10}
1614+
1615+
{$if a = '''
1616+
}
1617+
'''
1618+
}
1619+
1620+
"},
1621+
&[
1622+
// Ensure directives and comments can be mixed in any order
1623+
("{$if {$i foo} = 0}", IF_DIRECTIVE),
1624+
("{$if (*$i foo*) = 0}", IF_DIRECTIVE),
1625+
("(*$if (*$i foo*) = 0*)", IF_DIRECTIVE),
1626+
("(*$if {$i foo} = 0*)", IF_DIRECTIVE),
1627+
("{$if {(*} (*{*) {(* {$if }}", IF_DIRECTIVE),
1628+
// Ensure that block comments cannot be started from within nested block comments
1629+
("{$if {{} }", IF_DIRECTIVE),
1630+
// Ensure that block comments can be started from within nested conditional directives
1631+
("{$if {$if {}} }", IF_DIRECTIVE),
1632+
// Ensure that the nested directives are closed with the right half of the pair
1633+
("{$if {$i foo} = 0*) }", IF_DIRECTIVE),
1634+
("(*$if {$i foo} = 0} *)", IF_DIRECTIVE),
1635+
// Ensure nesting works recursively
1636+
(
1637+
indoc! {"
1638+
{$if
1639+
{$if True}
1640+
FOO
1641+
(*$elseif {$if True}FOO{$else}BAR{$endif} *)
1642+
BAR
1643+
{$endif}
1644+
= 0
1645+
}"
1646+
},
1647+
IF_DIRECTIVE,
1648+
),
1649+
// Ensure that the nesting doesn't work with non-expression directives
1650+
("{$ifdef {$i inc}", IFDEF_DIRECTIVE),
1651+
// Ensure that the nesting doesn't work within a nested non-expression directive
1652+
("{$if {$ifdef {}}", IF_DIRECTIVE),
1653+
// Ensure that nested text literals work
1654+
("{$if a = '}'#10#13''''}", IF_DIRECTIVE),
1655+
("{$if a = #10}", IF_DIRECTIVE),
1656+
("{$if a = '''\n}\n'''\n}", IF_DIRECTIVE),
1657+
],
1658+
)
14521659
}
14531660

14541661
#[test]

0 commit comments

Comments
 (0)