@@ -17,28 +17,76 @@ namespace Carbon {
17
17
18
18
using LexerDiagnosticEmitter = DiagnosticEmitter<const char *>;
19
19
20
- static constexpr char MultiLineIndicator[] = R"( """)" ;
21
-
22
- // Return the number of opening characters of a multi-line string literal,
23
- // after any '#'s, including the file type indicator and following newline.
24
- static auto GetMultiLineStringLiteralPrefixSize (llvm::StringRef source_text)
25
- -> int {
26
- if (!source_text.startswith (MultiLineIndicator)) {
27
- return 0 ;
20
+ static constexpr char MultiLineIndicator[] = R"( ''')" ;
21
+ static constexpr char DoubleQuotedMultiLineIndicator[] = R"( """)" ;
22
+
23
+ struct LexedStringLiteral ::Introducer {
24
+ // The kind of string being introduced.
25
+ MultiLineKind kind;
26
+ // The terminator for the string, without any '#' suffixes.
27
+ llvm::StringRef terminator;
28
+ // The length of the introducer, including the file type indicator and
29
+ // newline for a multi-line string literal.
30
+ int prefix_size;
31
+
32
+ // Lex the introducer for a string literal, after any '#'s.
33
+ static auto Lex (llvm::StringRef source_text) -> llvm::Optional<Introducer>;
34
+ };
35
+
36
+ // Lex the introducer for a string literal, after any '#'s.
37
+ //
38
+ // We lex multi-line literals when spelled with either ''' or """ for error
39
+ // recovery purposes, and reject """ literals after lexing.
40
+ auto LexedStringLiteral::Introducer::Lex (llvm::StringRef source_text)
41
+ -> llvm::Optional<Introducer> {
42
+ MultiLineKind kind = NotMultiLine;
43
+ llvm::StringRef indicator;
44
+ if (source_text.startswith (MultiLineIndicator)) {
45
+ kind = MultiLine;
46
+ indicator = llvm::StringRef (MultiLineIndicator);
47
+ } else if (source_text.startswith (DoubleQuotedMultiLineIndicator)) {
48
+ kind = MultiLineWithDoubleQuotes;
49
+ indicator = llvm::StringRef (DoubleQuotedMultiLineIndicator);
28
50
}
29
51
30
- // The rest of the line must be a valid file type indicator: a sequence of
31
- // characters containing neither '#' nor '"' followed by a newline.
32
- auto prefix_end =
33
- source_text.find_first_of (" #\n\" " , strlen (MultiLineIndicator));
34
- if (prefix_end == llvm::StringRef::npos || source_text[prefix_end] != ' \n ' ) {
35
- return 0 ;
52
+ if (kind != NotMultiLine) {
53
+ // The rest of the line must be a valid file type indicator: a sequence of
54
+ // characters containing neither '#' nor '"' followed by a newline.
55
+ auto prefix_end = source_text.find_first_of (" #\n\" " , indicator.size ());
56
+ if (prefix_end != llvm::StringRef::npos &&
57
+ source_text[prefix_end] == ' \n ' ) {
58
+ // Include the newline in the prefix size.
59
+ return Introducer{.kind = kind,
60
+ .terminator = indicator,
61
+ .prefix_size = static_cast <int >(prefix_end + 1 )};
62
+ }
63
+ }
64
+
65
+ if (!source_text.empty () && source_text[0 ] == ' "' ) {
66
+ return Introducer{
67
+ .kind = NotMultiLine, .terminator = " \" " , .prefix_size = 1 };
36
68
}
37
69
38
- // Include the newline on return.
39
- return prefix_end + 1 ;
70
+ return llvm::None;
40
71
}
41
72
73
+ namespace {
74
+ // A set of 'char' values.
75
+ struct alignas (8 ) CharSet {
76
+ bool Elements[UCHAR_MAX + 1 ];
77
+
78
+ constexpr CharSet (std::initializer_list<char > chars) : Elements () {
79
+ for (char c : chars) {
80
+ Elements[static_cast <unsigned char >(c)] = true ;
81
+ }
82
+ }
83
+
84
+ constexpr auto operator [](char c) const -> bool {
85
+ return Elements[static_cast <unsigned char >(c)];
86
+ }
87
+ };
88
+ } // namespace
89
+
42
90
auto LexedStringLiteral::Lex (llvm::StringRef source_text)
43
91
-> llvm::Optional<LexedStringLiteral> {
44
92
int64_t cursor = 0 ;
@@ -50,23 +98,18 @@ auto LexedStringLiteral::Lex(llvm::StringRef source_text)
50
98
}
51
99
const int hash_level = cursor;
52
100
53
- llvm::SmallString<16 > terminator (" \" " );
54
- llvm::SmallString<16 > escape (" \\ " );
55
-
56
- const int multi_line_prefix_size =
57
- GetMultiLineStringLiteralPrefixSize (source_text.substr (hash_level));
58
- const bool multi_line = multi_line_prefix_size > 0 ;
59
- if (multi_line) {
60
- cursor += multi_line_prefix_size;
61
- terminator = MultiLineIndicator;
62
- } else if (cursor < source_text_size && source_text[cursor] == ' "' ) {
63
- ++cursor;
64
- } else {
101
+ const llvm::Optional<Introducer> introducer =
102
+ Introducer::Lex (source_text.substr (hash_level));
103
+ if (!introducer) {
65
104
return llvm::None;
66
105
}
67
106
107
+ cursor += introducer->prefix_size ;
68
108
const int prefix_len = cursor;
69
109
110
+ llvm::SmallString<16 > terminator (introducer->terminator );
111
+ llvm::SmallString<16 > escape (" \\ " );
112
+
70
113
// The terminator and escape sequence marker require a number of '#'s
71
114
// matching the leading sequence of '#'s.
72
115
terminator.resize (terminator.size () + hash_level, ' #' );
@@ -75,51 +118,56 @@ auto LexedStringLiteral::Lex(llvm::StringRef source_text)
75
118
// TODO: Detect indent / dedent for multi-line string literals in order to
76
119
// stop parsing on dedent before a terminator is found.
77
120
for (; cursor < source_text_size; ++cursor) {
121
+ // Use a lookup table to allow us to quickly skip uninteresting characters.
122
+ static constexpr CharSet InterestingChars = {' \\ ' , ' \n ' , ' "' , ' \' ' };
123
+ if (!InterestingChars[source_text[cursor]]) {
124
+ continue ;
125
+ }
126
+
78
127
// This switch and loop structure relies on multi-character terminators and
79
128
// escape sequences starting with a predictable character and not containing
80
129
// embedded and unescaped terminators or newlines.
81
130
switch (source_text[cursor]) {
82
131
case ' \\ ' :
83
132
if (escape.size () == 1 ||
84
- source_text.substr (cursor).startswith (escape)) {
133
+ source_text.substr (cursor + 1 ).startswith (escape. substr ( 1 ) )) {
85
134
cursor += escape.size ();
86
135
// If there's either not a character following the escape, or it's a
87
136
// single-line string and the escaped character is a newline, we
88
137
// should stop here.
89
- if (cursor >= source_text_size ||
90
- (!multi_line && source_text[cursor] == ' \n ' )) {
138
+ if (cursor >= source_text_size || (introducer-> kind == NotMultiLine &&
139
+ source_text[cursor] == ' \n ' )) {
91
140
llvm::StringRef text = source_text.take_front (cursor);
92
141
return LexedStringLiteral (text, text.drop_front (prefix_len),
93
- hash_level, multi_line ,
142
+ hash_level, introducer-> kind ,
94
143
/* is_terminated=*/ false );
95
144
}
96
145
}
97
146
break ;
98
147
case ' \n ' :
99
- if (!multi_line ) {
148
+ if (introducer-> kind == NotMultiLine ) {
100
149
llvm::StringRef text = source_text.take_front (cursor);
101
150
return LexedStringLiteral (text, text.drop_front (prefix_len),
102
- hash_level, multi_line ,
151
+ hash_level, introducer-> kind ,
103
152
/* is_terminated=*/ false );
104
153
}
105
154
break ;
106
- case ' \ "' : {
107
- if (terminator. size () == 1 ||
108
- source_text.substr (cursor).startswith (terminator)) {
155
+ case ' "' :
156
+ case ' \' ' :
157
+ if ( source_text.substr (cursor).startswith (terminator)) {
109
158
llvm::StringRef text =
110
159
source_text.substr (0 , cursor + terminator.size ());
111
160
llvm::StringRef content =
112
161
source_text.substr (prefix_len, cursor - prefix_len);
113
- return LexedStringLiteral (text, content, hash_level, multi_line ,
162
+ return LexedStringLiteral (text, content, hash_level, introducer-> kind ,
114
163
/* is_terminated=*/ true );
115
164
}
116
165
break ;
117
- }
118
166
}
119
167
}
120
168
// No terminator was found.
121
169
return LexedStringLiteral (source_text, source_text.drop_front (prefix_len),
122
- hash_level, multi_line ,
170
+ hash_level, introducer-> kind ,
123
171
/* is_terminated=*/ false );
124
172
}
125
173
@@ -153,7 +201,7 @@ static auto CheckIndent(LexerDiagnosticEmitter& emitter, llvm::StringRef text,
153
201
if (indent.end () != content.end ()) {
154
202
CARBON_DIAGNOSTIC (
155
203
ContentBeforeStringTerminator, Error,
156
- " Only whitespace is permitted before the closing `\"\"\" ` of a "
204
+ " Only whitespace is permitted before the closing `''' ` of a "
157
205
" multi-line string." );
158
206
emitter.Emit (indent.end (), ContentBeforeStringTerminator);
159
207
}
@@ -309,7 +357,7 @@ static auto ExpandEscapeSequencesAndRemoveIndent(
309
357
if (!contents.startswith (" \n " )) {
310
358
CARBON_DIAGNOSTIC (
311
359
MismatchedIndentInString, Error,
312
- " Indentation does not match that of the closing \"\"\" in "
360
+ " Indentation does not match that of the closing `'''` in "
313
361
" multi-line string literal." );
314
362
emitter.Emit (line_start, MismatchedIndentInString);
315
363
}
@@ -386,6 +434,12 @@ auto LexedStringLiteral::ComputeValue(LexerDiagnosticEmitter& emitter) const
386
434
if (!is_terminated_) {
387
435
return " " ;
388
436
}
437
+ if (multi_line_ == MultiLineWithDoubleQuotes) {
438
+ CARBON_DIAGNOSTIC (
439
+ MultiLineStringWithDoubleQuotes, Error,
440
+ " Use `'''` delimiters for a multi-line string literal, not `\"\"\" `." );
441
+ emitter.Emit (text_.begin (), MultiLineStringWithDoubleQuotes);
442
+ }
389
443
llvm::StringRef indent =
390
444
multi_line_ ? CheckIndent (emitter, text_, content_) : llvm::StringRef ();
391
445
return ExpandEscapeSequencesAndRemoveIndent (emitter, content_, hash_level_,
0 commit comments