Skip to content

Commit dcc80f5

Browse files
authored
Lex '''-delimited multiline string literals. (#2133)
"""-delimited literals are still lexed for error recovery but produce an error.
1 parent e489b14 commit dcc80f5

File tree

7 files changed

+243
-100
lines changed

7 files changed

+243
-100
lines changed

toolchain/diagnostics/diagnostic_registry.def

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ CARBON_DIAGNOSTIC_KIND(InvalidHorizontalWhitespaceInString)
2727
CARBON_DIAGNOSTIC_KIND(IrregularDigitSeparators)
2828
CARBON_DIAGNOSTIC_KIND(MismatchedClosing)
2929
CARBON_DIAGNOSTIC_KIND(MismatchedIndentInString)
30+
CARBON_DIAGNOSTIC_KIND(MultiLineStringWithDoubleQuotes)
3031
CARBON_DIAGNOSTIC_KIND(NoWhitespaceAfterCommentIntroducer)
3132
CARBON_DIAGNOSTIC_KIND(TooManyDigits)
3233
CARBON_DIAGNOSTIC_KIND(TrailingComment)

toolchain/lexer/BUILD

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -132,6 +132,7 @@ cc_binary(
132132
srcs = ["string_literal_benchmark.cpp"],
133133
deps = [
134134
":string_literal",
135+
"//toolchain/diagnostics:null_diagnostics",
135136
"@com_github_google_benchmark//:benchmark_main",
136137
],
137138
)

toolchain/lexer/string_literal.cpp

Lines changed: 96 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -17,28 +17,76 @@ namespace Carbon {
1717

1818
using LexerDiagnosticEmitter = DiagnosticEmitter<const char*>;
1919

20-
static constexpr char MultiLineIndicator[] = R"(""")";
21-
22-
// Return the number of opening characters of a multi-line string literal,
23-
// after any '#'s, including the file type indicator and following newline.
24-
static auto GetMultiLineStringLiteralPrefixSize(llvm::StringRef source_text)
25-
-> int {
26-
if (!source_text.startswith(MultiLineIndicator)) {
27-
return 0;
20+
static constexpr char MultiLineIndicator[] = R"(''')";
21+
static constexpr char DoubleQuotedMultiLineIndicator[] = R"(""")";
22+
23+
struct LexedStringLiteral::Introducer {
24+
// The kind of string being introduced.
25+
MultiLineKind kind;
26+
// The terminator for the string, without any '#' suffixes.
27+
llvm::StringRef terminator;
28+
// The length of the introducer, including the file type indicator and
29+
// newline for a multi-line string literal.
30+
int prefix_size;
31+
32+
// Lex the introducer for a string literal, after any '#'s.
33+
static auto Lex(llvm::StringRef source_text) -> llvm::Optional<Introducer>;
34+
};
35+
36+
// Lex the introducer for a string literal, after any '#'s.
37+
//
38+
// We lex multi-line literals when spelled with either ''' or """ for error
39+
// recovery purposes, and reject """ literals after lexing.
40+
auto LexedStringLiteral::Introducer::Lex(llvm::StringRef source_text)
41+
-> llvm::Optional<Introducer> {
42+
MultiLineKind kind = NotMultiLine;
43+
llvm::StringRef indicator;
44+
if (source_text.startswith(MultiLineIndicator)) {
45+
kind = MultiLine;
46+
indicator = llvm::StringRef(MultiLineIndicator);
47+
} else if (source_text.startswith(DoubleQuotedMultiLineIndicator)) {
48+
kind = MultiLineWithDoubleQuotes;
49+
indicator = llvm::StringRef(DoubleQuotedMultiLineIndicator);
2850
}
2951

30-
// The rest of the line must be a valid file type indicator: a sequence of
31-
// characters containing neither '#' nor '"' followed by a newline.
32-
auto prefix_end =
33-
source_text.find_first_of("#\n\"", strlen(MultiLineIndicator));
34-
if (prefix_end == llvm::StringRef::npos || source_text[prefix_end] != '\n') {
35-
return 0;
52+
if (kind != NotMultiLine) {
53+
// The rest of the line must be a valid file type indicator: a sequence of
54+
// characters containing neither '#' nor '"' followed by a newline.
55+
auto prefix_end = source_text.find_first_of("#\n\"", indicator.size());
56+
if (prefix_end != llvm::StringRef::npos &&
57+
source_text[prefix_end] == '\n') {
58+
// Include the newline in the prefix size.
59+
return Introducer{.kind = kind,
60+
.terminator = indicator,
61+
.prefix_size = static_cast<int>(prefix_end + 1)};
62+
}
63+
}
64+
65+
if (!source_text.empty() && source_text[0] == '"') {
66+
return Introducer{
67+
.kind = NotMultiLine, .terminator = "\"", .prefix_size = 1};
3668
}
3769

38-
// Include the newline on return.
39-
return prefix_end + 1;
70+
return llvm::None;
4071
}
4172

73+
namespace {
74+
// A set of 'char' values.
75+
struct alignas(8) CharSet {
76+
bool Elements[UCHAR_MAX + 1];
77+
78+
constexpr CharSet(std::initializer_list<char> chars) : Elements() {
79+
for (char c : chars) {
80+
Elements[static_cast<unsigned char>(c)] = true;
81+
}
82+
}
83+
84+
constexpr auto operator[](char c) const -> bool {
85+
return Elements[static_cast<unsigned char>(c)];
86+
}
87+
};
88+
} // namespace
89+
4290
auto LexedStringLiteral::Lex(llvm::StringRef source_text)
4391
-> llvm::Optional<LexedStringLiteral> {
4492
int64_t cursor = 0;
@@ -50,23 +98,18 @@ auto LexedStringLiteral::Lex(llvm::StringRef source_text)
5098
}
5199
const int hash_level = cursor;
52100

53-
llvm::SmallString<16> terminator("\"");
54-
llvm::SmallString<16> escape("\\");
55-
56-
const int multi_line_prefix_size =
57-
GetMultiLineStringLiteralPrefixSize(source_text.substr(hash_level));
58-
const bool multi_line = multi_line_prefix_size > 0;
59-
if (multi_line) {
60-
cursor += multi_line_prefix_size;
61-
terminator = MultiLineIndicator;
62-
} else if (cursor < source_text_size && source_text[cursor] == '"') {
63-
++cursor;
64-
} else {
101+
const llvm::Optional<Introducer> introducer =
102+
Introducer::Lex(source_text.substr(hash_level));
103+
if (!introducer) {
65104
return llvm::None;
66105
}
67106

107+
cursor += introducer->prefix_size;
68108
const int prefix_len = cursor;
69109

110+
llvm::SmallString<16> terminator(introducer->terminator);
111+
llvm::SmallString<16> escape("\\");
112+
70113
// The terminator and escape sequence marker require a number of '#'s
71114
// matching the leading sequence of '#'s.
72115
terminator.resize(terminator.size() + hash_level, '#');
@@ -75,51 +118,56 @@ auto LexedStringLiteral::Lex(llvm::StringRef source_text)
75118
// TODO: Detect indent / dedent for multi-line string literals in order to
76119
// stop parsing on dedent before a terminator is found.
77120
for (; cursor < source_text_size; ++cursor) {
121+
// Use a lookup table to allow us to quickly skip uninteresting characters.
122+
static constexpr CharSet InterestingChars = {'\\', '\n', '"', '\''};
123+
if (!InterestingChars[source_text[cursor]]) {
124+
continue;
125+
}
126+
78127
// This switch and loop structure relies on multi-character terminators and
79128
// escape sequences starting with a predictable character and not containing
80129
// embedded and unescaped terminators or newlines.
81130
switch (source_text[cursor]) {
82131
case '\\':
83132
if (escape.size() == 1 ||
84-
source_text.substr(cursor).startswith(escape)) {
133+
source_text.substr(cursor + 1).startswith(escape.substr(1))) {
85134
cursor += escape.size();
86135
// If there's either not a character following the escape, or it's a
87136
// single-line string and the escaped character is a newline, we
88137
// should stop here.
89-
if (cursor >= source_text_size ||
90-
(!multi_line && source_text[cursor] == '\n')) {
138+
if (cursor >= source_text_size || (introducer->kind == NotMultiLine &&
139+
source_text[cursor] == '\n')) {
91140
llvm::StringRef text = source_text.take_front(cursor);
92141
return LexedStringLiteral(text, text.drop_front(prefix_len),
93-
hash_level, multi_line,
142+
hash_level, introducer->kind,
94143
/*is_terminated=*/false);
95144
}
96145
}
97146
break;
98147
case '\n':
99-
if (!multi_line) {
148+
if (introducer->kind == NotMultiLine) {
100149
llvm::StringRef text = source_text.take_front(cursor);
101150
return LexedStringLiteral(text, text.drop_front(prefix_len),
102-
hash_level, multi_line,
151+
hash_level, introducer->kind,
103152
/*is_terminated=*/false);
104153
}
105154
break;
106-
case '\"': {
107-
if (terminator.size() == 1 ||
108-
source_text.substr(cursor).startswith(terminator)) {
155+
case '"':
156+
case '\'':
157+
if (source_text.substr(cursor).startswith(terminator)) {
109158
llvm::StringRef text =
110159
source_text.substr(0, cursor + terminator.size());
111160
llvm::StringRef content =
112161
source_text.substr(prefix_len, cursor - prefix_len);
113-
return LexedStringLiteral(text, content, hash_level, multi_line,
162+
return LexedStringLiteral(text, content, hash_level, introducer->kind,
114163
/*is_terminated=*/true);
115164
}
116165
break;
117-
}
118166
}
119167
}
120168
// No terminator was found.
121169
return LexedStringLiteral(source_text, source_text.drop_front(prefix_len),
122-
hash_level, multi_line,
170+
hash_level, introducer->kind,
123171
/*is_terminated=*/false);
124172
}
125173

@@ -153,7 +201,7 @@ static auto CheckIndent(LexerDiagnosticEmitter& emitter, llvm::StringRef text,
153201
if (indent.end() != content.end()) {
154202
CARBON_DIAGNOSTIC(
155203
ContentBeforeStringTerminator, Error,
156-
"Only whitespace is permitted before the closing `\"\"\"` of a "
204+
"Only whitespace is permitted before the closing `'''` of a "
157205
"multi-line string.");
158206
emitter.Emit(indent.end(), ContentBeforeStringTerminator);
159207
}
@@ -309,7 +357,7 @@ static auto ExpandEscapeSequencesAndRemoveIndent(
309357
if (!contents.startswith("\n")) {
310358
CARBON_DIAGNOSTIC(
311359
MismatchedIndentInString, Error,
312-
"Indentation does not match that of the closing \"\"\" in "
360+
"Indentation does not match that of the closing `'''` in "
313361
"multi-line string literal.");
314362
emitter.Emit(line_start, MismatchedIndentInString);
315363
}
@@ -386,6 +434,12 @@ auto LexedStringLiteral::ComputeValue(LexerDiagnosticEmitter& emitter) const
386434
if (!is_terminated_) {
387435
return "";
388436
}
437+
if (multi_line_ == MultiLineWithDoubleQuotes) {
438+
CARBON_DIAGNOSTIC(
439+
MultiLineStringWithDoubleQuotes, Error,
440+
"Use `'''` delimiters for a multi-line string literal, not `\"\"\"`.");
441+
emitter.Emit(text_.begin(), MultiLineStringWithDoubleQuotes);
442+
}
389443
llvm::StringRef indent =
390444
multi_line_ ? CheckIndent(emitter, text_, content_) : llvm::StringRef();
391445
return ExpandEscapeSequencesAndRemoveIndent(emitter, content_, hash_level_,

toolchain/lexer/string_literal.h

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -37,8 +37,13 @@ class LexedStringLiteral {
3737
[[nodiscard]] auto is_terminated() const -> bool { return is_terminated_; }
3838

3939
private:
40+
enum MultiLineKind { NotMultiLine, MultiLine, MultiLineWithDoubleQuotes };
41+
42+
struct Introducer;
43+
4044
LexedStringLiteral(llvm::StringRef text, llvm::StringRef content,
41-
int hash_level, bool multi_line, bool is_terminated)
45+
int hash_level, MultiLineKind multi_line,
46+
bool is_terminated)
4247
: text_(text),
4348
content_(content),
4449
hash_level_(hash_level),
@@ -55,7 +60,7 @@ class LexedStringLiteral {
5560
// The number of `#`s preceding the opening `"` or `"""`.
5661
int hash_level_;
5762
// Whether this was a multi-line string literal.
58-
bool multi_line_;
63+
MultiLineKind multi_line_;
5964
// Whether the literal is valid, or should only be used for errors.
6065
bool is_terminated_;
6166
};

toolchain/lexer/string_literal_benchmark.cpp

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44

55
#include <benchmark/benchmark.h>
66

7+
#include "toolchain/diagnostics/null_diagnostics.h"
78
#include "toolchain/lexer/string_literal.h"
89

910
namespace Carbon::Testing {
@@ -24,6 +25,10 @@ static void BM_ValidString_Simple(benchmark::State& state) {
2425
}
2526

2627
static void BM_ValidString_Multiline(benchmark::State& state) {
28+
BM_ValidString(state, "'''\n", "\n'''");
29+
}
30+
31+
static void BM_ValidString_MultilineDoubleQuote(benchmark::State& state) {
2732
BM_ValidString(state, "\"\"\"\n", "\n\"\"\"");
2833
}
2934

@@ -33,6 +38,7 @@ static void BM_ValidString_Raw(benchmark::State& state) {
3338

3439
BENCHMARK(BM_ValidString_Simple);
3540
BENCHMARK(BM_ValidString_Multiline);
41+
BENCHMARK(BM_ValidString_MultilineDoubleQuote);
3642
BENCHMARK(BM_ValidString_Raw);
3743

3844
static void BM_IncompleteWithRepeatedEscapes(benchmark::State& state,
@@ -59,6 +65,11 @@ static void BM_IncompleteWithEscapes_Simple(benchmark::State& state) {
5965
}
6066

6167
static void BM_IncompleteWithEscapes_Multiline(benchmark::State& state) {
68+
BM_IncompleteWithRepeatedEscapes(state, "'''\n", "\\");
69+
}
70+
71+
static void BM_IncompleteWithEscapes_MultilineDoubleQuote(
72+
benchmark::State& state) {
6273
BM_IncompleteWithRepeatedEscapes(state, "\"\"\"\n", "\\");
6374
}
6475

@@ -68,7 +79,41 @@ static void BM_IncompleteWithEscapes_Raw(benchmark::State& state) {
6879

6980
BENCHMARK(BM_IncompleteWithEscapes_Simple);
7081
BENCHMARK(BM_IncompleteWithEscapes_Multiline);
82+
BENCHMARK(BM_IncompleteWithEscapes_MultilineDoubleQuote);
7183
BENCHMARK(BM_IncompleteWithEscapes_Raw);
7284

85+
static void BM_SimpleStringValue(benchmark::State& state,
86+
std::string_view introducer,
87+
std::string_view terminator) {
88+
std::string x(introducer);
89+
x.append(100000, 'a');
90+
x.append(terminator);
91+
for (auto _ : state) {
92+
LexedStringLiteral::Lex(x)->ComputeValue(
93+
NullDiagnosticEmitter<const char*>());
94+
}
95+
}
96+
97+
static void BM_SimpleStringValue_Simple(benchmark::State& state) {
98+
BM_SimpleStringValue(state, "\"", "\"");
99+
}
100+
101+
static void BM_SimpleStringValue_Multiline(benchmark::State& state) {
102+
BM_SimpleStringValue(state, "'''\n", "\n'''");
103+
}
104+
105+
static void BM_SimpleStringValue_MultilineDoubleQuote(benchmark::State& state) {
106+
BM_SimpleStringValue(state, "\"\"\"\n", "\n\"\"\"");
107+
}
108+
109+
static void BM_SimpleStringValue_Raw(benchmark::State& state) {
110+
BM_SimpleStringValue(state, "#\"", "\"#");
111+
}
112+
113+
BENCHMARK(BM_SimpleStringValue_Simple);
114+
BENCHMARK(BM_SimpleStringValue_Multiline);
115+
BENCHMARK(BM_SimpleStringValue_MultilineDoubleQuote);
116+
BENCHMARK(BM_SimpleStringValue_Raw);
117+
73118
} // namespace
74119
} // namespace Carbon::Testing

0 commit comments

Comments
 (0)