Skip to content

Commit 8fa8a8f

Browse files
committed
Correctly parse string literals with a prefix (e.g. raw string: R"...")
(this fixes some parser errors)
1 parent 667368f commit 8fa8a8f

File tree

2 files changed

+104
-11
lines changed

2 files changed

+104
-11
lines changed

generator/parser/lexer.cpp

Lines changed: 98 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -243,9 +243,9 @@ void Lexer::scan_preprocessor()
243243
reportError("expected newline");
244244
}
245245

246-
void Lexer::scan_char_constant()
246+
void Lexer::scan_char_constant_with_prefix(const unsigned char* prefix)
247247
{
248-
const unsigned char *begin = cursor;
248+
const unsigned char *begin = prefix ? prefix : cursor;
249249

250250
++cursor;
251251
while (*cursor && *cursor != '\'')
@@ -269,9 +269,9 @@ void Lexer::scan_char_constant()
269269
token_stream[(int) index++].kind = Token_char_literal;
270270
}
271271

272-
void Lexer::scan_string_constant()
272+
void Lexer::scan_string_constant_with_prefix(const unsigned char* prefix)
273273
{
274-
const unsigned char *begin = cursor;
274+
const unsigned char *begin = prefix ? prefix : cursor;
275275

276276
++cursor;
277277
while (*cursor && *cursor != '"')
@@ -295,6 +295,71 @@ void Lexer::scan_string_constant()
295295
token_stream[(int) index++].kind = Token_string_literal;
296296
}
297297

298+
void Lexer::scan_raw_string_constant_with_prefix(const unsigned char* prefix)
299+
{
300+
// always starts with "
301+
const unsigned char* begin = prefix ? prefix : cursor;
302+
int delimiterLength = 0;
303+
int endSequenceLength = 0;
304+
bool stillValidDelimiter = true;
305+
++cursor;
306+
while (*cursor)
307+
{
308+
if (!delimiterLength)
309+
{
310+
if (*cursor == '"')
311+
{
312+
break;
313+
}
314+
else if (*cursor == '\n')
315+
{
316+
// this would probably not be a valid delimiter
317+
stillValidDelimiter = false;
318+
}
319+
else if (stillValidDelimiter && *cursor == '(' && (cursor - begin) < 16)
320+
{
321+
// delimiter sequence identified (see https://en.cppreference.com/w/cpp/language/string_literal)
322+
delimiterLength = cursor - begin;
323+
}
324+
}
325+
else if (endSequenceLength)
326+
{
327+
// possible end delimiter sequence
328+
if (endSequenceLength == delimiterLength && *cursor == '"')
329+
{
330+
break;
331+
}
332+
else if (endSequenceLength < delimiterLength && *cursor == begin[endSequenceLength])
333+
{
334+
endSequenceLength++;
335+
}
336+
else
337+
{
338+
// this is not the end of the string, go back to
339+
// after the starting ')' and try again
340+
cursor -= endSequenceLength;
341+
endSequenceLength = 0;
342+
}
343+
}
344+
else if (*cursor == ')')
345+
{
346+
// this might be the start of the end delimiter sequence
347+
endSequenceLength = 1;
348+
}
349+
++cursor;
350+
}
351+
352+
if (*cursor != '"')
353+
reportError("expected \"");
354+
355+
++cursor;
356+
357+
token_stream[(int)index].extra.symbol =
358+
control->findOrInsertName((const char*)begin, cursor - begin);
359+
360+
token_stream[(int)index++].kind = Token_string_literal;
361+
}
362+
298363
void Lexer::scan_newline()
299364
{
300365
if (location_table.current_line == location_table.size())
@@ -338,20 +403,44 @@ void Lexer::scan_identifier_or_literal()
338403
void Lexer::scan_identifier_or_keyword()
339404
{
340405
const unsigned char *skip = cursor;
406+
const unsigned char* start = cursor;
341407
while (isalnum(*skip) || *skip== '_')
342408
++skip;
343409

344410
int n = skip - cursor;
345-
Token *current_token = &token_stream[(int) index];
346-
(this->*s_scan_keyword_table[n < 17 ? n : 0])();
411+
if (*skip == '"' && n <= 3)
412+
{
413+
cursor = skip;
414+
// this should be a unicode and/or raw string -
415+
// we pass through anything that does not follow the standard, though
416+
if (skip[-1] == 'R')
417+
{
418+
scan_raw_string_constant_with_prefix(start);
419+
}
420+
else
421+
{
422+
scan_string_constant_with_prefix(start);
423+
}
424+
}
425+
else if (*skip == '\'' && n <= 2)
426+
{
427+
// probably some special encoding
428+
cursor = skip;
429+
scan_char_constant_with_prefix(start);
430+
}
431+
else
432+
{
433+
Token* current_token = &token_stream[(int)index];
434+
(this->*s_scan_keyword_table[n < 17 ? n : 0])();
347435

348-
if (current_token->kind == Token_identifier)
436+
if (current_token->kind == Token_identifier)
349437
{
350438
current_token->extra.symbol =
351-
control->findOrInsertName((const char*) cursor, n);
439+
control->findOrInsertName((const char*)cursor, n);
352440
}
353441

354-
cursor = skip;
442+
cursor = skip;
443+
}
355444
}
356445

357446
void Lexer::scan_int_constant()

generator/parser/lexer.h

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -232,8 +232,12 @@ class Lexer
232232
void scan_identifier_or_keyword();
233233
void scan_identifier_or_literal();
234234
void scan_int_constant();
235-
void scan_char_constant();
236-
void scan_string_constant();
235+
void scan_char_constant() { scan_char_constant_with_prefix(nullptr); }
236+
void scan_string_constant() { scan_string_constant_with_prefix(nullptr); }
237+
// the _with_prefix variants take the start of an optional prefix (e.g., the R in R"")
238+
void scan_char_constant_with_prefix(const unsigned char* prefix);
239+
void scan_string_constant_with_prefix(const unsigned char* prefix);
240+
void scan_raw_string_constant_with_prefix(const unsigned char* prefix);
237241
void scan_invalid_input();
238242
void scan_preprocessor();
239243

0 commit comments

Comments
 (0)