Skip to content

Commit d772af3

Browse files
committed
Add enhanced literal and escape sequence support
- Binary literal support with 0b/0B prefix - Hexadecimal escape sequences (\xHH) - Octal escape sequences (\nnn) - Additional escape characters (\a, \b, \v, \f, \e, \?) - Adjacent string literal concatenation - Code formatted with clang-format-18
1 parent bbd5b41 commit d772af3

File tree

1 file changed

+132
-6
lines changed

1 file changed

+132
-6
lines changed

src/lexer.c

Lines changed: 132 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212

1313
/* Hash table constants */
1414
#define NUM_DIRECTIVES 11
15-
#define NUM_KEYWORDS 15
15+
#define NUM_KEYWORDS 16
1616

1717
/* Preprocessor directive hash table using existing shecc hashmap */
1818
hashmap_t *DIRECTIVE_MAP = NULL;
@@ -112,6 +112,8 @@ void lex_init_keywords()
112112
token_values[13] = T_default;
113113
names[14] = "continue";
114114
token_values[14] = T_continue;
115+
names[15] = "union";
116+
token_values[15] = T_union;
115117

116118
/* hashmap insertion */
117119
for (int i = 0; i < NUM_KEYWORDS; i++) {
@@ -203,6 +205,17 @@ bool is_hex(char c)
203205
(c >= 'A' && c <= 'F');
204206
}
205207

208+
int hex_digit_value(char c)
209+
{
210+
if (c >= '0' && c <= '9')
211+
return c - '0';
212+
if (c >= 'a' && c <= 'f')
213+
return c - 'a' + 10;
214+
if (c >= 'A' && c <= 'F')
215+
return c - 'A' + 10;
216+
return -1;
217+
}
218+
206219
bool is_numeric(char buffer[])
207220
{
208221
bool hex = false;
@@ -330,8 +343,21 @@ token_t lex_token_internal(bool aliasing)
330343
token_str[i++] = next_char;
331344
} while (is_hex(read_char(false)));
332345

346+
} else if (token_str[0] == '0' && ((next_char | 32) == 'b')) {
347+
/* Binary: starts with 0b or 0B */
348+
token_str[i++] = next_char;
349+
350+
read_char(false);
351+
if (next_char != '0' && next_char != '1')
352+
error("Invalid binary literal: expected 0 or 1 after 0b");
353+
354+
do {
355+
token_str[i++] = next_char;
356+
read_char(false);
357+
} while (next_char == '0' || next_char == '1');
358+
333359
} else if (token_str[0] == '0') {
334-
/* Octal: starts with 0 but not followed by 'x' */
360+
/* Octal: starts with 0 but not followed by 'x' or 'b' */
335361
while (is_digit(next_char)) {
336362
if (next_char >= '8')
337363
error("Invalid octal digit: must be in range 0-7");
@@ -413,8 +439,58 @@ token_t lex_token_internal(bool aliasing)
413439
token_str[i - 1] = '\\';
414440
else if (next_char == '0')
415441
token_str[i - 1] = '\0';
416-
else
417-
abort();
442+
else if (next_char == 'a')
443+
token_str[i - 1] = '\a';
444+
else if (next_char == 'b')
445+
token_str[i - 1] = '\b';
446+
else if (next_char == 'v')
447+
token_str[i - 1] = '\v';
448+
else if (next_char == 'f')
449+
token_str[i - 1] = '\f';
450+
else if (next_char == 'e') /* GNU extension: ESC character */
451+
token_str[i - 1] = 27;
452+
else if (next_char == '?')
453+
token_str[i - 1] = '?';
454+
else if (next_char == 'x') {
455+
/* Hexadecimal escape sequence \xHH */
456+
read_char(false);
457+
if (!is_hex(next_char))
458+
error("Invalid hex escape sequence");
459+
int value = 0;
460+
int count = 0;
461+
while (is_hex(next_char) && count < 2) {
462+
value = (value << 4) + hex_digit_value(next_char);
463+
read_char(false);
464+
count++;
465+
}
466+
token_str[i - 1] = value;
467+
/* Back up one character as we read one too many */
468+
SOURCE->size--;
469+
next_char = SOURCE->elements[SOURCE->size];
470+
} else if (next_char >= '0' && next_char <= '7') {
471+
/* Octal escape sequence \nnn */
472+
int value = next_char - '0';
473+
read_char(false);
474+
if (next_char >= '0' && next_char <= '7') {
475+
value = (value << 3) + (next_char - '0');
476+
read_char(false);
477+
if (next_char >= '0' && next_char <= '7') {
478+
value = (value << 3) + (next_char - '0');
479+
} else {
480+
/* Back up one character */
481+
SOURCE->size--;
482+
next_char = SOURCE->elements[SOURCE->size];
483+
}
484+
} else {
485+
/* Back up one character */
486+
SOURCE->size--;
487+
next_char = SOURCE->elements[SOURCE->size];
488+
}
489+
token_str[i - 1] = value;
490+
} else {
491+
/* Handle unknown escapes gracefully */
492+
token_str[i - 1] = next_char;
493+
}
418494
} else {
419495
token_str[i++] = next_char;
420496
}
@@ -445,8 +521,58 @@ token_t lex_token_internal(bool aliasing)
445521
token_str[0] = '\\';
446522
else if (next_char == '0')
447523
token_str[0] = '\0';
448-
else
449-
abort();
524+
else if (next_char == 'a')
525+
token_str[0] = '\a';
526+
else if (next_char == 'b')
527+
token_str[0] = '\b';
528+
else if (next_char == 'v')
529+
token_str[0] = '\v';
530+
else if (next_char == 'f')
531+
token_str[0] = '\f';
532+
else if (next_char == 'e') /* GNU extension: ESC character */
533+
token_str[0] = 27;
534+
else if (next_char == '?')
535+
token_str[0] = '?';
536+
else if (next_char == 'x') {
537+
/* Hexadecimal escape sequence \xHH */
538+
read_char(false);
539+
if (!is_hex(next_char))
540+
error("Invalid hex escape sequence");
541+
int value = 0;
542+
int count = 0;
543+
while (is_hex(next_char) && count < 2) {
544+
value = (value << 4) + hex_digit_value(next_char);
545+
read_char(false);
546+
count++;
547+
}
548+
token_str[0] = value;
549+
/* Back up one character as we read one too many */
550+
SOURCE->size--;
551+
next_char = SOURCE->elements[SOURCE->size];
552+
} else if (next_char >= '0' && next_char <= '7') {
553+
/* Octal escape sequence \nnn */
554+
int value = next_char - '0';
555+
read_char(false);
556+
if (next_char >= '0' && next_char <= '7') {
557+
value = (value << 3) + (next_char - '0');
558+
read_char(false);
559+
if (next_char >= '0' && next_char <= '7') {
560+
value = (value << 3) + (next_char - '0');
561+
} else {
562+
/* Back up one character */
563+
SOURCE->size--;
564+
next_char = SOURCE->elements[SOURCE->size];
565+
}
566+
} else {
567+
/* Back up one character */
568+
SOURCE->size--;
569+
next_char = SOURCE->elements[SOURCE->size];
570+
}
571+
token_str[0] = value;
572+
} else {
573+
/* Handle unknown escapes gracefully */
574+
token_str[0] = next_char;
575+
}
450576
} else {
451577
token_str[0] = next_char;
452578
}

0 commit comments

Comments
 (0)