Skip to content

Commit 39bc653

Browse files
committed
Replace strcmp chains with hashmap lookups
This commit modernizes lexer token recognition by utilizing existing hashmap infrastructure and arena allocation system for performance and consistency, resulting in O(n) → O(1) average case token lookup.
1 parent 72fc6b5 commit 39bc653

File tree

2 files changed

+169
-52
lines changed

2 files changed

+169
-52
lines changed

src/globals.c

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1113,8 +1113,14 @@ void global_init(void)
11131113
elf_section = strbuf_create(MAX_SECTION);
11141114
}
11151115

1116+
/* Forward declaration for lexer cleanup */
1117+
void lexer_cleanup(void);
1118+
11161119
void global_release(void)
11171120
{
1121+
/* Cleanup lexer hashmaps */
1122+
lexer_cleanup();
1123+
11181124
hashmap_free(MACROS_MAP);
11191125
free(TYPES);
11201126
arena_free(BLOCK_ARENA);

src/lexer.c

Lines changed: 163 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,163 @@
1010
#include "defs.h"
1111
#include "globals.c"
1212

13+
/* Hash table constants */
14+
#define NUM_DIRECTIVES 11
15+
#define NUM_KEYWORDS 15
16+
17+
/* Preprocessor directive hash table using existing shecc hashmap */
18+
hashmap_t *DIRECTIVE_MAP = NULL;
19+
/* C keywords hash table */
20+
hashmap_t *KEYWORD_MAP = NULL;
21+
/* Token arrays for cleanup */
22+
token_t *directive_tokens_storage = NULL;
23+
token_t *keyword_tokens_storage = NULL;
24+
25+
void lex_init_directives()
26+
{
27+
if (DIRECTIVE_MAP)
28+
return;
29+
30+
DIRECTIVE_MAP = hashmap_create(16); /* Small capacity for directives */
31+
32+
/* Initialization using indexed for-loop */
33+
directive_tokens_storage =
34+
arena_alloc(GENERAL_ARENA, NUM_DIRECTIVES * sizeof(token_t));
35+
36+
char *names[NUM_DIRECTIVES];
37+
token_t token_values[NUM_DIRECTIVES];
38+
39+
/* Populate arrays using index-based assignments for compatibility */
40+
names[0] = "#define";
41+
token_values[0] = T_cppd_define;
42+
names[1] = "#elif";
43+
token_values[1] = T_cppd_elif;
44+
names[2] = "#else";
45+
token_values[2] = T_cppd_else;
46+
names[3] = "#endif";
47+
token_values[3] = T_cppd_endif;
48+
names[4] = "#error";
49+
token_values[4] = T_cppd_error;
50+
names[5] = "#if";
51+
token_values[5] = T_cppd_if;
52+
names[6] = "#ifdef";
53+
token_values[6] = T_cppd_ifdef;
54+
names[7] = "#ifndef";
55+
token_values[7] = T_cppd_ifndef;
56+
names[8] = "#include";
57+
token_values[8] = T_cppd_include;
58+
names[9] = "#pragma";
59+
token_values[9] = T_cppd_pragma;
60+
names[10] = "#undef";
61+
token_values[10] = T_cppd_undef;
62+
63+
/* hashmap insertion */
64+
for (int i = 0; i < NUM_DIRECTIVES; i++) {
65+
directive_tokens_storage[i] = token_values[i];
66+
hashmap_put(DIRECTIVE_MAP, names[i], &directive_tokens_storage[i]);
67+
}
68+
}
69+
70+
void lex_init_keywords()
71+
{
72+
if (KEYWORD_MAP)
73+
return;
74+
75+
KEYWORD_MAP = hashmap_create(32); /* Capacity for keywords */
76+
77+
/* Initialization using indexed for-loop */
78+
keyword_tokens_storage =
79+
arena_alloc(GENERAL_ARENA, NUM_KEYWORDS * sizeof(token_t));
80+
81+
char *names[NUM_KEYWORDS];
82+
token_t token_values[NUM_KEYWORDS];
83+
84+
/* Populate arrays using index-based assignments for compatibility */
85+
names[0] = "if";
86+
token_values[0] = T_if;
87+
names[1] = "while";
88+
token_values[1] = T_while;
89+
names[2] = "for";
90+
token_values[2] = T_for;
91+
names[3] = "do";
92+
token_values[3] = T_do;
93+
names[4] = "else";
94+
token_values[4] = T_else;
95+
names[5] = "return";
96+
token_values[5] = T_return;
97+
names[6] = "typedef";
98+
token_values[6] = T_typedef;
99+
names[7] = "enum";
100+
token_values[7] = T_enum;
101+
names[8] = "struct";
102+
token_values[8] = T_struct;
103+
names[9] = "sizeof";
104+
token_values[9] = T_sizeof;
105+
names[10] = "switch";
106+
token_values[10] = T_switch;
107+
names[11] = "case";
108+
token_values[11] = T_case;
109+
names[12] = "break";
110+
token_values[12] = T_break;
111+
names[13] = "default";
112+
token_values[13] = T_default;
113+
names[14] = "continue";
114+
token_values[14] = T_continue;
115+
116+
/* hashmap insertion */
117+
for (int i = 0; i < NUM_KEYWORDS; i++) {
118+
keyword_tokens_storage[i] = token_values[i];
119+
hashmap_put(KEYWORD_MAP, names[i], &keyword_tokens_storage[i]);
120+
}
121+
}
122+
123+
/* Hash table lookup for preprocessor directives */
124+
token_t lookup_directive(char *token)
125+
{
126+
if (!DIRECTIVE_MAP)
127+
lex_init_directives();
128+
129+
token_t *result = hashmap_get(DIRECTIVE_MAP, token);
130+
if (result)
131+
return *result;
132+
133+
return T_identifier;
134+
}
135+
136+
/* Hash table lookup for C keywords */
137+
token_t lookup_keyword(char *token)
138+
{
139+
if (!KEYWORD_MAP)
140+
lex_init_keywords();
141+
142+
token_t *result = hashmap_get(KEYWORD_MAP, token);
143+
if (result)
144+
return *result;
145+
146+
return T_identifier;
147+
}
148+
149+
/* Cleanup function for lexer hashmaps */
150+
void lexer_cleanup()
151+
{
152+
if (DIRECTIVE_MAP) {
153+
hashmap_free(DIRECTIVE_MAP);
154+
DIRECTIVE_MAP = NULL;
155+
}
156+
157+
if (KEYWORD_MAP) {
158+
hashmap_free(KEYWORD_MAP);
159+
KEYWORD_MAP = NULL;
160+
}
161+
162+
/* Token storage arrays are allocated from GENERAL_ARENA and will be
163+
* automatically freed when the arena is freed in global_release().
164+
* No need to explicitly free them here.
165+
*/
166+
directive_tokens_storage = NULL;
167+
keyword_tokens_storage = NULL;
168+
}
169+
13170
bool is_whitespace(char c)
14171
{
15172
return c == ' ' || c == '\t';
@@ -112,28 +269,9 @@ token_t lex_token_internal(bool aliasing)
112269
token_str[i] = 0;
113270
skip_whitespace();
114271

115-
if (!strcmp(token_str, "#include"))
116-
return T_cppd_include;
117-
if (!strcmp(token_str, "#define"))
118-
return T_cppd_define;
119-
if (!strcmp(token_str, "#undef"))
120-
return T_cppd_undef;
121-
if (!strcmp(token_str, "#error"))
122-
return T_cppd_error;
123-
if (!strcmp(token_str, "#if"))
124-
return T_cppd_if;
125-
if (!strcmp(token_str, "#elif"))
126-
return T_cppd_elif;
127-
if (!strcmp(token_str, "#ifdef"))
128-
return T_cppd_ifdef;
129-
if (!strcmp(token_str, "#ifndef"))
130-
return T_cppd_ifndef;
131-
if (!strcmp(token_str, "#else"))
132-
return T_cppd_else;
133-
if (!strcmp(token_str, "#endif"))
134-
return T_cppd_endif;
135-
if (!strcmp(token_str, "#pragma"))
136-
return T_cppd_pragma;
272+
token_t directive = lookup_directive(token_str);
273+
if (directive != T_identifier)
274+
return directive;
137275
error("Unknown directive");
138276
}
139277

@@ -485,36 +623,9 @@ token_t lex_token_internal(bool aliasing)
485623
token_str[i] = 0;
486624
skip_whitespace();
487625

488-
if (!strcmp(token_str, "if"))
489-
return T_if;
490-
if (!strcmp(token_str, "while"))
491-
return T_while;
492-
if (!strcmp(token_str, "for"))
493-
return T_for;
494-
if (!strcmp(token_str, "do"))
495-
return T_do;
496-
if (!strcmp(token_str, "else"))
497-
return T_else;
498-
if (!strcmp(token_str, "return"))
499-
return T_return;
500-
if (!strcmp(token_str, "typedef"))
501-
return T_typedef;
502-
if (!strcmp(token_str, "enum"))
503-
return T_enum;
504-
if (!strcmp(token_str, "struct"))
505-
return T_struct;
506-
if (!strcmp(token_str, "sizeof"))
507-
return T_sizeof;
508-
if (!strcmp(token_str, "switch"))
509-
return T_switch;
510-
if (!strcmp(token_str, "case"))
511-
return T_case;
512-
if (!strcmp(token_str, "break"))
513-
return T_break;
514-
if (!strcmp(token_str, "default"))
515-
return T_default;
516-
if (!strcmp(token_str, "continue"))
517-
return T_continue;
626+
token_t keyword = lookup_keyword(token_str);
627+
if (keyword != T_identifier)
628+
return keyword;
518629

519630
if (aliasing) {
520631
alias = find_alias(token_str);

0 commit comments

Comments
 (0)