diff --git a/src/defs.h b/src/defs.h index 60037fbd..51c3ae69 100644 --- a/src/defs.h +++ b/src/defs.h @@ -224,6 +224,16 @@ typedef struct { int count; } token_buffer_t; +/* String pool for identifier deduplication */ +typedef struct { + hashmap_t *strings; /* Map string -> interned string */ +} string_pool_t; + +/* String literal pool for deduplicating string constants */ +typedef struct { + hashmap_t *literals; /* Map string literal -> ELF data offset */ +} string_literal_pool_t; + /* builtin types */ typedef enum { TYPE_void = 0, diff --git a/src/globals.c b/src/globals.c index e709332c..23bbb8e3 100644 --- a/src/globals.c +++ b/src/globals.c @@ -13,6 +13,9 @@ #include "defs.h" +/* Forward declaration for string interning */ +char *intern_string(char *str); + /* Lexer */ char token_str[MAX_TOKEN_LEN]; token_t next_token; @@ -673,7 +676,8 @@ void add_alias(char *alias, char *value) printf("Failed to allocate alias_t\n"); return; } - strcpy(al->alias, alias); + /* Use interned string for alias name */ + strcpy(al->alias, intern_string(alias)); hashmap_put(ALIASES_MAP, alias, al); } strcpy(al->value, value); @@ -707,7 +711,8 @@ macro_t *add_macro(char *name) printf("Failed to allocate macro_t\n"); return NULL; } - strcpy(ma->name, name); + /* Use interned string for macro name */ + strcpy(ma->name, intern_string(name)); hashmap_put(MACROS_MAP, name, ma); } ma->disabled = false; @@ -733,6 +738,41 @@ bool remove_macro(char *name) } void error(char *msg); + +/* String pool global */ +string_pool_t *string_pool; +string_literal_pool_t *string_literal_pool; + +/* Safe string interning that works with self-hosting */ +char *intern_string(char *str) +{ + char *existing; + char *interned; + int len; + + /* Safety: return original if NULL */ + if (!str) + return NULL; + + /* Safety: can't intern before initialization */ + if (!GENERAL_ARENA || !string_pool) + return str; + + /* Check if already interned */ + existing = hashmap_get(string_pool->strings, str); + if (existing) + return existing; + + /* Allocate and store new string */ + len = strlen(str) + 1; + interned = arena_alloc(GENERAL_ARENA, len); + strcpy(interned, str); + + hashmap_put(string_pool->strings, interned, interned); + + return interned; +} + int find_macro_param_src_idx(char *name, block_t *parent) { macro_t *macro = parent->macro; @@ -761,7 +801,8 @@ type_t *add_type(void) type_t *add_named_type(char *name) { type_t *type = add_type(); - strcpy(type->type_name, name); + /* Use interned string for type name */ + strcpy(type->type_name, intern_string(name)); return type; } @@ -773,7 +814,8 @@ void add_constant(char alias[], int value) return; } - strcpy(constant->alias, alias); + /* Use interned string for constant name */ + strcpy(constant->alias, intern_string(alias)); constant->value = value; hashmap_put(CONSTANTS_MAP, alias, constant); } @@ -877,7 +919,8 @@ func_t *add_func(char *func_name, bool synthesize) func = arena_alloc_func(); hashmap_put(FUNC_MAP, func_name, func); - strcpy(func->return_def.var_name, func_name); + /* Use interned string for function name */ + strcpy(func->return_def.var_name, intern_string(func_name)); func->stack_size = 4; if (synthesize) @@ -1042,7 +1085,7 @@ void add_insn(block_t *block, n->idx = 0; if (str) - strcpy(n->str, str); + strcpy(n->str, intern_string(str)); else n->str[0] = '\0'; @@ -1151,6 +1194,16 @@ void global_init(void) TYPES = arena_alloc(GENERAL_ARENA, MAX_TYPES * sizeof(type_t)); PH2_IR_FLATTEN = arena_alloc(GENERAL_ARENA, MAX_IR_INSTR * sizeof(ph2_ir_t *)); + + /* Initialize string pool for identifier deduplication */ + string_pool = arena_alloc(GENERAL_ARENA, sizeof(string_pool_t)); + string_pool->strings = hashmap_create(512); + + /* Initialize string literal pool for deduplicating string constants */ + string_literal_pool = + arena_alloc(GENERAL_ARENA, sizeof(string_literal_pool_t)); + string_literal_pool->literals = hashmap_create(256); + SOURCE = strbuf_create(MAX_SOURCE); FUNC_MAP = hashmap_create(DEFAULT_FUNCS_SIZE); INCLUSION_MAP = hashmap_create(DEFAULT_INCLUSIONS_SIZE); @@ -1273,6 +1326,13 @@ void global_release(void) lexer_cleanup(); hashmap_free(MACROS_MAP); + + /* Free string interning hashmaps */ + if (string_pool && string_pool->strings) + hashmap_free(string_pool->strings); + if (string_literal_pool && string_literal_pool->literals) + hashmap_free(string_literal_pool->literals); + arena_free(BLOCK_ARENA); arena_free(INSN_ARENA); arena_free(BB_ARENA); diff --git a/src/parser.c b/src/parser.c index 3620b055..e043c586 100644 --- a/src/parser.c +++ b/src/parser.c @@ -554,7 +554,7 @@ bool read_preproc_directive(void) while (lex_peek(T_identifier, alias)) { lex_expect(T_identifier); strcpy(macro->param_defs[macro->num_param_defs++].var_name, - alias); + intern_string(alias)); lex_accept(T_comma); } if (lex_accept(T_elipsis)) @@ -1192,14 +1192,18 @@ void read_inner_var_decl(var_t *vd, int anon, int is_param) /* is it function pointer declaration? */ if (lex_accept(T_open_bracket)) { func_t func; + char temp_name[MAX_VAR_LEN]; lex_expect(T_asterisk); - lex_ident(T_identifier, vd->var_name); + lex_ident(T_identifier, temp_name); + strcpy(vd->var_name, intern_string(temp_name)); lex_expect(T_close_bracket); read_parameter_list_decl(&func, 1); vd->is_func = true; } else { if (anon == 0) { - lex_ident(T_identifier, vd->var_name); + char temp_name[MAX_VAR_LEN]; + lex_ident(T_identifier, temp_name); + strcpy(vd->var_name, intern_string(temp_name)); if (!lex_peek(T_open_bracket, NULL) && !is_param) { if (vd->is_global) { opstack_push(vd); @@ -2078,7 +2082,7 @@ void read_expr_operand(block_t *parent, basic_block_t **bb) /* indirective function pointer assignment */ vd = require_var(parent); vd->is_func = true; - strcpy(vd->var_name, token); + strcpy(vd->var_name, intern_string(token)); opstack_push(vd); } } else if (lex_accept(T_open_curly)) { @@ -4431,7 +4435,7 @@ void read_global_statement(void) if (!type) type = add_type(); - strcpy(type->type_name, token); + strcpy(type->type_name, intern_string(token)); type->base_type = TYPE_struct; lex_expect(T_open_curly); @@ -4469,7 +4473,7 @@ void read_global_statement(void) if (!type) type = add_type(); - strcpy(type->type_name, token); + strcpy(type->type_name, intern_string(token)); type->base_type = TYPE_union; lex_expect(T_open_curly); @@ -4520,7 +4524,7 @@ void read_global_statement(void) } while (lex_accept(T_comma)); lex_expect(T_close_curly); lex_ident(T_identifier, token); - strcpy(type->type_name, token); + strcpy(type->type_name, intern_string(token)); lex_expect(T_semicolon); } else if (lex_accept(T_struct)) { int i = 0, size = 0, has_struct_def = 0; @@ -4535,7 +4539,7 @@ void read_global_statement(void) if (!tag) { tag = add_type(); tag->base_type = TYPE_struct; - strcpy(tag->type_name, token); + strcpy(tag->type_name, intern_string(token)); } } @@ -4574,7 +4578,7 @@ void read_global_statement(void) strcpy(token, tag->type_name); memcpy(tag, type, sizeof(type_t)); tag->base_type = TYPE_struct; - strcpy(tag->type_name, token); + strcpy(tag->type_name, intern_string(token)); } else { /* If it is a forward declaration, build a connection between * structure tag and alias. In 'find_type', it will retrieve @@ -4597,7 +4601,7 @@ void read_global_statement(void) if (!tag) { tag = add_type(); tag->base_type = TYPE_union; - strcpy(tag->type_name, token); + strcpy(tag->type_name, intern_string(token)); } } @@ -4640,7 +4644,7 @@ void read_global_statement(void) strcpy(token, tag->type_name); memcpy(tag, type, sizeof(type_t)); tag->base_type = TYPE_union; - strcpy(tag->type_name, token); + strcpy(tag->type_name, intern_string(token)); } else { /* If it is a forward declaration, build a connection between * union tag and alias. In 'find_type', it will retrieve