diff --git a/Makefile b/Makefile index b123215a..3e95adf2 100644 --- a/Makefile +++ b/Makefile @@ -73,6 +73,28 @@ endif ENABLE_ARCH_TEST ?= 0 $(call set-feature, ARCH_TEST) +# ThreadSanitizer support +# TSAN on x86-64 memory layout: +# Shadow: 0x02a000000000 - 0x7cefffffffff (reserved by TSAN) +# App: 0x7cf000000000 - 0x7ffffffff000 (usable by application) +# +# We use MAP_FIXED to allocate FULL4G's 4GB memory at a fixed address +# (0x7d0000000000) within TSAN's app range, ensuring compatibility. +# +# IMPORTANT: TSAN requires ASLR (Address Space Layout Randomization) to be +# disabled to prevent system allocations from landing in TSAN's shadow memory. +# Tests are run with 'setarch $(uname -m) -R' to disable ASLR. +ENABLE_TSAN ?= 0 +ifeq ("$(ENABLE_TSAN)", "1") +override ENABLE_SDL := 0 # SDL (uninstrumented system lib) creates threads TSAN cannot track +override ENABLE_LTO := 0 # LTO interferes with TSAN instrumentation +CFLAGS += -DTSAN_ENABLED # Signal code to use TSAN-compatible allocations +# Disable ASLR for TSAN tests to prevent allocations in TSAN shadow memory +BIN_WRAPPER = setarch $(shell uname -m) -R +else +BIN_WRAPPER = +endif + # Enable link-time optimization (LTO) ENABLE_LTO ?= 1 ifeq ($(call has, LTO), 1) @@ -281,6 +303,12 @@ CFLAGS += -fsanitize=undefined -fno-sanitize=alignment -fno-sanitize-recover=all LDFLAGS += -fsanitize=undefined -fno-sanitize=alignment -fno-sanitize-recover=all endif +# ThreadSanitizer flags (ENABLE_TSAN is set earlier to override SDL/FULL4G) +ifeq ("$(ENABLE_TSAN)", "1") +CFLAGS += -fsanitize=thread -g +LDFLAGS += -fsanitize=thread +endif + $(OUT)/emulate.o: CFLAGS += -foptimize-sibling-calls -fomit-frame-pointer -fno-stack-check -fno-stack-protector # .DEFAULT_GOAL should be set to all since the very first target is not all @@ -375,7 +403,7 @@ define check-test $(Q)true; \ $(PRINTF) "Running $(3) ... "; \ OUTPUT_FILE="$$(mktemp)"; \ -if (LC_ALL=C $(BIN) $(1) $(2) > "$$OUTPUT_FILE") && \ +if (LC_ALL=C $(BIN_WRAPPER) $(BIN) $(1) $(2) > "$$OUTPUT_FILE") && \ [ "$$(cat "$$OUTPUT_FILE" | $(LOG_FILTER) | $(4))" = "$(5)" ]; then \ $(call notice, [OK]); \ else \ diff --git a/src/emulate.c b/src/emulate.c index 58585f8a..388c66e2 100644 --- a/src/emulate.c +++ b/src/emulate.c @@ -24,6 +24,7 @@ extern struct target_ops gdbstub_ops; #endif #include "decode.h" +#include "log.h" #include "mpool.h" #include "riscv.h" #include "riscv_private.h" @@ -283,6 +284,7 @@ static block_t *block_alloc(riscv_t *rv) block->hot2 = false; block->has_loops = false; block->n_invoke = 0; + block->func = NULL; INIT_LIST_HEAD(&block->list); #if RV32_HAS(T2C) block->compiled = false; @@ -1151,22 +1153,32 @@ void rv_step(void *arg) #if RV32_HAS(JIT) #if RV32_HAS(T2C) /* executed through the tier-2 JIT compiler */ - if (block->hot2) { + /* Use acquire semantics to ensure we see func write before using it */ + if (__atomic_load_n(&block->hot2, __ATOMIC_ACQUIRE)) { ((exec_t2c_func_t) block->func)(rv); prev = NULL; continue; } /* check if invoking times of t1 generated code exceed threshold */ - else if (!block->compiled && block->n_invoke >= THRESHOLD) { - block->compiled = true; + else if (!__atomic_load_n(&block->compiled, __ATOMIC_RELAXED) && + __atomic_load_n(&block->n_invoke, __ATOMIC_RELAXED) >= + THRESHOLD) { + __atomic_store_n(&block->compiled, true, __ATOMIC_RELAXED); queue_entry_t *entry = malloc(sizeof(queue_entry_t)); if (unlikely(!entry)) { /* Malloc failed - reset compiled flag to allow retry later */ - block->compiled = false; + __atomic_store_n(&block->compiled, false, __ATOMIC_RELAXED); continue; } - entry->block = block; + /* Store cache key instead of pointer to prevent use-after-free */ +#if RV32_HAS(SYSTEM) + entry->key = + (uint64_t) block->pc_start | ((uint64_t) block->satp << 32); +#else + entry->key = (uint64_t) block->pc_start; +#endif pthread_mutex_lock(&rv->wait_queue_lock); list_add(&entry->list, &rv->wait_queue); + pthread_cond_signal(&rv->wait_queue_cond); pthread_mutex_unlock(&rv->wait_queue_lock); } #endif @@ -1178,7 +1190,11 @@ void rv_step(void *arg) * entry in compiled binary buffer. */ if (block->hot) { +#if RV32_HAS(T2C) + __atomic_fetch_add(&block->n_invoke, 1, __ATOMIC_RELAXED); +#else block->n_invoke++; +#endif ((exec_block_func_t) state->buf)( rv, (uintptr_t) (state->buf + block->offset)); prev = NULL; @@ -1190,10 +1206,20 @@ void rv_step(void *arg) #endif ) { jit_translate(rv, block); - ((exec_block_func_t) state->buf)( - rv, (uintptr_t) (state->buf + block->offset)); - prev = NULL; - continue; + /* Only execute if translation succeeded (block is hot) */ + if (block->hot) { + rv_log_debug("JIT: Executing block pc=0x%08x, offset=%u", + block->pc_start, block->offset); + ((exec_block_func_t) state->buf)( + rv, (uintptr_t) (state->buf + block->offset)); + prev = NULL; + continue; + } + /* Fall through to interpreter if translation failed */ + rv_log_debug( + "JIT: Translation failed for block pc=0x%08x, using " + "interpreter", + block->pc_start); } set_reset(&pc_set); has_loops = false; diff --git a/src/io.c b/src/io.c index 4ff325d3..975013ee 100644 --- a/src/io.c +++ b/src/io.c @@ -27,12 +27,47 @@ memory_t *memory_new(uint32_t size) return NULL; assert(mem); #if HAVE_MMAP +#if defined(TSAN_ENABLED) + /* ThreadSanitizer compatibility: Use MAP_FIXED to allocate at a specific + * address to avoid conflicts with TSAN's shadow memory. + */ +#if defined(__x86_64__) + /* x86_64: Allocate within TSAN's range (0x7cf000000000 - 0x7ffffffff000). + * + * Fixed address: 0x7d0000000000 + * Size: up to 4GB (0x100000000) + * End: 0x7d0100000000 (well within app range) + */ + void *fixed_addr = (void *) 0x7d0000000000UL; +#elif defined(__aarch64__) + /* ARM64 (macOS/Apple Silicon): Use higher address range. + * + * Fixed address: 0x150000000000 (21TB) + * Size: up to 4GB (0x100000000) + * End: 0x150100000000 + * + * This avoids TSAN's shadow memory and typical process allocations. + * Requires ASLR disabled via: setarch $(uname -m) -R + */ + void *fixed_addr = (void *) 0x150000000000UL; +#else +#error "TSAN is only supported on x86_64 and aarch64" +#endif + data_memory_base = mmap(fixed_addr, size, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0); + if (data_memory_base == MAP_FAILED) { + free(mem); + return NULL; + } +#else + /* Standard allocation without TSAN */ data_memory_base = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); if (data_memory_base == MAP_FAILED) { free(mem); return NULL; } +#endif #else data_memory_base = malloc(size); if (!data_memory_base) { diff --git a/src/jit.c b/src/jit.c index 158665d4..bc333fc2 100644 --- a/src/jit.c +++ b/src/jit.c @@ -42,6 +42,7 @@ #include "decode.h" #include "io.h" #include "jit.h" +#include "log.h" #include "riscv.h" #include "riscv_private.h" #include "utils.h" @@ -593,24 +594,30 @@ static void update_branch_imm(struct jit_state *state, assert((imm & 3) == 0); uint32_t insn; imm >>= 2; + rv_log_debug("JIT: Patching branch at offset=%u, imm=%d", offset, imm * 4); + /* Read instruction while in execute mode (MAP_JIT requirement) */ memcpy(&insn, state->buf + offset, sizeof(uint32_t)); if ((insn & 0xfe000000U) == 0x54000000U /* Conditional branch immediate. */ || (insn & 0x7e000000U) == 0x34000000U) { /* Compare and branch immediate. */ assert((imm >> 19) == INT64_C(-1) || (imm >> 19) == 0); + insn &= ~(0x7ffffU << 5); /* Clear old offset bits */ insn |= (imm & 0x7ffff) << 5; } else if ((insn & 0x7c000000U) == 0x14000000U) { /* Unconditional branch immediate. */ assert((imm >> 26) == INT64_C(-1) || (imm >> 26) == 0); + insn &= ~0x03ffffffU; /* Clear old offset bits */ insn |= (imm & 0x03ffffffU) << 0; } else { assert(false); insn = BAD_OPCODE; } #if defined(__APPLE__) && defined(__aarch64__) + /* Switch to write mode only for writing */ pthread_jit_write_protect_np(false); #endif memcpy(state->buf + offset, &insn, sizeof(uint32_t)); + sys_icache_invalidate(state->buf + offset, sizeof(uint32_t)); #if defined(__APPLE__) && defined(__aarch64__) pthread_jit_write_protect_np(true); #endif @@ -2164,9 +2171,12 @@ void clear_hot(block_t *block) static void code_cache_flush(struct jit_state *state, riscv_t *rv) { + rv_log_info("JIT: Flushing code cache (n_blocks=%d, n_jumps=%d, offset=%u)", + state->n_blocks, state->n_jumps, state->offset); should_flush = false; state->offset = state->org_size; state->n_blocks = 0; + state->n_jumps = 0; /* Reset jump count when flushing */ set_reset(&state->set); clear_cache_hot(rv->block_cache, (clear_func_t) clear_hot); #if RV32_HAS(T2C) @@ -2196,6 +2206,7 @@ static void translate(struct jit_state *state, riscv_t *rv, block_t *block) static void resolve_jumps(struct jit_state *state) { + rv_log_debug("JIT: Resolving %d jumps", state->n_jumps); for (int i = 0; i < state->n_jumps; i++) { struct jump jump = state->jumps[i]; int target_loc; @@ -2218,6 +2229,10 @@ static void resolve_jumps(struct jit_state *state) (if (jump.target_satp == state->offset_map[i].satp), ) { target_loc = state->offset_map[i].offset; + rv_log_debug( + "JIT: Jump %d resolved to block pc=0x%08x, " + "offset=%d", + i, jump.target_pc, target_loc); break; } } @@ -2229,6 +2244,7 @@ static void resolve_jumps(struct jit_state *state) uint8_t *offset_ptr = &state->buf[jump.offset_loc]; memcpy(offset_ptr, &rel, sizeof(uint32_t)); + sys_icache_invalidate(offset_ptr, sizeof(uint32_t)); #elif defined(__aarch64__) int32_t rel = target_loc - jump.offset_loc; update_branch_imm(state, jump.offset_loc, rel); @@ -2308,23 +2324,35 @@ void jit_translate(riscv_t *rv, block_t *block) ) { block->offset = state->offset_map[i].offset; block->hot = true; + rv_log_debug("JIT: Cache hit for block pc=0x%08x, offset=%u", + block->pc_start, block->offset); return; } } assert(NULL); __UNREACHABLE; } + rv_log_debug("JIT: Starting translation for block pc=0x%08x", + block->pc_start); restart: memset(state->jumps, 0, MAX_JUMPS * sizeof(struct jump)); state->n_jumps = 0; block->offset = state->offset; translate_chained_block(state, rv, block); if (unlikely(should_flush)) { + /* Mark block as not translated since translation was incomplete */ + block->hot = false; + /* Don't reset offset - it will be set correctly on restart */ + rv_log_debug("JIT: Translation triggered flush for block pc=0x%08x", + block->pc_start); code_cache_flush(state, rv); goto restart; } resolve_jumps(state); block->hot = true; + rv_log_debug( + "JIT: Translation completed for block pc=0x%08x, offset=%u, size=%u", + block->pc_start, block->offset, state->offset - block->offset); } struct jit_state *jit_state_init(size_t size) @@ -2336,6 +2364,52 @@ struct jit_state *jit_state_init(size_t size) state->offset = 0; state->size = size; +#if defined(TSAN_ENABLED) + /* ThreadSanitizer compatibility: Allocate JIT code buffer at a fixed + * address above the main memory region to avoid conflicts. + */ +#if defined(__x86_64__) + /* x86_64 memory layout: + * Main memory: 0x7d0000000000 - 0x7d0100000000 (4GB for FULL4G) + * JIT buffer: 0x7d1000000000 + size + * + * This keeps both allocations in TSAN's app range (0x7cf000000000 - + * 0x7ffffffff000) and prevents overlap with main memory or TSAN shadow. + */ + void *jit_addr = (void *) 0x7d1000000000UL; + state->buf = mmap(jit_addr, size, PROT_READ | PROT_WRITE | PROT_EXEC, + MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED +#if defined(__APPLE__) + | MAP_JIT +#endif + , + -1, 0); +#elif defined(__aarch64__) + /* ARM64 memory layout (macOS/Apple Silicon): + * Main memory: 0x150000000000 - 0x150100000000 (4GB for FULL4G) + * JIT buffer: 0x151000000000 + size + * + * Apple Silicon requires MAP_JIT for executable memory. The fixed + * address is chosen to avoid TSAN's shadow memory and typical process + * allocations. Requires ASLR disabled via: setarch $(uname -m) -R + */ + void *jit_addr = (void *) 0x151000000000UL; + state->buf = mmap(jit_addr, size, PROT_READ | PROT_WRITE | PROT_EXEC, + MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED +#if defined(__APPLE__) + | MAP_JIT +#endif + , + -1, 0); +#else +#error "TSAN is only supported on x86_64 and aarch64" +#endif + if (state->buf == MAP_FAILED) { + free(state); + return NULL; + } +#else + /* Standard allocation without TSAN */ state->buf = mmap(0, size, PROT_READ | PROT_WRITE | PROT_EXEC, MAP_PRIVATE | MAP_ANONYMOUS #if defined(__APPLE__) @@ -2347,8 +2421,7 @@ struct jit_state *jit_state_init(size_t size) free(state); return NULL; } - assert(state->buf != MAP_FAILED); - +#endif state->n_blocks = 0; set_reset(&state->set); reset_reg(); diff --git a/src/main.c b/src/main.c index 8e079a50..aa55773d 100644 --- a/src/main.c +++ b/src/main.c @@ -19,6 +19,28 @@ #include "riscv.h" #include "utils.h" +/* ThreadSanitizer configuration for FULL4G compatibility + * + * We use MAP_FIXED to allocate emulated memory at 0x7d0000000000, which is + * within TSAN's application memory range (0x7cf000000000 - 0x7ffffffff000). + * This avoids conflicts with TSAN's shadow memory and allows race detection + * to work with FULL4G's 4GB address space. + * + * Configuration optimizes for race detection with minimal overhead. + */ +#if defined(__SANITIZE_THREAD__) +const char *__tsan_default_options() +{ + return "halt_on_error=0" /* Continue after errors */ + ":report_bugs=1" /* Report data races */ + ":second_deadlock_stack=1" /* Full deadlock info */ + ":verbosity=0" /* Reduce noise */ + ":memory_limit_mb=0" /* No memory limit */ + ":history_size=7" /* Larger race detection window */ + ":io_sync=0"; /* Don't sync on I/O */ +} +#endif + /* enable program trace mode */ #if !RV32_HAS(SYSTEM) || (RV32_HAS(SYSTEM) && RV32_HAS(ELF_LOADER)) static bool opt_trace = false; @@ -282,7 +304,7 @@ int main(int argc, char **args) .args_offset_size = ARGS_OFFSET_SIZE, .argc = prog_argc, .argv = prog_args, - .log_level = LOG_TRACE, + .log_level = LOG_INFO, .run_flag = run_flag, .profile_output_file = prof_out_file, .cycle_per_step = CYCLE_PER_STEP, diff --git a/src/riscv.c b/src/riscv.c index dcaaa94a..47434c98 100644 --- a/src/riscv.c +++ b/src/riscv.c @@ -206,19 +206,41 @@ static pthread_t t2c_thread; static void *t2c_runloop(void *arg) { riscv_t *rv = (riscv_t *) arg; + pthread_mutex_lock(&rv->wait_queue_lock); while (!rv->quit) { - if (!list_empty(&rv->wait_queue)) { - queue_entry_t *entry = - list_last_entry(&rv->wait_queue, queue_entry_t, list); - pthread_mutex_lock(&rv->wait_queue_lock); - list_del_init(&entry->list); - pthread_mutex_unlock(&rv->wait_queue_lock); - pthread_mutex_lock(&rv->cache_lock); - t2c_compile(rv, entry->block); - pthread_mutex_unlock(&rv->cache_lock); - free(entry); - } + /* Wait for work or quit signal */ + while (list_empty(&rv->wait_queue) && !rv->quit) + pthread_cond_wait(&rv->wait_queue_cond, &rv->wait_queue_lock); + + if (rv->quit) + break; + + /* Extract work item while holding the lock */ + queue_entry_t *entry = + list_last_entry(&rv->wait_queue, queue_entry_t, list); + list_del_init(&entry->list); + pthread_mutex_unlock(&rv->wait_queue_lock); + + /* Perform compilation with cache lock */ + pthread_mutex_lock(&rv->cache_lock); + /* Look up block from cache using the key (might have been evicted) */ + uint32_t pc = (uint32_t) entry->key; + block_t *block = (block_t *) cache_get(rv->block_cache, pc, false); +#if RV32_HAS(SYSTEM) + /* Verify SATP matches (for system mode) */ + uint32_t satp = (uint32_t) (entry->key >> 32); + if (block && block->satp != satp) + block = NULL; +#endif + /* Compile only if block still exists in cache */ + if (block) + t2c_compile(rv, block); + pthread_mutex_unlock(&rv->cache_lock); + free(entry); + + pthread_mutex_lock(&rv->wait_queue_lock); } + pthread_mutex_unlock(&rv->wait_queue_lock); return NULL; } #endif @@ -745,6 +767,7 @@ riscv_t *rv_create(riscv_user_t rv_attr) /* prepare wait queue. */ pthread_mutex_init(&rv->wait_queue_lock, NULL); pthread_mutex_init(&rv->cache_lock, NULL); + pthread_cond_init(&rv->wait_queue_cond, NULL); INIT_LIST_HEAD(&rv->wait_queue); /* activate the background compilation thread. */ pthread_create(&t2c_thread, NULL, t2c_runloop, rv); @@ -866,10 +889,24 @@ void rv_delete(riscv_t *rv) block_map_destroy(rv); #else #if RV32_HAS(T2C) + /* Signal the thread to quit */ + pthread_mutex_lock(&rv->wait_queue_lock); rv->quit = true; + pthread_cond_signal(&rv->wait_queue_cond); + pthread_mutex_unlock(&rv->wait_queue_lock); + pthread_join(t2c_thread, NULL); + + /* Clean up any remaining entries in wait queue */ + queue_entry_t *entry, *safe; + list_for_each_entry_safe (entry, safe, &rv->wait_queue, list) { + list_del(&entry->list); + free(entry); + } + pthread_mutex_destroy(&rv->wait_queue_lock); pthread_mutex_destroy(&rv->cache_lock); + pthread_cond_destroy(&rv->wait_queue_cond); jit_cache_exit(rv->jit_cache); #endif jit_state_exit(rv->jit_state); diff --git a/src/riscv_private.h b/src/riscv_private.h index ace3ca90..55879432 100644 --- a/src/riscv_private.h +++ b/src/riscv_private.h @@ -105,7 +105,7 @@ typedef struct block { #if RV32_HAS(JIT) && RV32_HAS(T2C) typedef struct { - block_t *block; + uint64_t key; /**< cache key (PC or PC|SATP) to look up block */ struct list_head list; } queue_entry_t; #endif @@ -197,6 +197,7 @@ struct riscv_internal { #if RV32_HAS(T2C) struct list_head wait_queue; pthread_mutex_t wait_queue_lock, cache_lock; + pthread_cond_t wait_queue_cond; volatile bool quit; /**< Determine the main thread is terminated or not */ #endif void *jit_state; diff --git a/src/t2c.c b/src/t2c.c index 343b85e6..2115adaf 100644 --- a/src/t2c.c +++ b/src/t2c.c @@ -346,7 +346,9 @@ void t2c_compile(riscv_t *rv, block_t *block) jit_cache_update(rv->jit_cache, key, block->func); - block->hot2 = true; + /* Use release semantics to ensure func write is visible before hot2 is set + */ + __atomic_store_n(&block->hot2, true, __ATOMIC_RELEASE); } struct jit_cache *jit_cache_init()