From 3bdcec5cc14332b2db204c0140c8a2eb2edf44a4 Mon Sep 17 00:00:00 2001 From: Jim Huang Date: Sun, 5 Oct 2025 15:50:43 +0800 Subject: [PATCH 1/7] Enable TSAN with FULL4G and T2C support ThreadSanitizer (TSAN) can now detect race conditions across the entire multi-threaded JIT pipeline with full 4GB address space emulation. This enables testing of the tier-2 LLVM compilation thread while maintaining production memory layout. Memory Layout (TSAN-compatible): - Main memory: MAP_FIXED at 0x7d0000000000 (4GB) - JIT buffer: MAP_FIXED at 0x7d1000000000 - Both allocations within TSAN app range (0x7cf-0x7ff trillion) - Prevents conflicts with TSAN shadow memory (0x02a-0x7ce trillion) ASLR Mitigation: - Added setarch -R wrapper for TSAN test execution - Disables ASLR to prevent random allocations in shadow memory - Only affects test runs, not production builds SDL Conflict Resolution: - SDL (uninstrumented system library) creates threads TSAN cannot track - Disabled SDL when TSAN enabled to focus on built-in race detection - Production builds still fully support SDL --- Makefile | 30 ++++++++++++++++++++++- src/emulate.c | 25 +++++++++++++++---- src/io.c | 21 ++++++++++++++++ src/jit.c | 22 +++++++++++++++-- src/main.c | 22 +++++++++++++++++ src/riscv.c | 59 ++++++++++++++++++++++++++++++++++++--------- src/riscv_private.h | 3 ++- src/t2c.c | 4 ++- 8 files changed, 165 insertions(+), 21 deletions(-) diff --git a/Makefile b/Makefile index b123215a..3e95adf2 100644 --- a/Makefile +++ b/Makefile @@ -73,6 +73,28 @@ endif ENABLE_ARCH_TEST ?= 0 $(call set-feature, ARCH_TEST) +# ThreadSanitizer support +# TSAN on x86-64 memory layout: +# Shadow: 0x02a000000000 - 0x7cefffffffff (reserved by TSAN) +# App: 0x7cf000000000 - 0x7ffffffff000 (usable by application) +# +# We use MAP_FIXED to allocate FULL4G's 4GB memory at a fixed address +# (0x7d0000000000) within TSAN's app range, ensuring compatibility. +# +# IMPORTANT: TSAN requires ASLR (Address Space Layout Randomization) to be +# disabled to prevent system allocations from landing in TSAN's shadow memory. +# Tests are run with 'setarch $(uname -m) -R' to disable ASLR. +ENABLE_TSAN ?= 0 +ifeq ("$(ENABLE_TSAN)", "1") +override ENABLE_SDL := 0 # SDL (uninstrumented system lib) creates threads TSAN cannot track +override ENABLE_LTO := 0 # LTO interferes with TSAN instrumentation +CFLAGS += -DTSAN_ENABLED # Signal code to use TSAN-compatible allocations +# Disable ASLR for TSAN tests to prevent allocations in TSAN shadow memory +BIN_WRAPPER = setarch $(shell uname -m) -R +else +BIN_WRAPPER = +endif + # Enable link-time optimization (LTO) ENABLE_LTO ?= 1 ifeq ($(call has, LTO), 1) @@ -281,6 +303,12 @@ CFLAGS += -fsanitize=undefined -fno-sanitize=alignment -fno-sanitize-recover=all LDFLAGS += -fsanitize=undefined -fno-sanitize=alignment -fno-sanitize-recover=all endif +# ThreadSanitizer flags (ENABLE_TSAN is set earlier to override SDL/FULL4G) +ifeq ("$(ENABLE_TSAN)", "1") +CFLAGS += -fsanitize=thread -g +LDFLAGS += -fsanitize=thread +endif + $(OUT)/emulate.o: CFLAGS += -foptimize-sibling-calls -fomit-frame-pointer -fno-stack-check -fno-stack-protector # .DEFAULT_GOAL should be set to all since the very first target is not all @@ -375,7 +403,7 @@ define check-test $(Q)true; \ $(PRINTF) "Running $(3) ... "; \ OUTPUT_FILE="$$(mktemp)"; \ -if (LC_ALL=C $(BIN) $(1) $(2) > "$$OUTPUT_FILE") && \ +if (LC_ALL=C $(BIN_WRAPPER) $(BIN) $(1) $(2) > "$$OUTPUT_FILE") && \ [ "$$(cat "$$OUTPUT_FILE" | $(LOG_FILTER) | $(4))" = "$(5)" ]; then \ $(call notice, [OK]); \ else \ diff --git a/src/emulate.c b/src/emulate.c index 58585f8a..09a97d47 100644 --- a/src/emulate.c +++ b/src/emulate.c @@ -283,6 +283,7 @@ static block_t *block_alloc(riscv_t *rv) block->hot2 = false; block->has_loops = false; block->n_invoke = 0; + block->func = NULL; INIT_LIST_HEAD(&block->list); #if RV32_HAS(T2C) block->compiled = false; @@ -1151,22 +1152,32 @@ void rv_step(void *arg) #if RV32_HAS(JIT) #if RV32_HAS(T2C) /* executed through the tier-2 JIT compiler */ - if (block->hot2) { + /* Use acquire semantics to ensure we see func write before using it */ + if (__atomic_load_n(&block->hot2, __ATOMIC_ACQUIRE)) { ((exec_t2c_func_t) block->func)(rv); prev = NULL; continue; } /* check if invoking times of t1 generated code exceed threshold */ - else if (!block->compiled && block->n_invoke >= THRESHOLD) { - block->compiled = true; + else if (!__atomic_load_n(&block->compiled, __ATOMIC_RELAXED) && + __atomic_load_n(&block->n_invoke, __ATOMIC_RELAXED) >= + THRESHOLD) { + __atomic_store_n(&block->compiled, true, __ATOMIC_RELAXED); queue_entry_t *entry = malloc(sizeof(queue_entry_t)); if (unlikely(!entry)) { /* Malloc failed - reset compiled flag to allow retry later */ - block->compiled = false; + __atomic_store_n(&block->compiled, false, __ATOMIC_RELAXED); continue; } - entry->block = block; + /* Store cache key instead of pointer to prevent use-after-free */ +#if RV32_HAS(SYSTEM) + entry->key = + (uint64_t) block->pc_start | ((uint64_t) block->satp << 32); +#else + entry->key = (uint64_t) block->pc_start; +#endif pthread_mutex_lock(&rv->wait_queue_lock); list_add(&entry->list, &rv->wait_queue); + pthread_cond_signal(&rv->wait_queue_cond); pthread_mutex_unlock(&rv->wait_queue_lock); } #endif @@ -1178,7 +1189,11 @@ void rv_step(void *arg) * entry in compiled binary buffer. */ if (block->hot) { +#if RV32_HAS(T2C) + __atomic_fetch_add(&block->n_invoke, 1, __ATOMIC_RELAXED); +#else block->n_invoke++; +#endif ((exec_block_func_t) state->buf)( rv, (uintptr_t) (state->buf + block->offset)); prev = NULL; diff --git a/src/io.c b/src/io.c index 4ff325d3..1e5b73b9 100644 --- a/src/io.c +++ b/src/io.c @@ -27,12 +27,33 @@ memory_t *memory_new(uint32_t size) return NULL; assert(mem); #if HAVE_MMAP +#if defined(TSAN_ENABLED) && defined(__x86_64__) + /* ThreadSanitizer compatibility: Use MAP_FIXED to allocate at a specific + * address within TSAN's app range (0x7cf000000000 - 0x7ffffffff000). + * + * Fixed address: 0x7d0000000000 + * Size: up to 4GB (0x100000000) + * End: 0x7d0100000000 (well within app range) + * + * This guarantees the allocation won't land in TSAN's shadow memory, + * preventing "unexpected memory mapping" errors. + */ + void *fixed_addr = (void *) 0x7d0000000000UL; + data_memory_base = mmap(fixed_addr, size, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0); + if (data_memory_base == MAP_FAILED) { + free(mem); + return NULL; + } +#else + /* Standard allocation without TSAN */ data_memory_base = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); if (data_memory_base == MAP_FAILED) { free(mem); return NULL; } +#endif #else data_memory_base = malloc(size); if (!data_memory_base) { diff --git a/src/jit.c b/src/jit.c index 158665d4..9932ee6d 100644 --- a/src/jit.c +++ b/src/jit.c @@ -2336,6 +2336,25 @@ struct jit_state *jit_state_init(size_t size) state->offset = 0; state->size = size; +#if defined(TSAN_ENABLED) && defined(__x86_64__) + /* ThreadSanitizer compatibility: Allocate JIT code buffer at a fixed + * address above the main memory region to avoid conflicts. + * + * Main memory: 0x7d0000000000 - 0x7d0100000000 (4GB for FULL4G) + * JIT buffer: 0x7d1000000000 + size + * + * This keeps both allocations in TSAN's app range (0x7cf000000000 - + * 0x7ffffffff000) and prevents overlap with main memory or TSAN shadow. + */ + void *jit_addr = (void *) 0x7d1000000000UL; + state->buf = mmap(jit_addr, size, PROT_READ | PROT_WRITE | PROT_EXEC, + MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0); + if (state->buf == MAP_FAILED) { + free(state); + return NULL; + } +#else + /* Standard allocation without TSAN */ state->buf = mmap(0, size, PROT_READ | PROT_WRITE | PROT_EXEC, MAP_PRIVATE | MAP_ANONYMOUS #if defined(__APPLE__) @@ -2347,8 +2366,7 @@ struct jit_state *jit_state_init(size_t size) free(state); return NULL; } - assert(state->buf != MAP_FAILED); - +#endif state->n_blocks = 0; set_reset(&state->set); reset_reg(); diff --git a/src/main.c b/src/main.c index 8e079a50..8dfcda32 100644 --- a/src/main.c +++ b/src/main.c @@ -19,6 +19,28 @@ #include "riscv.h" #include "utils.h" +/* ThreadSanitizer configuration for FULL4G compatibility + * + * We use MAP_FIXED to allocate emulated memory at 0x7d0000000000, which is + * within TSAN's application memory range (0x7cf000000000 - 0x7ffffffff000). + * This avoids conflicts with TSAN's shadow memory and allows race detection + * to work with FULL4G's 4GB address space. + * + * Configuration optimizes for race detection with minimal overhead. + */ +#if defined(__SANITIZE_THREAD__) +const char *__tsan_default_options() +{ + return "halt_on_error=0" /* Continue after errors */ + ":report_bugs=1" /* Report data races */ + ":second_deadlock_stack=1" /* Full deadlock info */ + ":verbosity=0" /* Reduce noise */ + ":memory_limit_mb=0" /* No memory limit */ + ":history_size=7" /* Larger race detection window */ + ":io_sync=0"; /* Don't sync on I/O */ +} +#endif + /* enable program trace mode */ #if !RV32_HAS(SYSTEM) || (RV32_HAS(SYSTEM) && RV32_HAS(ELF_LOADER)) static bool opt_trace = false; diff --git a/src/riscv.c b/src/riscv.c index dcaaa94a..47434c98 100644 --- a/src/riscv.c +++ b/src/riscv.c @@ -206,19 +206,41 @@ static pthread_t t2c_thread; static void *t2c_runloop(void *arg) { riscv_t *rv = (riscv_t *) arg; + pthread_mutex_lock(&rv->wait_queue_lock); while (!rv->quit) { - if (!list_empty(&rv->wait_queue)) { - queue_entry_t *entry = - list_last_entry(&rv->wait_queue, queue_entry_t, list); - pthread_mutex_lock(&rv->wait_queue_lock); - list_del_init(&entry->list); - pthread_mutex_unlock(&rv->wait_queue_lock); - pthread_mutex_lock(&rv->cache_lock); - t2c_compile(rv, entry->block); - pthread_mutex_unlock(&rv->cache_lock); - free(entry); - } + /* Wait for work or quit signal */ + while (list_empty(&rv->wait_queue) && !rv->quit) + pthread_cond_wait(&rv->wait_queue_cond, &rv->wait_queue_lock); + + if (rv->quit) + break; + + /* Extract work item while holding the lock */ + queue_entry_t *entry = + list_last_entry(&rv->wait_queue, queue_entry_t, list); + list_del_init(&entry->list); + pthread_mutex_unlock(&rv->wait_queue_lock); + + /* Perform compilation with cache lock */ + pthread_mutex_lock(&rv->cache_lock); + /* Look up block from cache using the key (might have been evicted) */ + uint32_t pc = (uint32_t) entry->key; + block_t *block = (block_t *) cache_get(rv->block_cache, pc, false); +#if RV32_HAS(SYSTEM) + /* Verify SATP matches (for system mode) */ + uint32_t satp = (uint32_t) (entry->key >> 32); + if (block && block->satp != satp) + block = NULL; +#endif + /* Compile only if block still exists in cache */ + if (block) + t2c_compile(rv, block); + pthread_mutex_unlock(&rv->cache_lock); + free(entry); + + pthread_mutex_lock(&rv->wait_queue_lock); } + pthread_mutex_unlock(&rv->wait_queue_lock); return NULL; } #endif @@ -745,6 +767,7 @@ riscv_t *rv_create(riscv_user_t rv_attr) /* prepare wait queue. */ pthread_mutex_init(&rv->wait_queue_lock, NULL); pthread_mutex_init(&rv->cache_lock, NULL); + pthread_cond_init(&rv->wait_queue_cond, NULL); INIT_LIST_HEAD(&rv->wait_queue); /* activate the background compilation thread. */ pthread_create(&t2c_thread, NULL, t2c_runloop, rv); @@ -866,10 +889,24 @@ void rv_delete(riscv_t *rv) block_map_destroy(rv); #else #if RV32_HAS(T2C) + /* Signal the thread to quit */ + pthread_mutex_lock(&rv->wait_queue_lock); rv->quit = true; + pthread_cond_signal(&rv->wait_queue_cond); + pthread_mutex_unlock(&rv->wait_queue_lock); + pthread_join(t2c_thread, NULL); + + /* Clean up any remaining entries in wait queue */ + queue_entry_t *entry, *safe; + list_for_each_entry_safe (entry, safe, &rv->wait_queue, list) { + list_del(&entry->list); + free(entry); + } + pthread_mutex_destroy(&rv->wait_queue_lock); pthread_mutex_destroy(&rv->cache_lock); + pthread_cond_destroy(&rv->wait_queue_cond); jit_cache_exit(rv->jit_cache); #endif jit_state_exit(rv->jit_state); diff --git a/src/riscv_private.h b/src/riscv_private.h index ace3ca90..55879432 100644 --- a/src/riscv_private.h +++ b/src/riscv_private.h @@ -105,7 +105,7 @@ typedef struct block { #if RV32_HAS(JIT) && RV32_HAS(T2C) typedef struct { - block_t *block; + uint64_t key; /**< cache key (PC or PC|SATP) to look up block */ struct list_head list; } queue_entry_t; #endif @@ -197,6 +197,7 @@ struct riscv_internal { #if RV32_HAS(T2C) struct list_head wait_queue; pthread_mutex_t wait_queue_lock, cache_lock; + pthread_cond_t wait_queue_cond; volatile bool quit; /**< Determine the main thread is terminated or not */ #endif void *jit_state; diff --git a/src/t2c.c b/src/t2c.c index 343b85e6..2115adaf 100644 --- a/src/t2c.c +++ b/src/t2c.c @@ -346,7 +346,9 @@ void t2c_compile(riscv_t *rv, block_t *block) jit_cache_update(rv->jit_cache, key, block->func); - block->hot2 = true; + /* Use release semantics to ensure func write is visible before hot2 is set + */ + __atomic_store_n(&block->hot2, true, __ATOMIC_RELEASE); } struct jit_cache *jit_cache_init() From e72134d05aad5977c21fe1b2b1db7f15419040ca Mon Sep 17 00:00:00 2001 From: Jim Huang Date: Sun, 5 Oct 2025 16:02:50 +0800 Subject: [PATCH 2/7] Add Arm64 TSAN support and fix JIT cache coherency This commit adds ThreadSanitizer (TSAN) support for ARM64/Apple Silicon and fixes critical JIT instruction cache coherency issues. ARM64 TSAN Support: - Extended TSAN-compatible memory allocation to ARM64 architecture - Main memory allocated at fixed address 0x150000000000 (21TB) - JIT buffer allocated at 0x151000000000 with MAP_JIT for Apple Silicon - Both allocations avoid TSAN shadow memory and enable race detection - Note: Requires ASLR disabled on macOS (SIP restrictions may apply) JIT Cache Coherency Fixes: 1. Fixed pthread_jit_write_protect_np() ordering in update_branch_imm 2. Added sys_icache_invalidate() in update_branch_imm 3. Added cache invalidation in resolve_jumps() for x86_64 --- src/emulate.c | 12 ++++++++---- src/io.c | 24 +++++++++++++++++++----- src/jit.c | 46 ++++++++++++++++++++++++++++++++++++++++------ 3 files changed, 67 insertions(+), 15 deletions(-) diff --git a/src/emulate.c b/src/emulate.c index 09a97d47..39c222dd 100644 --- a/src/emulate.c +++ b/src/emulate.c @@ -1205,10 +1205,14 @@ void rv_step(void *arg) #endif ) { jit_translate(rv, block); - ((exec_block_func_t) state->buf)( - rv, (uintptr_t) (state->buf + block->offset)); - prev = NULL; - continue; + /* Only execute if translation succeeded (block is hot) */ + if (block->hot) { + ((exec_block_func_t) state->buf)( + rv, (uintptr_t) (state->buf + block->offset)); + prev = NULL; + continue; + } + /* Fall through to interpreter if translation failed */ } set_reset(&pc_set); has_loops = false; diff --git a/src/io.c b/src/io.c index 1e5b73b9..975013ee 100644 --- a/src/io.c +++ b/src/io.c @@ -27,18 +27,32 @@ memory_t *memory_new(uint32_t size) return NULL; assert(mem); #if HAVE_MMAP -#if defined(TSAN_ENABLED) && defined(__x86_64__) +#if defined(TSAN_ENABLED) /* ThreadSanitizer compatibility: Use MAP_FIXED to allocate at a specific - * address within TSAN's app range (0x7cf000000000 - 0x7ffffffff000). + * address to avoid conflicts with TSAN's shadow memory. + */ +#if defined(__x86_64__) + /* x86_64: Allocate within TSAN's range (0x7cf000000000 - 0x7ffffffff000). * * Fixed address: 0x7d0000000000 * Size: up to 4GB (0x100000000) * End: 0x7d0100000000 (well within app range) - * - * This guarantees the allocation won't land in TSAN's shadow memory, - * preventing "unexpected memory mapping" errors. */ void *fixed_addr = (void *) 0x7d0000000000UL; +#elif defined(__aarch64__) + /* ARM64 (macOS/Apple Silicon): Use higher address range. + * + * Fixed address: 0x150000000000 (21TB) + * Size: up to 4GB (0x100000000) + * End: 0x150100000000 + * + * This avoids TSAN's shadow memory and typical process allocations. + * Requires ASLR disabled via: setarch $(uname -m) -R + */ + void *fixed_addr = (void *) 0x150000000000UL; +#else +#error "TSAN is only supported on x86_64 and aarch64" +#endif data_memory_base = mmap(fixed_addr, size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0); if (data_memory_base == MAP_FAILED) { diff --git a/src/jit.c b/src/jit.c index 9932ee6d..0f7cd77b 100644 --- a/src/jit.c +++ b/src/jit.c @@ -593,6 +593,10 @@ static void update_branch_imm(struct jit_state *state, assert((imm & 3) == 0); uint32_t insn; imm >>= 2; +#if defined(__APPLE__) && defined(__aarch64__) + /* Must be in write mode to read/write MAP_JIT memory on Apple ARM64 */ + pthread_jit_write_protect_np(false); +#endif memcpy(&insn, state->buf + offset, sizeof(uint32_t)); if ((insn & 0xfe000000U) == 0x54000000U /* Conditional branch immediate. */ || (insn & 0x7e000000U) == @@ -607,10 +611,8 @@ static void update_branch_imm(struct jit_state *state, assert(false); insn = BAD_OPCODE; } -#if defined(__APPLE__) && defined(__aarch64__) - pthread_jit_write_protect_np(false); -#endif memcpy(state->buf + offset, &insn, sizeof(uint32_t)); + sys_icache_invalidate(state->buf + offset, sizeof(uint32_t)); #if defined(__APPLE__) && defined(__aarch64__) pthread_jit_write_protect_np(true); #endif @@ -2167,6 +2169,7 @@ static void code_cache_flush(struct jit_state *state, riscv_t *rv) should_flush = false; state->offset = state->org_size; state->n_blocks = 0; + state->n_jumps = 0; /* Reset jump count when flushing */ set_reset(&state->set); clear_cache_hot(rv->block_cache, (clear_func_t) clear_hot); #if RV32_HAS(T2C) @@ -2229,6 +2232,7 @@ static void resolve_jumps(struct jit_state *state) uint8_t *offset_ptr = &state->buf[jump.offset_loc]; memcpy(offset_ptr, &rel, sizeof(uint32_t)); + sys_icache_invalidate(offset_ptr, sizeof(uint32_t)); #elif defined(__aarch64__) int32_t rel = target_loc - jump.offset_loc; update_branch_imm(state, jump.offset_loc, rel); @@ -2320,6 +2324,9 @@ void jit_translate(riscv_t *rv, block_t *block) block->offset = state->offset; translate_chained_block(state, rv, block); if (unlikely(should_flush)) { + /* Mark block as not translated since translation was incomplete */ + block->hot = false; + /* Don't reset offset - it will be set correctly on restart */ code_cache_flush(state, rv); goto restart; } @@ -2336,10 +2343,12 @@ struct jit_state *jit_state_init(size_t size) state->offset = 0; state->size = size; -#if defined(TSAN_ENABLED) && defined(__x86_64__) +#if defined(TSAN_ENABLED) /* ThreadSanitizer compatibility: Allocate JIT code buffer at a fixed * address above the main memory region to avoid conflicts. - * + */ +#if defined(__x86_64__) + /* x86_64 memory layout: * Main memory: 0x7d0000000000 - 0x7d0100000000 (4GB for FULL4G) * JIT buffer: 0x7d1000000000 + size * @@ -2348,7 +2357,32 @@ struct jit_state *jit_state_init(size_t size) */ void *jit_addr = (void *) 0x7d1000000000UL; state->buf = mmap(jit_addr, size, PROT_READ | PROT_WRITE | PROT_EXEC, - MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0); + MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED +#if defined(__APPLE__) + | MAP_JIT +#endif + , + -1, 0); +#elif defined(__aarch64__) + /* ARM64 memory layout (macOS/Apple Silicon): + * Main memory: 0x150000000000 - 0x150100000000 (4GB for FULL4G) + * JIT buffer: 0x151000000000 + size + * + * Apple Silicon requires MAP_JIT for executable memory. The fixed + * address is chosen to avoid TSAN's shadow memory and typical process + * allocations. Requires ASLR disabled via: setarch $(uname -m) -R + */ + void *jit_addr = (void *) 0x151000000000UL; + state->buf = mmap(jit_addr, size, PROT_READ | PROT_WRITE | PROT_EXEC, + MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED +#if defined(__APPLE__) + | MAP_JIT +#endif + , + -1, 0); +#else +#error "TSAN is only supported on x86_64 and aarch64" +#endif if (state->buf == MAP_FAILED) { free(state); return NULL; From e6005e5b1e5178a018ca2512ed7318c54b54c3b7 Mon Sep 17 00:00:00 2001 From: Jim Huang Date: Sun, 5 Oct 2025 22:59:55 +0800 Subject: [PATCH 3/7] Add diagnostic logging to JIT compilation pipeline This introduces debug-level logging throughout the JITC to facilitate troubleshooting of intermittent compilation and execution failures on Arm64. --- src/emulate.c | 7 +++++++ src/jit.c | 19 +++++++++++++++++++ 2 files changed, 26 insertions(+) diff --git a/src/emulate.c b/src/emulate.c index 39c222dd..388c66e2 100644 --- a/src/emulate.c +++ b/src/emulate.c @@ -24,6 +24,7 @@ extern struct target_ops gdbstub_ops; #endif #include "decode.h" +#include "log.h" #include "mpool.h" #include "riscv.h" #include "riscv_private.h" @@ -1207,12 +1208,18 @@ void rv_step(void *arg) jit_translate(rv, block); /* Only execute if translation succeeded (block is hot) */ if (block->hot) { + rv_log_debug("JIT: Executing block pc=0x%08x, offset=%u", + block->pc_start, block->offset); ((exec_block_func_t) state->buf)( rv, (uintptr_t) (state->buf + block->offset)); prev = NULL; continue; } /* Fall through to interpreter if translation failed */ + rv_log_debug( + "JIT: Translation failed for block pc=0x%08x, using " + "interpreter", + block->pc_start); } set_reset(&pc_set); has_loops = false; diff --git a/src/jit.c b/src/jit.c index 0f7cd77b..f5014b23 100644 --- a/src/jit.c +++ b/src/jit.c @@ -42,6 +42,7 @@ #include "decode.h" #include "io.h" #include "jit.h" +#include "log.h" #include "riscv.h" #include "riscv_private.h" #include "utils.h" @@ -593,6 +594,7 @@ static void update_branch_imm(struct jit_state *state, assert((imm & 3) == 0); uint32_t insn; imm >>= 2; + rv_log_debug("JIT: Patching branch at offset=%u, imm=%d", offset, imm << 2); #if defined(__APPLE__) && defined(__aarch64__) /* Must be in write mode to read/write MAP_JIT memory on Apple ARM64 */ pthread_jit_write_protect_np(false); @@ -2166,6 +2168,9 @@ void clear_hot(block_t *block) static void code_cache_flush(struct jit_state *state, riscv_t *rv) { + rv_log_debug( + "JIT: Flushing code cache (n_blocks=%d, n_jumps=%d, offset=%u)", + state->n_blocks, state->n_jumps, state->offset); should_flush = false; state->offset = state->org_size; state->n_blocks = 0; @@ -2199,6 +2204,7 @@ static void translate(struct jit_state *state, riscv_t *rv, block_t *block) static void resolve_jumps(struct jit_state *state) { + rv_log_debug("JIT: Resolving %d jumps", state->n_jumps); for (int i = 0; i < state->n_jumps; i++) { struct jump jump = state->jumps[i]; int target_loc; @@ -2221,6 +2227,10 @@ static void resolve_jumps(struct jit_state *state) (if (jump.target_satp == state->offset_map[i].satp), ) { target_loc = state->offset_map[i].offset; + rv_log_debug( + "JIT: Jump %d resolved to block pc=0x%08x, " + "offset=%d", + i, jump.target_pc, target_loc); break; } } @@ -2312,12 +2322,16 @@ void jit_translate(riscv_t *rv, block_t *block) ) { block->offset = state->offset_map[i].offset; block->hot = true; + rv_log_debug("JIT: Cache hit for block pc=0x%08x, offset=%u", + block->pc_start, block->offset); return; } } assert(NULL); __UNREACHABLE; } + rv_log_debug("JIT: Starting translation for block pc=0x%08x", + block->pc_start); restart: memset(state->jumps, 0, MAX_JUMPS * sizeof(struct jump)); state->n_jumps = 0; @@ -2327,11 +2341,16 @@ void jit_translate(riscv_t *rv, block_t *block) /* Mark block as not translated since translation was incomplete */ block->hot = false; /* Don't reset offset - it will be set correctly on restart */ + rv_log_debug("JIT: Translation triggered flush for block pc=0x%08x", + block->pc_start); code_cache_flush(state, rv); goto restart; } resolve_jumps(state); block->hot = true; + rv_log_debug( + "JIT: Translation completed for block pc=0x%08x, offset=%u, size=%u", + block->pc_start, block->offset, state->offset - block->offset); } struct jit_state *jit_state_init(size_t size) From 0f13d064bb01bd2551898453df039d0756e8f7ba Mon Sep 17 00:00:00 2001 From: Jim Huang Date: Mon, 6 Oct 2025 00:19:15 +0800 Subject: [PATCH 4/7] Change default log level from TRACE to INFO The diagnostic logging added in the previous commit uses DEBUG level, which was being displayed because the default log level was set to TRACE. This caused CI test failures as the excessive debug output interfered with test output validation. --- src/main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main.c b/src/main.c index 8dfcda32..aa55773d 100644 --- a/src/main.c +++ b/src/main.c @@ -304,7 +304,7 @@ int main(int argc, char **args) .args_offset_size = ARGS_OFFSET_SIZE, .argc = prog_argc, .argv = prog_args, - .log_level = LOG_TRACE, + .log_level = LOG_INFO, .run_flag = run_flag, .profile_output_file = prof_out_file, .cycle_per_step = CYCLE_PER_STEP, From 9b5844bd3b07f082487b0d3dc029feb2687d99f6 Mon Sep 17 00:00:00 2001 From: Jim Huang Date: Mon, 6 Oct 2025 01:21:06 +0800 Subject: [PATCH 5/7] Fix undefined behavior in JIT diagnostic logging Replace left shift of potentially negative value with multiplication to avoid undefined behavior detected by UBSan. In update_branch_imm(), the immediate value (imm) is right-shifted by 2 and can be negative. The diagnostic logging attempted to restore the original value using left shift (imm << 2), which is undefined behavior when imm is negative. --- src/jit.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/jit.c b/src/jit.c index f5014b23..124d4ce7 100644 --- a/src/jit.c +++ b/src/jit.c @@ -594,7 +594,7 @@ static void update_branch_imm(struct jit_state *state, assert((imm & 3) == 0); uint32_t insn; imm >>= 2; - rv_log_debug("JIT: Patching branch at offset=%u, imm=%d", offset, imm << 2); + rv_log_debug("JIT: Patching branch at offset=%u, imm=%d", offset, imm * 4); #if defined(__APPLE__) && defined(__aarch64__) /* Must be in write mode to read/write MAP_JIT memory on Apple ARM64 */ pthread_jit_write_protect_np(false); From a5863f14bd93e2031c8d5c08416893a43b0d06da Mon Sep 17 00:00:00 2001 From: Jim Huang Date: Mon, 6 Oct 2025 10:40:02 +0800 Subject: [PATCH 6/7] Improve JIT diagnostic logging for CI visibility This upgrades code_cache_flush logging from DEBUG to INFO level to make this critical event visible in CI test logs. --- src/jit.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/jit.c b/src/jit.c index 124d4ce7..dd6bbc7f 100644 --- a/src/jit.c +++ b/src/jit.c @@ -2168,9 +2168,8 @@ void clear_hot(block_t *block) static void code_cache_flush(struct jit_state *state, riscv_t *rv) { - rv_log_debug( - "JIT: Flushing code cache (n_blocks=%d, n_jumps=%d, offset=%u)", - state->n_blocks, state->n_jumps, state->offset); + rv_log_info("JIT: Flushing code cache (n_blocks=%d, n_jumps=%d, offset=%u)", + state->n_blocks, state->n_jumps, state->offset); should_flush = false; state->offset = state->org_size; state->n_blocks = 0; From 0e4850e6a91805ece67ffc6b0231dcce86bdd2c1 Mon Sep 17 00:00:00 2001 From: Jim Huang Date: Tue, 7 Oct 2025 04:33:13 +0800 Subject: [PATCH 7/7] Fix JIT branch patching bugs on Apple Silicon This resolves critical bugs in update_branch_imm causing intermittent test failures (~13-20% failure rate) on macOS/Arm64: 1. MAP_JIT memory corruption: Reading MAP_JIT memory while in write mode returns corrupted data on Apple Silicon. Fixed by moving pthread_jit_write_protect_np(false) to after the read operation. 2. Branch offset bit accumulation: When branches are patched multiple times, old offset bits were not cleared before OR-ing new values. Added bit masking to clear old offsets before setting new ones. --- src/jit.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/src/jit.c b/src/jit.c index dd6bbc7f..bc333fc2 100644 --- a/src/jit.c +++ b/src/jit.c @@ -595,24 +595,27 @@ static void update_branch_imm(struct jit_state *state, uint32_t insn; imm >>= 2; rv_log_debug("JIT: Patching branch at offset=%u, imm=%d", offset, imm * 4); -#if defined(__APPLE__) && defined(__aarch64__) - /* Must be in write mode to read/write MAP_JIT memory on Apple ARM64 */ - pthread_jit_write_protect_np(false); -#endif + /* Read instruction while in execute mode (MAP_JIT requirement) */ memcpy(&insn, state->buf + offset, sizeof(uint32_t)); if ((insn & 0xfe000000U) == 0x54000000U /* Conditional branch immediate. */ || (insn & 0x7e000000U) == 0x34000000U) { /* Compare and branch immediate. */ assert((imm >> 19) == INT64_C(-1) || (imm >> 19) == 0); + insn &= ~(0x7ffffU << 5); /* Clear old offset bits */ insn |= (imm & 0x7ffff) << 5; } else if ((insn & 0x7c000000U) == 0x14000000U) { /* Unconditional branch immediate. */ assert((imm >> 26) == INT64_C(-1) || (imm >> 26) == 0); + insn &= ~0x03ffffffU; /* Clear old offset bits */ insn |= (imm & 0x03ffffffU) << 0; } else { assert(false); insn = BAD_OPCODE; } +#if defined(__APPLE__) && defined(__aarch64__) + /* Switch to write mode only for writing */ + pthread_jit_write_protect_np(false); +#endif memcpy(state->buf + offset, &insn, sizeof(uint32_t)); sys_icache_invalidate(state->buf + offset, sizeof(uint32_t)); #if defined(__APPLE__) && defined(__aarch64__)