diff --git a/Makefile b/Makefile
index b123215a..3e95adf2 100644
--- a/Makefile
+++ b/Makefile
@@ -73,6 +73,28 @@ endif
 ENABLE_ARCH_TEST ?= 0
 $(call set-feature, ARCH_TEST)
 
+# ThreadSanitizer support
+# TSAN on x86-64 memory layout:
+#   Shadow: 0x02a000000000 - 0x7cefffffffff (reserved by TSAN)
+#   App:    0x7cf000000000 - 0x7ffffffff000 (usable by application)
+#
+# We use MAP_FIXED to allocate FULL4G's 4GB memory at a fixed address
+# (0x7d0000000000) within TSAN's app range, ensuring compatibility.
+#
+# IMPORTANT: TSAN requires ASLR (Address Space Layout Randomization) to be
+# disabled to prevent system allocations from landing in TSAN's shadow memory.
+# Tests are run with 'setarch $(uname -m) -R' to disable ASLR.
+ENABLE_TSAN ?= 0
+ifeq ("$(ENABLE_TSAN)", "1")
+override ENABLE_SDL := 0       # SDL (uninstrumented system lib) creates threads TSAN cannot track
+override ENABLE_LTO := 0       # LTO interferes with TSAN instrumentation
+CFLAGS += -DTSAN_ENABLED       # Signal code to use TSAN-compatible allocations
+# Disable ASLR for TSAN tests to prevent allocations in TSAN shadow memory
+BIN_WRAPPER = setarch $(shell uname -m) -R
+else
+BIN_WRAPPER =
+endif
+
 # Enable link-time optimization (LTO)
 ENABLE_LTO ?= 1
 ifeq ($(call has, LTO), 1)
@@ -281,6 +303,12 @@ CFLAGS += -fsanitize=undefined -fno-sanitize=alignment -fno-sanitize-recover=all
 LDFLAGS += -fsanitize=undefined -fno-sanitize=alignment -fno-sanitize-recover=all
 endif
 
+# ThreadSanitizer flags (ENABLE_TSAN is set earlier to override SDL/FULL4G)
+ifeq ("$(ENABLE_TSAN)", "1")
+CFLAGS += -fsanitize=thread -g
+LDFLAGS += -fsanitize=thread
+endif
+
 $(OUT)/emulate.o: CFLAGS += -foptimize-sibling-calls -fomit-frame-pointer -fno-stack-check -fno-stack-protector
 
 # .DEFAULT_GOAL should be set to all since the very first target is not all
@@ -375,7 +403,7 @@ define check-test
 $(Q)true; \
 $(PRINTF) "Running $(3) ... "; \
 OUTPUT_FILE="$$(mktemp)"; \
-if (LC_ALL=C $(BIN) $(1) $(2) > "$$OUTPUT_FILE") && \
+if (LC_ALL=C $(BIN_WRAPPER) $(BIN) $(1) $(2) > "$$OUTPUT_FILE") && \
    [ "$$(cat "$$OUTPUT_FILE" | $(LOG_FILTER) | $(4))" = "$(5)" ]; then \
     $(call notice, [OK]); \
 else \
diff --git a/src/emulate.c b/src/emulate.c
index 58585f8a..388c66e2 100644
--- a/src/emulate.c
+++ b/src/emulate.c
@@ -24,6 +24,7 @@ extern struct target_ops gdbstub_ops;
 #endif
 
 #include "decode.h"
+#include "log.h"
 #include "mpool.h"
 #include "riscv.h"
 #include "riscv_private.h"
@@ -283,6 +284,7 @@ static block_t *block_alloc(riscv_t *rv)
     block->hot2 = false;
     block->has_loops = false;
     block->n_invoke = 0;
+    block->func = NULL;
     INIT_LIST_HEAD(&block->list);
 #if RV32_HAS(T2C)
     block->compiled = false;
@@ -1151,22 +1153,32 @@ void rv_step(void *arg)
 #if RV32_HAS(JIT)
 #if RV32_HAS(T2C)
         /* executed through the tier-2 JIT compiler */
-        if (block->hot2) {
+        /* Use acquire semantics to ensure we see func write before using it */
+        if (__atomic_load_n(&block->hot2, __ATOMIC_ACQUIRE)) {
             ((exec_t2c_func_t) block->func)(rv);
             prev = NULL;
             continue;
         } /* check if invoking times of t1 generated code exceed threshold */
-        else if (!block->compiled && block->n_invoke >= THRESHOLD) {
-            block->compiled = true;
+        else if (!__atomic_load_n(&block->compiled, __ATOMIC_RELAXED) &&
+                 __atomic_load_n(&block->n_invoke, __ATOMIC_RELAXED) >=
+                     THRESHOLD) {
+            __atomic_store_n(&block->compiled, true, __ATOMIC_RELAXED);
             queue_entry_t *entry = malloc(sizeof(queue_entry_t));
             if (unlikely(!entry)) {
                 /* Malloc failed - reset compiled flag to allow retry later */
-                block->compiled = false;
+                __atomic_store_n(&block->compiled, false, __ATOMIC_RELAXED);
                 continue;
             }
-            entry->block = block;
+            /* Store cache key instead of pointer to prevent use-after-free */
+#if RV32_HAS(SYSTEM)
+            entry->key =
+                (uint64_t) block->pc_start | ((uint64_t) block->satp << 32);
+#else
+            entry->key = (uint64_t) block->pc_start;
+#endif
             pthread_mutex_lock(&rv->wait_queue_lock);
             list_add(&entry->list, &rv->wait_queue);
+            pthread_cond_signal(&rv->wait_queue_cond);
             pthread_mutex_unlock(&rv->wait_queue_lock);
         }
 #endif
@@ -1178,7 +1190,11 @@ void rv_step(void *arg)
          *       entry in compiled binary buffer.
          */
         if (block->hot) {
+#if RV32_HAS(T2C)
+            __atomic_fetch_add(&block->n_invoke, 1, __ATOMIC_RELAXED);
+#else
             block->n_invoke++;
+#endif
             ((exec_block_func_t) state->buf)(
                 rv, (uintptr_t) (state->buf + block->offset));
             prev = NULL;
@@ -1190,10 +1206,20 @@ void rv_step(void *arg)
 #endif
         ) {
             jit_translate(rv, block);
-            ((exec_block_func_t) state->buf)(
-                rv, (uintptr_t) (state->buf + block->offset));
-            prev = NULL;
-            continue;
+            /* Only execute if translation succeeded (block is hot) */
+            if (block->hot) {
+                rv_log_debug("JIT: Executing block pc=0x%08x, offset=%u",
+                             block->pc_start, block->offset);
+                ((exec_block_func_t) state->buf)(
+                    rv, (uintptr_t) (state->buf + block->offset));
+                prev = NULL;
+                continue;
+            }
+            /* Fall through to interpreter if translation failed */
+            rv_log_debug(
+                "JIT: Translation failed for block pc=0x%08x, using "
+                "interpreter",
+                block->pc_start);
         }
         set_reset(&pc_set);
         has_loops = false;
diff --git a/src/io.c b/src/io.c
index 4ff325d3..975013ee 100644
--- a/src/io.c
+++ b/src/io.c
@@ -27,12 +27,47 @@ memory_t *memory_new(uint32_t size)
         return NULL;
     assert(mem);
 #if HAVE_MMAP
+#if defined(TSAN_ENABLED)
+    /* ThreadSanitizer compatibility: Use MAP_FIXED to allocate at a specific
+     * address to avoid conflicts with TSAN's shadow memory.
+     */
+#if defined(__x86_64__)
+    /* x86_64: Allocate within TSAN's range (0x7cf000000000 - 0x7ffffffff000).
+     *
+     * Fixed address: 0x7d0000000000
+     * Size: up to 4GB (0x100000000)
+     * End: 0x7d0100000000 (well within app range)
+     */
+    void *fixed_addr = (void *) 0x7d0000000000UL;
+#elif defined(__aarch64__)
+    /* ARM64 (macOS/Apple Silicon): Use higher address range.
+     *
+     * Fixed address: 0x150000000000 (21TB)
+     * Size: up to 4GB (0x100000000)
+     * End: 0x150100000000
+     *
+     * This avoids TSAN's shadow memory and typical process allocations.
+     * Requires ASLR disabled via: setarch $(uname -m) -R
+     */
+    void *fixed_addr = (void *) 0x150000000000UL;
+#else
+#error "TSAN is only supported on x86_64 and aarch64"
+#endif
+    data_memory_base = mmap(fixed_addr, size, PROT_READ | PROT_WRITE,
+                            MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0);
+    if (data_memory_base == MAP_FAILED) {
+        free(mem);
+        return NULL;
+    }
+#else
+    /* Standard allocation without TSAN */
     data_memory_base = mmap(NULL, size, PROT_READ | PROT_WRITE,
                             MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
     if (data_memory_base == MAP_FAILED) {
         free(mem);
         return NULL;
     }
+#endif
 #else
     data_memory_base = malloc(size);
     if (!data_memory_base) {
diff --git a/src/jit.c b/src/jit.c
index 158665d4..bc333fc2 100644
--- a/src/jit.c
+++ b/src/jit.c
@@ -42,6 +42,7 @@
 #include "decode.h"
 #include "io.h"
 #include "jit.h"
+#include "log.h"
 #include "riscv.h"
 #include "riscv_private.h"
 #include "utils.h"
@@ -593,24 +594,30 @@ static void update_branch_imm(struct jit_state *state,
     assert((imm & 3) == 0);
     uint32_t insn;
     imm >>= 2;
+    rv_log_debug("JIT: Patching branch at offset=%u, imm=%d", offset, imm * 4);
+    /* Read instruction while in execute mode (MAP_JIT requirement) */
     memcpy(&insn, state->buf + offset, sizeof(uint32_t));
     if ((insn & 0xfe000000U) == 0x54000000U /* Conditional branch immediate. */
         || (insn & 0x7e000000U) ==
                0x34000000U) { /* Compare and branch immediate. */
         assert((imm >> 19) == INT64_C(-1) || (imm >> 19) == 0);
+        insn &= ~(0x7ffffU << 5);  /* Clear old offset bits */
         insn |= (imm & 0x7ffff) << 5;
     } else if ((insn & 0x7c000000U) == 0x14000000U) {
         /* Unconditional branch immediate.  */
         assert((imm >> 26) == INT64_C(-1) || (imm >> 26) == 0);
+        insn &= ~0x03ffffffU;  /* Clear old offset bits */
         insn |= (imm & 0x03ffffffU) << 0;
     } else {
         assert(false);
         insn = BAD_OPCODE;
     }
 #if defined(__APPLE__) && defined(__aarch64__)
+    /* Switch to write mode only for writing */
     pthread_jit_write_protect_np(false);
 #endif
     memcpy(state->buf + offset, &insn, sizeof(uint32_t));
+    sys_icache_invalidate(state->buf + offset, sizeof(uint32_t));
 #if defined(__APPLE__) && defined(__aarch64__)
     pthread_jit_write_protect_np(true);
 #endif
@@ -2164,9 +2171,12 @@ void clear_hot(block_t *block)
 
 static void code_cache_flush(struct jit_state *state, riscv_t *rv)
 {
+    rv_log_info("JIT: Flushing code cache (n_blocks=%d, n_jumps=%d, offset=%u)",
+                state->n_blocks, state->n_jumps, state->offset);
     should_flush = false;
     state->offset = state->org_size;
     state->n_blocks = 0;
+    state->n_jumps = 0; /* Reset jump count when flushing */
     set_reset(&state->set);
     clear_cache_hot(rv->block_cache, (clear_func_t) clear_hot);
 #if RV32_HAS(T2C)
@@ -2196,6 +2206,7 @@ static void translate(struct jit_state *state, riscv_t *rv, block_t *block)
 
 static void resolve_jumps(struct jit_state *state)
 {
+    rv_log_debug("JIT: Resolving %d jumps", state->n_jumps);
     for (int i = 0; i < state->n_jumps; i++) {
         struct jump jump = state->jumps[i];
         int target_loc;
@@ -2218,6 +2229,10 @@ static void resolve_jumps(struct jit_state *state)
                     (if (jump.target_satp == state->offset_map[i].satp), )
                     {
                         target_loc = state->offset_map[i].offset;
+                        rv_log_debug(
+                            "JIT: Jump %d resolved to block pc=0x%08x, "
+                            "offset=%d",
+                            i, jump.target_pc, target_loc);
                         break;
                     }
                 }
@@ -2229,6 +2244,7 @@ static void resolve_jumps(struct jit_state *state)
 
         uint8_t *offset_ptr = &state->buf[jump.offset_loc];
         memcpy(offset_ptr, &rel, sizeof(uint32_t));
+        sys_icache_invalidate(offset_ptr, sizeof(uint32_t));
 #elif defined(__aarch64__)
         int32_t rel = target_loc - jump.offset_loc;
         update_branch_imm(state, jump.offset_loc, rel);
@@ -2308,23 +2324,35 @@ void jit_translate(riscv_t *rv, block_t *block)
             ) {
                 block->offset = state->offset_map[i].offset;
                 block->hot = true;
+                rv_log_debug("JIT: Cache hit for block pc=0x%08x, offset=%u",
+                             block->pc_start, block->offset);
                 return;
             }
         }
         assert(NULL);
         __UNREACHABLE;
     }
+    rv_log_debug("JIT: Starting translation for block pc=0x%08x",
+                 block->pc_start);
 restart:
     memset(state->jumps, 0, MAX_JUMPS * sizeof(struct jump));
     state->n_jumps = 0;
     block->offset = state->offset;
     translate_chained_block(state, rv, block);
     if (unlikely(should_flush)) {
+        /* Mark block as not translated since translation was incomplete */
+        block->hot = false;
+        /* Don't reset offset - it will be set correctly on restart */
+        rv_log_debug("JIT: Translation triggered flush for block pc=0x%08x",
+                     block->pc_start);
         code_cache_flush(state, rv);
         goto restart;
     }
     resolve_jumps(state);
     block->hot = true;
+    rv_log_debug(
+        "JIT: Translation completed for block pc=0x%08x, offset=%u, size=%u",
+        block->pc_start, block->offset, state->offset - block->offset);
 }
 
 struct jit_state *jit_state_init(size_t size)
@@ -2336,6 +2364,52 @@ struct jit_state *jit_state_init(size_t size)
 
     state->offset = 0;
     state->size = size;
+#if defined(TSAN_ENABLED)
+    /* ThreadSanitizer compatibility: Allocate JIT code buffer at a fixed
+     * address above the main memory region to avoid conflicts.
+     */
+#if defined(__x86_64__)
+    /* x86_64 memory layout:
+     * Main memory: 0x7d0000000000 - 0x7d0100000000 (4GB for FULL4G)
+     * JIT buffer:  0x7d1000000000 + size
+     *
+     * This keeps both allocations in TSAN's app range (0x7cf000000000 -
+     * 0x7ffffffff000) and prevents overlap with main memory or TSAN shadow.
+     */
+    void *jit_addr = (void *) 0x7d1000000000UL;
+    state->buf = mmap(jit_addr, size, PROT_READ | PROT_WRITE | PROT_EXEC,
+                      MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED
+#if defined(__APPLE__)
+                          | MAP_JIT
+#endif
+                      ,
+                      -1, 0);
+#elif defined(__aarch64__)
+    /* ARM64 memory layout (macOS/Apple Silicon):
+     * Main memory: 0x150000000000 - 0x150100000000 (4GB for FULL4G)
+     * JIT buffer:  0x151000000000 + size
+     *
+     * Apple Silicon requires MAP_JIT for executable memory. The fixed
+     * address is chosen to avoid TSAN's shadow memory and typical process
+     * allocations. Requires ASLR disabled via: setarch $(uname -m) -R
+     */
+    void *jit_addr = (void *) 0x151000000000UL;
+    state->buf = mmap(jit_addr, size, PROT_READ | PROT_WRITE | PROT_EXEC,
+                      MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED
+#if defined(__APPLE__)
+                          | MAP_JIT
+#endif
+                      ,
+                      -1, 0);
+#else
+#error "TSAN is only supported on x86_64 and aarch64"
+#endif
+    if (state->buf == MAP_FAILED) {
+        free(state);
+        return NULL;
+    }
+#else
+    /* Standard allocation without TSAN */
     state->buf = mmap(0, size, PROT_READ | PROT_WRITE | PROT_EXEC,
                       MAP_PRIVATE | MAP_ANONYMOUS
 #if defined(__APPLE__)
@@ -2347,8 +2421,7 @@ struct jit_state *jit_state_init(size_t size)
         free(state);
         return NULL;
     }
-    assert(state->buf != MAP_FAILED);
-
+#endif
     state->n_blocks = 0;
     set_reset(&state->set);
     reset_reg();
diff --git a/src/main.c b/src/main.c
index 8e079a50..aa55773d 100644
--- a/src/main.c
+++ b/src/main.c
@@ -19,6 +19,28 @@
 #include "riscv.h"
 #include "utils.h"
 
+/* ThreadSanitizer configuration for FULL4G compatibility
+ *
+ * We use MAP_FIXED to allocate emulated memory at 0x7d0000000000, which is
+ * within TSAN's application memory range (0x7cf000000000 - 0x7ffffffff000).
+ * This avoids conflicts with TSAN's shadow memory and allows race detection
+ * to work with FULL4G's 4GB address space.
+ *
+ * Configuration optimizes for race detection with minimal overhead.
+ */
+#if defined(__SANITIZE_THREAD__)
+const char *__tsan_default_options()
+{
+    return "halt_on_error=0"          /* Continue after errors */
+           ":report_bugs=1"           /* Report data races */
+           ":second_deadlock_stack=1" /* Full deadlock info */
+           ":verbosity=0"             /* Reduce noise */
+           ":memory_limit_mb=0"       /* No memory limit */
+           ":history_size=7"          /* Larger race detection window */
+           ":io_sync=0";              /* Don't sync on I/O */
+}
+#endif
+
 /* enable program trace mode */
 #if !RV32_HAS(SYSTEM) || (RV32_HAS(SYSTEM) && RV32_HAS(ELF_LOADER))
 static bool opt_trace = false;
@@ -282,7 +304,7 @@ int main(int argc, char **args)
         .args_offset_size = ARGS_OFFSET_SIZE,
         .argc = prog_argc,
         .argv = prog_args,
-        .log_level = LOG_TRACE,
+        .log_level = LOG_INFO,
         .run_flag = run_flag,
         .profile_output_file = prof_out_file,
         .cycle_per_step = CYCLE_PER_STEP,
diff --git a/src/riscv.c b/src/riscv.c
index dcaaa94a..47434c98 100644
--- a/src/riscv.c
+++ b/src/riscv.c
@@ -206,19 +206,41 @@ static pthread_t t2c_thread;
 static void *t2c_runloop(void *arg)
 {
     riscv_t *rv = (riscv_t *) arg;
+    pthread_mutex_lock(&rv->wait_queue_lock);
     while (!rv->quit) {
-        if (!list_empty(&rv->wait_queue)) {
-            queue_entry_t *entry =
-                list_last_entry(&rv->wait_queue, queue_entry_t, list);
-            pthread_mutex_lock(&rv->wait_queue_lock);
-            list_del_init(&entry->list);
-            pthread_mutex_unlock(&rv->wait_queue_lock);
-            pthread_mutex_lock(&rv->cache_lock);
-            t2c_compile(rv, entry->block);
-            pthread_mutex_unlock(&rv->cache_lock);
-            free(entry);
-        }
+        /* Wait for work or quit signal */
+        while (list_empty(&rv->wait_queue) && !rv->quit)
+            pthread_cond_wait(&rv->wait_queue_cond, &rv->wait_queue_lock);
+
+        if (rv->quit)
+            break;
+
+        /* Extract work item while holding the lock */
+        queue_entry_t *entry =
+            list_last_entry(&rv->wait_queue, queue_entry_t, list);
+        list_del_init(&entry->list);
+        pthread_mutex_unlock(&rv->wait_queue_lock);
+
+        /* Perform compilation with cache lock */
+        pthread_mutex_lock(&rv->cache_lock);
+        /* Look up block from cache using the key (might have been evicted) */
+        uint32_t pc = (uint32_t) entry->key;
+        block_t *block = (block_t *) cache_get(rv->block_cache, pc, false);
+#if RV32_HAS(SYSTEM)
+        /* Verify SATP matches (for system mode) */
+        uint32_t satp = (uint32_t) (entry->key >> 32);
+        if (block && block->satp != satp)
+            block = NULL;
+#endif
+        /* Compile only if block still exists in cache */
+        if (block)
+            t2c_compile(rv, block);
+        pthread_mutex_unlock(&rv->cache_lock);
+        free(entry);
+
+        pthread_mutex_lock(&rv->wait_queue_lock);
     }
+    pthread_mutex_unlock(&rv->wait_queue_lock);
     return NULL;
 }
 #endif
@@ -745,6 +767,7 @@ riscv_t *rv_create(riscv_user_t rv_attr)
     /* prepare wait queue. */
     pthread_mutex_init(&rv->wait_queue_lock, NULL);
     pthread_mutex_init(&rv->cache_lock, NULL);
+    pthread_cond_init(&rv->wait_queue_cond, NULL);
     INIT_LIST_HEAD(&rv->wait_queue);
     /* activate the background compilation thread. */
     pthread_create(&t2c_thread, NULL, t2c_runloop, rv);
@@ -866,10 +889,24 @@ void rv_delete(riscv_t *rv)
     block_map_destroy(rv);
 #else
 #if RV32_HAS(T2C)
+    /* Signal the thread to quit */
+    pthread_mutex_lock(&rv->wait_queue_lock);
     rv->quit = true;
+    pthread_cond_signal(&rv->wait_queue_cond);
+    pthread_mutex_unlock(&rv->wait_queue_lock);
+
     pthread_join(t2c_thread, NULL);
+
+    /* Clean up any remaining entries in wait queue */
+    queue_entry_t *entry, *safe;
+    list_for_each_entry_safe (entry, safe, &rv->wait_queue, list) {
+        list_del(&entry->list);
+        free(entry);
+    }
+
     pthread_mutex_destroy(&rv->wait_queue_lock);
     pthread_mutex_destroy(&rv->cache_lock);
+    pthread_cond_destroy(&rv->wait_queue_cond);
     jit_cache_exit(rv->jit_cache);
 #endif
     jit_state_exit(rv->jit_state);
diff --git a/src/riscv_private.h b/src/riscv_private.h
index ace3ca90..55879432 100644
--- a/src/riscv_private.h
+++ b/src/riscv_private.h
@@ -105,7 +105,7 @@ typedef struct block {
 
 #if RV32_HAS(JIT) && RV32_HAS(T2C)
 typedef struct {
-    block_t *block;
+    uint64_t key; /**< cache key (PC or PC|SATP) to look up block */
     struct list_head list;
 } queue_entry_t;
 #endif
@@ -197,6 +197,7 @@ struct riscv_internal {
 #if RV32_HAS(T2C)
     struct list_head wait_queue;
     pthread_mutex_t wait_queue_lock, cache_lock;
+    pthread_cond_t wait_queue_cond;
     volatile bool quit; /**< Determine the main thread is terminated or not */
 #endif
     void *jit_state;
diff --git a/src/t2c.c b/src/t2c.c
index 343b85e6..2115adaf 100644
--- a/src/t2c.c
+++ b/src/t2c.c
@@ -346,7 +346,9 @@ void t2c_compile(riscv_t *rv, block_t *block)
 
     jit_cache_update(rv->jit_cache, key, block->func);
 
-    block->hot2 = true;
+    /* Use release semantics to ensure func write is visible before hot2 is set
+     */
+    __atomic_store_n(&block->hot2, true, __ATOMIC_RELEASE);
 }
 
 struct jit_cache *jit_cache_init()