diff --git a/src/core/bootstrap.c b/src/core/bootstrap.c
index 5bdd386..68a292f 100644
--- a/src/core/bootstrap.c
+++ b/src/core/bootstrap.c
@@ -279,7 +279,11 @@ int guest_bootstrap_prepare(guest_t *g,
         log_error("failed to build page tables");
         return -1;
     }
-    g->need_tlbi = true;
+    /* No TLBI request here: the shim's _start does TLBI VMALLE1IS before
+     * enabling the MMU (src/core/shim.S), and the per-vCPU accumulator is the
+     * wrong place to stage a bring-up flush -- bootstrap may run on a thread
+     * whose slot is later consumed by an unrelated syscall.
+     */
 
     guest_region_add(g, g->shim_base, g->shim_base + shim_bin_len,
                      LINUX_PROT_READ | LINUX_PROT_EXEC, LINUX_MAP_PRIVATE, 0,
@@ -440,5 +444,14 @@ int guest_bootstrap_create_vcpu(guest_t *g,
         log_debug("main thread registered with SP_EL1=0x%llx",
                   (unsigned long long) el1_sp);
 
+    /* guest_build_page_tables and the bootstrap-time guest_invalidate_ptes
+     * calls (stack guard, null page, etc.) accumulate TLBI requests on this
+     * (the main) thread's cpu_tlbi_req TLS slot. The shim's _start does TLBI
+     * VMALLE1IS before enabling the MMU, so any TLB state was already dropped
+     * before guest code runs. Clear the accumulator so the first guest syscall
+     * does not redundantly broadcast on top of that.
+     */
+    tlbi_request_clear();
+
     return 0;
 }
diff --git a/src/core/guest.c b/src/core/guest.c
index e9af7f8..5e3d294 100644
--- a/src/core/guest.c
+++ b/src/core/guest.c
@@ -42,6 +42,11 @@
 #include "utils.h"
 #include "runtime/thread.h" /* thread_destroy_all_vcpus */
 
+/* Per-vCPU pending TLBI request. Zero-initialized in every host pthread
+ * by virtue of TLS default-zeroing, which maps to TLBI_NONE.
+ */
+_Thread_local tlbi_request_t cpu_tlbi_req;
+
 static void guest_region_clear(guest_t *g);
 
 /* Page table descriptor bits. */
@@ -901,7 +906,7 @@ void guest_reset(guest_t *g)
     g->mmap_rw_gap_hint = 0;
     g->mmap_rx_gap_hint = 0;
     g->ttbr0 = 0;
-    g->need_tlbi = false;
+    tlbi_request_clear();
     g->elf_load_min = ELF_DEFAULT_BASE;
 
     /* Clear semantic region tracking (will be re-populated after exec) */
@@ -1650,8 +1655,8 @@ uint64_t guest_build_page_tables(guest_t *g, const mem_region_t *regions, int n)
 /* Extend page tables to cover [start, end) with 2MiB block descriptors.
  * Walks the existing L0->L1 structure (from g->ttbr0) and allocates new
  * L2 tables as needed. This is safe to call while the vCPU is paused
- * (during HVC #5 handling). Sets g->need_tlbi so the shim flushes the
- * TLB before returning to EL0.
+ * (during HVC #5 handling). Records a TLBI request covering the new range
+ * so the shim flushes the matching TLB entries before returning to EL0.
  */
 int guest_extend_page_tables(guest_t *g,
                              uint64_t start,
@@ -1717,7 +1722,12 @@ int guest_extend_page_tables(guest_t *g,
         }
     }
 
-    g->need_tlbi = true;
+    /* Use the page-aligned bounds the loop actually covered. Extend grows
+     * the mapped range; existing VAs may carry negative TLB entries from
+     * prior translation faults at this address, so a flush is still needed.
+     * Large extends will exceed the selective cap and become broadcast.
+     */
+    tlbi_request_range(addr_start + base, addr_end + base);
     guest_pt_gen_bump(g);
     return 0;
 }
@@ -1822,7 +1832,6 @@ static int split_l2_block(guest_t *g, uint64_t *l2_entry)
         l3[i] = make_page_desc(block_ipa + (uint64_t) i * PAGE_SIZE, old_perms);
 
     *l2_entry = (g->ipa_base + l3_gpa) | PT_VALID | PT_TABLE;
-    g->need_tlbi = true;
     return 0;
 }
 
@@ -1830,7 +1839,10 @@ int guest_split_block(guest_t *g, uint64_t block_gpa)
 {
     uint64_t block_start = ALIGN_2MIB_DOWN(block_gpa);
     uint64_t *l2_entry = find_l2_entry(g, block_start);
-    return split_l2_block(g, l2_entry);
+    int rc = split_l2_block(g, l2_entry);
+    if (rc < 0)
+        return rc;
+    return 0;
 }
 
 int guest_invalidate_ptes(guest_t *g, uint64_t start, uint64_t end)
@@ -1862,9 +1874,12 @@ int guest_invalidate_ptes(guest_t *g, uint64_t start, uint64_t end)
         if ((*l2_entry & 3) == 1) {
             /* 2MiB block descriptor */
             if (start <= block_start && end >= block_end) {
-                /* Invalidating the entire 2MiB block: clear the L2 entry */
+                /* Invalidating the entire 2MiB block: clear the L2 entry.
+                 * The 2 MiB range exceeds the selective cap and upgrades
+                 * to broadcast.
+                 */
                 *l2_entry = 0;
-                g->need_tlbi = true;
+                tlbi_request_range(base + block_start, base + block_end);
                 addr = block_end;
                 continue;
             }
@@ -1889,7 +1904,7 @@ int guest_invalidate_ptes(guest_t *g, uint64_t start, uint64_t end)
             l3[l3_idx] = 0; /* Invalid descriptor */
         }
 
-        g->need_tlbi = true;
+        tlbi_request_range(base + page_start, base + page_end);
         addr = page_end;
     }
 
@@ -1936,7 +1951,7 @@ int guest_update_perms(guest_t *g, uint64_t start, uint64_t end, int perms)
                 if (old_perms != perms) {
                     uint64_t ipa = *l2_entry & L2_BLOCK_ADDR_MASK;
                     *l2_entry = make_block_desc(ipa, perms);
-                    g->need_tlbi = true;
+                    tlbi_request_range(base + block_start, base + block_end);
                 }
                 addr = block_end;
                 continue;
@@ -1959,9 +1974,13 @@ int guest_update_perms(guest_t *g, uint64_t start, uint64_t end, int perms)
         uint64_t l3_ipa = *l2_entry & 0xFFFFFFFFF000ULL;
         uint64_t *l3 = pt_at(g, l3_ipa - base);
 
-        /* Update pages within this 2MiB block that fall in [start, end) */
+        /* Update pages within this 2MiB block that fall in [start, end). Track
+         * the smallest sub-range that actually changed so the TLBI request only
+         * covers descriptors whose value changed (false-positive elimination).
+         */
         uint64_t page_start = (addr > block_start) ? addr : block_start;
         uint64_t page_end = (end < block_end) ? end : block_end;
+        uint64_t changed_lo = UINT64_MAX, changed_hi = 0;
 
         for (uint64_t pa = page_start; pa < page_end; pa += PAGE_SIZE) {
             unsigned l3_idx =
@@ -1981,10 +2000,18 @@ int guest_update_perms(guest_t *g, uint64_t start, uint64_t end, int perms)
                 page_ipa = l3[l3_idx] & 0xFFFFFFFFF000ULL;
             else
                 page_ipa = base + (pa & ~(PAGE_SIZE - 1));
-            l3[l3_idx] = make_page_desc(page_ipa, perms);
+            uint64_t new_desc = make_page_desc(page_ipa, perms);
+            if (l3[l3_idx] != new_desc) {
+                l3[l3_idx] = new_desc;
+                if (pa < changed_lo)
+                    changed_lo = pa;
+                if (pa + PAGE_SIZE > changed_hi)
+                    changed_hi = pa + PAGE_SIZE;
+            }
         }
 
-        g->need_tlbi = true;
+        if (changed_hi > changed_lo)
+            tlbi_request_range(base + changed_lo, base + changed_hi);
         addr = page_end;
     }
 
@@ -2079,6 +2106,8 @@ int guest_materialize_lazy(guest_t *g, uint64_t fault_offset)
         memset((uint8_t *) g->host_base + materialize_start, 0,
                materialize_end - materialize_start);
 
-    g->need_tlbi = true;
+    /* The page-table helpers above already requested the matching TLBI;
+     * no additional flush is needed here.
+     */
     return 0;
 }
diff --git a/src/core/guest.h b/src/core/guest.h
index c087d89..e0ea521 100644
--- a/src/core/guest.h
+++ b/src/core/guest.h
@@ -174,6 +174,39 @@ typedef struct {
     char name[64];          /* Label: "[heap]", "[stack]", ELF path, etc. */
 } guest_region_t;
 
+/* TLB invalidation request kinds. After every page-table modification, the
+ * shim flushes the TLB on syscall return. The host accumulates the smallest
+ * sufficient request across the syscall and emits it via the X8/X9/X10
+ * register channel. Kind values are an internal enum independent of the
+ * X8 wire codes; the syscall epilogue does the mapping (src/syscall/syscall.c
+ * holds the canonical table):
+ *   TLBI_NONE      -> X8 = 0  (no TLB flush)
+ *   TLBI_BROADCAST -> X8 = 1  (TLBI VMALLE1IS, broadest)
+ *   TLBI_RANGE     -> X8 = 3, X9 = start VA, X10 = page count
+ *                     (TLBI VAE1IS loop preserves unrelated TLB entries)
+ * X8 = 2 is reserved for the execve drop-frame marker the shim handles
+ * separately; it is never produced by the accumulator.
+ */
+typedef enum {
+    TLBI_NONE = 0,
+    TLBI_BROADCAST = 1,
+    TLBI_RANGE = 2,
+} tlbi_kind_t;
+
+/* Cap selective TLBI at this many 4 KiB pages. Beyond this, fall back to
+ * TLBI_BROADCAST: each TLBI VAE1IS broadcasts to all cores, so for large
+ * ranges the per-instruction issue cost outweighs the benefit of preserving
+ * unrelated TLB entries. 16 pages == 64 KiB covers RELRO and other typical
+ * mprotect / munmap targets.
+ */
+#define TLBI_SELECTIVE_MAX_PAGES 16
+
+typedef struct {
+    uint8_t kind;   /* tlbi_kind_t */
+    uint16_t pages; /* Page count when kind == TLBI_RANGE (1..MAX) */
+    uint64_t start; /* Page-aligned VA when kind == TLBI_RANGE */
+} tlbi_request_t;
+
 /* Guest state. */
 typedef struct {
     void *host_base; /* Host pointer to allocated guest memory */
@@ -221,9 +254,8 @@ typedef struct {
      */
     uint64_t mmap_rw_gap_hint, mmap_rx_gap_hint;
 
-    uint64_t ttbr0; /* TTBR0 value (IPA of L0 page table) */
-    bool need_tlbi; /* Signal shim to flush TLB after page table changes */
-    hv_vcpu_t vcpu; /* vCPU handle */
+    uint64_t ttbr0;       /* TTBR0 value (IPA of L0 page table) */
+    hv_vcpu_t vcpu;       /* vCPU handle */
     hv_vcpu_exit_t *exit; /* vCPU exit info */
     uint32_t ipa_bits;    /* IPA bits requested from HVF */
     /* Semantic region tracking for munmap/mprotect/proc-self-maps */
@@ -253,6 +285,107 @@ static inline void guest_pt_gen_bump(guest_t *g)
     atomic_fetch_add_explicit(&g->pt_gen, 1, memory_order_release);
 }
 
+/* TLB invalidation request helpers.
+ *
+ * Per-vCPU TLS slot. Each guest thread (= one host pthread + one HVF vCPU)
+ * accumulates its own pending TLBI request as its syscall handlers mutate
+ * the page tables. The syscall epilogue (src/syscall/syscall.c) reads its
+ * own thread's slot, emits the X8/X9/X10 protocol, and clears it.
+ *
+ * Why per-vCPU and not a guest_t-global accumulator: a global slot lets one
+ * vCPU's syscall epilogue drain (and clear) another vCPU's pending request
+ * before that vCPU has eret'd back to EL0, allowing the second vCPU to use
+ * a stale TLB until the broadcast TLBI from the first vCPU's shim catches
+ * up. A per-vCPU slot makes each thread strictly responsible for issuing
+ * the TLBI for its own changes before its own eret. Page-table changes are
+ * still global (guest memory and page tables are shared), but TLBI VAE1IS
+ * and TLBI VMALLE1IS in the inner-shareable domain broadcast to all PEs,
+ * so one vCPU's own TLBI is sufficient to invalidate stale entries on its
+ * own PE before resuming guest code.
+ *
+ * No locking is needed for the slot itself; only the owning thread reads
+ * or writes it. Page-table updates remain serialized by mmap_lock.
+ *
+ * Cross-vCPU shootdown window: between vCPU A releasing mmap_lock at the
+ * end of an mprotect/munmap and the shim on A issuing the TLBI, sibling
+ * vCPU B may continue executing EL0 code that hits A's now-stale TLB
+ * entries. Real Linux closes this with cross-CPU IPI synchronization in
+ * the kernel; user-space emulation on Hypervisor.framework cannot inject
+ * a synchronous IPI into a sibling vCPU thread, so the window remains.
+ * The guest is responsible for serializing concurrent PT mutations
+ * against concurrent accesses (futex / pthread_mutex), which is the same
+ * contract real Linux requires of well-behaved multi-threaded code. See
+ * TODO.md "Bounded retry on stale TLB data abort" (P3 hardening) for the
+ * tracked follow-up if a workload ever surfaces an actual reproducer.
+ */
+extern _Thread_local tlbi_request_t cpu_tlbi_req;
+
+static inline void tlbi_request_clear(void)
+{
+    cpu_tlbi_req.kind = TLBI_NONE;
+    cpu_tlbi_req.pages = 0;
+    cpu_tlbi_req.start = 0;
+}
+
+static inline void tlbi_request_broadcast(void)
+{
+    cpu_tlbi_req.kind = TLBI_BROADCAST;
+}
+
+static inline void tlbi_request_range(uint64_t start, uint64_t end)
+{
+    if (cpu_tlbi_req.kind == TLBI_BROADCAST)
+        return;
+    if (end <= start)
+        return;
+    /* Page-align: TLBI VAE1IS operates on 4 KiB granules. ALIGN_UP can
+     * overflow if end is within PAGE_SIZE-1 of UINT64_MAX; saturate to
+     * broadcast in that pathological case rather than wrap to 0.
+     */
+    const uint64_t mask = 0xFFFULL;
+    if (end > UINT64_MAX - mask) {
+        tlbi_request_broadcast();
+        return;
+    }
+    uint64_t s = start & ~mask;
+    uint64_t e = (end + mask) & ~mask;
+    uint64_t n = (e - s) >> 12;
+    if (n > TLBI_SELECTIVE_MAX_PAGES) {
+        tlbi_request_broadcast();
+        return;
+    }
+    if (cpu_tlbi_req.kind == TLBI_NONE) {
+        cpu_tlbi_req.kind = TLBI_RANGE;
+        cpu_tlbi_req.start = s;
+        cpu_tlbi_req.pages = (uint16_t) n;
+        return;
+    }
+    /* TLBI_RANGE: coalesce by union. Disjoint ranges still produce a single
+     * bounding interval; if it stays within the cap, the per-page TLBI loop
+     * still wins over a full flush by preserving the rest of the TLB.
+     */
+    uint64_t es = cpu_tlbi_req.start;
+    uint64_t pe = (uint64_t) cpu_tlbi_req.pages * 4096ULL;
+    /* The accumulator only ever holds page counts <= TLBI_SELECTIVE_MAX_PAGES
+     * (see the cap check above), so es + pe never overflows on real callers,
+     * but be explicit.
+     */
+    if (es > UINT64_MAX - pe) {
+        tlbi_request_broadcast();
+        return;
+    }
+    uint64_t ee = es + pe;
+    uint64_t us = s < es ? s : es;
+    uint64_t ue = e > ee ? e : ee;
+    uint64_t un = (ue - us) >> 12;
+    if (un > TLBI_SELECTIVE_MAX_PAGES) {
+        tlbi_request_broadcast();
+        return;
+    }
+    cpu_tlbi_req.start = us;
+    cpu_tlbi_req.pages = (uint16_t) un;
+}
+
 /* Convert a guest offset (0-based) to an IPA/VA (ipa_base + offset) */
 static inline uint64_t guest_ipa(const guest_t *g, uint64_t offset)
 {
@@ -387,7 +520,8 @@ uint64_t guest_build_page_tables(guest_t *g,
 
 /* Extend page tables to cover a new address range [start, end) with 2MiB
  * block descriptors. Reuses the existing L0->L1 table structure and
- * allocates new L2 tables as needed. Sets g->need_tlbi = true.
+ * allocates new L2 tables as needed. Records a TLBI request covering the
+ * affected range (range or broadcast).
  * Returns 0 on success, -1 on failure.
  */
 int guest_extend_page_tables(guest_t *g,
@@ -399,7 +533,17 @@ int guest_extend_page_tables(guest_t *g,
  * block_gpa must be within a currently-mapped 2MiB block. The block's
  * permissions are inherited by all 512 page entries. If the block is
  * already split (L2 entry is a table descriptor), this is a no-op.
- * Sets g->need_tlbi = true. Returns 0 on success, -1 on failure.
+ *
+ * No TLBI request is issued: the split alone preserves every VA->PA
+ * translation in the block (each L3 page descriptor inherits the block's
+ * permissions). Every caller follows the split with guest_invalidate_ptes
+ * or guest_update_perms on the actually-changing range; that subsequent
+ * call records the TLBI, and TLBI VAE1IS for any VA in the block also
+ * invalidates the cached 2 MiB block entry covering that VA (ARM ARM
+ * B2.2.5.6), so a single per-page TLBI suffices to retire the stale
+ * block translation as soon as any affected page is accessed.
+ *
+ * Returns 0 on success, -1 on failure.
  */
 int guest_split_block(guest_t *g, uint64_t block_gpa);
 
@@ -409,7 +553,8 @@ int guest_split_block(guest_t *g, uint64_t block_gpa);
  * PROT_NONE; the correct behavior is for the guest to fault.
  * If a 2MiB block is only partially invalidated, the block is split
  * into L3 pages first (preserving the non-invalidated pages).
- * Sets g->need_tlbi = true. Returns 0 on success, -1 on failure.
+ * Records a TLBI request covering the invalidated range.
+ * Returns 0 on success, -1 on failure.
  */
 int guest_invalidate_ptes(guest_t *g, uint64_t start, uint64_t end);
 
@@ -418,7 +563,8 @@ int guest_invalidate_ptes(guest_t *g, uint64_t start, uint64_t end);
  * updated), the block is automatically split into 4KiB L3 pages first.
  * If the entire 2MiB block is being updated, the block descriptor is
  * modified in place without splitting.
- * perms is a MEM_PERM_R/W/X combination. Sets g->need_tlbi = true.
+ * perms is a MEM_PERM_R/W/X combination. Records a TLBI request only for
+ * pages whose descriptor actually changed.
  * Returns 0 on success, -1 on failure.
  */
 int guest_update_perms(guest_t *g, uint64_t start, uint64_t end, int perms);
diff --git a/src/core/shim.S b/src/core/shim.S
index 62328d3..f51756d 100644
--- a/src/core/shim.S
+++ b/src/core/shim.S
@@ -16,7 +16,13 @@
  *   #2  Bad exception     (x0=ESR, x1=FAR, x2=ELR, x3=SPSR, x5=vector)
  *   #4  Set sysreg        (x0 = reg ID (0-8), x1 = value)
  *   #5  Syscall forward   (X0-X5=args, X8=syscall nr;
- *                          on return X8=TLBI flag, 2=execve)
+ *                          on return X8 carries the post-syscall request:
+ *                            0 = no TLB flush
+ *                            1 = broadcast TLBI VMALLE1IS
+ *                            2 = execve replaced register state (full flush
+ *                                + drop frame + ERET without GPR restore)
+ *                            3 = selective: TLBI VAE1IS over X10 pages
+ *                                starting at page-aligned VA in X9)
  *   #7  MRS trap          (host reads reg from ESR ISS; returns value in x0)
  *   #9  W^X toggle        (x0=FAR, x1=type: 0=exec->RX, 1=write->RW)
  *   #10 BRK from EL0      (SIGTRAP delivery / ptrace-stop; GPRs in frame)
@@ -481,8 +487,17 @@ handle_el0_fault:
 /* Shared exit paths for exception handlers
  */
 tlbi_restore_eret:
-    /* Flush TLB after page permission toggle (W^X or page table change) */
-    tlbi vmalle1is
+    /* Single-page TLB flush after a W^X permission toggle. Only one page
+     * (the one that triggered the HVC #9 fault, FAR_EL1 still in scope)
+     * changed permissions, so per-VA TLBI VAE1IS is sufficient and avoids
+     * blowing away unrelated TLB entries. The HVC #9 host handler does not
+     * raise further exceptions, so FAR_EL1 is preserved.
+     */
+    mrs x0, far_el1
+    lsr x0, x0, #12          /* TLBI VAE1IS operand: VA[55:12] */
+    tlbi vae1is, x0
+    dsb ish
+    ic iallu
     dsb ish
     isb
     /* Fall through to restore_eret */
@@ -550,30 +565,68 @@ handle_svc_0:
      *   1. Read X8 (syscall nr) and X0-X5 (args)
      *   2. Execute the syscall
      *   3. Write result to X0
-     *   4. Set X8 to 1 if page tables were modified (TLBI needed),
-     *      or 2 if execve replaced the full register state
+     *   4. Set X8 to indicate the post-syscall request:
+     *        0 = no TLB flush
+     *        1 = broadcast TLBI VMALLE1IS
+     *        2 = execve replaced register state (drop frame + flush)
+     *        3 = selective TLBI VAE1IS over X10 pages starting at X9
      *   5. Resume vCPU (execution continues below)
      */
     hvc #5
 
-    /* Execve rebuilt the guest register state.  Drop the old saved frame,
-     * flush stale translations, and return without restoring old GPRs.
+    /* Dispatch on X8. Encoded so the common case (X8 == 0, no flush) hits
+     * the cbz fast path; the other branches sort by frequency thereafter.
      */
+    cbz x8, 1f
+    cmp x8, #1
+    b.eq tlbi_full
+    cmp x8, #3
+    b.eq tlbi_selective
     cmp x8, #2
-    b.eq 2f
+    b.eq exec_drop_frame
+    /* Unknown X8: be conservative, broadcast and continue. */
 
-    /* If the host modified page tables (X8 != 0), flush the TLB and I-cache.
-     * Normal syscalls skip this path: read/write/open/stat hot loops do not
-     * modify executable mappings, and the full IC IALLU+barrier sequence is
-     * measurable in syscall-heavy workloads. JIT/signal/exec paths use the
-     * drop-frame marker above, which still performs the full flush.
+tlbi_full:
+    /* Broadcast TLB + I-cache flush. Used for page-table edits whose
+     * affected range exceeds the selective cap, or any time the host could
+     * not bound the change.
      */
-    cbz x8, 1f
     tlbi vmalle1is
     dsb ish
     ic iallu
     dsb ish
     isb
+    b 1f
+
+tlbi_selective:
+    /* Selective TLBI VAE1IS loop.
+     *   x9  = page-aligned VA of the first page to invalidate
+     *   x10 = page count (1..TLBI_SELECTIVE_MAX_PAGES, see core/guest.h)
+     * TLBI VAE1IS takes a Xt operand of (VA[55:12] | (ASID << 48)). The
+     * guest runs single-ASID at EL0, so just shift the VA right by 12.
+     * Issue all TLBI ops, then a single DSB ISH + IC IALLU + DSB + ISB
+     * matches broadcast semantics (preserves I-cache invalidation behaviour
+     * for callers like file-backed mmap of executable pages).
+     *
+     * Defensive: if x10 == 0, skip the loop. The per-vCPU host-side
+     * accumulator (cpu_tlbi_req in core/guest.h) never sets pages == 0
+     * alongside kind == TLBI_RANGE, but if a future helper bug or a stray
+     * write ever produced the pair X8=3, X10=0, the subs x12, x12, #1
+     * below would underflow to 0xFFFFFFFFFFFFFFFF and the b.ne would loop
+     * ~2^64 iterations, hanging this vCPU. Cheap guard.
+     */
+    cbz x10, 1f
+    lsr x11, x9, #12          /* x11 = VA >> 12 (current page operand) */
+    mov x12, x10              /* x12 = remaining page counter */
+3:  tlbi vae1is, x11
+    add x11, x11, #1          /* next page (operand is in 4 KiB units) */
+    subs x12, x12, #1
+    b.ne 3b
+    dsb ish
+    ic iallu
+    dsb ish
+    isb
+    b 1f
 
 1:
     /* Restore all guest registers except X0, which now holds the syscall
@@ -586,7 +639,8 @@ handle_svc_0:
      */
     eret
 
-2:  tlbi vmalle1is
+exec_drop_frame:
+    tlbi vmalle1is
     dsb ish
     isb
     ic iallu
diff --git a/src/syscall/exec.c b/src/syscall/exec.c
index ac0e9cc..9b2118f 100644
--- a/src/syscall/exec.c
+++ b/src/syscall/exec.c
@@ -771,10 +771,11 @@ int64_t sys_execve(hv_vcpu_t vcpu,
 
     /* Tell the shim that execve replaced the full guest register state.
      * X8=2 means: flush TLB, discard the old syscall frame, and return without
-     * restoring pre-exec registers.
+     * restoring pre-exec registers. This bypasses the normal syscall epilogue,
+     * which would otherwise overwrite X8 from cpu_tlbi_req.
      */
     hv_vcpu_set_reg(vcpu, HV_REG_X8, 2);
-    g->need_tlbi = false;
+    tlbi_request_clear();
 
     /* Readback forces HVF to commit sysreg/GPR writes before the run loop
      * resumes the vCPU.
diff --git a/src/syscall/mem.c b/src/syscall/mem.c
index f1e3eb9..289a9d6 100644
--- a/src/syscall/mem.c
+++ b/src/syscall/mem.c
@@ -658,7 +658,6 @@ static int rollback_fresh_mmap_allocation(guest_t *g,
         hvf_remove_file_overlay(g, overlay_ipa, overlay_len);
     if (guest_invalidate_ptes(g, start, start + length) < 0)
         return -LINUX_ENOMEM;
-    g->need_tlbi = true;
     g->mmap_next = saved_mmap_next;
     g->mmap_end = saved_mmap_end;
     g->mmap_rx_next = saved_mmap_rx_next;
@@ -1409,7 +1408,6 @@ int64_t sys_mmap(guest_t *g,
              * page table entries, making the range fault on access.
              */
             guest_invalidate_ptes(g, result_off, result_off + length);
-            g->need_tlbi = true;
         }
     }
 
@@ -1511,7 +1509,6 @@ int64_t sys_mmap(guest_t *g,
      */
     if (is_prot_none && !is_fixed) {
         guest_invalidate_ptes(g, result_off, result_off + length);
-        g->need_tlbi = true;
     }
 
     if (!is_prot_none && !is_fixed && !is_noreserve) {
@@ -1585,7 +1582,6 @@ int64_t sys_mmap(guest_t *g,
      */
     if (is_noreserve && !is_fixed) {
         guest_invalidate_ptes(g, result_off, result_off + length);
-        g->need_tlbi = true;
     }
 
     /* For file-backed mmap, populate the region with file contents.
@@ -2010,7 +2006,6 @@ int64_t sys_mremap(guest_t *g,
         }
         dispose_region_snapshots(&source_snaps, &source_nsnaps);
         dispose_region_snapshots(&dest_snaps, &dest_nsnaps);
-        g->need_tlbi = true;
         return (int64_t) guest_ipa(g, new_off);
     }
 
@@ -2079,7 +2074,6 @@ int64_t sys_mremap(guest_t *g,
                     mark_overlay_metadata_range(g, old_off, old_off + old_size,
                                                 old_overlay_start,
                                                 old_overlay_end);
-                g->need_tlbi = true;
 
                 /* Update high-water marks */
                 uint64_t hwm = old_off + new_size;
@@ -2193,7 +2187,6 @@ int64_t sys_mremap(guest_t *g,
                     source_overlay_end, track_backing_fd,
                     source_overlay_file_off);
                 guest_invalidate_ptes(g, new_off, new_off + new_size);
-                g->need_tlbi = true;
                 if (track_backing_fd >= 0)
                     close(track_backing_fd);
                 return copy_err;
@@ -2232,7 +2225,6 @@ int64_t sys_mremap(guest_t *g,
                 g->mmap_next = hwm;
         }
 
-        g->need_tlbi = true;
         return (int64_t) guest_ipa(g, new_off);
     }
 
@@ -2434,7 +2426,6 @@ static int munmap_guest_range(guest_t *g, uint64_t unmap_off, uint64_t end)
      */
     if (guest_invalidate_ptes(g, unmap_off, end) < 0)
         return -LINUX_ENOMEM;
-    g->need_tlbi = true;
     for (int i = 0; i < g->nregions; i++) {
         guest_region_t *r = &g->regions[i];
         if (r->start >= end)
@@ -2581,7 +2572,6 @@ int64_t sys_mprotect(guest_t *g, uint64_t addr, uint64_t length, int prot)
             } else {
                 guest_invalidate_ptes(g, mprot_off, mprot_end);
             }
-            g->need_tlbi = true;
         }
     }
     return 0;
diff --git a/src/syscall/proc.c b/src/syscall/proc.c
index 60adda1..d431047 100644
--- a/src/syscall/proc.c
+++ b/src/syscall/proc.c
@@ -1432,8 +1432,13 @@ int vcpu_run_loop(hv_vcpu_t vcpu,
                             "%s: W^X toggle FAILED "
                             "(split=%d update=%d) far=0x%llx",
                             prefix, sr, ur, (unsigned long long) far);
-                    /* TLB flush is done by the shim (tlbi_restore_eret) */
-                    g->need_tlbi = false;
+                    /* TLB flush is done by the shim (tlbi_restore_eret) for
+                     * the single faulting page. Clear this thread's pending
+                     * request so the next syscall epilogue does not re-flush
+                     * the W^X page. cpu_tlbi_req is per-vCPU, so this only
+                     * touches our own slot -- concurrent vCPUs are unaffected.
+                     */
+                    tlbi_request_clear();
                     break;
                 }
 
diff --git a/src/syscall/syscall.c b/src/syscall/syscall.c
index 5af65cb..e8f31db 100644
--- a/src/syscall/syscall.c
+++ b/src/syscall/syscall.c
@@ -1831,18 +1831,34 @@ int syscall_dispatch(hv_vcpu_t vcpu, guest_t *g, int *exit_code, bool verbose)
         /* Write result back to X0 */
         hv_vcpu_set_reg(vcpu, HV_REG_X0, (uint64_t) result);
 
-        /* Signal the shim to flush TLB if page tables were modified.
-         * The shim checks X8 after HVC #5: non-zero triggers TLBI.
-         * Must explicitly write X8=0 when no TLBI is needed, because the
-         * shim sees X8's pre-syscall value (the syscall number, always
-         * non-zero) and would spuriously TLBI on every return.
+        /* Signal the shim to flush TLB if this vCPU modified page tables.
+         * Protocol after HVC #5 (X8 carries the request):
+         *   0 -> skip
+         *   1 -> broadcast TLBI VMALLE1IS
+         *   2 -> reserved for execve (set by sys_execve, never reached here)
+         *   3 -> selective TLBI VAE1IS over X10 pages starting at X9
+         * Must explicitly write X8 because the shim reads its post-HVC value;
+         * the pre-syscall X8 is the syscall number (always non-zero) and would
+         * spuriously TLBI on every return.
+         *
+         * cpu_tlbi_req is a per-vCPU TLS slot, so this read needs no lock and
+         * cannot be drained or torn by another vCPU's epilogue.
          */
-        if (g->need_tlbi) {
+        switch ((tlbi_kind_t) cpu_tlbi_req.kind) {
+        case TLBI_BROADCAST:
             hv_vcpu_set_reg(vcpu, HV_REG_X8, 1);
-            g->need_tlbi = false;
-        } else {
+            break;
+        case TLBI_RANGE:
+            hv_vcpu_set_reg(vcpu, HV_REG_X8, 3);
+            hv_vcpu_set_reg(vcpu, HV_REG_X9, cpu_tlbi_req.start);
+            hv_vcpu_set_reg(vcpu, HV_REG_X10, cpu_tlbi_req.pages);
+            break;
+        case TLBI_NONE:
+        default:
             hv_vcpu_set_reg(vcpu, HV_REG_X8, 0);
+            break;
         }
+        tlbi_request_clear();
     }
 
     return should_exit;