diff --git a/src/core/bootstrap.c b/src/core/bootstrap.c index 5bdd386..68a292f 100644 --- a/src/core/bootstrap.c +++ b/src/core/bootstrap.c @@ -279,7 +279,11 @@ int guest_bootstrap_prepare(guest_t *g, log_error("failed to build page tables"); return -1; } - g->need_tlbi = true; + /* No TLBI request here: the shim's _start does TLBI VMALLE1IS before + * enabling the MMU (src/core/shim.S), and the per-vCPU accumulator is the + * wrong place to stage a bring-up flush -- bootstrap may run on a thread + * whose slot is later consumed by an unrelated syscall. + */ guest_region_add(g, g->shim_base, g->shim_base + shim_bin_len, LINUX_PROT_READ | LINUX_PROT_EXEC, LINUX_MAP_PRIVATE, 0, @@ -440,5 +444,14 @@ int guest_bootstrap_create_vcpu(guest_t *g, log_debug("main thread registered with SP_EL1=0x%llx", (unsigned long long) el1_sp); + /* guest_build_page_tables and the bootstrap-time guest_invalidate_ptes + * calls (stack guard, null page, etc.) accumulate TLBI requests on this + * (the main) thread's cpu_tlbi_req TLS slot. The shim's _start does TLBI + * VMALLE1IS before enabling the MMU, so any TLB state was already dropped + * before guest code runs. Clear the accumulator so the first guest syscall + * does not redundantly broadcast on top of that. + */ + tlbi_request_clear(); + return 0; } diff --git a/src/core/guest.c b/src/core/guest.c index e9af7f8..5e3d294 100644 --- a/src/core/guest.c +++ b/src/core/guest.c @@ -42,6 +42,11 @@ #include "utils.h" #include "runtime/thread.h" /* thread_destroy_all_vcpus */ +/* Per-vCPU pending TLBI request. Zero-initialized in every host pthread + * by virtue of TLS default-zeroing, which maps to TLBI_NONE. + */ +_Thread_local tlbi_request_t cpu_tlbi_req; + static void guest_region_clear(guest_t *g); /* Page table descriptor bits. */ @@ -901,7 +906,7 @@ void guest_reset(guest_t *g) g->mmap_rw_gap_hint = 0; g->mmap_rx_gap_hint = 0; g->ttbr0 = 0; - g->need_tlbi = false; + tlbi_request_clear(); g->elf_load_min = ELF_DEFAULT_BASE; /* Clear semantic region tracking (will be re-populated after exec) */ @@ -1650,8 +1655,8 @@ uint64_t guest_build_page_tables(guest_t *g, const mem_region_t *regions, int n) /* Extend page tables to cover [start, end) with 2MiB block descriptors. * Walks the existing L0->L1 structure (from g->ttbr0) and allocates new * L2 tables as needed. This is safe to call while the vCPU is paused - * (during HVC #5 handling). Sets g->need_tlbi so the shim flushes the - * TLB before returning to EL0. + * (during HVC #5 handling). Records a TLBI request covering the new range + * so the shim flushes the matching TLB entries before returning to EL0. */ int guest_extend_page_tables(guest_t *g, uint64_t start, @@ -1717,7 +1722,12 @@ int guest_extend_page_tables(guest_t *g, } } - g->need_tlbi = true; + /* Use the page-aligned bounds the loop actually covered. Extend grows + * the mapped range; existing VAs may carry negative TLB entries from + * prior translation faults at this address, so a flush is still needed. + * Large extends will exceed the selective cap and become broadcast. + */ + tlbi_request_range(addr_start + base, addr_end + base); guest_pt_gen_bump(g); return 0; } @@ -1822,7 +1832,6 @@ static int split_l2_block(guest_t *g, uint64_t *l2_entry) l3[i] = make_page_desc(block_ipa + (uint64_t) i * PAGE_SIZE, old_perms); *l2_entry = (g->ipa_base + l3_gpa) | PT_VALID | PT_TABLE; - g->need_tlbi = true; return 0; } @@ -1830,7 +1839,10 @@ int guest_split_block(guest_t *g, uint64_t block_gpa) { uint64_t block_start = ALIGN_2MIB_DOWN(block_gpa); uint64_t *l2_entry = find_l2_entry(g, block_start); - return split_l2_block(g, l2_entry); + int rc = split_l2_block(g, l2_entry); + if (rc < 0) + return rc; + return 0; } int guest_invalidate_ptes(guest_t *g, uint64_t start, uint64_t end) @@ -1862,9 +1874,12 @@ int guest_invalidate_ptes(guest_t *g, uint64_t start, uint64_t end) if ((*l2_entry & 3) == 1) { /* 2MiB block descriptor */ if (start <= block_start && end >= block_end) { - /* Invalidating the entire 2MiB block: clear the L2 entry */ + /* Invalidating the entire 2MiB block: clear the L2 entry. + * The 2 MiB range exceeds the selective cap and upgrades + * to broadcast. + */ *l2_entry = 0; - g->need_tlbi = true; + tlbi_request_range(base + block_start, base + block_end); addr = block_end; continue; } @@ -1889,7 +1904,7 @@ int guest_invalidate_ptes(guest_t *g, uint64_t start, uint64_t end) l3[l3_idx] = 0; /* Invalid descriptor */ } - g->need_tlbi = true; + tlbi_request_range(base + page_start, base + page_end); addr = page_end; } @@ -1936,7 +1951,7 @@ int guest_update_perms(guest_t *g, uint64_t start, uint64_t end, int perms) if (old_perms != perms) { uint64_t ipa = *l2_entry & L2_BLOCK_ADDR_MASK; *l2_entry = make_block_desc(ipa, perms); - g->need_tlbi = true; + tlbi_request_range(base + block_start, base + block_end); } addr = block_end; continue; @@ -1959,9 +1974,13 @@ int guest_update_perms(guest_t *g, uint64_t start, uint64_t end, int perms) uint64_t l3_ipa = *l2_entry & 0xFFFFFFFFF000ULL; uint64_t *l3 = pt_at(g, l3_ipa - base); - /* Update pages within this 2MiB block that fall in [start, end) */ + /* Update pages within this 2MiB block that fall in [start, end). Track + * the smallest sub-range that actually changed so the TLBI request only + * covers descriptors whose value changed (false-positive elimination). + */ uint64_t page_start = (addr > block_start) ? addr : block_start; uint64_t page_end = (end < block_end) ? end : block_end; + uint64_t changed_lo = UINT64_MAX, changed_hi = 0; for (uint64_t pa = page_start; pa < page_end; pa += PAGE_SIZE) { unsigned l3_idx = @@ -1981,10 +2000,18 @@ int guest_update_perms(guest_t *g, uint64_t start, uint64_t end, int perms) page_ipa = l3[l3_idx] & 0xFFFFFFFFF000ULL; else page_ipa = base + (pa & ~(PAGE_SIZE - 1)); - l3[l3_idx] = make_page_desc(page_ipa, perms); + uint64_t new_desc = make_page_desc(page_ipa, perms); + if (l3[l3_idx] != new_desc) { + l3[l3_idx] = new_desc; + if (pa < changed_lo) + changed_lo = pa; + if (pa + PAGE_SIZE > changed_hi) + changed_hi = pa + PAGE_SIZE; + } } - g->need_tlbi = true; + if (changed_hi > changed_lo) + tlbi_request_range(base + changed_lo, base + changed_hi); addr = page_end; } @@ -2079,6 +2106,8 @@ int guest_materialize_lazy(guest_t *g, uint64_t fault_offset) memset((uint8_t *) g->host_base + materialize_start, 0, materialize_end - materialize_start); - g->need_tlbi = true; + /* The page-table helpers above already requested the matching TLBI; + * no additional flush is needed here. + */ return 0; } diff --git a/src/core/guest.h b/src/core/guest.h index c087d89..e0ea521 100644 --- a/src/core/guest.h +++ b/src/core/guest.h @@ -174,6 +174,39 @@ typedef struct { char name[64]; /* Label: "[heap]", "[stack]", ELF path, etc. */ } guest_region_t; +/* TLB invalidation request kinds. After every page-table modification, the + * shim flushes the TLB on syscall return. The host accumulates the smallest + * sufficient request across the syscall and emits it via the X8/X9/X10 + * register channel. Kind values are an internal enum independent of the + * X8 wire codes; the syscall epilogue does the mapping (src/syscall/syscall.c + * holds the canonical table): + * TLBI_NONE -> X8 = 0 (no TLB flush) + * TLBI_BROADCAST -> X8 = 1 (TLBI VMALLE1IS, broadest) + * TLBI_RANGE -> X8 = 3, X9 = start VA, X10 = page count + * (TLBI VAE1IS loop preserves unrelated TLB entries) + * X8 = 2 is reserved for the execve drop-frame marker the shim handles + * separately; it is never produced by the accumulator. + */ +typedef enum { + TLBI_NONE = 0, + TLBI_BROADCAST = 1, + TLBI_RANGE = 2, +} tlbi_kind_t; + +/* Cap selective TLBI at this many 4 KiB pages. Beyond this, fall back to + * TLBI_BROADCAST: each TLBI VAE1IS broadcasts to all cores, so for large + * ranges the per-instruction issue cost outweighs the benefit of preserving + * unrelated TLB entries. 16 pages == 64 KiB covers RELRO and other typical + * mprotect / munmap targets. + */ +#define TLBI_SELECTIVE_MAX_PAGES 16 + +typedef struct { + uint8_t kind; /* tlbi_kind_t */ + uint16_t pages; /* Page count when kind == TLBI_RANGE (1..MAX) */ + uint64_t start; /* Page-aligned VA when kind == TLBI_RANGE */ +} tlbi_request_t; + /* Guest state. */ typedef struct { void *host_base; /* Host pointer to allocated guest memory */ @@ -221,9 +254,8 @@ typedef struct { */ uint64_t mmap_rw_gap_hint, mmap_rx_gap_hint; - uint64_t ttbr0; /* TTBR0 value (IPA of L0 page table) */ - bool need_tlbi; /* Signal shim to flush TLB after page table changes */ - hv_vcpu_t vcpu; /* vCPU handle */ + uint64_t ttbr0; /* TTBR0 value (IPA of L0 page table) */ + hv_vcpu_t vcpu; /* vCPU handle */ hv_vcpu_exit_t *exit; /* vCPU exit info */ uint32_t ipa_bits; /* IPA bits requested from HVF */ /* Semantic region tracking for munmap/mprotect/proc-self-maps */ @@ -253,6 +285,107 @@ static inline void guest_pt_gen_bump(guest_t *g) atomic_fetch_add_explicit(&g->pt_gen, 1, memory_order_release); } +/* TLB invalidation request helpers. + * + * Per-vCPU TLS slot. Each guest thread (= one host pthread + one HVF vCPU) + * accumulates its own pending TLBI request as its syscall handlers mutate + * the page tables. The syscall epilogue (src/syscall/syscall.c) reads its + * own thread's slot, emits the X8/X9/X10 protocol, and clears it. + * + * Why per-vCPU and not a guest_t-global accumulator: a global slot lets one + * vCPU's syscall epilogue drain (and clear) another vCPU's pending request + * before that vCPU has eret'd back to EL0, allowing the second vCPU to use + * a stale TLB until the broadcast TLBI from the first vCPU's shim catches + * up. A per-vCPU slot makes each thread strictly responsible for issuing + * the TLBI for its own changes before its own eret. Page-table changes are + * still global (guest memory and page tables are shared), but TLBI VAE1IS + * and TLBI VMALLE1IS in the inner-shareable domain broadcast to all PEs, + * so one vCPU's own TLBI is sufficient to invalidate stale entries on its + * own PE before resuming guest code. + * + * No locking is needed for the slot itself; only the owning thread reads + * or writes it. Page-table updates remain serialized by mmap_lock. + * + * Cross-vCPU shootdown window: between vCPU A releasing mmap_lock at the + * end of an mprotect/munmap and the shim on A issuing the TLBI, sibling + * vCPU B may continue executing EL0 code that hits A's now-stale TLB + * entries. Real Linux closes this with cross-CPU IPI synchronization in + * the kernel; user-space emulation on Hypervisor.framework cannot inject + * a synchronous IPI into a sibling vCPU thread, so the window remains. + * The guest is responsible for serializing concurrent PT mutations + * against concurrent accesses (futex / pthread_mutex), which is the same + * contract real Linux requires of well-behaved multi-threaded code. See + * TODO.md "Bounded retry on stale TLB data abort" (P3 hardening) for the + * tracked follow-up if a workload ever surfaces an actual reproducer. + */ +extern _Thread_local tlbi_request_t cpu_tlbi_req; + +static inline void tlbi_request_clear(void) +{ + cpu_tlbi_req.kind = TLBI_NONE; + cpu_tlbi_req.pages = 0; + cpu_tlbi_req.start = 0; +} + +static inline void tlbi_request_broadcast(void) +{ + cpu_tlbi_req.kind = TLBI_BROADCAST; +} + +static inline void tlbi_request_range(uint64_t start, uint64_t end) +{ + if (cpu_tlbi_req.kind == TLBI_BROADCAST) + return; + if (end <= start) + return; + /* Page-align: TLBI VAE1IS operates on 4 KiB granules. ALIGN_UP can + * overflow if end is within PAGE_SIZE-1 of UINT64_MAX; saturate to + * broadcast in that pathological case rather than wrap to 0. + */ + const uint64_t mask = 0xFFFULL; + if (end > UINT64_MAX - mask) { + tlbi_request_broadcast(); + return; + } + uint64_t s = start & ~mask; + uint64_t e = (end + mask) & ~mask; + uint64_t n = (e - s) >> 12; + if (n > TLBI_SELECTIVE_MAX_PAGES) { + tlbi_request_broadcast(); + return; + } + if (cpu_tlbi_req.kind == TLBI_NONE) { + cpu_tlbi_req.kind = TLBI_RANGE; + cpu_tlbi_req.start = s; + cpu_tlbi_req.pages = (uint16_t) n; + return; + } + /* TLBI_RANGE: coalesce by union. Disjoint ranges still produce a single + * bounding interval; if it stays within the cap, the per-page TLBI loop + * still wins over a full flush by preserving the rest of the TLB. + */ + uint64_t es = cpu_tlbi_req.start; + uint64_t pe = (uint64_t) cpu_tlbi_req.pages * 4096ULL; + /* The accumulator only ever holds page counts <= TLBI_SELECTIVE_MAX_PAGES + * (see the cap check above), so es + pe never overflows on real callers, + * but be explicit. + */ + if (es > UINT64_MAX - pe) { + tlbi_request_broadcast(); + return; + } + uint64_t ee = es + pe; + uint64_t us = s < es ? s : es; + uint64_t ue = e > ee ? e : ee; + uint64_t un = (ue - us) >> 12; + if (un > TLBI_SELECTIVE_MAX_PAGES) { + tlbi_request_broadcast(); + return; + } + cpu_tlbi_req.start = us; + cpu_tlbi_req.pages = (uint16_t) un; +} + /* Convert a guest offset (0-based) to an IPA/VA (ipa_base + offset) */ static inline uint64_t guest_ipa(const guest_t *g, uint64_t offset) { @@ -387,7 +520,8 @@ uint64_t guest_build_page_tables(guest_t *g, /* Extend page tables to cover a new address range [start, end) with 2MiB * block descriptors. Reuses the existing L0->L1 table structure and - * allocates new L2 tables as needed. Sets g->need_tlbi = true. + * allocates new L2 tables as needed. Records a TLBI request covering the + * affected range (range or broadcast). * Returns 0 on success, -1 on failure. */ int guest_extend_page_tables(guest_t *g, @@ -399,7 +533,17 @@ int guest_extend_page_tables(guest_t *g, * block_gpa must be within a currently-mapped 2MiB block. The block's * permissions are inherited by all 512 page entries. If the block is * already split (L2 entry is a table descriptor), this is a no-op. - * Sets g->need_tlbi = true. Returns 0 on success, -1 on failure. + * + * No TLBI request is issued: the split alone preserves every VA->PA + * translation in the block (each L3 page descriptor inherits the block's + * permissions). Every caller follows the split with guest_invalidate_ptes + * or guest_update_perms on the actually-changing range; that subsequent + * call records the TLBI, and TLBI VAE1IS for any VA in the block also + * invalidates the cached 2 MiB block entry covering that VA (ARM ARM + * B2.2.5.6), so a single per-page TLBI suffices to retire the stale + * block translation as soon as any affected page is accessed. + * + * Returns 0 on success, -1 on failure. */ int guest_split_block(guest_t *g, uint64_t block_gpa); @@ -409,7 +553,8 @@ int guest_split_block(guest_t *g, uint64_t block_gpa); * PROT_NONE; the correct behavior is for the guest to fault. * If a 2MiB block is only partially invalidated, the block is split * into L3 pages first (preserving the non-invalidated pages). - * Sets g->need_tlbi = true. Returns 0 on success, -1 on failure. + * Records a TLBI request covering the invalidated range. + * Returns 0 on success, -1 on failure. */ int guest_invalidate_ptes(guest_t *g, uint64_t start, uint64_t end); @@ -418,7 +563,8 @@ int guest_invalidate_ptes(guest_t *g, uint64_t start, uint64_t end); * updated), the block is automatically split into 4KiB L3 pages first. * If the entire 2MiB block is being updated, the block descriptor is * modified in place without splitting. - * perms is a MEM_PERM_R/W/X combination. Sets g->need_tlbi = true. + * perms is a MEM_PERM_R/W/X combination. Records a TLBI request only for + * pages whose descriptor actually changed. * Returns 0 on success, -1 on failure. */ int guest_update_perms(guest_t *g, uint64_t start, uint64_t end, int perms); diff --git a/src/core/shim.S b/src/core/shim.S index 62328d3..f51756d 100644 --- a/src/core/shim.S +++ b/src/core/shim.S @@ -16,7 +16,13 @@ * #2 Bad exception (x0=ESR, x1=FAR, x2=ELR, x3=SPSR, x5=vector) * #4 Set sysreg (x0 = reg ID (0-8), x1 = value) * #5 Syscall forward (X0-X5=args, X8=syscall nr; - * on return X8=TLBI flag, 2=execve) + * on return X8 carries the post-syscall request: + * 0 = no TLB flush + * 1 = broadcast TLBI VMALLE1IS + * 2 = execve replaced register state (full flush + * + drop frame + ERET without GPR restore) + * 3 = selective: TLBI VAE1IS over X10 pages + * starting at page-aligned VA in X9) * #7 MRS trap (host reads reg from ESR ISS; returns value in x0) * #9 W^X toggle (x0=FAR, x1=type: 0=exec->RX, 1=write->RW) * #10 BRK from EL0 (SIGTRAP delivery / ptrace-stop; GPRs in frame) @@ -481,8 +487,17 @@ handle_el0_fault: /* Shared exit paths for exception handlers */ tlbi_restore_eret: - /* Flush TLB after page permission toggle (W^X or page table change) */ - tlbi vmalle1is + /* Single-page TLB flush after a W^X permission toggle. Only one page + * (the one that triggered the HVC #9 fault, FAR_EL1 still in scope) + * changed permissions, so per-VA TLBI VAE1IS is sufficient and avoids + * blowing away unrelated TLB entries. The HVC #9 host handler does not + * raise further exceptions, so FAR_EL1 is preserved. + */ + mrs x0, far_el1 + lsr x0, x0, #12 /* TLBI VAE1IS operand: VA[55:12] */ + tlbi vae1is, x0 + dsb ish + ic iallu dsb ish isb /* Fall through to restore_eret */ @@ -550,30 +565,68 @@ handle_svc_0: * 1. Read X8 (syscall nr) and X0-X5 (args) * 2. Execute the syscall * 3. Write result to X0 - * 4. Set X8 to 1 if page tables were modified (TLBI needed), - * or 2 if execve replaced the full register state + * 4. Set X8 to indicate the post-syscall request: + * 0 = no TLB flush + * 1 = broadcast TLBI VMALLE1IS + * 2 = execve replaced register state (drop frame + flush) + * 3 = selective TLBI VAE1IS over X10 pages starting at X9 * 5. Resume vCPU (execution continues below) */ hvc #5 - /* Execve rebuilt the guest register state. Drop the old saved frame, - * flush stale translations, and return without restoring old GPRs. + /* Dispatch on X8. Encoded so the common case (X8 == 0, no flush) hits + * the cbz fast path; the other branches sort by frequency thereafter. */ + cbz x8, 1f + cmp x8, #1 + b.eq tlbi_full + cmp x8, #3 + b.eq tlbi_selective cmp x8, #2 - b.eq 2f + b.eq exec_drop_frame + /* Unknown X8: be conservative, broadcast and continue. */ - /* If the host modified page tables (X8 != 0), flush the TLB and I-cache. - * Normal syscalls skip this path: read/write/open/stat hot loops do not - * modify executable mappings, and the full IC IALLU+barrier sequence is - * measurable in syscall-heavy workloads. JIT/signal/exec paths use the - * drop-frame marker above, which still performs the full flush. +tlbi_full: + /* Broadcast TLB + I-cache flush. Used for page-table edits whose + * affected range exceeds the selective cap, or any time the host could + * not bound the change. */ - cbz x8, 1f tlbi vmalle1is dsb ish ic iallu dsb ish isb + b 1f + +tlbi_selective: + /* Selective TLBI VAE1IS loop. + * x9 = page-aligned VA of the first page to invalidate + * x10 = page count (1..TLBI_SELECTIVE_MAX_PAGES, see core/guest.h) + * TLBI VAE1IS takes a Xt operand of (VA[55:12] | (ASID << 48)). The + * guest runs single-ASID at EL0, so just shift the VA right by 12. + * Issue all TLBI ops, then a single DSB ISH + IC IALLU + DSB + ISB + * matches broadcast semantics (preserves I-cache invalidation behaviour + * for callers like file-backed mmap of executable pages). + * + * Defensive: if x10 == 0, skip the loop. The per-vCPU host-side + * accumulator (cpu_tlbi_req in core/guest.h) never sets pages == 0 + * alongside kind == TLBI_RANGE, but if a future helper bug or a stray + * write ever produced the pair X8=3, X10=0, the subs x12, x12, #1 + * below would underflow to 0xFFFFFFFFFFFFFFFF and the b.ne would loop + * ~2^64 iterations, hanging this vCPU. Cheap guard. + */ + cbz x10, 1f + lsr x11, x9, #12 /* x11 = VA >> 12 (current page operand) */ + mov x12, x10 /* x12 = remaining page counter */ +3: tlbi vae1is, x11 + add x11, x11, #1 /* next page (operand is in 4 KiB units) */ + subs x12, x12, #1 + b.ne 3b + dsb ish + ic iallu + dsb ish + isb + b 1f 1: /* Restore all guest registers except X0, which now holds the syscall @@ -586,7 +639,8 @@ handle_svc_0: */ eret -2: tlbi vmalle1is +exec_drop_frame: + tlbi vmalle1is dsb ish isb ic iallu diff --git a/src/syscall/exec.c b/src/syscall/exec.c index ac0e9cc..9b2118f 100644 --- a/src/syscall/exec.c +++ b/src/syscall/exec.c @@ -771,10 +771,11 @@ int64_t sys_execve(hv_vcpu_t vcpu, /* Tell the shim that execve replaced the full guest register state. * X8=2 means: flush TLB, discard the old syscall frame, and return without - * restoring pre-exec registers. + * restoring pre-exec registers. This bypasses the normal syscall epilogue, + * which would otherwise overwrite X8 from cpu_tlbi_req. */ hv_vcpu_set_reg(vcpu, HV_REG_X8, 2); - g->need_tlbi = false; + tlbi_request_clear(); /* Readback forces HVF to commit sysreg/GPR writes before the run loop * resumes the vCPU. diff --git a/src/syscall/mem.c b/src/syscall/mem.c index f1e3eb9..289a9d6 100644 --- a/src/syscall/mem.c +++ b/src/syscall/mem.c @@ -658,7 +658,6 @@ static int rollback_fresh_mmap_allocation(guest_t *g, hvf_remove_file_overlay(g, overlay_ipa, overlay_len); if (guest_invalidate_ptes(g, start, start + length) < 0) return -LINUX_ENOMEM; - g->need_tlbi = true; g->mmap_next = saved_mmap_next; g->mmap_end = saved_mmap_end; g->mmap_rx_next = saved_mmap_rx_next; @@ -1409,7 +1408,6 @@ int64_t sys_mmap(guest_t *g, * page table entries, making the range fault on access. */ guest_invalidate_ptes(g, result_off, result_off + length); - g->need_tlbi = true; } } @@ -1511,7 +1509,6 @@ int64_t sys_mmap(guest_t *g, */ if (is_prot_none && !is_fixed) { guest_invalidate_ptes(g, result_off, result_off + length); - g->need_tlbi = true; } if (!is_prot_none && !is_fixed && !is_noreserve) { @@ -1585,7 +1582,6 @@ int64_t sys_mmap(guest_t *g, */ if (is_noreserve && !is_fixed) { guest_invalidate_ptes(g, result_off, result_off + length); - g->need_tlbi = true; } /* For file-backed mmap, populate the region with file contents. @@ -2010,7 +2006,6 @@ int64_t sys_mremap(guest_t *g, } dispose_region_snapshots(&source_snaps, &source_nsnaps); dispose_region_snapshots(&dest_snaps, &dest_nsnaps); - g->need_tlbi = true; return (int64_t) guest_ipa(g, new_off); } @@ -2079,7 +2074,6 @@ int64_t sys_mremap(guest_t *g, mark_overlay_metadata_range(g, old_off, old_off + old_size, old_overlay_start, old_overlay_end); - g->need_tlbi = true; /* Update high-water marks */ uint64_t hwm = old_off + new_size; @@ -2193,7 +2187,6 @@ int64_t sys_mremap(guest_t *g, source_overlay_end, track_backing_fd, source_overlay_file_off); guest_invalidate_ptes(g, new_off, new_off + new_size); - g->need_tlbi = true; if (track_backing_fd >= 0) close(track_backing_fd); return copy_err; @@ -2232,7 +2225,6 @@ int64_t sys_mremap(guest_t *g, g->mmap_next = hwm; } - g->need_tlbi = true; return (int64_t) guest_ipa(g, new_off); } @@ -2434,7 +2426,6 @@ static int munmap_guest_range(guest_t *g, uint64_t unmap_off, uint64_t end) */ if (guest_invalidate_ptes(g, unmap_off, end) < 0) return -LINUX_ENOMEM; - g->need_tlbi = true; for (int i = 0; i < g->nregions; i++) { guest_region_t *r = &g->regions[i]; if (r->start >= end) @@ -2581,7 +2572,6 @@ int64_t sys_mprotect(guest_t *g, uint64_t addr, uint64_t length, int prot) } else { guest_invalidate_ptes(g, mprot_off, mprot_end); } - g->need_tlbi = true; } } return 0; diff --git a/src/syscall/proc.c b/src/syscall/proc.c index 60adda1..d431047 100644 --- a/src/syscall/proc.c +++ b/src/syscall/proc.c @@ -1432,8 +1432,13 @@ int vcpu_run_loop(hv_vcpu_t vcpu, "%s: W^X toggle FAILED " "(split=%d update=%d) far=0x%llx", prefix, sr, ur, (unsigned long long) far); - /* TLB flush is done by the shim (tlbi_restore_eret) */ - g->need_tlbi = false; + /* TLB flush is done by the shim (tlbi_restore_eret) for + * the single faulting page. Clear this thread's pending + * request so the next syscall epilogue does not re-flush + * the W^X page. cpu_tlbi_req is per-vCPU, so this only + * touches our own slot -- concurrent vCPUs are unaffected. + */ + tlbi_request_clear(); break; } diff --git a/src/syscall/syscall.c b/src/syscall/syscall.c index 5af65cb..e8f31db 100644 --- a/src/syscall/syscall.c +++ b/src/syscall/syscall.c @@ -1831,18 +1831,34 @@ int syscall_dispatch(hv_vcpu_t vcpu, guest_t *g, int *exit_code, bool verbose) /* Write result back to X0 */ hv_vcpu_set_reg(vcpu, HV_REG_X0, (uint64_t) result); - /* Signal the shim to flush TLB if page tables were modified. - * The shim checks X8 after HVC #5: non-zero triggers TLBI. - * Must explicitly write X8=0 when no TLBI is needed, because the - * shim sees X8's pre-syscall value (the syscall number, always - * non-zero) and would spuriously TLBI on every return. + /* Signal the shim to flush TLB if this vCPU modified page tables. + * Protocol after HVC #5 (X8 carries the request): + * 0 -> skip + * 1 -> broadcast TLBI VMALLE1IS + * 2 -> reserved for execve (set by sys_execve, never reached here) + * 3 -> selective TLBI VAE1IS over X10 pages starting at X9 + * Must explicitly write X8 because the shim reads its post-HVC value; + * the pre-syscall X8 is the syscall number (always non-zero) and would + * spuriously TLBI on every return. + * + * cpu_tlbi_req is a per-vCPU TLS slot, so this read needs no lock and + * cannot be drained or torn by another vCPU's epilogue. */ - if (g->need_tlbi) { + switch ((tlbi_kind_t) cpu_tlbi_req.kind) { + case TLBI_BROADCAST: hv_vcpu_set_reg(vcpu, HV_REG_X8, 1); - g->need_tlbi = false; - } else { + break; + case TLBI_RANGE: + hv_vcpu_set_reg(vcpu, HV_REG_X8, 3); + hv_vcpu_set_reg(vcpu, HV_REG_X9, cpu_tlbi_req.start); + hv_vcpu_set_reg(vcpu, HV_REG_X10, cpu_tlbi_req.pages); + break; + case TLBI_NONE: + default: hv_vcpu_set_reg(vcpu, HV_REG_X8, 0); + break; } + tlbi_request_clear(); } return should_exit;