Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 14 additions & 1 deletion src/core/bootstrap.c
Original file line number Diff line number Diff line change
Expand Up @@ -279,7 +279,11 @@ int guest_bootstrap_prepare(guest_t *g,
log_error("failed to build page tables");
return -1;
}
g->need_tlbi = true;
/* No TLBI request here: the shim's _start does TLBI VMALLE1IS before
* enabling the MMU (src/core/shim.S), and the per-vCPU accumulator is the
* wrong place to stage a bring-up flush -- bootstrap may run on a thread
* whose slot is later consumed by an unrelated syscall.
*/

guest_region_add(g, g->shim_base, g->shim_base + shim_bin_len,
LINUX_PROT_READ | LINUX_PROT_EXEC, LINUX_MAP_PRIVATE, 0,
Expand Down Expand Up @@ -440,5 +444,14 @@ int guest_bootstrap_create_vcpu(guest_t *g,
log_debug("main thread registered with SP_EL1=0x%llx",
(unsigned long long) el1_sp);

/* guest_build_page_tables and the bootstrap-time guest_invalidate_ptes
* calls (stack guard, null page, etc.) accumulate TLBI requests on this
* (the main) thread's cpu_tlbi_req TLS slot. The shim's _start does TLBI
* VMALLE1IS before enabling the MMU, so any TLB state was already dropped
* before guest code runs. Clear the accumulator so the first guest syscall
* does not redundantly broadcast on top of that.
*/
tlbi_request_clear();

return 0;
}
57 changes: 43 additions & 14 deletions src/core/guest.c
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,11 @@
#include "utils.h"
#include "runtime/thread.h" /* thread_destroy_all_vcpus */

/* Per-vCPU pending TLBI request. Zero-initialized in every host pthread
* by virtue of TLS default-zeroing, which maps to TLBI_NONE.
*/
_Thread_local tlbi_request_t cpu_tlbi_req;

static void guest_region_clear(guest_t *g);

/* Page table descriptor bits. */
Expand Down Expand Up @@ -901,7 +906,7 @@ void guest_reset(guest_t *g)
g->mmap_rw_gap_hint = 0;
g->mmap_rx_gap_hint = 0;
g->ttbr0 = 0;
g->need_tlbi = false;
tlbi_request_clear();
g->elf_load_min = ELF_DEFAULT_BASE;

/* Clear semantic region tracking (will be re-populated after exec) */
Expand Down Expand Up @@ -1650,8 +1655,8 @@ uint64_t guest_build_page_tables(guest_t *g, const mem_region_t *regions, int n)
/* Extend page tables to cover [start, end) with 2MiB block descriptors.
* Walks the existing L0->L1 structure (from g->ttbr0) and allocates new
* L2 tables as needed. This is safe to call while the vCPU is paused
* (during HVC #5 handling). Sets g->need_tlbi so the shim flushes the
* TLB before returning to EL0.
* (during HVC #5 handling). Records a TLBI request covering the new range
* so the shim flushes the matching TLB entries before returning to EL0.
*/
int guest_extend_page_tables(guest_t *g,
uint64_t start,
Expand Down Expand Up @@ -1717,7 +1722,12 @@ int guest_extend_page_tables(guest_t *g,
}
}

g->need_tlbi = true;
/* Use the page-aligned bounds the loop actually covered. Extend grows
* the mapped range; existing VAs may carry negative TLB entries from
* prior translation faults at this address, so a flush is still needed.
* Large extends will exceed the selective cap and become broadcast.
*/
tlbi_request_range(addr_start + base, addr_end + base);
guest_pt_gen_bump(g);
return 0;
}
Expand Down Expand Up @@ -1822,15 +1832,17 @@ static int split_l2_block(guest_t *g, uint64_t *l2_entry)
l3[i] = make_page_desc(block_ipa + (uint64_t) i * PAGE_SIZE, old_perms);

*l2_entry = (g->ipa_base + l3_gpa) | PT_VALID | PT_TABLE;
g->need_tlbi = true;
return 0;
}

int guest_split_block(guest_t *g, uint64_t block_gpa)
{
uint64_t block_start = ALIGN_2MIB_DOWN(block_gpa);
uint64_t *l2_entry = find_l2_entry(g, block_start);
return split_l2_block(g, l2_entry);
int rc = split_l2_block(g, l2_entry);
if (rc < 0)
return rc;
return 0;
}

int guest_invalidate_ptes(guest_t *g, uint64_t start, uint64_t end)
Expand Down Expand Up @@ -1862,9 +1874,12 @@ int guest_invalidate_ptes(guest_t *g, uint64_t start, uint64_t end)
if ((*l2_entry & 3) == 1) {
/* 2MiB block descriptor */
if (start <= block_start && end >= block_end) {
/* Invalidating the entire 2MiB block: clear the L2 entry */
/* Invalidating the entire 2MiB block: clear the L2 entry.
* The 2 MiB range exceeds the selective cap and upgrades
* to broadcast.
*/
*l2_entry = 0;
g->need_tlbi = true;
tlbi_request_range(base + block_start, base + block_end);
addr = block_end;
continue;
}
Expand All @@ -1889,7 +1904,7 @@ int guest_invalidate_ptes(guest_t *g, uint64_t start, uint64_t end)
l3[l3_idx] = 0; /* Invalid descriptor */
}

g->need_tlbi = true;
tlbi_request_range(base + page_start, base + page_end);
addr = page_end;
}

Expand Down Expand Up @@ -1936,7 +1951,7 @@ int guest_update_perms(guest_t *g, uint64_t start, uint64_t end, int perms)
if (old_perms != perms) {
uint64_t ipa = *l2_entry & L2_BLOCK_ADDR_MASK;
*l2_entry = make_block_desc(ipa, perms);
g->need_tlbi = true;
tlbi_request_range(base + block_start, base + block_end);
}
addr = block_end;
continue;
Expand All @@ -1959,9 +1974,13 @@ int guest_update_perms(guest_t *g, uint64_t start, uint64_t end, int perms)
uint64_t l3_ipa = *l2_entry & 0xFFFFFFFFF000ULL;
uint64_t *l3 = pt_at(g, l3_ipa - base);

/* Update pages within this 2MiB block that fall in [start, end) */
/* Update pages within this 2MiB block that fall in [start, end). Track
* the smallest sub-range that actually changed so the TLBI request only
* covers descriptors whose value changed (false-positive elimination).
*/
uint64_t page_start = (addr > block_start) ? addr : block_start;
uint64_t page_end = (end < block_end) ? end : block_end;
uint64_t changed_lo = UINT64_MAX, changed_hi = 0;

for (uint64_t pa = page_start; pa < page_end; pa += PAGE_SIZE) {
unsigned l3_idx =
Expand All @@ -1981,10 +2000,18 @@ int guest_update_perms(guest_t *g, uint64_t start, uint64_t end, int perms)
page_ipa = l3[l3_idx] & 0xFFFFFFFFF000ULL;
else
page_ipa = base + (pa & ~(PAGE_SIZE - 1));
l3[l3_idx] = make_page_desc(page_ipa, perms);
uint64_t new_desc = make_page_desc(page_ipa, perms);
if (l3[l3_idx] != new_desc) {
l3[l3_idx] = new_desc;
if (pa < changed_lo)
changed_lo = pa;
if (pa + PAGE_SIZE > changed_hi)
changed_hi = pa + PAGE_SIZE;
}
}

g->need_tlbi = true;
if (changed_hi > changed_lo)
tlbi_request_range(base + changed_lo, base + changed_hi);
addr = page_end;
}

Expand Down Expand Up @@ -2079,6 +2106,8 @@ int guest_materialize_lazy(guest_t *g, uint64_t fault_offset)
memset((uint8_t *) g->host_base + materialize_start, 0,
materialize_end - materialize_start);

g->need_tlbi = true;
/* The page-table helpers above already requested the matching TLBI;
* no additional flush is needed here.
*/
return 0;
}
160 changes: 153 additions & 7 deletions src/core/guest.h
Original file line number Diff line number Diff line change
Expand Up @@ -174,6 +174,39 @@ typedef struct {
char name[64]; /* Label: "[heap]", "[stack]", ELF path, etc. */
} guest_region_t;

/* TLB invalidation request kinds. After every page-table modification, the
* shim flushes the TLB on syscall return. The host accumulates the smallest
* sufficient request across the syscall and emits it via the X8/X9/X10
* register channel. Kind values are an internal enum independent of the
* X8 wire codes; the syscall epilogue does the mapping (src/syscall/syscall.c
* holds the canonical table):
* TLBI_NONE -> X8 = 0 (no TLB flush)
* TLBI_BROADCAST -> X8 = 1 (TLBI VMALLE1IS, broadest)
* TLBI_RANGE -> X8 = 3, X9 = start VA, X10 = page count
* (TLBI VAE1IS loop preserves unrelated TLB entries)
* X8 = 2 is reserved for the execve drop-frame marker the shim handles
* separately; it is never produced by the accumulator.
*/
typedef enum {
TLBI_NONE = 0,
TLBI_BROADCAST = 1,
TLBI_RANGE = 2,
} tlbi_kind_t;

/* Cap selective TLBI at this many 4 KiB pages. Beyond this, fall back to
* TLBI_BROADCAST: each TLBI VAE1IS broadcasts to all cores, so for large
* ranges the per-instruction issue cost outweighs the benefit of preserving
* unrelated TLB entries. 16 pages == 64 KiB covers RELRO and other typical
* mprotect / munmap targets.
*/
#define TLBI_SELECTIVE_MAX_PAGES 16

typedef struct {
uint8_t kind; /* tlbi_kind_t */
uint16_t pages; /* Page count when kind == TLBI_RANGE (1..MAX) */
uint64_t start; /* Page-aligned VA when kind == TLBI_RANGE */
} tlbi_request_t;

/* Guest state. */
typedef struct {
void *host_base; /* Host pointer to allocated guest memory */
Expand Down Expand Up @@ -221,9 +254,8 @@ typedef struct {
*/
uint64_t mmap_rw_gap_hint, mmap_rx_gap_hint;

uint64_t ttbr0; /* TTBR0 value (IPA of L0 page table) */
bool need_tlbi; /* Signal shim to flush TLB after page table changes */
hv_vcpu_t vcpu; /* vCPU handle */
uint64_t ttbr0; /* TTBR0 value (IPA of L0 page table) */
hv_vcpu_t vcpu; /* vCPU handle */
hv_vcpu_exit_t *exit; /* vCPU exit info */
uint32_t ipa_bits; /* IPA bits requested from HVF */
/* Semantic region tracking for munmap/mprotect/proc-self-maps */
Expand Down Expand Up @@ -253,6 +285,107 @@ static inline void guest_pt_gen_bump(guest_t *g)
atomic_fetch_add_explicit(&g->pt_gen, 1, memory_order_release);
}

/* TLB invalidation request helpers.
*
* Per-vCPU TLS slot. Each guest thread (= one host pthread + one HVF vCPU)
* accumulates its own pending TLBI request as its syscall handlers mutate
* the page tables. The syscall epilogue (src/syscall/syscall.c) reads its
* own thread's slot, emits the X8/X9/X10 protocol, and clears it.
*
* Why per-vCPU and not a guest_t-global accumulator: a global slot lets one
* vCPU's syscall epilogue drain (and clear) another vCPU's pending request
* before that vCPU has eret'd back to EL0, allowing the second vCPU to use
* a stale TLB until the broadcast TLBI from the first vCPU's shim catches
* up. A per-vCPU slot makes each thread strictly responsible for issuing
* the TLBI for its own changes before its own eret. Page-table changes are
* still global (guest memory and page tables are shared), but TLBI VAE1IS
* and TLBI VMALLE1IS in the inner-shareable domain broadcast to all PEs,
* so one vCPU's own TLBI is sufficient to invalidate stale entries on its
* own PE before resuming guest code.
*
* No locking is needed for the slot itself; only the owning thread reads
* or writes it. Page-table updates remain serialized by mmap_lock.
*
* Cross-vCPU shootdown window: between vCPU A releasing mmap_lock at the
* end of an mprotect/munmap and the shim on A issuing the TLBI, sibling
* vCPU B may continue executing EL0 code that hits A's now-stale TLB
* entries. Real Linux closes this with cross-CPU IPI synchronization in
* the kernel; user-space emulation on Hypervisor.framework cannot inject
* a synchronous IPI into a sibling vCPU thread, so the window remains.
* The guest is responsible for serializing concurrent PT mutations
* against concurrent accesses (futex / pthread_mutex), which is the same
* contract real Linux requires of well-behaved multi-threaded code. See
* TODO.md "Bounded retry on stale TLB data abort" (P3 hardening) for the
* tracked follow-up if a workload ever surfaces an actual reproducer.
*/
extern _Thread_local tlbi_request_t cpu_tlbi_req;

static inline void tlbi_request_clear(void)
{
cpu_tlbi_req.kind = TLBI_NONE;
cpu_tlbi_req.pages = 0;
cpu_tlbi_req.start = 0;
}

static inline void tlbi_request_broadcast(void)
{
cpu_tlbi_req.kind = TLBI_BROADCAST;
}

static inline void tlbi_request_range(uint64_t start, uint64_t end)
{
if (cpu_tlbi_req.kind == TLBI_BROADCAST)
return;
if (end <= start)
return;
/* Page-align: TLBI VAE1IS operates on 4 KiB granules. ALIGN_UP can
* overflow if end is within PAGE_SIZE-1 of UINT64_MAX; saturate to
* broadcast in that pathological case rather than wrap to 0.
*/
const uint64_t mask = 0xFFFULL;
if (end > UINT64_MAX - mask) {
tlbi_request_broadcast();
return;
}
uint64_t s = start & ~mask;
uint64_t e = (end + mask) & ~mask;
uint64_t n = (e - s) >> 12;
if (n > TLBI_SELECTIVE_MAX_PAGES) {
tlbi_request_broadcast();
return;
}
if (cpu_tlbi_req.kind == TLBI_NONE) {
cpu_tlbi_req.kind = TLBI_RANGE;
cpu_tlbi_req.start = s;
cpu_tlbi_req.pages = (uint16_t) n;
return;
}
/* TLBI_RANGE: coalesce by union. Disjoint ranges still produce a single
* bounding interval; if it stays within the cap, the per-page TLBI loop
* still wins over a full flush by preserving the rest of the TLB.
*/
uint64_t es = cpu_tlbi_req.start;
uint64_t pe = (uint64_t) cpu_tlbi_req.pages * 4096ULL;
/* The accumulator only ever holds page counts <= TLBI_SELECTIVE_MAX_PAGES
* (see the cap check above), so es + pe never overflows on real callers,
* but be explicit.
*/
if (es > UINT64_MAX - pe) {
tlbi_request_broadcast();
return;
}
uint64_t ee = es + pe;
uint64_t us = s < es ? s : es;
uint64_t ue = e > ee ? e : ee;
uint64_t un = (ue - us) >> 12;
if (un > TLBI_SELECTIVE_MAX_PAGES) {
tlbi_request_broadcast();
return;
}
cpu_tlbi_req.start = us;
cpu_tlbi_req.pages = (uint16_t) un;
}

/* Convert a guest offset (0-based) to an IPA/VA (ipa_base + offset) */
static inline uint64_t guest_ipa(const guest_t *g, uint64_t offset)
{
Expand Down Expand Up @@ -387,7 +520,8 @@ uint64_t guest_build_page_tables(guest_t *g,

/* Extend page tables to cover a new address range [start, end) with 2MiB
* block descriptors. Reuses the existing L0->L1 table structure and
* allocates new L2 tables as needed. Sets g->need_tlbi = true.
* allocates new L2 tables as needed. Records a TLBI request covering the
* affected range (range or broadcast).
* Returns 0 on success, -1 on failure.
*/
int guest_extend_page_tables(guest_t *g,
Expand All @@ -399,7 +533,17 @@ int guest_extend_page_tables(guest_t *g,
* block_gpa must be within a currently-mapped 2MiB block. The block's
* permissions are inherited by all 512 page entries. If the block is
* already split (L2 entry is a table descriptor), this is a no-op.
* Sets g->need_tlbi = true. Returns 0 on success, -1 on failure.
*
* No TLBI request is issued: the split alone preserves every VA->PA
* translation in the block (each L3 page descriptor inherits the block's
* permissions). Every caller follows the split with guest_invalidate_ptes
* or guest_update_perms on the actually-changing range; that subsequent
* call records the TLBI, and TLBI VAE1IS for any VA in the block also
* invalidates the cached 2 MiB block entry covering that VA (ARM ARM
* B2.2.5.6), so a single per-page TLBI suffices to retire the stale
* block translation as soon as any affected page is accessed.
*
* Returns 0 on success, -1 on failure.
*/
int guest_split_block(guest_t *g, uint64_t block_gpa);

Expand All @@ -409,7 +553,8 @@ int guest_split_block(guest_t *g, uint64_t block_gpa);
* PROT_NONE; the correct behavior is for the guest to fault.
* If a 2MiB block is only partially invalidated, the block is split
* into L3 pages first (preserving the non-invalidated pages).
* Sets g->need_tlbi = true. Returns 0 on success, -1 on failure.
* Records a TLBI request covering the invalidated range.
* Returns 0 on success, -1 on failure.
*/
int guest_invalidate_ptes(guest_t *g, uint64_t start, uint64_t end);

Expand All @@ -418,7 +563,8 @@ int guest_invalidate_ptes(guest_t *g, uint64_t start, uint64_t end);
* updated), the block is automatically split into 4KiB L3 pages first.
* If the entire 2MiB block is being updated, the block descriptor is
* modified in place without splitting.
* perms is a MEM_PERM_R/W/X combination. Sets g->need_tlbi = true.
* perms is a MEM_PERM_R/W/X combination. Records a TLBI request only for
* pages whose descriptor actually changed.
* Returns 0 on success, -1 on failure.
*/
int guest_update_perms(guest_t *g, uint64_t start, uint64_t end, int perms);
Expand Down
Loading
Loading