Skip to content

Commit ad76dac

Browse files
author
Sangho Lee
committed
addressed some feedbacks
1 parent 2ed4daf commit ad76dac

File tree

6 files changed

+253
-146
lines changed

6 files changed

+253
-146
lines changed

litebox_common_linux/src/vmap.rs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -174,4 +174,8 @@ pub enum PhysPointerError {
174174
CopyFailed,
175175
#[error("Duplicate physical page address {0:#x} in the input array")]
176176
DuplicatePhysicalAddress(usize),
177+
#[error("Virtual address space exhausted in vmap region")]
178+
VaSpaceExhausted,
179+
#[error("Page-table frame allocation failed (out of memory)")]
180+
FrameAllocationFailed,
177181
}

litebox_platform_lvbs/src/arch/x86/mm/paging.rs

Lines changed: 131 additions & 63 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33

44
use litebox::mm::linux::{PageFaultError, PageRange, VmFlags, VmemPageFaultHandler};
55
use litebox::platform::page_mgmt;
6+
use litebox::utils::TruncateExt;
67
use x86_64::{
78
PhysAddr, VirtAddr,
89
structures::{
@@ -30,6 +31,49 @@ const FLUSH_TLB: bool = true;
3031
#[cfg(test)]
3132
const FLUSH_TLB: bool = false;
3233

34+
/// When flushing more than this many pages, use a full CR3 reload instead of
35+
/// per-page `invlpg` instructions. This heuristic is from the Linux kernel.
36+
/// https://elixir.bootlin.com/linux/v6.18.6/source/arch/x86/mm/tlb.c#L1394
37+
#[cfg(not(test))]
38+
const TLB_SINGLE_PAGE_FLUSH_CEILING: usize = 33;
39+
40+
/// Flush TLB entries for a contiguous page range.
41+
///
42+
/// For small ranges (<= [`TLB_SINGLE_PAGE_FLUSH_CEILING`]) this issues `invlpg` per
43+
/// page. For larger ranges it reloads CR3 to flush the entire TLB.
44+
///
45+
/// TODO(security): Both `invlpg` and CR3 reload are local to the executing core.
46+
/// Threads of the same process (CLONE_VM) can be scheduled on different cores
47+
/// sharing the same page table, so a remote core may retain stale TLB entries
48+
/// after unmap/mprotect. While threads sharing a page table are not themselves
49+
/// a security boundary, a stale TLB entry can allow access to a physical frame
50+
/// that has been freed and reallocated to a *different* process, breaking
51+
/// process isolation. This function should send IPIs to remote cores for a
52+
/// cross-core TLB shootdown (similar to Linux's `flush_tlb_others` /
53+
/// `native_flush_tlb_others`). On AMD CPUs with INVLPGB support, use the
54+
/// `invlpgb`/`tlbsync` instructions instead of IPIs for broadcast invalidation.
55+
/// On newer Intel CPUs, use Remote Action Requests (RAR) for the same purpose.
56+
///
57+
/// Note: OP-TEE TAs only unmap pages when tearing down the entire TA instance
58+
/// and its sessions, so multi-session TAs sharing a page table are not affected.
59+
#[cfg(not(test))]
60+
fn flush_tlb_range(start: Page<Size4KiB>, count: usize) {
61+
if count == 0 {
62+
return;
63+
}
64+
if count <= TLB_SINGLE_PAGE_FLUSH_CEILING {
65+
let end = start + count as u64;
66+
for page in Page::range(start, end) {
67+
x86_64::instructions::tlb::flush(page.start_address());
68+
}
69+
} else {
70+
x86_64::instructions::tlb::flush_all();
71+
}
72+
}
73+
74+
#[cfg(test)]
75+
fn flush_tlb_range(_start: Page<Size4KiB>, _count: usize) {}
76+
3377
#[inline]
3478
fn frame_to_pointer<M: MemoryProvider>(frame: PhysFrame) -> *mut PageTable {
3579
let virt = M::pa_to_va(frame.start_address());
@@ -102,39 +146,42 @@ impl<M: MemoryProvider, const ALIGN: usize> X64PageTable<'_, M, ALIGN> {
102146
UserMutPtr::from_ptr(range.start as *mut u8)
103147
}
104148

105-
/// Unmap 4KiB pages from the page table
106-
/// Set `dealloc_frames` to `true` to free the corresponding physical frames.
149+
/// Unmap a range of 4KiB pages from the page table.
150+
///
151+
/// Set `dealloc_frames` to `true` to free the corresponding physical frames. Skip this
152+
/// when the corresponding physical frames are managed elsewhere (e.g., VTL0).
107153
/// Set `flush_tlb` to `true` to flush TLB entries after unmapping (not needed when
108154
/// the page table is being destroyed).
109-
///
110-
/// Note it does not free the allocated frames for page table itself (only those allocated to
111-
/// user space).
155+
/// Set `clean_up_page_tables` to `true` to free intermediate page-table frames
156+
/// (P1/P2/P3) that become empty after unmapping. Skip this when the VA range
157+
/// will be reused soon, as the intermediate frames would just be re-allocated.
112158
pub(crate) unsafe fn unmap_pages(
113159
&self,
114160
range: PageRange<ALIGN>,
115161
dealloc_frames: bool,
116162
flush_tlb: bool,
163+
clean_up_page_tables: bool,
117164
) -> Result<(), page_mgmt::DeallocationError> {
118-
let start_va = VirtAddr::new(range.start as _);
119-
let start = Page::<Size4KiB>::from_start_address(start_va)
165+
if range.is_empty() {
166+
return Ok(());
167+
}
168+
let start = Page::<Size4KiB>::from_start_address(VirtAddr::new(range.start as _))
120169
.or(Err(page_mgmt::DeallocationError::Unaligned))?;
121-
let end_va = VirtAddr::new(range.end as _);
122-
let end = Page::<Size4KiB>::from_start_address(end_va)
170+
let end = Page::<Size4KiB>::from_start_address(VirtAddr::new(range.end as _))
123171
.or(Err(page_mgmt::DeallocationError::Unaligned))?;
124172
let mut allocator = PageTableAllocator::<M>::new();
125173

174+
// Note: TLB entries are batch-flushed after all pages are unmapped, consistent
175+
// with the Linux kernel's mmu_gather approach.
126176
// Note this implementation is slow as each page requires a full page table walk.
127177
// If we have N pages, it will be N times slower.
128178
let mut inner = self.inner.lock();
129179
for page in Page::range(start, end) {
130180
match inner.unmap(page) {
131-
Ok((frame, fl)) => {
181+
Ok((frame, _)) => {
132182
if dealloc_frames {
133183
unsafe { allocator.deallocate_frame(frame) };
134184
}
135-
if flush_tlb && FLUSH_TLB {
136-
fl.flush();
137-
}
138185
}
139186
Err(X64UnmapError::PageNotMapped) => {}
140187
Err(X64UnmapError::ParentEntryHugePage) => {
@@ -145,6 +192,20 @@ impl<M: MemoryProvider, const ALIGN: usize> X64PageTable<'_, M, ALIGN> {
145192
}
146193
}
147194
}
195+
196+
if flush_tlb {
197+
let page_count = (end.start_address() - start.start_address()) / Size4KiB::SIZE;
198+
flush_tlb_range(start, page_count.truncate());
199+
}
200+
201+
if clean_up_page_tables {
202+
// Safety: all leaf entries in the range have been unmapped above;
203+
// the caller guarantees this VA range is no longer in use.
204+
unsafe {
205+
inner.clean_up_addr_range(Page::range_inclusive(start, end - 1u64), &mut allocator);
206+
}
207+
}
208+
148209
Ok(())
149210
}
150211

@@ -180,7 +241,7 @@ impl<M: MemoryProvider, const ALIGN: usize> X64PageTable<'_, M, ALIGN> {
180241
end: user_addr_max,
181242
};
182243
// Safety: The caller ensures no references to the unmapped pages exist.
183-
let _ = unsafe { self.unmap_pages(user_range, true, false) };
244+
let _ = unsafe { self.unmap_pages(user_range, true, false, false) };
184245

185246
// Clean up all empty P1 - P3 tables
186247
let mut allocator = PageTableAllocator::<M>::new();
@@ -204,18 +265,23 @@ impl<M: MemoryProvider, const ALIGN: usize> X64PageTable<'_, M, ALIGN> {
204265
let end: Page<Size4KiB> = Page::from_start_address(VirtAddr::new(old_range.end as u64))
205266
.or(Err(page_mgmt::RemapError::Unaligned))?;
206267

268+
// Note: TLB entries for the old addresses are batch-flushed after all pages
269+
// are remapped, consistent with the Linux kernel's approach. Only the old
270+
// (unmapped) addresses need flushing; the new addresses are not-present →
271+
// present transitions and do not require flushing.
207272
// Note this implementation is slow as each page requires three full page table walks.
208273
// If we have N pages, it will be 3N times slower.
209274
let mut allocator = PageTableAllocator::<M>::new();
210275
let mut inner = self.inner.lock();
276+
let flush_start = start;
211277
while start < end {
212278
match inner.translate(start.start_address()) {
213279
TranslateResult::Mapped {
214280
frame: _,
215281
offset: _,
216282
flags,
217283
} => match inner.unmap(start) {
218-
Ok((frame, fl)) => {
284+
Ok((frame, _)) => {
219285
match unsafe { inner.map_to(new_start, frame, flags, &mut allocator) } {
220286
Ok(_) => {}
221287
Err(e) => match e {
@@ -230,9 +296,6 @@ impl<M: MemoryProvider, const ALIGN: usize> X64PageTable<'_, M, ALIGN> {
230296
}
231297
},
232298
}
233-
if FLUSH_TLB {
234-
fl.flush();
235-
}
236299
}
237300
Err(X64UnmapError::PageNotMapped) => {
238301
unreachable!()
@@ -255,6 +318,16 @@ impl<M: MemoryProvider, const ALIGN: usize> X64PageTable<'_, M, ALIGN> {
255318
new_start += 1;
256319
}
257320

321+
// Flush old (unmapped) addresses. The new (mapped) addresses are not-present →
322+
// present transitions which also require flushing under Hyper-V's virtual TLB
323+
// (see comment in `map_phys_frame_range`).
324+
let page_count = (end.start_address() - flush_start.start_address()) / Size4KiB::SIZE;
325+
flush_tlb_range(flush_start, page_count.truncate());
326+
let new_flush_start =
327+
Page::<Size4KiB>::from_start_address(VirtAddr::new(new_range.start as u64))
328+
.or(Err(page_mgmt::RemapError::Unaligned))?;
329+
flush_tlb_range(new_flush_start, page_count.truncate());
330+
258331
Ok(UserMutPtr::from_ptr(new_range.start as *mut u8))
259332
}
260333

@@ -270,6 +343,8 @@ impl<M: MemoryProvider, const ALIGN: usize> X64PageTable<'_, M, ALIGN> {
270343
Page::from_start_address(start).or(Err(page_mgmt::PermissionUpdateError::Unaligned))?;
271344
let end: Page<Size4KiB> = Page::containing_address(end - 1);
272345

346+
// Note: TLB entries are batch-flushed after all permission updates, consistent
347+
// with the Linux kernel's flush_tlb_range approach.
273348
// TODO: this implementation is slow as each page requires two full page table walks.
274349
// If we have N pages, it will be 2N times slower.
275350
let mut inner = self.inner.lock();
@@ -292,11 +367,7 @@ impl<M: MemoryProvider, const ALIGN: usize> X64PageTable<'_, M, ALIGN> {
292367
match unsafe {
293368
inner.update_flags(page, (flags & !Self::MPROTECT_PTE_MASK) | new_flags)
294369
} {
295-
Ok(fl) => {
296-
if FLUSH_TLB {
297-
fl.flush();
298-
}
299-
}
370+
Ok(_) => {}
300371
Err(e) => match e {
301372
FlagUpdateError::PageNotMapped => unreachable!(),
302373
FlagUpdateError::ParentEntryHugePage => {
@@ -313,6 +384,9 @@ impl<M: MemoryProvider, const ALIGN: usize> X64PageTable<'_, M, ALIGN> {
313384
}
314385
}
315386

387+
let page_count = (end.start_address() - start.start_address()) / Size4KiB::SIZE + 1;
388+
flush_tlb_range(start, page_count.truncate());
389+
316390
Ok(())
317391
}
318392

@@ -353,15 +427,26 @@ impl<M: MemoryProvider, const ALIGN: usize> X64PageTable<'_, M, ALIGN> {
353427
match unsafe {
354428
inner.map_to_with_table_flags(page, target_frame, flags, flags, &mut allocator)
355429
} {
356-
Ok(fl) => {
357-
if FLUSH_TLB {
358-
fl.flush();
359-
}
360-
}
430+
Ok(_) => {}
361431
Err(e) => return Err(e),
362432
}
363433
}
364434

435+
// Note: On bare-metal x86, TLB invalidation is not architecturally required
436+
// when changing a PTE from not-present to present (Intel SDM §4.10.4.3).
437+
// However, Hyper-V's virtual TLB may cache not-present translations, so
438+
// the guest must invalidate when transitioning from not-present to present.
439+
// See TLFS §11.6.1 "Recommendations for Address Spaces":
440+
// "it is recommended that operating systems running as a guest of a
441+
// hypervisor invalidate [...] TLB entries that correspond to a transition
442+
// from a not-present to a present state"
443+
// Without this flush, the hypervisor may serve stale "not-present"
444+
// translations, causing spurious page faults.
445+
let start =
446+
Page::<Size4KiB>::containing_address(M::pa_to_va(frame_range.start.start_address()));
447+
let page_count = frame_range.len();
448+
flush_tlb_range(start, page_count.truncate());
449+
365450
Ok(M::pa_to_va(frame_range.start.start_address()).as_mut_ptr())
366451
}
367452

@@ -399,29 +484,6 @@ impl<M: MemoryProvider, const ALIGN: usize> X64PageTable<'_, M, ALIGN> {
399484
.map_err(|_| MapToError::FrameAllocationFailed)?;
400485
let end_page = start_page + frames.len() as u64;
401486

402-
// Pre-scan: verify all target VAs are unmapped before modifying any page table entries.
403-
// The caller provides freshly allocated VAs from the vmap allocator so collisions
404-
// indicate a bug: panic in debug builds, return an error in release builds.
405-
for page in Page::range(start_page, end_page) {
406-
let va = page.start_address();
407-
match inner.translate(va) {
408-
TranslateResult::Mapped { frame, .. } => {
409-
let existing_frame =
410-
PhysFrame::<Size4KiB>::containing_address(frame.start_address());
411-
debug_assert!(
412-
false,
413-
"vmap: VA {va:?} is already mapped to {existing_frame:?}"
414-
);
415-
return Err(MapToError::PageAlreadyMapped(existing_frame));
416-
}
417-
TranslateResult::NotMapped => {}
418-
TranslateResult::InvalidFrameAddress(_) => {
419-
debug_assert!(false, "vmap: invalid frame address at VA {va:?}");
420-
return Err(MapToError::FrameAllocationFailed);
421-
}
422-
}
423-
}
424-
425487
let table_flags = PageTableFlags::PRESENT | PageTableFlags::WRITABLE;
426488
for (page, &target_frame) in Page::range(start_page, end_page).zip(frames.iter()) {
427489
// Note: Since we lock the entire page table for the duration of this function (`self.inner.lock()`),
@@ -437,13 +499,14 @@ impl<M: MemoryProvider, const ALIGN: usize> X64PageTable<'_, M, ALIGN> {
437499
&mut allocator,
438500
)
439501
} {
440-
Ok(fl) => {
502+
Ok(_) => {
441503
mapped_count += 1;
442-
if FLUSH_TLB {
443-
fl.flush();
444-
}
445504
}
446505
Err(e) => {
506+
debug_assert!(
507+
false,
508+
"vmap: map_to_with_table_flags failed at page {page:?}: {e:?}"
509+
);
447510
if mapped_count > 0 {
448511
Self::rollback_mapped_pages(
449512
&mut inner,
@@ -459,6 +522,10 @@ impl<M: MemoryProvider, const ALIGN: usize> X64PageTable<'_, M, ALIGN> {
459522
}
460523
}
461524

525+
// See comment in `map_phys_frame_range` — Hyper-V requires TLB notification
526+
// even for not-present → present transitions.
527+
flush_tlb_range(start_page, mapped_count);
528+
462529
Ok(base_va.as_mut_ptr())
463530
}
464531

@@ -472,18 +539,19 @@ impl<M: MemoryProvider, const ALIGN: usize> X64PageTable<'_, M, ALIGN> {
472539
pages: x86_64::structures::paging::page::PageRangeInclusive<Size4KiB>,
473540
allocator: &mut PageTableAllocator<M>,
474541
) {
542+
let first_page = pages.start;
543+
let mut count: usize = 0;
475544
for page in pages {
476-
if let Ok((_frame, fl)) = inner.unmap(page)
477-
&& FLUSH_TLB
478-
{
479-
fl.flush();
545+
if inner.unmap(page).is_ok() {
546+
count += 1;
480547
}
481548
}
482549

483-
// Free any intermediate page-table frames (P1/P2/P3) that are now
484-
// empty after unmapping.
485-
// Safety: the vmap VA range is used exclusively by this page table
486-
// and all leaf entries have just been unmapped above.
550+
flush_tlb_range(first_page, count);
551+
552+
// Safety: all leaf entries in `pages` have been unmapped above while
553+
// holding `self.inner`, so any P1/P2/P3 frames that became empty can
554+
// be safely freed.
487555
unsafe {
488556
inner.clean_up_addr_range(pages, allocator);
489557
}

0 commit comments

Comments
 (0)