diff --git a/pkg/ring0/pagetables/BUILD b/pkg/ring0/pagetables/BUILD index b111991e01..186fcf596d 100644 --- a/pkg/ring0/pagetables/BUILD +++ b/pkg/ring0/pagetables/BUILD @@ -49,6 +49,7 @@ go_library( "pagetables_aarch64.go", "pagetables_amd64.go", "pagetables_arm64.go", + "pagetables_unsafe.go", "pagetables_x86.go", "pcids.go", "pcids_aarch64.go", diff --git a/pkg/ring0/pagetables/pagetables.go b/pkg/ring0/pagetables/pagetables.go index 04f9195d71..e0c67d65fc 100644 --- a/pkg/ring0/pagetables/pagetables.go +++ b/pkg/ring0/pagetables/pagetables.go @@ -110,8 +110,9 @@ func New(a Allocator) *PageTables { type mapVisitor struct { target uintptr // Input. physical uintptr // Input. - opts MapOpts // Input. - prev bool // Output. + // opts is a pointer just to reduce a stack usage. It should never be changed. + opts *MapOpts // Input. + prev bool // Output. } // visit is used for map. @@ -119,7 +120,7 @@ type mapVisitor struct { //go:nosplit func (v *mapVisitor) visit(start uintptr, pte *PTE, align uintptr) bool { p := v.physical + (start - v.target) - if pte.Valid() && (pte.Address() != p || pte.Opts() != v.opts) { + if pte.Valid() && (pte.Address() != p || pte.Opts() != *v.opts) { v.prev = true } if p&align != 0 { @@ -169,7 +170,7 @@ func (p *PageTables) Map(addr hostarch.Addr, length uintptr, opts MapOpts, physi visitor: mapVisitor{ target: uintptr(addr), physical: physical, - opts: opts, + opts: &opts, }, } w.iterateRange(uintptr(addr), uintptr(addr)+length) diff --git a/pkg/ring0/pagetables/pagetables_aarch64.go b/pkg/ring0/pagetables/pagetables_aarch64.go index 97ce934e08..ca0e8c384b 100644 --- a/pkg/ring0/pagetables/pagetables_aarch64.go +++ b/pkg/ring0/pagetables/pagetables_aarch64.go @@ -91,6 +91,9 @@ type MapOpts struct { // User indicates the page is a user page. User bool + // Static indicates the entries should not be cleared/freed. + Static bool + // MemoryType is the memory type. MemoryType hostarch.MemoryType } @@ -156,7 +159,7 @@ func (p *PTE) IsSect() bool { // This does not change the sect page property. // //go:nosplit -func (p *PTE) Set(addr uintptr, opts MapOpts) { +func (p *PTE) Set(addr uintptr, opts *MapOpts) { v := (addr &^ optionMask) | nG | readOnly | protDefault // Note: p.IsSect is manually inlined to reduce stack size for // nosplit-ness. diff --git a/pkg/ring0/pagetables/pagetables_unsafe.go b/pkg/ring0/pagetables/pagetables_unsafe.go new file mode 100644 index 0000000000..b9f5724bc3 --- /dev/null +++ b/pkg/ring0/pagetables/pagetables_unsafe.go @@ -0,0 +1,26 @@ +// Copyright 2025 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package pagetables + +import ( + "unsafe" +) + +// Get returns the entry with the specified index. +// +//go:nosplit +func (p *PTEs) Get(idx uint16) *PTE { + return (*PTE)(unsafe.Pointer(uintptr(unsafe.Pointer(&p[0])) + 8*uintptr(idx))) +} diff --git a/pkg/ring0/pagetables/pagetables_x86.go b/pkg/ring0/pagetables/pagetables_x86.go index 2109ccdf33..5b0947c643 100644 --- a/pkg/ring0/pagetables/pagetables_x86.go +++ b/pkg/ring0/pagetables/pagetables_x86.go @@ -73,6 +73,9 @@ type MapOpts struct { // User indicates the page is a user page. User bool + // Static indicates the entries should not be cleared/freed. + Static bool + // MemoryType is the memory type. MemoryType hostarch.MemoryType } @@ -91,7 +94,7 @@ func (p *PTE) Clear() { // //go:nosplit func (p *PTE) Valid() bool { - return atomic.LoadUintptr((*uintptr)(p))&present != 0 + return atomic.LoadUintptr((*uintptr)(p)) != 0 } // Opts returns the PTE options. @@ -138,8 +141,8 @@ func (p *PTE) IsSuper() bool { // This does not change the super page property. // //go:nosplit -func (p *PTE) Set(addr uintptr, opts MapOpts) { - if !opts.AccessType.Any() { +func (p *PTE) Set(addr uintptr, opts *MapOpts) { + if !opts.AccessType.Any() && !opts.Static { p.Clear() return } diff --git a/pkg/ring0/pagetables/walker_amd64.go b/pkg/ring0/pagetables/walker_amd64.go index 1168e82f88..0e10cbc331 100644 --- a/pkg/ring0/pagetables/walker_amd64.go +++ b/pkg/ring0/pagetables/walker_amd64.go @@ -43,7 +43,7 @@ func (w *Walker) walkPTEs(entries *PTEs, start, end uintptr) (bool, uint16) { var clearEntries uint16 for start < end { pteIndex := uint16((start & pteMask) >> pteShift) - entry := &entries[pteIndex] + entry := entries.Get(pteIndex) if !entry.Valid() && !w.visitor.requiresAlloc() { clearEntries++ start += pteSize @@ -81,7 +81,7 @@ func (w *Walker) walkPMDs(pmdEntries *PTEs, start, end uintptr) (bool, uint16) { var pteEntries *PTEs nextBoundary := addrEnd(start, end, pmdSize) pmdIndex := uint16((start & pmdMask) >> pmdShift) - pmdEntry := &pmdEntries[pmdIndex] + pmdEntry := pmdEntries.Get(pmdIndex) if !pmdEntry.Valid() { if !w.visitor.requiresAlloc() { // Skip over this entry. @@ -114,9 +114,10 @@ func (w *Walker) walkPMDs(pmdEntries *PTEs, start, end uintptr) (bool, uint16) { // Install the relevant entries. pteEntries = w.pageTables.Allocator.NewPTEs() for index := uint16(0); index < entriesPerPage; index++ { + opts := pmdEntry.Opts() pteEntries[index].Set( pmdEntry.Address()+(pteSize*uintptr(index)), - pmdEntry.Opts()) + &opts) } pmdEntry.setPageTable(w.pageTables, pteEntries) } else { @@ -173,7 +174,7 @@ func (w *Walker) walkPUDs(pudEntries *PTEs, start, end uintptr) (bool, uint16) { var pmdEntries *PTEs nextBoundary := addrEnd(start, end, pudSize) pudIndex := uint16((start & pudMask) >> pudShift) - pudEntry := &pudEntries[pudIndex] + pudEntry := pudEntries.Get(pudIndex) if !pudEntry.Valid() { if !w.visitor.requiresAlloc() { // Skip over this entry. @@ -209,9 +210,10 @@ func (w *Walker) walkPUDs(pudEntries *PTEs, start, end uintptr) (bool, uint16) { pmdEntries = w.pageTables.Allocator.NewPTEs() // escapes: see above. for index := uint16(0); index < entriesPerPage; index++ { pmdEntries[index].SetSuper() + opts := pudEntry.Opts() pmdEntries[index].Set( pudEntry.Address()+(pmdSize*uintptr(index)), - pudEntry.Opts()) + &opts) } pudEntry.setPageTable(w.pageTables, pmdEntries) } else { @@ -261,7 +263,7 @@ func (w *Walker) iterateRangeCanonical(start, end uintptr) bool { var pudEntries *PTEs nextBoundary := addrEnd(start, end, pgdSize) pgdIndex := uint16((start & pgdMask) >> pgdShift) - pgdEntry := &w.pageTables.root[pgdIndex] + pgdEntry := w.pageTables.root.Get(pgdIndex) if !w.pageTables.largeAddressesEnabled { if !pgdEntry.Valid() { if !w.visitor.requiresAlloc() { diff --git a/pkg/ring0/pagetables/walker_arm64.go b/pkg/ring0/pagetables/walker_arm64.go index 726672f817..92dcfe6d3c 100644 --- a/pkg/ring0/pagetables/walker_arm64.go +++ b/pkg/ring0/pagetables/walker_arm64.go @@ -87,9 +87,10 @@ func (w *Walker) iterateRangeCanonical(start, end uintptr) bool { pmdEntries = w.pageTables.Allocator.NewPTEs() for index := uint16(0); index < entriesPerPage; index++ { pmdEntries[index].SetSect() + opts := pudEntry.Opts() pmdEntries[index].Set( pudEntry.Address()+(pmdSize*uintptr(index)), - pudEntry.Opts()) + &opts) } pudEntry.setPageTable(w.pageTables, pmdEntries) } else { @@ -152,9 +153,10 @@ func (w *Walker) iterateRangeCanonical(start, end uintptr) bool { // Install the relevant entries. pteEntries = w.pageTables.Allocator.NewPTEs() for index := uint16(0); index < entriesPerPage; index++ { + opts := pmdEntry.Opts() pteEntries[index].Set( pmdEntry.Address()+(pteSize*uintptr(index)), - pmdEntry.Opts()) + &opts) } pmdEntry.setPageTable(w.pageTables, pteEntries) } else { diff --git a/pkg/sentry/platform/kvm/bluepill_fault.go b/pkg/sentry/platform/kvm/bluepill_fault.go index 4e51e64aea..1675d55db8 100644 --- a/pkg/sentry/platform/kvm/bluepill_fault.go +++ b/pkg/sentry/platform/kvm/bluepill_fault.go @@ -25,11 +25,14 @@ import ( var ( // faultBlockSize is the size used for servicing memory faults. // - // This should be large enough to avoid frequent faults and avoid using - // all available KVM slots (~512), but small enough that KVM does not - // complain about slot sizes (~4GB). See handleBluepillFault for how - // this block is used. - faultBlockSize = uintptr(2 << 30) + // This should be large enough so that the total number of slots + // required to cover the 47-bit virtual address space does not exceed + // the KVM slot limit (e.g. 32764). Linux doesn't allocate virtual + // address space above 47-bit by default. + // It must be small enough to limit the memory overhead associated with + // KVM slot allocation. For example, using a 46-bit address space + // results in an overhead of ~250 MB. + faultBlockSize = uintptr(8 << 30) // faultBlockMask is the mask for the fault blocks. // @@ -56,13 +59,17 @@ func calculateBluepillFault(physical uintptr) (virtualStart, physicalStart, leng } // Adjust the block to match our size. - physicalStart = pr.physical + (alignedPhysical-pr.physical)&faultBlockMask - virtualStart = pr.virtual + (physicalStart - pr.physical) + physicalStart = pr.physical / faultBlockSize * faultBlockSize + physicalStart = physicalStart + (alignedPhysical-physicalStart)&faultBlockMask physicalEnd := physicalStart + faultBlockSize + if physicalStart < pr.physical { + physicalStart = pr.physical + } if physicalEnd > end { physicalEnd = end } length = physicalEnd - physicalStart + virtualStart = pr.virtual + (physicalStart - pr.physical) return virtualStart, physicalStart, length, &physicalRegions[i] } diff --git a/pkg/sentry/platform/kvm/kvm_test.go b/pkg/sentry/platform/kvm/kvm_test.go index d64db6a5fd..8e8deacb77 100644 --- a/pkg/sentry/platform/kvm/kvm_test.go +++ b/pkg/sentry/platform/kvm/kvm_test.go @@ -39,6 +39,7 @@ import ( var dummyFPState fpu.State type testHarness interface { + Logf(format string, args ...any) Errorf(format string, args ...any) Fatalf(format string, args ...any) } @@ -146,6 +147,7 @@ func applicationTest(t testHarness, useHostMappings bool, targetFn uintptr, fn f // done for regular user code, but is fine for test // purposes.) applyPhysicalRegions(func(pr physicalRegion) bool { + t.Logf("Map %x-%x", pr.virtual, pr.virtual+pr.length) pt.Map(hostarch.Addr(pr.virtual), pr.length, pagetables.MapOpts{ AccessType: hostarch.AnyAccess, User: true, diff --git a/pkg/sentry/platform/kvm/machine.go b/pkg/sentry/platform/kvm/machine.go index 78b7de31a8..43ba744686 100644 --- a/pkg/sentry/platform/kvm/machine.go +++ b/pkg/sentry/platform/kvm/machine.go @@ -332,26 +332,33 @@ func newMachine(vm int, config *Config) (*machine, error) { // faultBlockSize has to equal or less than KVM_MEM_MAX_NR_PAGES. faultBlockSize = uintptr(1) << 42 faultBlockMask = ^uintptr(faultBlockSize - 1) + for _, r := range physicalRegions { + m.mapPhysical(r.physical, r.length) + } } else { + // Apply the physical mappings. Note that these mappings may point to + // guest physical addresses that are not actually available. These + // physical pages are mapped on demand, see kernel_unsafe.go. + applyPhysicalRegions(func(pr physicalRegion) bool { + physical := pr.physical + for physical < pr.physical+pr.length { + virtualStart, physicalStart, length, _ := calculateBluepillFault(physical) + // Pre-allocate page tables in the lower half. + m.kernel.PageTables.Map( + hostarch.Addr(virtualStart), + length, + pagetables.MapOpts{Static: true}, + physicalStart) + physical += length + } + + return true // Keep iterating. + }) // Install seccomp rules to trap runtime mmap system calls. They will // be handled by seccompMmapHandler. seccompMmapRules(m) } - // Apply the physical mappings. Note that these mappings may point to - // guest physical addresses that are not actually available. These - // physical pages are mapped on demand, see kernel_unsafe.go. - applyPhysicalRegions(func(pr physicalRegion) bool { - // Map everything in the lower half. - m.kernel.PageTables.Map( - hostarch.Addr(pr.virtual), - pr.length, - pagetables.MapOpts{AccessType: hostarch.ReadWrite}, - pr.physical) - - return true // Keep iterating. - }) - // Ensure that the currently mapped virtual regions are actually // available in the VM. Note that this doesn't guarantee no future // faults, however it should guarantee that everything is available to @@ -368,6 +375,9 @@ func newMachine(vm int, config *Config) (*machine, error) { // Cap the length to the end of the area. length = vr.virtual + vr.length - virtual } + // Ensure the physical range is mapped. + m.mapPhysical(physical, length) + // Update page tables for executable mappings. if vr.accessType.Execute { if vr.accessType.Write { @@ -380,8 +390,6 @@ func newMachine(vm int, config *Config) (*machine, error) { physical) } - // Ensure the physical range is mapped. - m.mapPhysical(physical, length) virtual += length } } @@ -404,11 +412,6 @@ func newMachine(vm int, config *Config) (*machine, error) { mapRegion(vr, 0) }) - if mapEntireAddressSpace { - for _, r := range physicalRegions { - m.mapPhysical(r.physical, r.length) - } - } enableAsyncPreemption() // Initialize architecture state. if err := m.initArchState(); err != nil { @@ -458,8 +461,15 @@ func (m *machine) mapPhysical(physical, length uintptr) { } // Is this already mapped? Check the usedSlots. - if !pr.mmio && !m.hasSlot(physicalStart) { - m.mapMemorySlot(virtualStart, physicalStart, length, pr.readOnly) + if !m.hasSlot(physicalStart) { + m.kernel.PageTables.Map( + hostarch.Addr(virtualStart), + length, + pagetables.MapOpts{AccessType: hostarch.ReadWrite}, + physicalStart) + if !pr.mmio { + m.mapMemorySlot(virtualStart, physicalStart, length, pr.readOnly) + } } // Move to the next chunk. diff --git a/pkg/sentry/platform/kvm/physical_map.go b/pkg/sentry/platform/kvm/physical_map.go index 9784e52041..95ed0790f4 100644 --- a/pkg/sentry/platform/kvm/physical_map.go +++ b/pkg/sentry/platform/kvm/physical_map.go @@ -66,6 +66,7 @@ func fillAddressSpace() (specialRegions []specialVirtualRegion) { pSize := uintptr(1) << ring0.PhysicalAddressBits pSize -= reservedMemory + maxUserAddr := uintptr(0) // Add specifically excluded regions; see excludeVirtualRegion. if err := applyVirtualRegions(func(vr virtualRegion) { if excludeVirtualRegion(vr) { @@ -81,10 +82,17 @@ func fillAddressSpace() (specialRegions []specialVirtualRegion) { }) log.Infof("mmio: virtual [%x,%x)", vr.virtual, vr.virtual+vr.length) } + if vr.filename != "[vsyscall]" { + maxUserAddr = vr.region.virtual + vr.region.length + } }); err != nil { panic(fmt.Sprintf("error parsing /proc/self/maps: %v", err)) } + var archRegions []specialVirtualRegion + vSize, archRegions = archSpecialRegions(vSize, maxUserAddr) + specialRegions = append(specialRegions, archRegions...) + // Do we need any more work? if vSize < pSize { return specialRegions @@ -109,7 +117,7 @@ func fillAddressSpace() (specialRegions []specialVirtualRegion) { current := required // Attempted mmap size. filled := uintptr(0) suggestedAddr := uintptr(0) - if ring0.VirtualAddressBits > 48 { + if exendedAddressSpaceAllowed && ring0.VirtualAddressBits > 48 { // Pass a hint address above 47 bits to indicate to the kernel that // we can handle, and want, mappings above 47 bits: // https://docs.kernel.org/arch/x86/x86_64/5level-paging.html#user-space-and-large-virtual-address-space. diff --git a/pkg/sentry/platform/kvm/physical_map_amd64.go b/pkg/sentry/platform/kvm/physical_map_amd64.go index c5adfb577f..664421e90a 100644 --- a/pkg/sentry/platform/kvm/physical_map_amd64.go +++ b/pkg/sentry/platform/kvm/physical_map_amd64.go @@ -14,9 +14,62 @@ package kvm +import ( + "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/ring0" +) + const ( // reservedMemory is a chunk of physical memory reserved starting at // physical address zero. There are some special pages in this region, // so we just call the whole thing off. reservedMemory = 0x100000000 ) + +const ( + // defaultAddressSpaceSize is the default limit for the user virtual + // address space, which is 47-bits (2^47 bytes). The mmap syscall + // respects this limit by default, even with 5-level page tables + // enabled. + defaultAddressSpaceSize = uintptr(1) << 47 + + // exendedAddressSpaceAllowed controls address space usage beyond + // the default 47-bit limit. It is set to 'false' for several reasons: + // * There are no known use cases requiring the extended address space. + // * By restricting the size, we avoid the overhead of: + // a) Aligning the virtual address space size to the physical + // address space size. + // b) Creating unnecessary page table entries for the unused + // extended range. + // * The memory slot size is currently configured only to cover + // the default 47-bit address space. + // * 5-level page table support was primarily introduced to workaround + // a specific kernel bug where VDSO could be mapped above the 47-bit + // boundary (v6.9-rc1~186^2~7). + exendedAddressSpaceAllowed = false +) + +// archSpecialRegions returns special regions that are excluded from the virtual +// address space. Linux doesn't map vma-s above 47-bit by default. +func archSpecialRegions(vSize uintptr, maxUserAddr uintptr) (uintptr, []specialVirtualRegion) { + var specialRegions []specialVirtualRegion + if exendedAddressSpaceAllowed || vSize <= defaultAddressSpaceSize { + return vSize, nil + } + // This is a workaround for the kernel bug when vdso can be + // mapped above the 47-bit address space boundary. + if defaultAddressSpaceSize > maxUserAddr { + maxUserAddr = defaultAddressSpaceSize + } + r := region{ + virtual: maxUserAddr, + length: ring0.MaximumUserAddress - defaultAddressSpaceSize, + } + specialRegions = append(specialRegions, specialVirtualRegion{ + region: r, + }) + vSize -= r.length + log.Infof("excluded: virtual [%x,%x)", r.virtual, r.virtual+r.length) + + return vSize, specialRegions +} diff --git a/pkg/sentry/platform/kvm/physical_map_arm64.go b/pkg/sentry/platform/kvm/physical_map_arm64.go index 4d85614539..bd4d06aa36 100644 --- a/pkg/sentry/platform/kvm/physical_map_arm64.go +++ b/pkg/sentry/platform/kvm/physical_map_arm64.go @@ -16,4 +16,10 @@ package kvm const ( reservedMemory = 0 + // 5-level page tables are not implemeted on arm64. + exendedAddressSpaceAllowed = false ) + +func archSpecialRegions(vSize uintptr, maxUserAddr uintptr) (uintptr, []specialVirtualRegion) { + return vSize, nil +}