Minor page tables improvements.

* Make split safe.
* Enable looking up next valid address.
* Support mappings with !accessType.Any(), distinct from unmap.

These changes allow for the use of pagetables in low-level OS packages, such
as ring0, and allow for the use of pagetables for more generic address space
reservation (by writing entries with no access specified).

Updates #5039

PiperOrigin-RevId: 355109016
This commit is contained in:
Adin Scannell 2021-02-02 00:08:37 -08:00 committed by gVisor bot
parent 0c8cc66117
commit 5fa683ffdf
10 changed files with 212 additions and 247 deletions

View File

@ -9,7 +9,10 @@ package(licenses = ["notice"])
# architecture builds. # architecture builds.
go_template( go_template(
name = "generic_walker_%s" % arch, name = "generic_walker_%s" % arch,
srcs = ["walker_%s.go" % arch], srcs = [
"walker_generic.go",
"walker_%s.go" % arch,
],
opt_types = [ opt_types = [
"Visitor", "Visitor",
], ],
@ -50,6 +53,7 @@ go_library(
"pcids_x86.go", "pcids_x86.go",
"walker_amd64.go", "walker_amd64.go",
"walker_arm64.go", "walker_arm64.go",
"walker_generic.go",
":walker_empty_amd64", ":walker_empty_amd64",
":walker_empty_arm64", ":walker_empty_arm64",
":walker_lookup_amd64", ":walker_lookup_amd64",

View File

@ -60,6 +60,7 @@ type PageTables struct {
// Init initializes a set of PageTables. // Init initializes a set of PageTables.
// //
// +checkescape:hard,stack
//go:nosplit //go:nosplit
func (p *PageTables) Init(allocator Allocator) { func (p *PageTables) Init(allocator Allocator) {
p.Allocator = allocator p.Allocator = allocator
@ -92,7 +93,6 @@ func NewWithUpper(a Allocator, upperSharedPageTables *PageTables, upperStart uin
} }
p.InitArch(a) p.InitArch(a)
return p return p
} }
@ -112,7 +112,7 @@ type mapVisitor struct {
// visit is used for map. // visit is used for map.
// //
//go:nosplit //go:nosplit
func (v *mapVisitor) visit(start uintptr, pte *PTE, align uintptr) { func (v *mapVisitor) visit(start uintptr, pte *PTE, align uintptr) bool {
p := v.physical + (start - uintptr(v.target)) p := v.physical + (start - uintptr(v.target))
if pte.Valid() && (pte.Address() != p || pte.Opts() != v.opts) { if pte.Valid() && (pte.Address() != p || pte.Opts() != v.opts) {
v.prev = true v.prev = true
@ -122,9 +122,10 @@ func (v *mapVisitor) visit(start uintptr, pte *PTE, align uintptr) {
// install a valid entry here, however we must zap any existing // install a valid entry here, however we must zap any existing
// entry to ensure this happens. // entry to ensure this happens.
pte.Clear() pte.Clear()
return return true
} }
pte.Set(p, v.opts) pte.Set(p, v.opts)
return true
} }
//go:nosplit //go:nosplit
@ -140,7 +141,6 @@ func (*mapVisitor) requiresSplit() bool { return true }
// Precondition: addr & length must be page-aligned, their sum must not overflow. // Precondition: addr & length must be page-aligned, their sum must not overflow.
// //
// +checkescape:hard,stack // +checkescape:hard,stack
//
//go:nosplit //go:nosplit
func (p *PageTables) Map(addr usermem.Addr, length uintptr, opts MapOpts, physical uintptr) bool { func (p *PageTables) Map(addr usermem.Addr, length uintptr, opts MapOpts, physical uintptr) bool {
if p.readOnlyShared { if p.readOnlyShared {
@ -158,9 +158,6 @@ func (p *PageTables) Map(addr usermem.Addr, length uintptr, opts MapOpts, physic
length = p.upperStart - uintptr(addr) length = p.upperStart - uintptr(addr)
} }
} }
if !opts.AccessType.Any() {
return p.Unmap(addr, length)
}
w := mapWalker{ w := mapWalker{
pageTables: p, pageTables: p,
visitor: mapVisitor{ visitor: mapVisitor{
@ -187,9 +184,10 @@ func (*unmapVisitor) requiresSplit() bool { return true }
// visit unmaps the given entry. // visit unmaps the given entry.
// //
//go:nosplit //go:nosplit
func (v *unmapVisitor) visit(start uintptr, pte *PTE, align uintptr) { func (v *unmapVisitor) visit(start uintptr, pte *PTE, align uintptr) bool {
pte.Clear() pte.Clear()
v.count++ v.count++
return true
} }
// Unmap unmaps the given range. // Unmap unmaps the given range.
@ -199,7 +197,6 @@ func (v *unmapVisitor) visit(start uintptr, pte *PTE, align uintptr) {
// Precondition: addr & length must be page-aligned, their sum must not overflow. // Precondition: addr & length must be page-aligned, their sum must not overflow.
// //
// +checkescape:hard,stack // +checkescape:hard,stack
//
//go:nosplit //go:nosplit
func (p *PageTables) Unmap(addr usermem.Addr, length uintptr) bool { func (p *PageTables) Unmap(addr usermem.Addr, length uintptr) bool {
if p.readOnlyShared { if p.readOnlyShared {
@ -241,8 +238,9 @@ func (*emptyVisitor) requiresSplit() bool { return false }
// visit unmaps the given entry. // visit unmaps the given entry.
// //
//go:nosplit //go:nosplit
func (v *emptyVisitor) visit(start uintptr, pte *PTE, align uintptr) { func (v *emptyVisitor) visit(start uintptr, pte *PTE, align uintptr) bool {
v.count++ v.count++
return true
} }
// IsEmpty checks if the given range is empty. // IsEmpty checks if the given range is empty.
@ -250,7 +248,6 @@ func (v *emptyVisitor) visit(start uintptr, pte *PTE, align uintptr) {
// Precondition: addr & length must be page-aligned. // Precondition: addr & length must be page-aligned.
// //
// +checkescape:hard,stack // +checkescape:hard,stack
//
//go:nosplit //go:nosplit
func (p *PageTables) IsEmpty(addr usermem.Addr, length uintptr) bool { func (p *PageTables) IsEmpty(addr usermem.Addr, length uintptr) bool {
w := emptyWalker{ w := emptyWalker{
@ -262,20 +259,28 @@ func (p *PageTables) IsEmpty(addr usermem.Addr, length uintptr) bool {
// lookupVisitor is used for lookup. // lookupVisitor is used for lookup.
type lookupVisitor struct { type lookupVisitor struct {
target uintptr // Input. target uintptr // Input & Output.
physical uintptr // Output. findFirst bool // Input.
opts MapOpts // Output. physical uintptr // Output.
size uintptr // Output.
opts MapOpts // Output.
} }
// visit matches the given address. // visit matches the given address.
// //
//go:nosplit //go:nosplit
func (v *lookupVisitor) visit(start uintptr, pte *PTE, align uintptr) { func (v *lookupVisitor) visit(start uintptr, pte *PTE, align uintptr) bool {
if !pte.Valid() { if !pte.Valid() {
return // If looking for the first, then we just keep iterating until
// we find a valid entry.
return v.findFirst
} }
v.physical = pte.Address() + (start - uintptr(v.target)) // Is this within the current range?
v.target = start
v.physical = pte.Address()
v.size = (align + 1)
v.opts = pte.Opts() v.opts = pte.Opts()
return false
} }
//go:nosplit //go:nosplit
@ -286,20 +291,29 @@ func (*lookupVisitor) requiresSplit() bool { return false }
// Lookup returns the physical address for the given virtual address. // Lookup returns the physical address for the given virtual address.
// //
// +checkescape:hard,stack // If findFirst is true, then the next valid address after addr is returned.
// If findFirst is false, then only a mapping for addr will be returned.
// //
// Note that if size is zero, then no matching entry was found.
//
// +checkescape:hard,stack
//go:nosplit //go:nosplit
func (p *PageTables) Lookup(addr usermem.Addr) (physical uintptr, opts MapOpts) { func (p *PageTables) Lookup(addr usermem.Addr, findFirst bool) (virtual usermem.Addr, physical, size uintptr, opts MapOpts) {
mask := uintptr(usermem.PageSize - 1) mask := uintptr(usermem.PageSize - 1)
offset := uintptr(addr) & mask addr &^= usermem.Addr(mask)
w := lookupWalker{ w := lookupWalker{
pageTables: p, pageTables: p,
visitor: lookupVisitor{ visitor: lookupVisitor{
target: uintptr(addr &^ usermem.Addr(mask)), target: uintptr(addr),
findFirst: findFirst,
}, },
} }
w.iterateRange(uintptr(addr), uintptr(addr)+1) end := ^usermem.Addr(0) &^ usermem.Addr(mask)
return w.visitor.physical + offset, w.visitor.opts if !findFirst {
end = addr + 1
}
w.iterateRange(uintptr(addr), uintptr(end))
return usermem.Addr(w.visitor.target), w.visitor.physical, w.visitor.size, w.visitor.opts
} }
// MarkReadOnlyShared marks the pagetables read-only and can be shared. // MarkReadOnlyShared marks the pagetables read-only and can be shared.

View File

@ -156,12 +156,7 @@ func (p *PTE) IsSect() bool {
// //
//go:nosplit //go:nosplit
func (p *PTE) Set(addr uintptr, opts MapOpts) { func (p *PTE) Set(addr uintptr, opts MapOpts) {
if !opts.AccessType.Any() { v := (addr &^ optionMask) | nG | readOnly | protDefault
p.Clear()
return
}
v := (addr &^ optionMask) | protDefault | nG | readOnly
if p.IsSect() { if p.IsSect() {
// Note that this is inherited from the previous instance. Set // Note that this is inherited from the previous instance. Set
// does not change the value of Sect. See above. // does not change the value of Sect. See above.
@ -169,6 +164,10 @@ func (p *PTE) Set(addr uintptr, opts MapOpts) {
} else { } else {
v |= typePage v |= typePage
} }
if !opts.AccessType.Any() {
// Leave as non-valid if no access is available.
v &^= pteValid
}
if opts.Global { if opts.Global {
v = v &^ nG v = v &^ nG

View File

@ -43,6 +43,7 @@ const (
// InitArch does some additional initialization related to the architecture. // InitArch does some additional initialization related to the architecture.
// //
// +checkescape:hard,stack
//go:nosplit //go:nosplit
func (p *PageTables) InitArch(allocator Allocator) { func (p *PageTables) InitArch(allocator Allocator) {
if p.upperSharedPageTables != nil { if p.upperSharedPageTables != nil {
@ -50,6 +51,7 @@ func (p *PageTables) InitArch(allocator Allocator) {
} }
} }
//go:nosplit
func pgdIndex(upperStart uintptr) uintptr { func pgdIndex(upperStart uintptr) uintptr {
if upperStart&(pgdSize-1) != 0 { if upperStart&(pgdSize-1) != 0 {
panic("upperStart should be pgd size aligned") panic("upperStart should be pgd size aligned")

View File

@ -44,6 +44,7 @@ const (
// InitArch does some additional initialization related to the architecture. // InitArch does some additional initialization related to the architecture.
// //
// +checkescape:hard,stack
//go:nosplit //go:nosplit
func (p *PageTables) InitArch(allocator Allocator) { func (p *PageTables) InitArch(allocator Allocator) {
if p.upperSharedPageTables != nil { if p.upperSharedPageTables != nil {

View File

@ -34,7 +34,7 @@ type checkVisitor struct {
failed string // Output. failed string // Output.
} }
func (v *checkVisitor) visit(start uintptr, pte *PTE, align uintptr) { func (v *checkVisitor) visit(start uintptr, pte *PTE, align uintptr) bool {
v.found = append(v.found, mapping{ v.found = append(v.found, mapping{
start: start, start: start,
length: align + 1, length: align + 1,
@ -43,7 +43,7 @@ func (v *checkVisitor) visit(start uintptr, pte *PTE, align uintptr) {
}) })
if v.failed != "" { if v.failed != "" {
// Don't keep looking for errors. // Don't keep looking for errors.
return return false
} }
if v.current >= len(v.expected) { if v.current >= len(v.expected) {
@ -58,6 +58,7 @@ func (v *checkVisitor) visit(start uintptr, pte *PTE, align uintptr) {
v.failed = "opts didn't match" v.failed = "opts didn't match"
} }
v.current++ v.current++
return true
} }
func (*checkVisitor) requiresAlloc() bool { return false } func (*checkVisitor) requiresAlloc() bool { return false }

View File

@ -137,7 +137,10 @@ func (p *PTE) Set(addr uintptr, opts MapOpts) {
p.Clear() p.Clear()
return return
} }
v := (addr &^ optionMask) | present | accessed v := (addr &^ optionMask)
if opts.AccessType.Any() {
v |= present | accessed
}
if opts.User { if opts.User {
v |= user v |= user
} }

View File

@ -16,104 +16,10 @@
package pagetables package pagetables
// Visitor is a generic type.
type Visitor interface {
// visit is called on each PTE.
visit(start uintptr, pte *PTE, align uintptr)
// requiresAlloc indicates that new entries should be allocated within
// the walked range.
requiresAlloc() bool
// requiresSplit indicates that entries in the given range should be
// split if they are huge or jumbo pages.
requiresSplit() bool
}
// Walker walks page tables.
type Walker struct {
// pageTables are the tables to walk.
pageTables *PageTables
// Visitor is the set of arguments.
visitor Visitor
}
// iterateRange iterates over all appropriate levels of page tables for the given range.
//
// If requiresAlloc is true, then Set _must_ be called on all given PTEs. The
// exception is super pages. If a valid super page (huge or jumbo) cannot be
// installed, then the walk will continue to individual entries.
//
// This algorithm will attempt to maximize the use of super pages whenever
// possible. Whether a super page is provided will be clear through the range
// provided in the callback.
//
// Note that if requiresAlloc is true, then no gaps will be present. However,
// if alloc is not set, then the iteration will likely be full of gaps.
//
// Note that this function should generally be avoided in favor of Map, Unmap,
// etc. when not necessary.
//
// Precondition: start must be page-aligned.
//
// Precondition: start must be less than end.
//
// Precondition: If requiresAlloc is true, then start and end should not span
// non-canonical ranges. If they do, a panic will result.
//
//go:nosplit
func (w *Walker) iterateRange(start, end uintptr) {
if start%pteSize != 0 {
panic("unaligned start")
}
if end < start {
panic("start > end")
}
if start < lowerTop {
if end <= lowerTop {
w.iterateRangeCanonical(start, end)
} else if end > lowerTop && end <= upperBottom {
if w.visitor.requiresAlloc() {
panic("alloc spans non-canonical range")
}
w.iterateRangeCanonical(start, lowerTop)
} else {
if w.visitor.requiresAlloc() {
panic("alloc spans non-canonical range")
}
w.iterateRangeCanonical(start, lowerTop)
w.iterateRangeCanonical(upperBottom, end)
}
} else if start < upperBottom {
if end <= upperBottom {
if w.visitor.requiresAlloc() {
panic("alloc spans non-canonical range")
}
} else {
if w.visitor.requiresAlloc() {
panic("alloc spans non-canonical range")
}
w.iterateRangeCanonical(upperBottom, end)
}
} else {
w.iterateRangeCanonical(start, end)
}
}
// next returns the next address quantized by the given size.
//
//go:nosplit
func next(start uintptr, size uintptr) uintptr {
start &= ^(size - 1)
start += size
return start
}
// iterateRangeCanonical walks a canonical range. // iterateRangeCanonical walks a canonical range.
// //
//go:nosplit //go:nosplit
func (w *Walker) iterateRangeCanonical(start, end uintptr) { func (w *Walker) iterateRangeCanonical(start, end uintptr) bool {
for pgdIndex := uint16((start & pgdMask) >> pgdShift); start < end && pgdIndex < entriesPerPage; pgdIndex++ { for pgdIndex := uint16((start & pgdMask) >> pgdShift); start < end && pgdIndex < entriesPerPage; pgdIndex++ {
var ( var (
pgdEntry = &w.pageTables.root[pgdIndex] pgdEntry = &w.pageTables.root[pgdIndex]
@ -127,10 +33,10 @@ func (w *Walker) iterateRangeCanonical(start, end uintptr) {
} }
// Allocate a new pgd. // Allocate a new pgd.
pudEntries = w.pageTables.Allocator.NewPTEs() pudEntries = w.pageTables.Allocator.NewPTEs() // escapes: depends on allocator.
pgdEntry.setPageTable(w.pageTables, pudEntries) pgdEntry.setPageTable(w.pageTables, pudEntries)
} else { } else {
pudEntries = w.pageTables.Allocator.LookupPTEs(pgdEntry.Address()) pudEntries = w.pageTables.Allocator.LookupPTEs(pgdEntry.Address()) // escapes: see above.
} }
// Map the next level. // Map the next level.
@ -155,7 +61,9 @@ func (w *Walker) iterateRangeCanonical(start, end uintptr) {
// new page for the pmd. // new page for the pmd.
if start&(pudSize-1) == 0 && end-start >= pudSize { if start&(pudSize-1) == 0 && end-start >= pudSize {
pudEntry.SetSuper() pudEntry.SetSuper()
w.visitor.visit(uintptr(start), pudEntry, pudSize-1) if !w.visitor.visit(uintptr(start&^(pudSize-1)), pudEntry, pudSize-1) {
return false
}
if pudEntry.Valid() { if pudEntry.Valid() {
start = next(start, pudSize) start = next(start, pudSize)
continue continue
@ -163,14 +71,14 @@ func (w *Walker) iterateRangeCanonical(start, end uintptr) {
} }
// Allocate a new pud. // Allocate a new pud.
pmdEntries = w.pageTables.Allocator.NewPTEs() pmdEntries = w.pageTables.Allocator.NewPTEs() // escapes: see above.
pudEntry.setPageTable(w.pageTables, pmdEntries) pudEntry.setPageTable(w.pageTables, pmdEntries)
} else if pudEntry.IsSuper() { } else if pudEntry.IsSuper() {
// Does this page need to be split? // Does this page need to be split?
if w.visitor.requiresSplit() && (start&(pudSize-1) != 0 || end < next(start, pudSize)) { if w.visitor.requiresSplit() && (start&(pudSize-1) != 0 || end < next(start, pudSize)) {
// Install the relevant entries. // Install the relevant entries.
pmdEntries = w.pageTables.Allocator.NewPTEs() pmdEntries = w.pageTables.Allocator.NewPTEs() // escapes: see above.
for index := uint16(0); index < entriesPerPage; index++ { for index := uint16(0); index < entriesPerPage; index++ {
pmdEntries[index].SetSuper() pmdEntries[index].SetSuper()
pmdEntries[index].Set( pmdEntries[index].Set(
@ -180,7 +88,9 @@ func (w *Walker) iterateRangeCanonical(start, end uintptr) {
pudEntry.setPageTable(w.pageTables, pmdEntries) pudEntry.setPageTable(w.pageTables, pmdEntries)
} else { } else {
// A super page to be checked directly. // A super page to be checked directly.
w.visitor.visit(uintptr(start), pudEntry, pudSize-1) if !w.visitor.visit(uintptr(start&^(pudSize-1)), pudEntry, pudSize-1) {
return false
}
// Might have been cleared. // Might have been cleared.
if !pudEntry.Valid() { if !pudEntry.Valid() {
@ -192,7 +102,7 @@ func (w *Walker) iterateRangeCanonical(start, end uintptr) {
continue continue
} }
} else { } else {
pmdEntries = w.pageTables.Allocator.LookupPTEs(pudEntry.Address()) pmdEntries = w.pageTables.Allocator.LookupPTEs(pudEntry.Address()) // escapes: see above.
} }
// Map the next level, since this is valid. // Map the next level, since this is valid.
@ -216,7 +126,9 @@ func (w *Walker) iterateRangeCanonical(start, end uintptr) {
// As above, we can skip allocating a new page. // As above, we can skip allocating a new page.
if start&(pmdSize-1) == 0 && end-start >= pmdSize { if start&(pmdSize-1) == 0 && end-start >= pmdSize {
pmdEntry.SetSuper() pmdEntry.SetSuper()
w.visitor.visit(uintptr(start), pmdEntry, pmdSize-1) if !w.visitor.visit(uintptr(start&^(pmdSize-1)), pmdEntry, pmdSize-1) {
return false
}
if pmdEntry.Valid() { if pmdEntry.Valid() {
start = next(start, pmdSize) start = next(start, pmdSize)
continue continue
@ -224,7 +136,7 @@ func (w *Walker) iterateRangeCanonical(start, end uintptr) {
} }
// Allocate a new pmd. // Allocate a new pmd.
pteEntries = w.pageTables.Allocator.NewPTEs() pteEntries = w.pageTables.Allocator.NewPTEs() // escapes: see above.
pmdEntry.setPageTable(w.pageTables, pteEntries) pmdEntry.setPageTable(w.pageTables, pteEntries)
} else if pmdEntry.IsSuper() { } else if pmdEntry.IsSuper() {
@ -240,7 +152,9 @@ func (w *Walker) iterateRangeCanonical(start, end uintptr) {
pmdEntry.setPageTable(w.pageTables, pteEntries) pmdEntry.setPageTable(w.pageTables, pteEntries)
} else { } else {
// A huge page to be checked directly. // A huge page to be checked directly.
w.visitor.visit(uintptr(start), pmdEntry, pmdSize-1) if !w.visitor.visit(uintptr(start&^(pmdSize-1)), pmdEntry, pmdSize-1) {
return false
}
// Might have been cleared. // Might have been cleared.
if !pmdEntry.Valid() { if !pmdEntry.Valid() {
@ -252,7 +166,7 @@ func (w *Walker) iterateRangeCanonical(start, end uintptr) {
continue continue
} }
} else { } else {
pteEntries = w.pageTables.Allocator.LookupPTEs(pmdEntry.Address()) pteEntries = w.pageTables.Allocator.LookupPTEs(pmdEntry.Address()) // escapes: see above.
} }
// Map the next level, since this is valid. // Map the next level, since this is valid.
@ -269,11 +183,10 @@ func (w *Walker) iterateRangeCanonical(start, end uintptr) {
} }
// At this point, we are guaranteed that start%pteSize == 0. // At this point, we are guaranteed that start%pteSize == 0.
w.visitor.visit(uintptr(start), pteEntry, pteSize-1) if !w.visitor.visit(uintptr(start&^(pteSize-1)), pteEntry, pteSize-1) {
if !pteEntry.Valid() { return false
if w.visitor.requiresAlloc() { }
panic("PTE not set after iteration with requiresAlloc!") if !pteEntry.Valid() && !w.visitor.requiresAlloc() {
}
clearPTEEntries++ clearPTEEntries++
} }
@ -285,7 +198,7 @@ func (w *Walker) iterateRangeCanonical(start, end uintptr) {
// Check if we no longer need this page. // Check if we no longer need this page.
if clearPTEEntries == entriesPerPage { if clearPTEEntries == entriesPerPage {
pmdEntry.Clear() pmdEntry.Clear()
w.pageTables.Allocator.FreePTEs(pteEntries) w.pageTables.Allocator.FreePTEs(pteEntries) // escapes: see above.
clearPMDEntries++ clearPMDEntries++
} }
} }
@ -293,7 +206,7 @@ func (w *Walker) iterateRangeCanonical(start, end uintptr) {
// Check if we no longer need this page. // Check if we no longer need this page.
if clearPMDEntries == entriesPerPage { if clearPMDEntries == entriesPerPage {
pudEntry.Clear() pudEntry.Clear()
w.pageTables.Allocator.FreePTEs(pmdEntries) w.pageTables.Allocator.FreePTEs(pmdEntries) // escapes: see above.
clearPUDEntries++ clearPUDEntries++
} }
} }
@ -301,7 +214,8 @@ func (w *Walker) iterateRangeCanonical(start, end uintptr) {
// Check if we no longer need this page. // Check if we no longer need this page.
if clearPUDEntries == entriesPerPage { if clearPUDEntries == entriesPerPage {
pgdEntry.Clear() pgdEntry.Clear()
w.pageTables.Allocator.FreePTEs(pudEntries) w.pageTables.Allocator.FreePTEs(pudEntries) // escapes: see above.
} }
} }
return true
} }

View File

@ -16,104 +16,10 @@
package pagetables package pagetables
// Visitor is a generic type.
type Visitor interface {
// visit is called on each PTE.
visit(start uintptr, pte *PTE, align uintptr)
// requiresAlloc indicates that new entries should be allocated within
// the walked range.
requiresAlloc() bool
// requiresSplit indicates that entries in the given range should be
// split if they are huge or jumbo pages.
requiresSplit() bool
}
// Walker walks page tables.
type Walker struct {
// pageTables are the tables to walk.
pageTables *PageTables
// Visitor is the set of arguments.
visitor Visitor
}
// iterateRange iterates over all appropriate levels of page tables for the given range.
//
// If requiresAlloc is true, then Set _must_ be called on all given PTEs. The
// exception is sect pages. If a valid sect page (huge or jumbo) cannot be
// installed, then the walk will continue to individual entries.
//
// This algorithm will attempt to maximize the use of sect pages whenever
// possible. Whether a sect page is provided will be clear through the range
// provided in the callback.
//
// Note that if requiresAlloc is true, then no gaps will be present. However,
// if alloc is not set, then the iteration will likely be full of gaps.
//
// Note that this function should generally be avoided in favor of Map, Unmap,
// etc. when not necessary.
//
// Precondition: start must be page-aligned.
//
// Precondition: start must be less than end.
//
// Precondition: If requiresAlloc is true, then start and end should not span
// non-canonical ranges. If they do, a panic will result.
//
//go:nosplit
func (w *Walker) iterateRange(start, end uintptr) {
if start%pteSize != 0 {
panic("unaligned start")
}
if end < start {
panic("start > end")
}
if start < lowerTop {
if end <= lowerTop {
w.iterateRangeCanonical(start, end)
} else if end > lowerTop && end <= upperBottom {
if w.visitor.requiresAlloc() {
panic("alloc spans non-canonical range")
}
w.iterateRangeCanonical(start, lowerTop)
} else {
if w.visitor.requiresAlloc() {
panic("alloc spans non-canonical range")
}
w.iterateRangeCanonical(start, lowerTop)
w.iterateRangeCanonical(upperBottom, end)
}
} else if start < upperBottom {
if end <= upperBottom {
if w.visitor.requiresAlloc() {
panic("alloc spans non-canonical range")
}
} else {
if w.visitor.requiresAlloc() {
panic("alloc spans non-canonical range")
}
w.iterateRangeCanonical(upperBottom, end)
}
} else {
w.iterateRangeCanonical(start, end)
}
}
// next returns the next address quantized by the given size.
//
//go:nosplit
func next(start uintptr, size uintptr) uintptr {
start &= ^(size - 1)
start += size
return start
}
// iterateRangeCanonical walks a canonical range. // iterateRangeCanonical walks a canonical range.
// //
//go:nosplit //go:nosplit
func (w *Walker) iterateRangeCanonical(start, end uintptr) { func (w *Walker) iterateRangeCanonical(start, end uintptr) bool {
pgdEntryIndex := w.pageTables.root pgdEntryIndex := w.pageTables.root
if start >= upperBottom { if start >= upperBottom {
pgdEntryIndex = w.pageTables.archPageTables.root pgdEntryIndex = w.pageTables.archPageTables.root
@ -160,7 +66,9 @@ func (w *Walker) iterateRangeCanonical(start, end uintptr) {
// new page for the pmd. // new page for the pmd.
if start&(pudSize-1) == 0 && end-start >= pudSize { if start&(pudSize-1) == 0 && end-start >= pudSize {
pudEntry.SetSect() pudEntry.SetSect()
w.visitor.visit(uintptr(start), pudEntry, pudSize-1) if !w.visitor.visit(uintptr(start), pudEntry, pudSize-1) {
return false
}
if pudEntry.Valid() { if pudEntry.Valid() {
start = next(start, pudSize) start = next(start, pudSize)
continue continue
@ -185,7 +93,9 @@ func (w *Walker) iterateRangeCanonical(start, end uintptr) {
pudEntry.setPageTable(w.pageTables, pmdEntries) pudEntry.setPageTable(w.pageTables, pmdEntries)
} else { } else {
// A sect page to be checked directly. // A sect page to be checked directly.
w.visitor.visit(uintptr(start), pudEntry, pudSize-1) if !w.visitor.visit(uintptr(start), pudEntry, pudSize-1) {
return false
}
// Might have been cleared. // Might have been cleared.
if !pudEntry.Valid() { if !pudEntry.Valid() {
@ -222,7 +132,9 @@ func (w *Walker) iterateRangeCanonical(start, end uintptr) {
// As above, we can skip allocating a new page. // As above, we can skip allocating a new page.
if start&(pmdSize-1) == 0 && end-start >= pmdSize { if start&(pmdSize-1) == 0 && end-start >= pmdSize {
pmdEntry.SetSect() pmdEntry.SetSect()
w.visitor.visit(uintptr(start), pmdEntry, pmdSize-1) if !w.visitor.visit(uintptr(start), pmdEntry, pmdSize-1) {
return false
}
if pmdEntry.Valid() { if pmdEntry.Valid() {
start = next(start, pmdSize) start = next(start, pmdSize)
continue continue
@ -246,7 +158,9 @@ func (w *Walker) iterateRangeCanonical(start, end uintptr) {
pmdEntry.setPageTable(w.pageTables, pteEntries) pmdEntry.setPageTable(w.pageTables, pteEntries)
} else { } else {
// A huge page to be checked directly. // A huge page to be checked directly.
w.visitor.visit(uintptr(start), pmdEntry, pmdSize-1) if !w.visitor.visit(uintptr(start), pmdEntry, pmdSize-1) {
return false
}
// Might have been cleared. // Might have been cleared.
if !pmdEntry.Valid() { if !pmdEntry.Valid() {
@ -276,7 +190,9 @@ func (w *Walker) iterateRangeCanonical(start, end uintptr) {
} }
// At this point, we are guaranteed that start%pteSize == 0. // At this point, we are guaranteed that start%pteSize == 0.
w.visitor.visit(uintptr(start), pteEntry, pteSize-1) if !w.visitor.visit(uintptr(start), pteEntry, pteSize-1) {
return false
}
if !pteEntry.Valid() { if !pteEntry.Valid() {
if w.visitor.requiresAlloc() { if w.visitor.requiresAlloc() {
panic("PTE not set after iteration with requiresAlloc!") panic("PTE not set after iteration with requiresAlloc!")
@ -311,4 +227,5 @@ func (w *Walker) iterateRangeCanonical(start, end uintptr) {
w.pageTables.Allocator.FreePTEs(pudEntries) w.pageTables.Allocator.FreePTEs(pudEntries)
} }
} }
return true
} }

View File

@ -0,0 +1,110 @@
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package pagetables
// Visitor is a generic type.
type Visitor interface {
// visit is called on each PTE. The returned boolean indicates whether
// the walk should continue.
visit(start uintptr, pte *PTE, align uintptr) bool
// requiresAlloc indicates that new entries should be allocated within
// the walked range.
requiresAlloc() bool
// requiresSplit indicates that entries in the given range should be
// split if they are huge or jumbo pages.
requiresSplit() bool
}
// Walker walks page tables.
type Walker struct {
// pageTables are the tables to walk.
pageTables *PageTables
// Visitor is the set of arguments.
visitor Visitor
}
// iterateRange iterates over all appropriate levels of page tables for the given range.
//
// If requiresAlloc is true, then Set _must_ be called on all given PTEs. The
// exception is super pages. If a valid super page (huge or jumbo) cannot be
// installed, then the walk will continue to individual entries.
//
// This algorithm will attempt to maximize the use of super/sect pages whenever
// possible. Whether a super page is provided will be clear through the range
// provided in the callback.
//
// Note that if requiresAlloc is true, then no gaps will be present. However,
// if alloc is not set, then the iteration will likely be full of gaps.
//
// Note that this function should generally be avoided in favor of Map, Unmap,
// etc. when not necessary.
//
// Precondition: start must be page-aligned.
// Precondition: start must be less than end.
// Precondition: If requiresAlloc is true, then start and end should not span
// non-canonical ranges. If they do, a panic will result.
//
//go:nosplit
func (w *Walker) iterateRange(start, end uintptr) {
if start%pteSize != 0 {
panic("unaligned start")
}
if end < start {
panic("start > end")
}
if start < lowerTop {
if end <= lowerTop {
w.iterateRangeCanonical(start, end)
} else if end > lowerTop && end <= upperBottom {
if w.visitor.requiresAlloc() {
panic("alloc spans non-canonical range")
}
w.iterateRangeCanonical(start, lowerTop)
} else {
if w.visitor.requiresAlloc() {
panic("alloc spans non-canonical range")
}
if !w.iterateRangeCanonical(start, lowerTop) {
return
}
w.iterateRangeCanonical(upperBottom, end)
}
} else if start < upperBottom {
if end <= upperBottom {
if w.visitor.requiresAlloc() {
panic("alloc spans non-canonical range")
}
} else {
if w.visitor.requiresAlloc() {
panic("alloc spans non-canonical range")
}
w.iterateRangeCanonical(upperBottom, end)
}
} else {
w.iterateRangeCanonical(start, end)
}
}
// next returns the next address quantized by the given size.
//
//go:nosplit
func next(start uintptr, size uintptr) uintptr {
start &= ^(size - 1)
start += size
return start
}