357 lines
12 KiB
Go
357 lines
12 KiB
Go
// Copyright 2019 The gVisor Authors.
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
// +build go1.12
|
|
// +build !go1.14
|
|
|
|
// Check go:linkname function signatures when updating Go version.
|
|
|
|
package vfs
|
|
|
|
import (
|
|
"fmt"
|
|
"math/bits"
|
|
"reflect"
|
|
"sync/atomic"
|
|
"unsafe"
|
|
|
|
"gvisor.dev/gvisor/third_party/gvsync"
|
|
)
|
|
|
|
// mountKey represents the location at which a Mount is mounted. It is
|
|
// structurally identical to VirtualDentry, but stores its fields as
|
|
// unsafe.Pointer since mutators synchronize with VFS path traversal using
|
|
// seqcounts.
|
|
type mountKey struct {
|
|
parent unsafe.Pointer // *Mount
|
|
point unsafe.Pointer // *Dentry
|
|
}
|
|
|
|
// Invariant: mnt.key's fields are nil. parent and point are non-nil.
|
|
func (mnt *Mount) storeKey(parent *Mount, point *Dentry) {
|
|
atomic.StorePointer(&mnt.key.parent, unsafe.Pointer(parent))
|
|
atomic.StorePointer(&mnt.key.point, unsafe.Pointer(point))
|
|
}
|
|
|
|
func (mnt *Mount) loadKey() (*Mount, *Dentry) {
|
|
return (*Mount)(atomic.LoadPointer(&mnt.key.parent)), (*Dentry)(atomic.LoadPointer(&mnt.key.point))
|
|
}
|
|
|
|
func (mnt *Mount) parent() *Mount {
|
|
return (*Mount)(atomic.LoadPointer(&mnt.key.parent))
|
|
}
|
|
|
|
func (mnt *Mount) point() *Dentry {
|
|
return (*Dentry)(atomic.LoadPointer(&mnt.key.point))
|
|
}
|
|
|
|
// mountTable maps (mount parent, mount point) pairs to mounts. It supports
|
|
// efficient concurrent lookup, even in the presence of concurrent mutators
|
|
// (provided mutation is sufficiently uncommon).
|
|
//
|
|
// mountTable.Init() must be called on new mountTables before use.
|
|
type mountTable struct {
|
|
// mountTable is implemented as a seqcount-protected hash table that
|
|
// resolves collisions with linear probing, featuring Robin Hood insertion
|
|
// and backward shift deletion. These minimize probe length variance,
|
|
// significantly improving the performance of linear probing at high load
|
|
// factors. (mountTable doesn't use bucketing, which is the other major
|
|
// technique commonly used in high-performance hash tables; the efficiency
|
|
// of bucketing is largely due to SIMD lookup, and Go lacks both SIMD
|
|
// intrinsics and inline assembly, limiting the performance of this
|
|
// approach.)
|
|
|
|
seq gvsync.SeqCount
|
|
seed uint32 // for hashing keys
|
|
|
|
// size holds both length (number of elements) and capacity (number of
|
|
// slots): capacity is stored as its base-2 log (referred to as order) in
|
|
// the least significant bits of size, and length is stored in the
|
|
// remaining bits. Go defines bit shifts >= width of shifted unsigned
|
|
// operand as shifting to 0, which differs from x86's SHL, so the Go
|
|
// compiler inserts a bounds check for each bit shift unless we mask order
|
|
// anyway (cf. runtime.bucketShift()), and length isn't used by lookup;
|
|
// thus this bit packing gets us more bits for the length (vs. storing
|
|
// length and cap in separate uint32s) for ~free.
|
|
size uint64
|
|
|
|
slots unsafe.Pointer // []mountSlot; never nil after Init
|
|
}
|
|
|
|
type mountSlot struct {
|
|
// We don't store keys in slots; instead, we just check Mount.parent and
|
|
// Mount.point directly. Any practical use of lookup will need to touch
|
|
// Mounts anyway, and comparing hashes means that false positives are
|
|
// extremely rare, so this isn't an extra cache line touch overall.
|
|
value unsafe.Pointer // *Mount
|
|
hash uintptr
|
|
}
|
|
|
|
const (
|
|
mtSizeOrderBits = 6 // log2 of pointer size in bits
|
|
mtSizeOrderMask = (1 << mtSizeOrderBits) - 1
|
|
mtSizeOrderOne = 1
|
|
mtSizeLenLSB = mtSizeOrderBits
|
|
mtSizeLenOne = 1 << mtSizeLenLSB
|
|
mtSizeLenNegOne = ^uint64(mtSizeOrderMask) // uint64(-1) << mtSizeLenLSB
|
|
|
|
mountSlotBytes = unsafe.Sizeof(mountSlot{})
|
|
mountKeyBytes = unsafe.Sizeof(mountKey{})
|
|
|
|
// Tuning parameters.
|
|
//
|
|
// Essentially every mountTable will contain at least /proc, /sys, and
|
|
// /dev/shm, so there is ~no reason for mtInitCap to be < 4.
|
|
mtInitOrder = 2
|
|
mtInitCap = 1 << mtInitOrder
|
|
mtMaxLoadNum = 13
|
|
mtMaxLoadDen = 16
|
|
)
|
|
|
|
func init() {
|
|
// We can't just define mtSizeOrderBits as follows because Go doesn't have
|
|
// constexpr.
|
|
if ptrBits := uint(unsafe.Sizeof(uintptr(0)) * 8); mtSizeOrderBits != bits.TrailingZeros(ptrBits) {
|
|
panic(fmt.Sprintf("mtSizeOrderBits (%d) must be %d = log2 of pointer size in bits (%d)", mtSizeOrderBits, bits.TrailingZeros(ptrBits), ptrBits))
|
|
}
|
|
if bits.OnesCount(uint(mountSlotBytes)) != 1 {
|
|
panic(fmt.Sprintf("sizeof(mountSlotBytes) (%d) must be a power of 2 to use bit masking for wraparound", mountSlotBytes))
|
|
}
|
|
if mtInitCap <= 1 {
|
|
panic(fmt.Sprintf("mtInitCap (%d) must be at least 2 since mountTable methods assume that there will always be at least one empty slot", mtInitCap))
|
|
}
|
|
if mtMaxLoadNum >= mtMaxLoadDen {
|
|
panic(fmt.Sprintf("invalid mountTable maximum load factor (%d/%d)", mtMaxLoadNum, mtMaxLoadDen))
|
|
}
|
|
}
|
|
|
|
// Init must be called exactly once on each mountTable before use.
|
|
func (mt *mountTable) Init() {
|
|
mt.seed = rand32()
|
|
mt.size = mtInitOrder
|
|
mt.slots = newMountTableSlots(mtInitCap)
|
|
}
|
|
|
|
func newMountTableSlots(cap uintptr) unsafe.Pointer {
|
|
slice := make([]mountSlot, cap, cap)
|
|
hdr := (*reflect.SliceHeader)(unsafe.Pointer(&slice))
|
|
return unsafe.Pointer(hdr.Data)
|
|
}
|
|
|
|
// Lookup returns the Mount with the given parent, mounted at the given point.
|
|
// If no such Mount exists, Lookup returns nil.
|
|
//
|
|
// Lookup may be called even if there are concurrent mutators of mt.
|
|
func (mt *mountTable) Lookup(parent *Mount, point *Dentry) *Mount {
|
|
key := mountKey{parent: unsafe.Pointer(parent), point: unsafe.Pointer(point)}
|
|
hash := memhash(noescape(unsafe.Pointer(&key)), uintptr(mt.seed), mountKeyBytes)
|
|
|
|
loop:
|
|
for {
|
|
epoch := mt.seq.BeginRead()
|
|
size := atomic.LoadUint64(&mt.size)
|
|
slots := atomic.LoadPointer(&mt.slots)
|
|
if !mt.seq.ReadOk(epoch) {
|
|
continue
|
|
}
|
|
tcap := uintptr(1) << (size & mtSizeOrderMask)
|
|
mask := tcap - 1
|
|
off := (hash & mask) * mountSlotBytes
|
|
offmask := mask * mountSlotBytes
|
|
for {
|
|
// This avoids bounds checking.
|
|
slot := (*mountSlot)(unsafe.Pointer(uintptr(slots) + off))
|
|
slotValue := atomic.LoadPointer(&slot.value)
|
|
slotHash := atomic.LoadUintptr(&slot.hash)
|
|
if !mt.seq.ReadOk(epoch) {
|
|
// The element we're looking for might have been moved into a
|
|
// slot we've previously checked, so restart entirely.
|
|
continue loop
|
|
}
|
|
if slotValue == nil {
|
|
return nil
|
|
}
|
|
if slotHash == hash {
|
|
mount := (*Mount)(slotValue)
|
|
var mountKey mountKey
|
|
mountKey.parent = atomic.LoadPointer(&mount.key.parent)
|
|
mountKey.point = atomic.LoadPointer(&mount.key.point)
|
|
if !mt.seq.ReadOk(epoch) {
|
|
continue loop
|
|
}
|
|
if key == mountKey {
|
|
return mount
|
|
}
|
|
}
|
|
off = (off + mountSlotBytes) & offmask
|
|
}
|
|
}
|
|
}
|
|
|
|
// Insert inserts the given mount into mt.
|
|
//
|
|
// Preconditions: There are no concurrent mutators of mt. mt must not already
|
|
// contain a Mount with the same mount point and parent.
|
|
func (mt *mountTable) Insert(mount *Mount) {
|
|
hash := memhash(unsafe.Pointer(&mount.key), uintptr(mt.seed), mountKeyBytes)
|
|
|
|
// We're under the maximum load factor if:
|
|
//
|
|
// (len+1) / cap <= mtMaxLoadNum / mtMaxLoadDen
|
|
// (len+1) * mtMaxLoadDen <= mtMaxLoadNum * cap
|
|
tlen := mt.size >> mtSizeLenLSB
|
|
order := mt.size & mtSizeOrderMask
|
|
tcap := uintptr(1) << order
|
|
if ((tlen + 1) * mtMaxLoadDen) <= (uint64(mtMaxLoadNum) << order) {
|
|
// Atomically insert the new element into the table.
|
|
mt.seq.BeginWrite()
|
|
atomic.AddUint64(&mt.size, mtSizeLenOne)
|
|
mtInsertLocked(mt.slots, tcap, unsafe.Pointer(mount), hash)
|
|
mt.seq.EndWrite()
|
|
return
|
|
}
|
|
|
|
// Otherwise, we have to expand. Double the number of slots in the new
|
|
// table.
|
|
newOrder := order + 1
|
|
if newOrder > mtSizeOrderMask {
|
|
panic("mount table size overflow")
|
|
}
|
|
newCap := uintptr(1) << newOrder
|
|
newSlots := newMountTableSlots(newCap)
|
|
// Copy existing elements to the new table.
|
|
oldCur := mt.slots
|
|
// Go does not permit pointers to the end of allocated objects, so we
|
|
// must use a pointer to the last element of the old table. The
|
|
// following expression is equivalent to
|
|
// `slots+(cap-1)*mountSlotBytes` but has a critical path length of 2
|
|
// arithmetic instructions instead of 3.
|
|
oldLast := unsafe.Pointer((uintptr(mt.slots) - mountSlotBytes) + (tcap * mountSlotBytes))
|
|
for {
|
|
oldSlot := (*mountSlot)(oldCur)
|
|
if oldSlot.value != nil {
|
|
// Don't need to lock mt.seq yet since newSlots isn't visible
|
|
// to readers.
|
|
mtInsertLocked(newSlots, newCap, oldSlot.value, oldSlot.hash)
|
|
}
|
|
if oldCur == oldLast {
|
|
break
|
|
}
|
|
oldCur = unsafe.Pointer(uintptr(oldCur) + mountSlotBytes)
|
|
}
|
|
// Insert the new element into the new table.
|
|
mtInsertLocked(newSlots, newCap, unsafe.Pointer(mount), hash)
|
|
// Atomically switch to the new table.
|
|
mt.seq.BeginWrite()
|
|
atomic.AddUint64(&mt.size, mtSizeLenOne|mtSizeOrderOne)
|
|
atomic.StorePointer(&mt.slots, newSlots)
|
|
mt.seq.EndWrite()
|
|
}
|
|
|
|
// Preconditions: There are no concurrent mutators of the table (slots, cap).
|
|
// If the table is visible to readers, then mt.seq must be in a writer critical
|
|
// section. cap must be a power of 2.
|
|
func mtInsertLocked(slots unsafe.Pointer, cap uintptr, value unsafe.Pointer, hash uintptr) {
|
|
mask := cap - 1
|
|
off := (hash & mask) * mountSlotBytes
|
|
offmask := mask * mountSlotBytes
|
|
disp := uintptr(0)
|
|
for {
|
|
slot := (*mountSlot)(unsafe.Pointer(uintptr(slots) + off))
|
|
slotValue := slot.value
|
|
if slotValue == nil {
|
|
atomic.StorePointer(&slot.value, value)
|
|
atomic.StoreUintptr(&slot.hash, hash)
|
|
return
|
|
}
|
|
// If we've been displaced farther from our first-probed slot than the
|
|
// element stored in this one, swap elements and switch to inserting
|
|
// the replaced one. (This is Robin Hood insertion.)
|
|
slotHash := slot.hash
|
|
slotDisp := ((off / mountSlotBytes) - slotHash) & mask
|
|
if disp > slotDisp {
|
|
atomic.StorePointer(&slot.value, value)
|
|
atomic.StoreUintptr(&slot.hash, hash)
|
|
value = slotValue
|
|
hash = slotHash
|
|
disp = slotDisp
|
|
}
|
|
off = (off + mountSlotBytes) & offmask
|
|
disp++
|
|
}
|
|
}
|
|
|
|
// Remove removes the given mount from mt.
|
|
//
|
|
// Preconditions: There are no concurrent mutators of mt. mt must contain
|
|
// mount.
|
|
func (mt *mountTable) Remove(mount *Mount) {
|
|
hash := memhash(unsafe.Pointer(&mount.key), uintptr(mt.seed), mountKeyBytes)
|
|
tcap := uintptr(1) << (mt.size & mtSizeOrderMask)
|
|
mask := tcap - 1
|
|
slots := mt.slots
|
|
off := (hash & mask) * mountSlotBytes
|
|
offmask := mask * mountSlotBytes
|
|
for {
|
|
slot := (*mountSlot)(unsafe.Pointer(uintptr(slots) + off))
|
|
slotValue := slot.value
|
|
if slotValue == unsafe.Pointer(mount) {
|
|
// Found the element to remove. Move all subsequent elements
|
|
// backward until we either find an empty slot, or an element that
|
|
// is already in its first-probed slot. (This is backward shift
|
|
// deletion.)
|
|
mt.seq.BeginWrite()
|
|
for {
|
|
nextOff := (off + mountSlotBytes) & offmask
|
|
nextSlot := (*mountSlot)(unsafe.Pointer(uintptr(slots) + nextOff))
|
|
nextSlotValue := nextSlot.value
|
|
if nextSlotValue == nil {
|
|
break
|
|
}
|
|
nextSlotHash := nextSlot.hash
|
|
if (nextOff / mountSlotBytes) == (nextSlotHash & mask) {
|
|
break
|
|
}
|
|
atomic.StorePointer(&slot.value, nextSlotValue)
|
|
atomic.StoreUintptr(&slot.hash, nextSlotHash)
|
|
off = nextOff
|
|
slot = nextSlot
|
|
}
|
|
atomic.StorePointer(&slot.value, nil)
|
|
atomic.AddUint64(&mt.size, mtSizeLenNegOne)
|
|
mt.seq.EndWrite()
|
|
return
|
|
}
|
|
if checkInvariants && slotValue == nil {
|
|
panic(fmt.Sprintf("mountTable.Remove() called on missing Mount %v", mount))
|
|
}
|
|
off = (off + mountSlotBytes) & offmask
|
|
}
|
|
}
|
|
|
|
//go:linkname memhash runtime.memhash
|
|
func memhash(p unsafe.Pointer, seed, s uintptr) uintptr
|
|
|
|
//go:linkname rand32 runtime.fastrand
|
|
func rand32() uint32
|
|
|
|
// This is copy/pasted from runtime.noescape(), and is needed because arguments
|
|
// apparently escape from all functions defined by linkname.
|
|
//
|
|
//go:nosplit
|
|
func noescape(p unsafe.Pointer) unsafe.Pointer {
|
|
x := uintptr(p)
|
|
return unsafe.Pointer(x ^ 0)
|
|
}
|