784 lines
22 KiB
Go
784 lines
22 KiB
Go
// Copyright 2018 The gVisor Authors.
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
// Package futex provides an implementation of the futex interface as found in
|
|
// the Linux kernel. It allows one to easily transform Wait() calls into waits
|
|
// on a channel, which is useful in a Go-based kernel, for example.
|
|
package futex
|
|
|
|
import (
|
|
"sync"
|
|
|
|
"gvisor.dev/gvisor/pkg/abi/linux"
|
|
"gvisor.dev/gvisor/pkg/sentry/memmap"
|
|
"gvisor.dev/gvisor/pkg/sentry/usermem"
|
|
"gvisor.dev/gvisor/pkg/syserror"
|
|
)
|
|
|
|
// KeyKind indicates the type of a Key.
|
|
type KeyKind int
|
|
|
|
const (
|
|
// KindPrivate indicates a private futex (a futex syscall with the
|
|
// FUTEX_PRIVATE_FLAG set).
|
|
KindPrivate KeyKind = iota
|
|
|
|
// KindSharedPrivate indicates a shared futex on a private memory mapping.
|
|
// Although KindPrivate and KindSharedPrivate futexes both use memory
|
|
// addresses to identify futexes, they do not interoperate (in Linux, the
|
|
// two are distinguished by the FUT_OFF_MMSHARED flag, which is used in key
|
|
// comparison).
|
|
KindSharedPrivate
|
|
|
|
// KindSharedMappable indicates a shared futex on a memory mapping other
|
|
// than a private anonymous memory mapping.
|
|
KindSharedMappable
|
|
)
|
|
|
|
// Key represents something that a futex waiter may wait on.
|
|
type Key struct {
|
|
// Kind is the type of the Key.
|
|
Kind KeyKind
|
|
|
|
// Mappable is the memory-mapped object that is represented by the Key.
|
|
// Mappable is always nil if Kind is not KindSharedMappable, and may be nil
|
|
// even if it is.
|
|
Mappable memmap.Mappable
|
|
|
|
// MappingIdentity is the MappingIdentity associated with Mappable.
|
|
// MappingIdentity is always nil is Mappable is nil, and may be nil even if
|
|
// it isn't.
|
|
MappingIdentity memmap.MappingIdentity
|
|
|
|
// If Kind is KindPrivate or KindSharedPrivate, Offset is the represented
|
|
// memory address. Otherwise, Offset is the represented offset into
|
|
// Mappable.
|
|
Offset uint64
|
|
}
|
|
|
|
func (k *Key) release() {
|
|
if k.MappingIdentity != nil {
|
|
k.MappingIdentity.DecRef()
|
|
}
|
|
k.Mappable = nil
|
|
k.MappingIdentity = nil
|
|
}
|
|
|
|
func (k *Key) clone() Key {
|
|
if k.MappingIdentity != nil {
|
|
k.MappingIdentity.IncRef()
|
|
}
|
|
return *k
|
|
}
|
|
|
|
// Preconditions: k.Kind == KindPrivate or KindSharedPrivate.
|
|
func (k *Key) addr() usermem.Addr {
|
|
return usermem.Addr(k.Offset)
|
|
}
|
|
|
|
// matches returns true if a wakeup on k2 should wake a waiter waiting on k.
|
|
func (k *Key) matches(k2 *Key) bool {
|
|
// k.MappingIdentity is ignored; it's only used for reference counting.
|
|
return k.Kind == k2.Kind && k.Mappable == k2.Mappable && k.Offset == k2.Offset
|
|
}
|
|
|
|
// Target abstracts memory accesses and keys.
|
|
type Target interface {
|
|
// SwapUint32 gives access to usermem.IO.SwapUint32.
|
|
SwapUint32(addr usermem.Addr, new uint32) (uint32, error)
|
|
|
|
// CompareAndSwap gives access to usermem.IO.CompareAndSwapUint32.
|
|
CompareAndSwapUint32(addr usermem.Addr, old, new uint32) (uint32, error)
|
|
|
|
// LoadUint32 gives access to usermem.IO.LoadUint32.
|
|
LoadUint32(addr usermem.Addr) (uint32, error)
|
|
|
|
// GetSharedKey returns a Key with kind KindSharedPrivate or
|
|
// KindSharedMappable corresponding to the memory mapped at address addr.
|
|
//
|
|
// If GetSharedKey returns a Key with a non-nil MappingIdentity, a
|
|
// reference is held on the MappingIdentity, which must be dropped by the
|
|
// caller when the Key is no longer in use.
|
|
GetSharedKey(addr usermem.Addr) (Key, error)
|
|
}
|
|
|
|
// check performs a basic equality check on the given address.
|
|
func check(t Target, addr usermem.Addr, val uint32) error {
|
|
cur, err := t.LoadUint32(addr)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
if cur != val {
|
|
return syserror.EAGAIN
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// atomicOp performs a complex operation on the given address.
|
|
func atomicOp(t Target, addr usermem.Addr, opIn uint32) (bool, error) {
|
|
opType := (opIn >> 28) & 0xf
|
|
cmp := (opIn >> 24) & 0xf
|
|
opArg := (opIn >> 12) & 0xfff
|
|
cmpArg := opIn & 0xfff
|
|
|
|
if opType&linux.FUTEX_OP_OPARG_SHIFT != 0 {
|
|
opArg = 1 << opArg
|
|
opType &^= linux.FUTEX_OP_OPARG_SHIFT // Clear flag.
|
|
}
|
|
|
|
var (
|
|
oldVal uint32
|
|
err error
|
|
)
|
|
if opType == linux.FUTEX_OP_SET {
|
|
oldVal, err = t.SwapUint32(addr, opArg)
|
|
if err != nil {
|
|
return false, err
|
|
}
|
|
} else {
|
|
for {
|
|
oldVal, err = t.LoadUint32(addr)
|
|
if err != nil {
|
|
return false, err
|
|
}
|
|
var newVal uint32
|
|
switch opType {
|
|
case linux.FUTEX_OP_ADD:
|
|
newVal = oldVal + opArg
|
|
case linux.FUTEX_OP_OR:
|
|
newVal = oldVal | opArg
|
|
case linux.FUTEX_OP_ANDN:
|
|
newVal = oldVal &^ opArg
|
|
case linux.FUTEX_OP_XOR:
|
|
newVal = oldVal ^ opArg
|
|
default:
|
|
return false, syserror.ENOSYS
|
|
}
|
|
prev, err := t.CompareAndSwapUint32(addr, oldVal, newVal)
|
|
if err != nil {
|
|
return false, err
|
|
}
|
|
if prev == oldVal {
|
|
break // Success.
|
|
}
|
|
}
|
|
}
|
|
|
|
switch cmp {
|
|
case linux.FUTEX_OP_CMP_EQ:
|
|
return oldVal == cmpArg, nil
|
|
case linux.FUTEX_OP_CMP_NE:
|
|
return oldVal != cmpArg, nil
|
|
case linux.FUTEX_OP_CMP_LT:
|
|
return oldVal < cmpArg, nil
|
|
case linux.FUTEX_OP_CMP_LE:
|
|
return oldVal <= cmpArg, nil
|
|
case linux.FUTEX_OP_CMP_GT:
|
|
return oldVal > cmpArg, nil
|
|
case linux.FUTEX_OP_CMP_GE:
|
|
return oldVal >= cmpArg, nil
|
|
default:
|
|
return false, syserror.ENOSYS
|
|
}
|
|
}
|
|
|
|
// Waiter is the struct which gets enqueued into buckets for wake up routines
|
|
// and requeue routines to scan and notify. Once a Waiter has been enqueued by
|
|
// WaitPrepare(), callers may listen on C for wake up events.
|
|
type Waiter struct {
|
|
// Synchronization:
|
|
//
|
|
// - A Waiter that is not enqueued in a bucket is exclusively owned (no
|
|
// synchronization applies).
|
|
//
|
|
// - A Waiter is enqueued in a bucket by calling WaitPrepare(). After this,
|
|
// waiterEntry, bucket, and key are protected by the bucket.mu ("bucket
|
|
// lock") of the containing bucket, and bitmask is immutable. Note that
|
|
// since bucket is mutated using atomic memory operations, bucket.Load()
|
|
// may be called without holding the bucket lock, although it may change
|
|
// racily. See WaitComplete().
|
|
//
|
|
// - A Waiter is only guaranteed to be no longer queued after calling
|
|
// WaitComplete().
|
|
|
|
// waiterEntry links Waiter into bucket.waiters.
|
|
waiterEntry
|
|
|
|
// bucket is the bucket this waiter is queued in. If bucket is nil, the
|
|
// waiter is not waiting and is not in any bucket.
|
|
bucket AtomicPtrBucket
|
|
|
|
// C is sent to when the Waiter is woken.
|
|
C chan struct{}
|
|
|
|
// key is what this waiter is waiting on.
|
|
key Key
|
|
|
|
// The bitmask we're waiting on.
|
|
// This is used the case of a FUTEX_WAKE_BITSET.
|
|
bitmask uint32
|
|
|
|
// tid is the thread ID for the waiter in case this is a PI mutex.
|
|
tid uint32
|
|
}
|
|
|
|
// NewWaiter returns a new unqueued Waiter.
|
|
func NewWaiter() *Waiter {
|
|
return &Waiter{
|
|
C: make(chan struct{}, 1),
|
|
}
|
|
}
|
|
|
|
// woken returns true if w has been woken since the last call to WaitPrepare.
|
|
func (w *Waiter) woken() bool {
|
|
return len(w.C) != 0
|
|
}
|
|
|
|
// bucket holds a list of waiters for a given address hash.
|
|
//
|
|
// +stateify savable
|
|
type bucket struct {
|
|
// mu protects waiters and contained Waiter state. See comment in Waiter.
|
|
mu sync.Mutex `state:"nosave"`
|
|
|
|
waiters waiterList `state:"zerovalue"`
|
|
}
|
|
|
|
// wakeLocked wakes up to n waiters matching the bitmask at the addr for this
|
|
// bucket and returns the number of waiters woken.
|
|
//
|
|
// Preconditions: b.mu must be locked.
|
|
func (b *bucket) wakeLocked(key *Key, bitmask uint32, n int) int {
|
|
done := 0
|
|
for w := b.waiters.Front(); done < n && w != nil; {
|
|
if !w.key.matches(key) || w.bitmask&bitmask == 0 {
|
|
// Not matching.
|
|
w = w.Next()
|
|
continue
|
|
}
|
|
|
|
// Remove from the bucket and wake the waiter.
|
|
woke := w
|
|
w = w.Next() // Next iteration.
|
|
b.wakeWaiterLocked(woke)
|
|
done++
|
|
}
|
|
return done
|
|
}
|
|
|
|
func (b *bucket) wakeWaiterLocked(w *Waiter) {
|
|
// Remove from the bucket and wake the waiter.
|
|
b.waiters.Remove(w)
|
|
w.C <- struct{}{}
|
|
|
|
// NOTE: The above channel write establishes a write barrier according
|
|
// to the memory model, so nothing may be ordered around it. Since
|
|
// we've dequeued w and will never touch it again, we can safely
|
|
// store nil to w.bucket here and allow the WaitComplete() to
|
|
// short-circuit grabbing the bucket lock. If they somehow miss the
|
|
// store, we are still holding the lock, so we can know that they won't
|
|
// dequeue w, assume it's free and have the below operation
|
|
// afterwards.
|
|
w.bucket.Store(nil)
|
|
}
|
|
|
|
// requeueLocked takes n waiters from the bucket and moves them to naddr on the
|
|
// bucket "to".
|
|
//
|
|
// Preconditions: b and to must be locked.
|
|
func (b *bucket) requeueLocked(to *bucket, key, nkey *Key, n int) int {
|
|
done := 0
|
|
for w := b.waiters.Front(); done < n && w != nil; {
|
|
if !w.key.matches(key) {
|
|
// Not matching.
|
|
w = w.Next()
|
|
continue
|
|
}
|
|
|
|
requeued := w
|
|
w = w.Next() // Next iteration.
|
|
b.waiters.Remove(requeued)
|
|
requeued.key.release()
|
|
requeued.key = nkey.clone()
|
|
to.waiters.PushBack(requeued)
|
|
requeued.bucket.Store(to)
|
|
done++
|
|
}
|
|
return done
|
|
}
|
|
|
|
const (
|
|
// bucketCount is the number of buckets per Manager. By having many of
|
|
// these we reduce contention when concurrent yet unrelated calls are made.
|
|
bucketCount = 1 << bucketCountBits
|
|
bucketCountBits = 10
|
|
)
|
|
|
|
// getKey returns a Key representing address addr in c.
|
|
func getKey(t Target, addr usermem.Addr, private bool) (Key, error) {
|
|
// Ensure the address is aligned.
|
|
// It must be a DWORD boundary.
|
|
if addr&0x3 != 0 {
|
|
return Key{}, syserror.EINVAL
|
|
}
|
|
if private {
|
|
return Key{Kind: KindPrivate, Offset: uint64(addr)}, nil
|
|
}
|
|
return t.GetSharedKey(addr)
|
|
}
|
|
|
|
// bucketIndexForAddr returns the index into Manager.buckets for addr.
|
|
func bucketIndexForAddr(addr usermem.Addr) uintptr {
|
|
// - The bottom 2 bits of addr must be 0, per getKey.
|
|
//
|
|
// - On amd64, the top 16 bits of addr (bits 48-63) must be equal to bit 47
|
|
// for a canonical address, and (on all existing platforms) bit 47 must be
|
|
// 0 for an application address.
|
|
//
|
|
// Thus 19 bits of addr are "useless" for hashing, leaving only 45 "useful"
|
|
// bits. We choose one of the simplest possible hash functions that at
|
|
// least uses all 45 useful bits in the output, given that bucketCountBits
|
|
// == 10. This hash function also has the property that it will usually map
|
|
// adjacent addresses to adjacent buckets, slightly improving memory
|
|
// locality when an application synchronization structure uses multiple
|
|
// nearby futexes.
|
|
//
|
|
// Note that despite the large number of arithmetic operations in the
|
|
// function, many components can be computed in parallel, such that the
|
|
// critical path is 1 bit shift + 3 additions (2 in h1, then h1 + h2). This
|
|
// is also why h1 and h2 are grouped separately; for "(addr >> 2) + ... +
|
|
// (addr >> 42)" without any additional grouping, the compiler puts all 4
|
|
// additions in the critical path.
|
|
h1 := uintptr(addr>>2) + uintptr(addr>>12) + uintptr(addr>>22)
|
|
h2 := uintptr(addr>>32) + uintptr(addr>>42)
|
|
return (h1 + h2) % bucketCount
|
|
}
|
|
|
|
// Manager holds futex state for a single virtual address space.
|
|
//
|
|
// +stateify savable
|
|
type Manager struct {
|
|
// privateBuckets holds buckets for KindPrivate and KindSharedPrivate
|
|
// futexes.
|
|
privateBuckets [bucketCount]bucket `state:"zerovalue"`
|
|
|
|
// sharedBucket is the bucket for KindSharedMappable futexes. sharedBucket
|
|
// may be shared by multiple Managers. The sharedBucket pointer is
|
|
// immutable.
|
|
sharedBucket *bucket
|
|
}
|
|
|
|
// NewManager returns an initialized futex manager.
|
|
func NewManager() *Manager {
|
|
return &Manager{
|
|
sharedBucket: &bucket{},
|
|
}
|
|
}
|
|
|
|
// Fork returns a new Manager. Shared futex clients using the returned Manager
|
|
// may interoperate with those using m.
|
|
func (m *Manager) Fork() *Manager {
|
|
return &Manager{
|
|
sharedBucket: m.sharedBucket,
|
|
}
|
|
}
|
|
|
|
// lockBucket returns a locked bucket for the given key.
|
|
func (m *Manager) lockBucket(k *Key) *bucket {
|
|
var b *bucket
|
|
if k.Kind == KindSharedMappable {
|
|
b = m.sharedBucket
|
|
} else {
|
|
b = &m.privateBuckets[bucketIndexForAddr(k.addr())]
|
|
}
|
|
b.mu.Lock()
|
|
return b
|
|
}
|
|
|
|
// lockBuckets returns locked buckets for the given keys.
|
|
func (m *Manager) lockBuckets(k1, k2 *Key) (*bucket, *bucket) {
|
|
// Buckets must be consistently ordered to avoid circular lock
|
|
// dependencies. We order buckets in m.privateBuckets by index (lowest
|
|
// index first), and all buckets in m.privateBuckets precede
|
|
// m.sharedBucket.
|
|
|
|
// Handle the common case first:
|
|
if k1.Kind != KindSharedMappable && k2.Kind != KindSharedMappable {
|
|
i1 := bucketIndexForAddr(k1.addr())
|
|
i2 := bucketIndexForAddr(k2.addr())
|
|
b1 := &m.privateBuckets[i1]
|
|
b2 := &m.privateBuckets[i2]
|
|
switch {
|
|
case i1 < i2:
|
|
b1.mu.Lock()
|
|
b2.mu.Lock()
|
|
case i2 < i1:
|
|
b2.mu.Lock()
|
|
b1.mu.Lock()
|
|
default:
|
|
b1.mu.Lock()
|
|
}
|
|
return b1, b2
|
|
}
|
|
|
|
// At least one of b1 or b2 should be m.sharedBucket.
|
|
b1 := m.sharedBucket
|
|
b2 := m.sharedBucket
|
|
if k1.Kind != KindSharedMappable {
|
|
b1 = m.lockBucket(k1)
|
|
} else if k2.Kind != KindSharedMappable {
|
|
b2 = m.lockBucket(k2)
|
|
}
|
|
m.sharedBucket.mu.Lock()
|
|
return b1, b2
|
|
}
|
|
|
|
// Wake wakes up to n waiters matching the bitmask on the given addr.
|
|
// The number of waiters woken is returned.
|
|
func (m *Manager) Wake(t Target, addr usermem.Addr, private bool, bitmask uint32, n int) (int, error) {
|
|
// This function is very hot; avoid defer.
|
|
k, err := getKey(t, addr, private)
|
|
if err != nil {
|
|
return 0, err
|
|
}
|
|
|
|
b := m.lockBucket(&k)
|
|
r := b.wakeLocked(&k, bitmask, n)
|
|
|
|
b.mu.Unlock()
|
|
k.release()
|
|
return r, nil
|
|
}
|
|
|
|
func (m *Manager) doRequeue(t Target, addr, naddr usermem.Addr, private bool, checkval bool, val uint32, nwake int, nreq int) (int, error) {
|
|
k1, err := getKey(t, addr, private)
|
|
if err != nil {
|
|
return 0, err
|
|
}
|
|
defer k1.release()
|
|
k2, err := getKey(t, naddr, private)
|
|
if err != nil {
|
|
return 0, err
|
|
}
|
|
defer k2.release()
|
|
|
|
b1, b2 := m.lockBuckets(&k1, &k2)
|
|
defer b1.mu.Unlock()
|
|
if b2 != b1 {
|
|
defer b2.mu.Unlock()
|
|
}
|
|
|
|
if checkval {
|
|
if err := check(t, addr, val); err != nil {
|
|
return 0, err
|
|
}
|
|
}
|
|
|
|
// Wake the number required.
|
|
done := b1.wakeLocked(&k1, ^uint32(0), nwake)
|
|
|
|
// Requeue the number required.
|
|
b1.requeueLocked(b2, &k1, &k2, nreq)
|
|
|
|
return done, nil
|
|
}
|
|
|
|
// Requeue wakes up to nwake waiters on the given addr, and unconditionally
|
|
// requeues up to nreq waiters on naddr.
|
|
func (m *Manager) Requeue(t Target, addr, naddr usermem.Addr, private bool, nwake int, nreq int) (int, error) {
|
|
return m.doRequeue(t, addr, naddr, private, false, 0, nwake, nreq)
|
|
}
|
|
|
|
// RequeueCmp atomically checks that the addr contains val (via the Target),
|
|
// wakes up to nwake waiters on addr and then unconditionally requeues nreq
|
|
// waiters on naddr.
|
|
func (m *Manager) RequeueCmp(t Target, addr, naddr usermem.Addr, private bool, val uint32, nwake int, nreq int) (int, error) {
|
|
return m.doRequeue(t, addr, naddr, private, true, val, nwake, nreq)
|
|
}
|
|
|
|
// WakeOp atomically applies op to the memory address addr2, wakes up to nwake1
|
|
// waiters unconditionally from addr1, and, based on the original value at addr2
|
|
// and a comparison encoded in op, wakes up to nwake2 waiters from addr2.
|
|
// It returns the total number of waiters woken.
|
|
func (m *Manager) WakeOp(t Target, addr1, addr2 usermem.Addr, private bool, nwake1 int, nwake2 int, op uint32) (int, error) {
|
|
k1, err := getKey(t, addr1, private)
|
|
if err != nil {
|
|
return 0, err
|
|
}
|
|
defer k1.release()
|
|
k2, err := getKey(t, addr2, private)
|
|
if err != nil {
|
|
return 0, err
|
|
}
|
|
defer k2.release()
|
|
|
|
b1, b2 := m.lockBuckets(&k1, &k2)
|
|
defer b1.mu.Unlock()
|
|
if b2 != b1 {
|
|
defer b2.mu.Unlock()
|
|
}
|
|
|
|
done := 0
|
|
cond, err := atomicOp(t, addr2, op)
|
|
if err != nil {
|
|
return 0, err
|
|
}
|
|
|
|
// Wake up up to nwake1 entries from the first bucket.
|
|
done = b1.wakeLocked(&k1, ^uint32(0), nwake1)
|
|
|
|
// Wake up up to nwake2 entries from the second bucket if the
|
|
// operation yielded true.
|
|
if cond {
|
|
done += b2.wakeLocked(&k2, ^uint32(0), nwake2)
|
|
}
|
|
|
|
return done, nil
|
|
}
|
|
|
|
// WaitPrepare atomically checks that addr contains val (via the Checker), then
|
|
// enqueues w to be woken by a send to w.C. If WaitPrepare returns nil, the
|
|
// Waiter must be subsequently removed by calling WaitComplete, whether or not
|
|
// a wakeup is received on w.C.
|
|
func (m *Manager) WaitPrepare(w *Waiter, t Target, addr usermem.Addr, private bool, val uint32, bitmask uint32) error {
|
|
k, err := getKey(t, addr, private)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
// Ownership of k is transferred to w below.
|
|
|
|
// Prepare the Waiter before taking the bucket lock.
|
|
select {
|
|
case <-w.C:
|
|
default:
|
|
}
|
|
w.key = k
|
|
w.bitmask = bitmask
|
|
|
|
b := m.lockBucket(&k)
|
|
// This function is very hot; avoid defer.
|
|
|
|
// Perform our atomic check.
|
|
if err := check(t, addr, val); err != nil {
|
|
b.mu.Unlock()
|
|
w.key.release()
|
|
return err
|
|
}
|
|
|
|
// Add the waiter to the bucket.
|
|
b.waiters.PushBack(w)
|
|
w.bucket.Store(b)
|
|
|
|
b.mu.Unlock()
|
|
return nil
|
|
}
|
|
|
|
// WaitComplete must be called when a Waiter previously added by WaitPrepare is
|
|
// no longer eligible to be woken.
|
|
func (m *Manager) WaitComplete(w *Waiter) {
|
|
// Remove w from the bucket it's in.
|
|
for {
|
|
b := w.bucket.Load()
|
|
|
|
// If b is nil, the waiter isn't in any bucket anymore. This can't be
|
|
// racy because the waiter can't be concurrently re-queued in another
|
|
// bucket.
|
|
if b == nil {
|
|
break
|
|
}
|
|
|
|
// Take the bucket lock. Note that without holding the bucket lock, the
|
|
// waiter is not guaranteed to stay in that bucket, so after we take
|
|
// the bucket lock, we must ensure that the bucket hasn't changed: if
|
|
// it happens to have changed, we release the old bucket lock and try
|
|
// again with the new bucket; if it hasn't changed, we know it won't
|
|
// change now because we hold the lock.
|
|
b.mu.Lock()
|
|
if b != w.bucket.Load() {
|
|
b.mu.Unlock()
|
|
continue
|
|
}
|
|
|
|
// Remove waiter from bucket.
|
|
b.waiters.Remove(w)
|
|
w.bucket.Store(nil)
|
|
b.mu.Unlock()
|
|
break
|
|
}
|
|
|
|
// Release references held by the waiter.
|
|
w.key.release()
|
|
}
|
|
|
|
// LockPI attempts to lock the futex following the Priority-inheritance futex
|
|
// rules. The lock is acquired only when 'addr' points to 0. The TID of the
|
|
// calling task is set to 'addr' to indicate the futex is owned. It returns true
|
|
// if the futex was successfully acquired.
|
|
//
|
|
// FUTEX_OWNER_DIED is only set by the Linux when robust lists are in use (see
|
|
// exit_robust_list()). Given we don't support robust lists, although handled
|
|
// below, it's never set.
|
|
func (m *Manager) LockPI(w *Waiter, t Target, addr usermem.Addr, tid uint32, private, try bool) (bool, error) {
|
|
k, err := getKey(t, addr, private)
|
|
if err != nil {
|
|
return false, err
|
|
}
|
|
// Ownership of k is transferred to w below.
|
|
|
|
// Prepare the Waiter before taking the bucket lock.
|
|
select {
|
|
case <-w.C:
|
|
default:
|
|
}
|
|
w.key = k
|
|
w.tid = tid
|
|
|
|
b := m.lockBucket(&k)
|
|
// Hot function: avoid defers.
|
|
|
|
success, err := m.lockPILocked(w, t, addr, tid, b, try)
|
|
if err != nil {
|
|
w.key.release()
|
|
b.mu.Unlock()
|
|
return false, err
|
|
}
|
|
if success || try {
|
|
// Release waiter if it's not going to be a wait.
|
|
w.key.release()
|
|
}
|
|
b.mu.Unlock()
|
|
return success, nil
|
|
}
|
|
|
|
func (m *Manager) lockPILocked(w *Waiter, t Target, addr usermem.Addr, tid uint32, b *bucket, try bool) (bool, error) {
|
|
for {
|
|
cur, err := t.LoadUint32(addr)
|
|
if err != nil {
|
|
return false, err
|
|
}
|
|
if (cur & linux.FUTEX_TID_MASK) == tid {
|
|
return false, syserror.EDEADLK
|
|
}
|
|
|
|
if (cur & linux.FUTEX_TID_MASK) == 0 {
|
|
// No owner and no waiters, try to acquire the futex.
|
|
|
|
// Set TID and preserve owner died status.
|
|
val := tid
|
|
val |= cur & linux.FUTEX_OWNER_DIED
|
|
prev, err := t.CompareAndSwapUint32(addr, cur, val)
|
|
if err != nil {
|
|
return false, err
|
|
}
|
|
if prev != cur {
|
|
// CAS failed, retry...
|
|
// Linux reacquires the bucket lock on retries, which will re-lookup the
|
|
// mapping at the futex address. However, retrying while holding the
|
|
// lock is more efficient and reduces the chance of another conflict.
|
|
continue
|
|
}
|
|
// Futex acquired.
|
|
return true, nil
|
|
}
|
|
|
|
// Futex is already owned, prepare to wait.
|
|
|
|
if try {
|
|
// Caller doesn't want to wait.
|
|
return false, nil
|
|
}
|
|
|
|
// Set waiters bit if not set yet.
|
|
if cur&linux.FUTEX_WAITERS == 0 {
|
|
prev, err := t.CompareAndSwapUint32(addr, cur, cur|linux.FUTEX_WAITERS)
|
|
if err != nil {
|
|
return false, err
|
|
}
|
|
if prev != cur {
|
|
// CAS failed, retry...
|
|
continue
|
|
}
|
|
}
|
|
|
|
// Add the waiter to the bucket.
|
|
b.waiters.PushBack(w)
|
|
w.bucket.Store(b)
|
|
return false, nil
|
|
}
|
|
}
|
|
|
|
// UnlockPI unlock the futex following the Priority-inheritance futex
|
|
// rules. The address provided must contain the caller's TID. If there are
|
|
// waiters, TID of the next waiter (FIFO) is set to the given address, and the
|
|
// waiter woken up. If there are no waiters, 0 is set to the address.
|
|
func (m *Manager) UnlockPI(t Target, addr usermem.Addr, tid uint32, private bool) error {
|
|
k, err := getKey(t, addr, private)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
b := m.lockBucket(&k)
|
|
|
|
err = m.unlockPILocked(t, addr, tid, b)
|
|
|
|
k.release()
|
|
b.mu.Unlock()
|
|
return err
|
|
}
|
|
|
|
func (m *Manager) unlockPILocked(t Target, addr usermem.Addr, tid uint32, b *bucket) error {
|
|
cur, err := t.LoadUint32(addr)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
if (cur & linux.FUTEX_TID_MASK) != tid {
|
|
return syserror.EPERM
|
|
}
|
|
|
|
if b.waiters.Empty() {
|
|
// It's safe to set 0 because there are no waiters, no new owner, and the
|
|
// executing task is the current owner (no owner died bit).
|
|
prev, err := t.CompareAndSwapUint32(addr, cur, 0)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
if prev != cur {
|
|
// Let user mode handle CAS races. This is different than lock, which
|
|
// retries when CAS fails.
|
|
return syserror.EAGAIN
|
|
}
|
|
return nil
|
|
}
|
|
|
|
next := b.waiters.Front()
|
|
|
|
// Set next owner's TID, waiters if there are any. Resets owner died bit, if
|
|
// set, because the executing task takes over as the owner.
|
|
val := next.tid
|
|
if next.Next() != nil {
|
|
val |= linux.FUTEX_WAITERS
|
|
}
|
|
|
|
prev, err := t.CompareAndSwapUint32(addr, cur, val)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
if prev != cur {
|
|
return syserror.EINVAL
|
|
}
|
|
|
|
b.wakeWaiterLocked(next)
|
|
return nil
|
|
}
|