From 0b76887147820a809beaa497ede8dc4f7b7b120a Mon Sep 17 00:00:00 2001 From: Fabricio Voznika Date: Tue, 5 Mar 2019 23:39:14 -0800 Subject: [PATCH] Priority-inheritance futex implementation It is Implemented without the priority inheritance part given that gVisor defers scheduling decisions to Go runtime and doesn't have control over it. PiperOrigin-RevId: 236989545 Change-Id: I714c8ca0798743ecf3167b14ffeb5cd834302560 --- pkg/abi/linux/futex.go | 6 + pkg/sentry/kernel/futex/BUILD | 2 + pkg/sentry/kernel/futex/futex.go | 215 ++++++++++++++++-- pkg/sentry/kernel/futex/futex_test.go | 4 + pkg/sentry/kernel/task_futex.go | 7 + pkg/sentry/mm/io.go | 45 ++++ pkg/sentry/platform/platform.go | 10 + pkg/sentry/platform/safecopy/BUILD | 4 +- pkg/sentry/platform/safecopy/atomic_amd64.s | 28 +++ pkg/sentry/platform/safecopy/atomic_arm64.s | 28 +++ pkg/sentry/platform/safecopy/safecopy.go | 4 + .../platform/safecopy/safecopy_unsafe.go | 20 ++ .../platform/safecopy/sighandler_amd64.s | 9 + .../platform/safecopy/sighandler_arm64.s | 11 + pkg/sentry/safemem/block_unsafe.go | 10 + pkg/sentry/syscalls/linux/sys_futex.go | 68 +++++- pkg/sentry/usermem/bytes_io_unsafe.go | 8 + pkg/sentry/usermem/usermem.go | 7 + pkg/syserror/syserror.go | 1 + runsc/boot/compat.go | 4 +- test/syscalls/linux/BUILD | 1 + test/syscalls/linux/futex.cc | 115 +++++++++- 22 files changed, 578 insertions(+), 29 deletions(-) diff --git a/pkg/abi/linux/futex.go b/pkg/abi/linux/futex.go index 5dff01fba..afdf4123b 100644 --- a/pkg/abi/linux/futex.go +++ b/pkg/abi/linux/futex.go @@ -54,3 +54,9 @@ const ( // FUTEX_TID_MASK is the TID portion of a PI futex word. const FUTEX_TID_MASK = 0x3fffffff + +// Constants used for priority-inheritance futexes. +const ( + FUTEX_WAITERS = 0x80000000 + FUTEX_OWNER_DIED = 0x40000000 +) diff --git a/pkg/sentry/kernel/futex/BUILD b/pkg/sentry/kernel/futex/BUILD index 91feeb5ed..b6af5b20b 100644 --- a/pkg/sentry/kernel/futex/BUILD +++ b/pkg/sentry/kernel/futex/BUILD @@ -37,6 +37,8 @@ go_library( visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/abi/linux", + "//pkg/log", + "//pkg/sentry/context", "//pkg/sentry/memmap", "//pkg/sentry/usermem", "//pkg/syserror", diff --git a/pkg/sentry/kernel/futex/futex.go b/pkg/sentry/kernel/futex/futex.go index b3e628fd4..cd7d51621 100644 --- a/pkg/sentry/kernel/futex/futex.go +++ b/pkg/sentry/kernel/futex/futex.go @@ -95,12 +95,15 @@ func (k *Key) matches(k2 *Key) bool { // Target abstracts memory accesses and keys. type Target interface { - // SwapUint32 gives access to usermem.SwapUint32. + // SwapUint32 gives access to usermem.IO.SwapUint32. SwapUint32(addr usermem.Addr, new uint32) (uint32, error) - // CompareAndSwap gives access to usermem.CompareAndSwapUint32. + // CompareAndSwap gives access to usermem.IO.CompareAndSwapUint32. CompareAndSwapUint32(addr usermem.Addr, old, new uint32) (uint32, error) + // LoadUint32 gives access to usermem.IO.LoadUint32. + LoadUint32(addr usermem.Addr) (uint32, error) + // GetSharedKey returns a Key with kind KindSharedPrivate or // KindSharedMappable corresponding to the memory mapped at address addr. // @@ -112,11 +115,11 @@ type Target interface { // check performs a basic equality check on the given address. func check(t Target, addr usermem.Addr, val uint32) error { - prev, err := t.CompareAndSwapUint32(addr, val, val) + cur, err := t.LoadUint32(addr) if err != nil { return err } - if prev != val { + if cur != val { return syserror.EAGAIN } return nil @@ -140,11 +143,14 @@ func atomicOp(t Target, addr usermem.Addr, opIn uint32) (bool, error) { ) if opType == linux.FUTEX_OP_SET { oldVal, err = t.SwapUint32(addr, opArg) + if err != nil { + return false, err + } } else { for { - oldVal, err = t.CompareAndSwapUint32(addr, 0, 0) + oldVal, err = t.LoadUint32(addr) if err != nil { - break + return false, err } var newVal uint32 switch opType { @@ -161,7 +167,7 @@ func atomicOp(t Target, addr usermem.Addr, opIn uint32) (bool, error) { } prev, err := t.CompareAndSwapUint32(addr, oldVal, newVal) if err != nil { - break + return false, err } if prev == oldVal { break // Success. @@ -222,6 +228,9 @@ type Waiter struct { // The bitmask we're waiting on. // This is used the case of a FUTEX_WAKE_BITSET. bitmask uint32 + + // tid is the thread ID for the waiter in case this is a PI mutex. + tid uint32 } // NewWaiter returns a new unqueued Waiter. @@ -262,23 +271,28 @@ func (b *bucket) wakeLocked(key *Key, bitmask uint32, n int) int { // Remove from the bucket and wake the waiter. woke := w w = w.Next() // Next iteration. - b.waiters.Remove(woke) - woke.C <- struct{}{} - - // NOTE: The above channel write establishes a write barrier according - // to the memory model, so nothing may be ordered around it. Since - // we've dequeued woke and will never touch it again, we can safely - // store nil to woke.bucket here and allow the WaitComplete() to - // short-circuit grabbing the bucket lock. If they somehow miss the - // store, we are still holding the lock, so we can know that they won't - // dequeue woke, assume it's free and have the below operation - // afterwards. - woke.bucket.Store(nil) + b.wakeWaiterLocked(woke) done++ } return done } +func (b *bucket) wakeWaiterLocked(w *Waiter) { + // Remove from the bucket and wake the waiter. + b.waiters.Remove(w) + w.C <- struct{}{} + + // NOTE: The above channel write establishes a write barrier according + // to the memory model, so nothing may be ordered around it. Since + // we've dequeued w and will never touch it again, we can safely + // store nil to w.bucket here and allow the WaitComplete() to + // short-circuit grabbing the bucket lock. If they somehow miss the + // store, we are still holding the lock, so we can know that they won't + // dequeue w, assume it's free and have the below operation + // afterwards. + w.bucket.Store(nil) +} + // requeueLocked takes n waiters from the bucket and moves them to naddr on the // bucket "to". // @@ -596,7 +610,7 @@ func (m *Manager) WaitComplete(w *Waiter) { continue } - // Remove w from b. + // Remove waiter from bucket. b.waiters.Remove(w) w.bucket.Store(nil) b.mu.Unlock() @@ -606,3 +620,164 @@ func (m *Manager) WaitComplete(w *Waiter) { // Release references held by the waiter. w.key.release() } + +// LockPI attempts to lock the futex following the Priority-inheritance futex +// rules. The lock is acquired only when 'addr' points to 0. The TID of the +// calling task is set to 'addr' to indicate the futex is owned. It returns true +// if the futex was successfully acquired. +// +// FUTEX_OWNER_DIED is only set by the Linux when robust lists are in use (see +// exit_robust_list()). Given we don't support robust lists, although handled +// below, it's never set. +func (m *Manager) LockPI(w *Waiter, t Target, addr usermem.Addr, tid uint32, private, try bool) (bool, error) { + k, err := getKey(t, addr, private) + if err != nil { + return false, err + } + // Ownership of k is transferred to w below. + + // Prepare the Waiter before taking the bucket lock. + select { + case <-w.C: + default: + } + w.key = k + w.tid = tid + + b := m.lockBucket(&k) + // Hot function: avoid defers. + + success, err := m.lockPILocked(w, t, addr, tid, b, try) + if err != nil { + w.key.release() + b.mu.Unlock() + return false, err + } + if success || try { + // Release waiter if it's not going to be a wait. + w.key.release() + } + b.mu.Unlock() + return success, nil +} + +func (m *Manager) lockPILocked(w *Waiter, t Target, addr usermem.Addr, tid uint32, b *bucket, try bool) (bool, error) { + for { + cur, err := t.LoadUint32(addr) + if err != nil { + return false, err + } + if (cur & linux.FUTEX_TID_MASK) == tid { + return false, syserror.EDEADLK + } + + if (cur & linux.FUTEX_TID_MASK) == 0 { + // No owner and no waiters, try to acquire the futex. + + // Set TID and preserve owner died status. + val := tid + val |= cur & linux.FUTEX_OWNER_DIED + prev, err := t.CompareAndSwapUint32(addr, cur, val) + if err != nil { + return false, err + } + if prev != cur { + // CAS failed, retry... + // Linux reacquires the bucket lock on retries, which will re-lookup the + // mapping at the futex address. However, retrying while holding the + // lock is more efficient and reduces the chance of another conflict. + continue + } + // Futex acquired. + return true, nil + } + + // Futex is already owned, prepare to wait. + + if try { + // Caller doesn't want to wait. + return false, nil + } + + // Set waiters bit if not set yet. + if cur&linux.FUTEX_WAITERS == 0 { + prev, err := t.CompareAndSwapUint32(addr, cur, cur|linux.FUTEX_WAITERS) + if err != nil { + return false, err + } + if prev != cur { + // CAS failed, retry... + continue + } + } + + // Add the waiter to the bucket. + b.waiters.PushBack(w) + w.bucket.Store(b) + return false, nil + } +} + +// UnlockPI unlock the futex following the Priority-inheritance futex +// rules. The address provided must contain the caller's TID. If there are +// waiters, TID of the next waiter (FIFO) is set to the given address, and the +// waiter woken up. If there are no waiters, 0 is set to the address. +func (m *Manager) UnlockPI(t Target, addr usermem.Addr, tid uint32, private bool) error { + k, err := getKey(t, addr, private) + if err != nil { + return err + } + b := m.lockBucket(&k) + + err = m.unlockPILocked(t, addr, tid, b) + + k.release() + b.mu.Unlock() + return err +} + +func (m *Manager) unlockPILocked(t Target, addr usermem.Addr, tid uint32, b *bucket) error { + cur, err := t.LoadUint32(addr) + if err != nil { + return err + } + + if (cur & linux.FUTEX_TID_MASK) != tid { + return syserror.EPERM + } + + if b.waiters.Empty() { + // It's safe to set 0 because there are no waiters, no new owner, and the + // executing task is the current owner (no owner died bit). + prev, err := t.CompareAndSwapUint32(addr, cur, 0) + if err != nil { + return err + } + if prev != cur { + // Let user mode handle CAS races. This is different than lock, which + // retries when CAS fails. + return syserror.EAGAIN + } + return nil + } + + next := b.waiters.Front() + + // Set next owner's TID, waiters if there are any. Resets owner died bit, if + // set, because the executing task takes over as the owner. + val := next.tid + if next.Next() != nil { + val |= linux.FUTEX_WAITERS + } + + prev, err := t.CompareAndSwapUint32(addr, cur, val) + if err != nil { + return err + } + if prev != cur { + return syserror.EINVAL + } + + b.wakeWaiterLocked(next) + return nil +} diff --git a/pkg/sentry/kernel/futex/futex_test.go b/pkg/sentry/kernel/futex/futex_test.go index a7ab9f229..9d44ee8e5 100644 --- a/pkg/sentry/kernel/futex/futex_test.go +++ b/pkg/sentry/kernel/futex/futex_test.go @@ -49,6 +49,10 @@ func (t testData) CompareAndSwapUint32(addr usermem.Addr, old, new uint32) (uint return atomic.LoadUint32((*uint32)(unsafe.Pointer(&t[addr]))), nil } +func (t testData) LoadUint32(addr usermem.Addr) (uint32, error) { + return atomic.LoadUint32((*uint32)(unsafe.Pointer(&t[addr]))), nil +} + func (t testData) GetSharedKey(addr usermem.Addr) (Key, error) { return Key{ Kind: KindSharedMappable, diff --git a/pkg/sentry/kernel/task_futex.go b/pkg/sentry/kernel/task_futex.go index 921f7bdbc..351cf47d7 100644 --- a/pkg/sentry/kernel/task_futex.go +++ b/pkg/sentry/kernel/task_futex.go @@ -41,6 +41,13 @@ func (t *Task) CompareAndSwapUint32(addr usermem.Addr, old, new uint32) (uint32, }) } +// LoadUint32 implemets futex.Target.LoadUint32. +func (t *Task) LoadUint32(addr usermem.Addr) (uint32, error) { + return t.MemoryManager().LoadUint32(t, addr, usermem.IOOpts{ + AddressSpaceActive: true, + }) +} + // GetSharedKey implements futex.Target.GetSharedKey. func (t *Task) GetSharedKey(addr usermem.Addr) (futex.Key, error) { return t.MemoryManager().GetSharedFutexKey(t, addr) diff --git a/pkg/sentry/mm/io.go b/pkg/sentry/mm/io.go index 6600ddd78..e0cebef84 100644 --- a/pkg/sentry/mm/io.go +++ b/pkg/sentry/mm/io.go @@ -346,6 +346,7 @@ func (mm *MemoryManager) SwapUint32(ctx context.Context, addr usermem.Addr, new if err != nil { return 0, translateIOError(ctx, err) } + // Return the number of bytes read. return 4, nil }) return old, err @@ -388,11 +389,55 @@ func (mm *MemoryManager) CompareAndSwapUint32(ctx context.Context, addr usermem. if err != nil { return 0, translateIOError(ctx, err) } + // Return the number of bytes read. return 4, nil }) return prev, err } +// LoadUint32 implements usermem.IO.LoadUint32. +func (mm *MemoryManager) LoadUint32(ctx context.Context, addr usermem.Addr, opts usermem.IOOpts) (uint32, error) { + ar, ok := mm.CheckIORange(addr, 4) + if !ok { + return 0, syserror.EFAULT + } + + // Do AddressSpace IO if applicable. + if mm.haveASIO && opts.AddressSpaceActive && !opts.IgnorePermissions { + for { + val, err := mm.as.LoadUint32(addr) + if err == nil { + return val, nil + } + if f, ok := err.(platform.SegmentationFault); ok { + if err := mm.handleASIOFault(ctx, f.Addr, ar, usermem.Read); err != nil { + return 0, err + } + continue + } + return 0, translateIOError(ctx, err) + } + } + + // Go through internal mappings. + var val uint32 + _, err := mm.withInternalMappings(ctx, ar, usermem.Read, opts.IgnorePermissions, func(ims safemem.BlockSeq) (uint64, error) { + if ims.NumBlocks() != 1 || ims.NumBytes() != 4 { + // Atomicity is unachievable across mappings. + return 0, syserror.EFAULT + } + im := ims.Head() + var err error + val, err = safemem.LoadUint32(im) + if err != nil { + return 0, translateIOError(ctx, err) + } + // Return the number of bytes read. + return 4, nil + }) + return val, err +} + // handleASIOFault handles a page fault at address addr for an AddressSpaceIO // operation spanning ioar. // diff --git a/pkg/sentry/platform/platform.go b/pkg/sentry/platform/platform.go index f16588e6e..a9e76bd45 100644 --- a/pkg/sentry/platform/platform.go +++ b/pkg/sentry/platform/platform.go @@ -254,6 +254,11 @@ type AddressSpaceIO interface { // // Preconditions: addr must be aligned to a 4-byte boundary. CompareAndSwapUint32(addr usermem.Addr, old, new uint32) (uint32, error) + + // LoadUint32 atomically loads the uint32 value at addr and returns it. + // + // Preconditions: addr must be aligned to a 4-byte boundary. + LoadUint32(addr usermem.Addr) (uint32, error) } // NoAddressSpaceIO implements AddressSpaceIO methods by panicing. @@ -284,6 +289,11 @@ func (NoAddressSpaceIO) CompareAndSwapUint32(addr usermem.Addr, old, new uint32) panic("This platform does not support AddressSpaceIO") } +// LoadUint32 implements AddressSpaceIO.LoadUint32. +func (NoAddressSpaceIO) LoadUint32(addr usermem.Addr) (uint32, error) { + panic("This platform does not support AddressSpaceIO") +} + // SegmentationFault is an error returned by AddressSpaceIO methods when IO // fails due to access of an unmapped page, or a mapped page with insufficient // permissions. diff --git a/pkg/sentry/platform/safecopy/BUILD b/pkg/sentry/platform/safecopy/BUILD index 05a6a61ae..d97a40297 100644 --- a/pkg/sentry/platform/safecopy/BUILD +++ b/pkg/sentry/platform/safecopy/BUILD @@ -18,9 +18,7 @@ go_library( ], importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/platform/safecopy", visibility = ["//pkg/sentry:internal"], - deps = [ - "//pkg/syserror", - ], + deps = ["//pkg/syserror"], ) go_test( diff --git a/pkg/sentry/platform/safecopy/atomic_amd64.s b/pkg/sentry/platform/safecopy/atomic_amd64.s index 873ffa046..f90b4bfd1 100644 --- a/pkg/sentry/platform/safecopy/atomic_amd64.s +++ b/pkg/sentry/platform/safecopy/atomic_amd64.s @@ -106,3 +106,31 @@ TEXT ·compareAndSwapUint32(SB), NOSPLIT, $0-24 CMPXCHGL DX, 0(DI) MOVL AX, prev+16(FP) RET + +// handleLoadUint32Fault returns the value stored in DI. Control is transferred +// to it when LoadUint32 below receives SIGSEGV or SIGBUS, with the signal +// number stored in DI. +// +// It must have the same frame configuration as loadUint32 so that it can undo +// any potential call frame set up by the assembler. +TEXT handleLoadUint32Fault(SB), NOSPLIT, $0-16 + MOVL DI, sig+12(FP) + RET + +// loadUint32 atomically loads *addr and returns it. If a SIGSEGV or SIGBUS +// signal is received, the value returned is unspecified, and sig is the number +// of the signal that was received. +// +// Preconditions: addr must be aligned to a 4-byte boundary. +// +//func loadUint32(ptr unsafe.Pointer) (val uint32, sig int32) +TEXT ·loadUint32(SB), NOSPLIT, $0-16 + // Store 0 as the returned signal number. If we run to completion, + // this is the value the caller will see; if a signal is received, + // handleLoadUint32Fault will store a different value in this address. + MOVL $0, sig+12(FP) + + MOVQ addr+0(FP), AX + MOVL (AX), BX + MOVL BX, val+8(FP) + RET diff --git a/pkg/sentry/platform/safecopy/atomic_arm64.s b/pkg/sentry/platform/safecopy/atomic_arm64.s index 554a5c1e1..d58ed71f7 100644 --- a/pkg/sentry/platform/safecopy/atomic_arm64.s +++ b/pkg/sentry/platform/safecopy/atomic_arm64.s @@ -96,3 +96,31 @@ again: done: MOVW R3, prev+16(FP) RET + +// handleLoadUint32Fault returns the value stored in DI. Control is transferred +// to it when LoadUint32 below receives SIGSEGV or SIGBUS, with the signal +// number stored in DI. +// +// It must have the same frame configuration as loadUint32 so that it can undo +// any potential call frame set up by the assembler. +TEXT handleLoadUint32Fault(SB), NOSPLIT, $0-16 + MOVW R1, sig+12(FP) + RET + +// loadUint32 atomically loads *addr and returns it. If a SIGSEGV or SIGBUS +// signal is received, the value returned is unspecified, and sig is the number +// of the signal that was received. +// +// Preconditions: addr must be aligned to a 4-byte boundary. +// +//func loadUint32(ptr unsafe.Pointer) (val uint32, sig int32) +TEXT ·loadUint32(SB), NOSPLIT, $0-16 + // Store 0 as the returned signal number. If we run to completion, + // this is the value the caller will see; if a signal is received, + // handleLoadUint32Fault will store a different value in this address. + MOVW $0, sig+12(FP) + + MOVD addr+0(FP), R0 + LDARW (R0), R1 + MOVW R1, val+8(FP) + RET diff --git a/pkg/sentry/platform/safecopy/safecopy.go b/pkg/sentry/platform/safecopy/safecopy.go index c60f73103..69c66a3b7 100644 --- a/pkg/sentry/platform/safecopy/safecopy.go +++ b/pkg/sentry/platform/safecopy/safecopy.go @@ -75,6 +75,8 @@ var ( swapUint64End uintptr compareAndSwapUint32Begin uintptr compareAndSwapUint32End uintptr + loadUint32Begin uintptr + loadUint32End uintptr // savedSigSegVHandler is a pointer to the SIGSEGV handler that was // configured before we replaced it with our own. We still call into it @@ -119,6 +121,8 @@ func initializeAddresses() { swapUint64End = FindEndAddress(swapUint64Begin) compareAndSwapUint32Begin = reflect.ValueOf(compareAndSwapUint32).Pointer() compareAndSwapUint32End = FindEndAddress(compareAndSwapUint32Begin) + loadUint32Begin = reflect.ValueOf(loadUint32).Pointer() + loadUint32End = FindEndAddress(loadUint32Begin) } func init() { diff --git a/pkg/sentry/platform/safecopy/safecopy_unsafe.go b/pkg/sentry/platform/safecopy/safecopy_unsafe.go index e78a6714e..f84527484 100644 --- a/pkg/sentry/platform/safecopy/safecopy_unsafe.go +++ b/pkg/sentry/platform/safecopy/safecopy_unsafe.go @@ -79,6 +79,14 @@ func swapUint64(ptr unsafe.Pointer, new uint64) (old uint64, sig int32) //go:noescape func compareAndSwapUint32(ptr unsafe.Pointer, old, new uint32) (prev uint32, sig int32) +// LoadUint32 is like sync/atomic.LoadUint32, but operates with user memory. It +// may fail with SIGSEGV or SIGBUS if it is received while reading from ptr. +// +// Preconditions: ptr must be aligned to a 4-byte boundary. +// +//go:noescape +func loadUint32(ptr unsafe.Pointer) (val uint32, sig int32) + // CopyIn copies len(dst) bytes from src to dst. It returns the number of bytes // copied and an error if SIGSEGV or SIGBUS is received while reading from src. func CopyIn(dst []byte, src unsafe.Pointer) (int, error) { @@ -260,6 +268,18 @@ func CompareAndSwapUint32(ptr unsafe.Pointer, old, new uint32) (uint32, error) { return prev, errorFromFaultSignal(ptr, sig) } +// LoadUint32 is like sync/atomic.LoadUint32, but operates with user memory. It +// may fail with SIGSEGV or SIGBUS if it is received while reading from ptr. +// +// Preconditions: ptr must be aligned to a 4-byte boundary. +func LoadUint32(ptr unsafe.Pointer) (uint32, error) { + if addr := uintptr(ptr); addr&3 != 0 { + return 0, AlignmentError{addr, 4} + } + val, sig := loadUint32(ptr) + return val, errorFromFaultSignal(ptr, sig) +} + func errorFromFaultSignal(addr unsafe.Pointer, sig int32) error { switch sig { case 0: diff --git a/pkg/sentry/platform/safecopy/sighandler_amd64.s b/pkg/sentry/platform/safecopy/sighandler_amd64.s index 06614f1b4..db7701a29 100644 --- a/pkg/sentry/platform/safecopy/sighandler_amd64.s +++ b/pkg/sentry/platform/safecopy/sighandler_amd64.s @@ -101,6 +101,15 @@ not_swapuint64: JMP handle_fault not_casuint32: + CMPQ CX, ·loadUint32Begin(SB) + JB not_loaduint32 + CMPQ CX, ·loadUint32End(SB) + JAE not_loaduint32 + + LEAQ handleLoadUint32Fault(SB), CX + JMP handle_fault + +not_loaduint32: original_handler: // Jump to the previous signal handler, which is likely the golang one. XORQ CX, CX diff --git a/pkg/sentry/platform/safecopy/sighandler_arm64.s b/pkg/sentry/platform/safecopy/sighandler_arm64.s index 5e8e193e7..cdfca8207 100644 --- a/pkg/sentry/platform/safecopy/sighandler_arm64.s +++ b/pkg/sentry/platform/safecopy/sighandler_arm64.s @@ -110,6 +110,17 @@ not_swapuint64: B handle_fault not_casuint32: + MOVD ·loadUint32Begin(SB), R8 + CMP R8, R7 + BLO not_loaduint32 + MOVD ·loadUint32End(SB), R8 + CMP R8, R7 + BHS not_loaduint32 + + MOVD $handleLoadUint32Fault(SB), R7 + B handle_fault + +not_loaduint32: original_handler: // Jump to the previous signal handler, which is likely the golang one. MOVD ·savedSigBusHandler(SB), R7 diff --git a/pkg/sentry/safemem/block_unsafe.go b/pkg/sentry/safemem/block_unsafe.go index e91ff66ae..c3a9780d2 100644 --- a/pkg/sentry/safemem/block_unsafe.go +++ b/pkg/sentry/safemem/block_unsafe.go @@ -267,3 +267,13 @@ func CompareAndSwapUint32(b Block, old, new uint32) (uint32, error) { } return safecopy.CompareAndSwapUint32(b.start, old, new) } + +// LoadUint32 invokes safecopy.LoadUint32 on the first 4 bytes of b. +// +// Preconditions: b.Len() >= 4. +func LoadUint32(b Block) (uint32, error) { + if b.length < 4 { + panic(fmt.Sprintf("insufficient length: %d", b.length)) + } + return safecopy.LoadUint32(b.start) +} diff --git a/pkg/sentry/syscalls/linux/sys_futex.go b/pkg/sentry/syscalls/linux/sys_futex.go index 7a1d396ec..f0c89cba4 100644 --- a/pkg/sentry/syscalls/linux/sys_futex.go +++ b/pkg/sentry/syscalls/linux/sys_futex.go @@ -124,6 +124,46 @@ func futexWaitDuration(t *kernel.Task, duration time.Duration, forever bool, add return 0, kernel.ERESTART_RESTARTBLOCK } +func futexLockPI(t *kernel.Task, ts linux.Timespec, forever bool, addr usermem.Addr, private bool) error { + w := t.FutexWaiter() + locked, err := t.Futex().LockPI(w, t, addr, uint32(t.ThreadID()), private, false) + if err != nil { + return err + } + if locked { + // Futex acquired, we're done! + return nil + } + + if forever { + err = t.Block(w.C) + } else { + notifier, tchan := ktime.NewChannelNotifier() + timer := ktime.NewTimer(t.Kernel().RealtimeClock(), notifier) + timer.Swap(ktime.Setting{ + Enabled: true, + Next: ktime.FromTimespec(ts), + }) + err = t.BlockWithTimer(w.C, tchan) + timer.Destroy() + } + + t.Futex().WaitComplete(w) + return syserror.ConvertIntr(err, kernel.ERESTARTSYS) +} + +func tryLockPI(t *kernel.Task, addr usermem.Addr, private bool) error { + w := t.FutexWaiter() + locked, err := t.Futex().LockPI(w, t, addr, uint32(t.ThreadID()), private, true) + if err != nil { + return err + } + if !locked { + return syserror.EWOULDBLOCK + } + return nil +} + // Futex implements linux syscall futex(2). // It provides a method for a program to wait for a value at a given address to // change, and a method to wake up anyone waiting on a particular address. @@ -144,7 +184,7 @@ func Futex(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall switch cmd { case linux.FUTEX_WAIT, linux.FUTEX_WAIT_BITSET: // WAIT{_BITSET} wait forever if the timeout isn't passed. - forever := timeout == 0 + forever := (timeout == 0) var timespec linux.Timespec if !forever { @@ -205,8 +245,30 @@ func Futex(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall n, err := t.Futex().WakeOp(t, addr, naddr, private, val, nreq, op) return uintptr(n), nil, err - case linux.FUTEX_LOCK_PI, linux.FUTEX_UNLOCK_PI, linux.FUTEX_TRYLOCK_PI, linux.FUTEX_WAIT_REQUEUE_PI, linux.FUTEX_CMP_REQUEUE_PI: - // We don't support any priority inversion futexes. + case linux.FUTEX_LOCK_PI: + forever := (timeout == 0) + + var timespec linux.Timespec + if !forever { + var err error + timespec, err = copyTimespecIn(t, timeout) + if err != nil { + return 0, nil, err + } + } + err := futexLockPI(t, timespec, forever, addr, private) + return 0, nil, err + + case linux.FUTEX_TRYLOCK_PI: + err := tryLockPI(t, addr, private) + return 0, nil, err + + case linux.FUTEX_UNLOCK_PI: + err := t.Futex().UnlockPI(t, addr, uint32(t.ThreadID()), private) + return 0, nil, err + + case linux.FUTEX_WAIT_REQUEUE_PI, linux.FUTEX_CMP_REQUEUE_PI: + t.Kernel().EmitUnimplementedEvent(t) return 0, nil, syserror.ENOSYS default: diff --git a/pkg/sentry/usermem/bytes_io_unsafe.go b/pkg/sentry/usermem/bytes_io_unsafe.go index 8bdf3a508..7add8bc82 100644 --- a/pkg/sentry/usermem/bytes_io_unsafe.go +++ b/pkg/sentry/usermem/bytes_io_unsafe.go @@ -37,3 +37,11 @@ func (b *BytesIO) CompareAndSwapUint32(ctx context.Context, addr Addr, old, new } return atomicbitops.CompareAndSwapUint32((*uint32)(unsafe.Pointer(&b.Bytes[int(addr)])), old, new), nil } + +// LoadUint32 implements IO.LoadUint32. +func (b *BytesIO) LoadUint32(ctx context.Context, addr Addr, opts IOOpts) (uint32, error) { + if _, err := b.rangeCheck(addr, 4); err != nil { + return 0, err + } + return atomic.LoadUint32((*uint32)(unsafe.Pointer(&b.Bytes[int(addr)]))), nil +} diff --git a/pkg/sentry/usermem/usermem.go b/pkg/sentry/usermem/usermem.go index 75ac4d22d..c3c9c153b 100644 --- a/pkg/sentry/usermem/usermem.go +++ b/pkg/sentry/usermem/usermem.go @@ -103,6 +103,13 @@ type IO interface { // any following locks in the lock order. addr must be aligned to a 4-byte // boundary. CompareAndSwapUint32(ctx context.Context, addr Addr, old, new uint32, opts IOOpts) (uint32, error) + + // LoadUint32 atomically loads the uint32 value at addr and returns it. + // + // Preconditions: The caller must not hold mm.MemoryManager.mappingMu or + // any following locks in the lock order. addr must be aligned to a 4-byte + // boundary. + LoadUint32(ctx context.Context, addr Addr, opts IOOpts) (uint32, error) } // IOOpts contains options applicable to all IO methods. diff --git a/pkg/syserror/syserror.go b/pkg/syserror/syserror.go index 4228707f4..5558cccff 100644 --- a/pkg/syserror/syserror.go +++ b/pkg/syserror/syserror.go @@ -33,6 +33,7 @@ var ( ECHILD = error(syscall.ECHILD) ECONNREFUSED = error(syscall.ECONNREFUSED) ECONNRESET = error(syscall.ECONNRESET) + EDEADLK = error(syscall.EDEADLK) EEXIST = error(syscall.EEXIST) EFAULT = error(syscall.EFAULT) EFBIG = error(syscall.EFBIG) diff --git a/runsc/boot/compat.go b/runsc/boot/compat.go index 37d0c31fd..b3499bcde 100644 --- a/runsc/boot/compat.go +++ b/runsc/boot/compat.go @@ -99,8 +99,8 @@ func (c *compatEmitter) emitUnimplementedSyscall(us *spb.UnimplementedSyscall) { // args: cmd, ... tr = newArgsTracker(0) - case syscall.SYS_IOCTL, syscall.SYS_EPOLL_CTL, syscall.SYS_SHMCTL: - // args: fd, cmd, ... + case syscall.SYS_IOCTL, syscall.SYS_EPOLL_CTL, syscall.SYS_SHMCTL, syscall.SYS_FUTEX: + // args: fd/addr, cmd, ... tr = newArgsTracker(1) case syscall.SYS_GETSOCKOPT, syscall.SYS_SETSOCKOPT: diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD index 4c818238b..2c214925e 100644 --- a/test/syscalls/linux/BUILD +++ b/test/syscalls/linux/BUILD @@ -808,6 +808,7 @@ cc_binary( "//test/util:cleanup", "//test/util:file_descriptor", "//test/util:memory_util", + "//test/util:save_util", "//test/util:temp_path", "//test/util:test_main", "//test/util:test_util", diff --git a/test/syscalls/linux/futex.cc b/test/syscalls/linux/futex.cc index 6fa284013..35933b660 100644 --- a/test/syscalls/linux/futex.cc +++ b/test/syscalls/linux/futex.cc @@ -32,6 +32,7 @@ #include "test/util/cleanup.h" #include "test/util/file_descriptor.h" #include "test/util/memory_util.h" +#include "test/util/save_util.h" #include "test/util/temp_path.h" #include "test/util/test_util.h" #include "test/util/thread_util.h" @@ -118,6 +119,30 @@ int futex_wake_op(bool priv, std::atomic* uaddr1, std::atomic* uaddr2, return syscall(SYS_futex, uaddr1, op, nwake1, nwake2, uaddr2, sub_op); } +int futex_lock_pi(bool priv, std::atomic* uaddr) { + int op = FUTEX_LOCK_PI; + if (priv) { + op |= FUTEX_PRIVATE_FLAG; + } + return RetryEINTR(syscall)(SYS_futex, uaddr, op, nullptr, nullptr); +} + +int futex_trylock_pi(bool priv, std::atomic* uaddr) { + int op = FUTEX_TRYLOCK_PI; + if (priv) { + op |= FUTEX_PRIVATE_FLAG; + } + return RetryEINTR(syscall)(SYS_futex, uaddr, op, nullptr, nullptr); +} + +int futex_unlock_pi(bool priv, std::atomic* uaddr) { + int op = FUTEX_UNLOCK_PI; + if (priv) { + op |= FUTEX_PRIVATE_FLAG; + } + return RetryEINTR(syscall)(SYS_futex, uaddr, op, nullptr, nullptr); +} + // Fixture for futex tests parameterized by whether to use private or shared // futexes. class PrivateAndSharedFutexTest : public ::testing::TestWithParam { @@ -589,7 +614,95 @@ TEST(SharedFutexTest, WakeInterprocessFile_NoRandomSave) { << " status " << status; } -} // namespace +TEST_P(PrivateAndSharedFutexTest, PIBasic) { + std::atomic a = ATOMIC_VAR_INIT(0); + ASSERT_THAT(futex_lock_pi(IsPrivate(), &a), SyscallSucceeds()); + EXPECT_EQ(a.load(), gettid()); + EXPECT_THAT(futex_lock_pi(IsPrivate(), &a), SyscallFailsWithErrno(EDEADLK)); + + ASSERT_THAT(futex_unlock_pi(IsPrivate(), &a), SyscallSucceeds()); + EXPECT_EQ(a.load(), 0); + EXPECT_THAT(futex_unlock_pi(IsPrivate(), &a), SyscallFailsWithErrno(EPERM)); +} + +TEST_P(PrivateAndSharedFutexTest, PIConcurrency_NoRandomSave) { + DisableSave ds; // Too many syscalls. + + std::atomic a = ATOMIC_VAR_INIT(0); + const bool is_priv = IsPrivate(); + + std::unique_ptr threads[100]; + for (size_t i = 0; i < ABSL_ARRAYSIZE(threads); ++i) { + threads[i] = absl::make_unique([is_priv, &a] { + for (size_t j = 0; j < 10; ++j) { + ASSERT_THAT(futex_lock_pi(is_priv, &a), SyscallSucceeds()); + EXPECT_EQ(a.load() & FUTEX_TID_MASK, gettid()); + SleepSafe(absl::Milliseconds(5)); + ASSERT_THAT(futex_unlock_pi(is_priv, &a), SyscallSucceeds()); + } + }); + } +} + +TEST_P(PrivateAndSharedFutexTest, PIWaiters) { + std::atomic a = ATOMIC_VAR_INIT(0); + const bool is_priv = IsPrivate(); + + ASSERT_THAT(futex_lock_pi(is_priv, &a), SyscallSucceeds()); + EXPECT_EQ(a.load(), gettid()); + + ScopedThread th([is_priv, &a] { + ASSERT_THAT(futex_lock_pi(is_priv, &a), SyscallSucceeds()); + ASSERT_THAT(futex_unlock_pi(is_priv, &a), SyscallSucceeds()); + }); + + // Wait until the thread blocks on the futex, setting the waiters bit. + auto start = absl::Now(); + while (a.load() != (FUTEX_WAITERS | gettid())) { + ASSERT_LT(absl::Now() - start, absl::Seconds(5)); + absl::SleepFor(absl::Milliseconds(100)); + } + ASSERT_THAT(futex_unlock_pi(is_priv, &a), SyscallSucceeds()); +} + +TEST_P(PrivateAndSharedFutexTest, PITryLock) { + std::atomic a = ATOMIC_VAR_INIT(0); + const bool is_priv = IsPrivate(); + + ASSERT_THAT(futex_trylock_pi(IsPrivate(), &a), SyscallSucceeds()); + EXPECT_EQ(a.load(), gettid()); + + EXPECT_THAT(futex_trylock_pi(is_priv, &a), SyscallFailsWithErrno(EDEADLK)); + ScopedThread th([is_priv, &a] { + EXPECT_THAT(futex_trylock_pi(is_priv, &a), SyscallFailsWithErrno(EAGAIN)); + }); + th.Join(); + + ASSERT_THAT(futex_unlock_pi(IsPrivate(), &a), SyscallSucceeds()); +} + +TEST_P(PrivateAndSharedFutexTest, PITryLockConcurrency_NoRandomSave) { + DisableSave ds; // Too many syscalls. + + std::atomic a = ATOMIC_VAR_INIT(0); + const bool is_priv = IsPrivate(); + + std::unique_ptr threads[100]; + for (size_t i = 0; i < ABSL_ARRAYSIZE(threads); ++i) { + threads[i] = absl::make_unique([is_priv, &a] { + for (size_t j = 0; j < 10;) { + if (futex_trylock_pi(is_priv, &a) >= 0) { + ++j; + EXPECT_EQ(a.load() & FUTEX_TID_MASK, gettid()); + SleepSafe(absl::Milliseconds(5)); + ASSERT_THAT(futex_unlock_pi(is_priv, &a), SyscallSucceeds()); + } + } + }); + } +} + +} // namespace } // namespace testing } // namespace gvisor