Priority-inheritance futex implementation

It is Implemented without the priority inheritance part given that gVisor defers scheduling decisions to Go runtime and doesn't have control over it. PiperOrigin-RevId: 236989545 Change-Id: I714c8ca0798743ecf3167b14ffeb5cd834302560
2019-03-05 23:39:14 -08:00 · 2019-03-05 23:39:14 -08:00 · 0b76887147
parent fcba4e8f04
commit 0b76887147
22 changed files with 578 additions and 29 deletions
--- a/pkg/abi/linux/futex.go
+++ b/pkg/abi/linux/futex.go
@ -54,3 +54,9 @@ const (

 // FUTEX_TID_MASK is the TID portion of a PI futex word.
 const FUTEX_TID_MASK = 0x3fffffff
+
+// Constants used for priority-inheritance futexes.
+const (
+	FUTEX_WAITERS    = 0x80000000
+	FUTEX_OWNER_DIED = 0x40000000
+)
--- a/pkg/sentry/kernel/futex/BUILD
+++ b/pkg/sentry/kernel/futex/BUILD
@ -37,6 +37,8 @@ go_library(
    visibility = ["//pkg/sentry:internal"],
    deps = [
        "//pkg/abi/linux",
+        "//pkg/log",
+        "//pkg/sentry/context",
        "//pkg/sentry/memmap",
        "//pkg/sentry/usermem",
        "//pkg/syserror",
--- a/pkg/sentry/kernel/futex/futex.go
+++ b/pkg/sentry/kernel/futex/futex.go
@ -95,12 +95,15 @@ func (k *Key) matches(k2 *Key) bool {

 // Target abstracts memory accesses and keys.
 type Target interface {
-	// SwapUint32 gives access to usermem.SwapUint32.
+	// SwapUint32 gives access to usermem.IO.SwapUint32.
 	SwapUint32(addr usermem.Addr, new uint32) (uint32, error)

-	// CompareAndSwap gives access to usermem.CompareAndSwapUint32.
+	// CompareAndSwap gives access to usermem.IO.CompareAndSwapUint32.
 	CompareAndSwapUint32(addr usermem.Addr, old, new uint32) (uint32, error)

+	// LoadUint32 gives access to usermem.IO.LoadUint32.
+	LoadUint32(addr usermem.Addr) (uint32, error)
+
 	// GetSharedKey returns a Key with kind KindSharedPrivate or
 	// KindSharedMappable corresponding to the memory mapped at address addr.
 	//
@ -112,11 +115,11 @@ type Target interface {

 // check performs a basic equality check on the given address.
 func check(t Target, addr usermem.Addr, val uint32) error {
-	prev, err := t.CompareAndSwapUint32(addr, val, val)
+	cur, err := t.LoadUint32(addr)
 	if err != nil {
 		return err
 	}
-	if prev != val {
+	if cur != val {
 		return syserror.EAGAIN
 	}
 	return nil
@ -140,11 +143,14 @@ func atomicOp(t Target, addr usermem.Addr, opIn uint32) (bool, error) {
 	)
 	if opType == linux.FUTEX_OP_SET {
 		oldVal, err = t.SwapUint32(addr, opArg)
+		if err != nil {
+			return false, err
+		}
 	} else {
 		for {
-			oldVal, err = t.CompareAndSwapUint32(addr, 0, 0)
+			oldVal, err = t.LoadUint32(addr)
 			if err != nil {
-				break
+				return false, err
 			}
 			var newVal uint32
 			switch opType {
@ -161,7 +167,7 @@ func atomicOp(t Target, addr usermem.Addr, opIn uint32) (bool, error) {
 			}
 			prev, err := t.CompareAndSwapUint32(addr, oldVal, newVal)
 			if err != nil {
-				break
+				return false, err
 			}
 			if prev == oldVal {
 				break // Success.
@ -222,6 +228,9 @@ type Waiter struct {
 	// The bitmask we're waiting on.
 	// This is used the case of a FUTEX_WAKE_BITSET.
 	bitmask uint32
+
+	// tid is the thread ID for the waiter in case this is a PI mutex.
+	tid uint32
 }

 // NewWaiter returns a new unqueued Waiter.
@ -262,23 +271,28 @@ func (b *bucket) wakeLocked(key *Key, bitmask uint32, n int) int {
 		// Remove from the bucket and wake the waiter.
 		woke := w
 		w = w.Next() // Next iteration.
-		b.waiters.Remove(woke)
-		woke.C <- struct{}{}
-
-		// NOTE: The above channel write establishes a write barrier according
-		// to the memory model, so nothing may be ordered around it. Since
-		// we've dequeued woke and will never touch it again, we can safely
-		// store nil to woke.bucket here and allow the WaitComplete() to
-		// short-circuit grabbing the bucket lock. If they somehow miss the
-		// store, we are still holding the lock, so we can know that they won't
-		// dequeue woke, assume it's free and have the below operation
-		// afterwards.
-		woke.bucket.Store(nil)
+		b.wakeWaiterLocked(woke)
 		done++
 	}
 	return done
 }

+func (b *bucket) wakeWaiterLocked(w *Waiter) {
+	// Remove from the bucket and wake the waiter.
+	b.waiters.Remove(w)
+	w.C <- struct{}{}
+
+	// NOTE: The above channel write establishes a write barrier according
+	// to the memory model, so nothing may be ordered around it. Since
+	// we've dequeued w and will never touch it again, we can safely
+	// store nil to w.bucket here and allow the WaitComplete() to
+	// short-circuit grabbing the bucket lock. If they somehow miss the
+	// store, we are still holding the lock, so we can know that they won't
+	// dequeue w, assume it's free and have the below operation
+	// afterwards.
+	w.bucket.Store(nil)
+}
+
 // requeueLocked takes n waiters from the bucket and moves them to naddr on the
 // bucket "to".
 //
@ -596,7 +610,7 @@ func (m *Manager) WaitComplete(w *Waiter) {
 			continue
 		}

-		// Remove w from b.
+		// Remove waiter from bucket.
 		b.waiters.Remove(w)
 		w.bucket.Store(nil)
 		b.mu.Unlock()
@ -606,3 +620,164 @@ func (m *Manager) WaitComplete(w *Waiter) {
 	// Release references held by the waiter.
 	w.key.release()
 }
+
+// LockPI attempts to lock the futex following the Priority-inheritance futex
+// rules. The lock is acquired only when 'addr' points to 0. The TID of the
+// calling task is set to 'addr' to indicate the futex is owned. It returns true
+// if the futex was successfully acquired.
+//
+// FUTEX_OWNER_DIED is only set by the Linux when robust lists are in use (see
+// exit_robust_list()). Given we don't support robust lists, although handled
+// below, it's never set.
+func (m *Manager) LockPI(w *Waiter, t Target, addr usermem.Addr, tid uint32, private, try bool) (bool, error) {
+	k, err := getKey(t, addr, private)
+	if err != nil {
+		return false, err
+	}
+	// Ownership of k is transferred to w below.
+
+	// Prepare the Waiter before taking the bucket lock.
+	select {
+	case <-w.C:
+	default:
+	}
+	w.key = k
+	w.tid = tid
+
+	b := m.lockBucket(&k)
+	// Hot function: avoid defers.
+
+	success, err := m.lockPILocked(w, t, addr, tid, b, try)
+	if err != nil {
+		w.key.release()
+		b.mu.Unlock()
+		return false, err
+	}
+	if success || try {
+		// Release waiter if it's not going to be a wait.
+		w.key.release()
+	}
+	b.mu.Unlock()
+	return success, nil
+}
+
+func (m *Manager) lockPILocked(w *Waiter, t Target, addr usermem.Addr, tid uint32, b *bucket, try bool) (bool, error) {
+	for {
+		cur, err := t.LoadUint32(addr)
+		if err != nil {
+			return false, err
+		}
+		if (cur & linux.FUTEX_TID_MASK) == tid {
+			return false, syserror.EDEADLK
+		}
+
+		if (cur & linux.FUTEX_TID_MASK) == 0 {
+			// No owner and no waiters, try to acquire the futex.
+
+			// Set TID and preserve owner died status.
+			val := tid
+			val |= cur & linux.FUTEX_OWNER_DIED
+			prev, err := t.CompareAndSwapUint32(addr, cur, val)
+			if err != nil {
+				return false, err
+			}
+			if prev != cur {
+				// CAS failed, retry...
+				// Linux reacquires the bucket lock on retries, which will re-lookup the
+				// mapping at the futex address. However, retrying while holding the
+				// lock is more efficient and reduces the chance of another conflict.
+				continue
+			}
+			// Futex acquired.
+			return true, nil
+		}
+
+		// Futex is already owned, prepare to wait.
+
+		if try {
+			// Caller doesn't want to wait.
+			return false, nil
+		}
+
+		// Set waiters bit if not set yet.
+		if cur&linux.FUTEX_WAITERS == 0 {
+			prev, err := t.CompareAndSwapUint32(addr, cur, cur|linux.FUTEX_WAITERS)
+			if err != nil {
+				return false, err
+			}
+			if prev != cur {
+				// CAS failed, retry...
+				continue
+			}
+		}
+
+		// Add the waiter to the bucket.
+		b.waiters.PushBack(w)
+		w.bucket.Store(b)
+		return false, nil
+	}
+}
+
+// UnlockPI unlock the futex following the Priority-inheritance futex
+// rules. The address provided must contain the caller's TID. If there are
+// waiters, TID of the next waiter (FIFO) is set to the given address, and the
+// waiter woken up. If there are no waiters, 0 is set to the address.
+func (m *Manager) UnlockPI(t Target, addr usermem.Addr, tid uint32, private bool) error {
+	k, err := getKey(t, addr, private)
+	if err != nil {
+		return err
+	}
+	b := m.lockBucket(&k)
+
+	err = m.unlockPILocked(t, addr, tid, b)
+
+	k.release()
+	b.mu.Unlock()
+	return err
+}
+
+func (m *Manager) unlockPILocked(t Target, addr usermem.Addr, tid uint32, b *bucket) error {
+	cur, err := t.LoadUint32(addr)
+	if err != nil {
+		return err
+	}
+
+	if (cur & linux.FUTEX_TID_MASK) != tid {
+		return syserror.EPERM
+	}
+
+	if b.waiters.Empty() {
+		// It's safe to set 0 because there are no waiters, no new owner, and the
+		// executing task is the current owner (no owner died bit).
+		prev, err := t.CompareAndSwapUint32(addr, cur, 0)
+		if err != nil {
+			return err
+		}
+		if prev != cur {
+			// Let user mode handle CAS races. This is different than lock, which
+			// retries when CAS fails.
+			return syserror.EAGAIN
+		}
+		return nil
+	}
+
+	next := b.waiters.Front()
+
+	// Set next owner's TID, waiters if there are any. Resets owner died bit, if
+	// set, because the executing task takes over as the owner.
+	val := next.tid
+	if next.Next() != nil {
+		val |= linux.FUTEX_WAITERS
+	}
+
+	prev, err := t.CompareAndSwapUint32(addr, cur, val)
+	if err != nil {
+		return err
+	}
+	if prev != cur {
+		return syserror.EINVAL
+	}
+
+	b.wakeWaiterLocked(next)
+	return nil
+}
--- a/pkg/sentry/kernel/futex/futex_test.go
+++ b/pkg/sentry/kernel/futex/futex_test.go
@ -49,6 +49,10 @@ func (t testData) CompareAndSwapUint32(addr usermem.Addr, old, new uint32) (uint
 	return atomic.LoadUint32((*uint32)(unsafe.Pointer(&t[addr]))), nil
 }

+func (t testData) LoadUint32(addr usermem.Addr) (uint32, error) {
+	return atomic.LoadUint32((*uint32)(unsafe.Pointer(&t[addr]))), nil
+}
+
 func (t testData) GetSharedKey(addr usermem.Addr) (Key, error) {
 	return Key{
 		Kind:   KindSharedMappable,
--- a/pkg/sentry/kernel/task_futex.go
+++ b/pkg/sentry/kernel/task_futex.go
@ -41,6 +41,13 @@ func (t *Task) CompareAndSwapUint32(addr usermem.Addr, old, new uint32) (uint32,
 	})
 }

+// LoadUint32 implemets futex.Target.LoadUint32.
+func (t *Task) LoadUint32(addr usermem.Addr) (uint32, error) {
+	return t.MemoryManager().LoadUint32(t, addr, usermem.IOOpts{
+		AddressSpaceActive: true,
+	})
+}
+
 // GetSharedKey implements futex.Target.GetSharedKey.
 func (t *Task) GetSharedKey(addr usermem.Addr) (futex.Key, error) {
 	return t.MemoryManager().GetSharedFutexKey(t, addr)
--- a/pkg/sentry/mm/io.go
+++ b/pkg/sentry/mm/io.go
@ -346,6 +346,7 @@ func (mm *MemoryManager) SwapUint32(ctx context.Context, addr usermem.Addr, new
 		if err != nil {
 			return 0, translateIOError(ctx, err)
 		}
+		// Return the number of bytes read.
 		return 4, nil
 	})
 	return old, err
@ -388,11 +389,55 @@ func (mm *MemoryManager) CompareAndSwapUint32(ctx context.Context, addr usermem.
 		if err != nil {
 			return 0, translateIOError(ctx, err)
 		}
+		// Return the number of bytes read.
 		return 4, nil
 	})
 	return prev, err
 }

+// LoadUint32 implements usermem.IO.LoadUint32.
+func (mm *MemoryManager) LoadUint32(ctx context.Context, addr usermem.Addr, opts usermem.IOOpts) (uint32, error) {
+	ar, ok := mm.CheckIORange(addr, 4)
+	if !ok {
+		return 0, syserror.EFAULT
+	}
+
+	// Do AddressSpace IO if applicable.
+	if mm.haveASIO && opts.AddressSpaceActive && !opts.IgnorePermissions {
+		for {
+			val, err := mm.as.LoadUint32(addr)
+			if err == nil {
+				return val, nil
+			}
+			if f, ok := err.(platform.SegmentationFault); ok {
+				if err := mm.handleASIOFault(ctx, f.Addr, ar, usermem.Read); err != nil {
+					return 0, err
+				}
+				continue
+			}
+			return 0, translateIOError(ctx, err)
+		}
+	}
+
+	// Go through internal mappings.
+	var val uint32
+	_, err := mm.withInternalMappings(ctx, ar, usermem.Read, opts.IgnorePermissions, func(ims safemem.BlockSeq) (uint64, error) {
+		if ims.NumBlocks() != 1 || ims.NumBytes() != 4 {
+			// Atomicity is unachievable across mappings.
+			return 0, syserror.EFAULT
+		}
+		im := ims.Head()
+		var err error
+		val, err = safemem.LoadUint32(im)
+		if err != nil {
+			return 0, translateIOError(ctx, err)
+		}
+		// Return the number of bytes read.
+		return 4, nil
+	})
+	return val, err
+}
+
 // handleASIOFault handles a page fault at address addr for an AddressSpaceIO
 // operation spanning ioar.
 //
--- a/pkg/sentry/platform/platform.go
+++ b/pkg/sentry/platform/platform.go
@ -254,6 +254,11 @@ type AddressSpaceIO interface {
 	//
 	// Preconditions: addr must be aligned to a 4-byte boundary.
 	CompareAndSwapUint32(addr usermem.Addr, old, new uint32) (uint32, error)
+
+	// LoadUint32 atomically loads the uint32 value at addr and returns it.
+	//
+	// Preconditions: addr must be aligned to a 4-byte boundary.
+	LoadUint32(addr usermem.Addr) (uint32, error)
 }

 // NoAddressSpaceIO implements AddressSpaceIO methods by panicing.
@ -284,6 +289,11 @@ func (NoAddressSpaceIO) CompareAndSwapUint32(addr usermem.Addr, old, new uint32)
 	panic("This platform does not support AddressSpaceIO")
 }

+// LoadUint32 implements AddressSpaceIO.LoadUint32.
+func (NoAddressSpaceIO) LoadUint32(addr usermem.Addr) (uint32, error) {
+	panic("This platform does not support AddressSpaceIO")
+}
+
 // SegmentationFault is an error returned by AddressSpaceIO methods when IO
 // fails due to access of an unmapped page, or a mapped page with insufficient
 // permissions.
--- a/pkg/sentry/platform/safecopy/BUILD
+++ b/pkg/sentry/platform/safecopy/BUILD
@ -18,9 +18,7 @@ go_library(
    ],
    importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/platform/safecopy",
    visibility = ["//pkg/sentry:internal"],
-    deps = [
-        "//pkg/syserror",
-    ],
+    deps = ["//pkg/syserror"],
 )

 go_test(
--- a/pkg/sentry/platform/safecopy/atomic_amd64.s
+++ b/pkg/sentry/platform/safecopy/atomic_amd64.s
@ -106,3 +106,31 @@ TEXT ·compareAndSwapUint32(SB), NOSPLIT, $0-24
  CMPXCHGL DX, 0(DI)
  MOVL AX, prev+16(FP)
  RET
+
+// handleLoadUint32Fault returns the value stored in DI. Control is transferred
+// to it when LoadUint32 below receives SIGSEGV or SIGBUS, with the signal
+// number stored in DI.
+//
+// It must have the same frame configuration as loadUint32 so that it can undo
+// any potential call frame set up by the assembler.
+TEXT handleLoadUint32Fault(SB), NOSPLIT, $0-16
+  MOVL DI, sig+12(FP)
+  RET
+
+// loadUint32 atomically loads *addr and returns it. If a SIGSEGV or SIGBUS
+// signal is received, the value returned is unspecified, and sig is the number
+// of the signal that was received.
+//
+// Preconditions: addr must be aligned to a 4-byte boundary.
+//
+//func loadUint32(ptr unsafe.Pointer) (val uint32, sig int32)
+TEXT ·loadUint32(SB), NOSPLIT, $0-16
+  // Store 0 as the returned signal number. If we run to completion,
+  // this is the value the caller will see; if a signal is received,
+  // handleLoadUint32Fault will store a different value in this address.
+  MOVL $0, sig+12(FP)
+
+  MOVQ addr+0(FP), AX
+  MOVL (AX), BX
+  MOVL BX, val+8(FP)
+  RET
--- a/pkg/sentry/platform/safecopy/atomic_arm64.s
+++ b/pkg/sentry/platform/safecopy/atomic_arm64.s
@ -96,3 +96,31 @@ again:
 done:
 	MOVW R3, prev+16(FP)
 	RET
+
+// handleLoadUint32Fault returns the value stored in DI. Control is transferred
+// to it when LoadUint32 below receives SIGSEGV or SIGBUS, with the signal
+// number stored in DI.
+//
+// It must have the same frame configuration as loadUint32 so that it can undo
+// any potential call frame set up by the assembler.
+TEXT handleLoadUint32Fault(SB), NOSPLIT, $0-16
+	MOVW R1, sig+12(FP)
+	RET
+
+// loadUint32 atomically loads *addr and returns it. If a SIGSEGV or SIGBUS
+// signal is received, the value returned is unspecified, and sig is the number
+// of the signal that was received.
+//
+// Preconditions: addr must be aligned to a 4-byte boundary.
+//
+//func loadUint32(ptr unsafe.Pointer) (val uint32, sig int32)
+TEXT ·loadUint32(SB), NOSPLIT, $0-16
+	// Store 0 as the returned signal number. If we run to completion,
+	// this is the value the caller will see; if a signal is received,
+	// handleLoadUint32Fault will store a different value in this address.
+	MOVW $0, sig+12(FP)
+
+	MOVD addr+0(FP), R0
+	LDARW (R0), R1
+	MOVW R1, val+8(FP)
+	RET
--- a/pkg/sentry/platform/safecopy/safecopy.go
+++ b/pkg/sentry/platform/safecopy/safecopy.go
@ -75,6 +75,8 @@ var (
 	swapUint64End             uintptr
 	compareAndSwapUint32Begin uintptr
 	compareAndSwapUint32End   uintptr
+	loadUint32Begin           uintptr
+	loadUint32End             uintptr

 	// savedSigSegVHandler is a pointer to the SIGSEGV handler that was
 	// configured before we replaced it with our own. We still call into it
@ -119,6 +121,8 @@ func initializeAddresses() {
 	swapUint64End = FindEndAddress(swapUint64Begin)
 	compareAndSwapUint32Begin = reflect.ValueOf(compareAndSwapUint32).Pointer()
 	compareAndSwapUint32End = FindEndAddress(compareAndSwapUint32Begin)
+	loadUint32Begin = reflect.ValueOf(loadUint32).Pointer()
+	loadUint32End = FindEndAddress(loadUint32Begin)
 }

 func init() {
--- a/pkg/sentry/platform/safecopy/safecopy_unsafe.go
+++ b/pkg/sentry/platform/safecopy/safecopy_unsafe.go
@ -79,6 +79,14 @@ func swapUint64(ptr unsafe.Pointer, new uint64) (old uint64, sig int32)
 //go:noescape
 func compareAndSwapUint32(ptr unsafe.Pointer, old, new uint32) (prev uint32, sig int32)

+// LoadUint32 is like sync/atomic.LoadUint32, but operates with user memory. It
+// may fail with SIGSEGV or SIGBUS if it is received while reading from ptr.
+//
+// Preconditions: ptr must be aligned to a 4-byte boundary.
+//
+//go:noescape
+func loadUint32(ptr unsafe.Pointer) (val uint32, sig int32)
+
 // CopyIn copies len(dst) bytes from src to dst. It returns the number of bytes
 // copied and an error if SIGSEGV or SIGBUS is received while reading from src.
 func CopyIn(dst []byte, src unsafe.Pointer) (int, error) {
@ -260,6 +268,18 @@ func CompareAndSwapUint32(ptr unsafe.Pointer, old, new uint32) (uint32, error) {
 	return prev, errorFromFaultSignal(ptr, sig)
 }

+// LoadUint32 is like sync/atomic.LoadUint32, but operates with user memory. It
+// may fail with SIGSEGV or SIGBUS if it is received while reading from ptr.
+//
+// Preconditions: ptr must be aligned to a 4-byte boundary.
+func LoadUint32(ptr unsafe.Pointer) (uint32, error) {
+	if addr := uintptr(ptr); addr&3 != 0 {
+		return 0, AlignmentError{addr, 4}
+	}
+	val, sig := loadUint32(ptr)
+	return val, errorFromFaultSignal(ptr, sig)
+}
+
 func errorFromFaultSignal(addr unsafe.Pointer, sig int32) error {
 	switch sig {
 	case 0:
--- a/pkg/sentry/platform/safecopy/sighandler_amd64.s
+++ b/pkg/sentry/platform/safecopy/sighandler_amd64.s
@ -101,6 +101,15 @@ not_swapuint64:
 	JMP handle_fault

 not_casuint32:
+	CMPQ CX, ·loadUint32Begin(SB)
+	JB not_loaduint32
+	CMPQ CX, ·loadUint32End(SB)
+	JAE not_loaduint32
+
+	LEAQ handleLoadUint32Fault(SB), CX
+	JMP handle_fault
+
+not_loaduint32:
 original_handler:
 	// Jump to the previous signal handler, which is likely the golang one.
 	XORQ CX, CX
--- a/pkg/sentry/platform/safecopy/sighandler_arm64.s
+++ b/pkg/sentry/platform/safecopy/sighandler_arm64.s
@ -110,6 +110,17 @@ not_swapuint64:
 	B handle_fault

 not_casuint32:
+	MOVD ·loadUint32Begin(SB), R8
+	CMP R8, R7
+	BLO not_loaduint32
+	MOVD ·loadUint32End(SB), R8
+	CMP R8, R7
+	BHS not_loaduint32
+
+	MOVD $handleLoadUint32Fault(SB), R7
+	B handle_fault
+
+not_loaduint32:
 original_handler:
 	// Jump to the previous signal handler, which is likely the golang one.
 	MOVD ·savedSigBusHandler(SB), R7
--- a/pkg/sentry/safemem/block_unsafe.go
+++ b/pkg/sentry/safemem/block_unsafe.go
@ -267,3 +267,13 @@ func CompareAndSwapUint32(b Block, old, new uint32) (uint32, error) {
 	}
 	return safecopy.CompareAndSwapUint32(b.start, old, new)
 }
+
+// LoadUint32 invokes safecopy.LoadUint32 on the first 4 bytes of b.
+//
+// Preconditions: b.Len() >= 4.
+func LoadUint32(b Block) (uint32, error) {
+	if b.length < 4 {
+		panic(fmt.Sprintf("insufficient length: %d", b.length))
+	}
+	return safecopy.LoadUint32(b.start)
+}
--- a/pkg/sentry/syscalls/linux/sys_futex.go
+++ b/pkg/sentry/syscalls/linux/sys_futex.go
@ -124,6 +124,46 @@ func futexWaitDuration(t *kernel.Task, duration time.Duration, forever bool, add
 	return 0, kernel.ERESTART_RESTARTBLOCK
 }

+func futexLockPI(t *kernel.Task, ts linux.Timespec, forever bool, addr usermem.Addr, private bool) error {
+	w := t.FutexWaiter()
+	locked, err := t.Futex().LockPI(w, t, addr, uint32(t.ThreadID()), private, false)
+	if err != nil {
+		return err
+	}
+	if locked {
+		// Futex acquired, we're done!
+		return nil
+	}
+
+	if forever {
+		err = t.Block(w.C)
+	} else {
+		notifier, tchan := ktime.NewChannelNotifier()
+		timer := ktime.NewTimer(t.Kernel().RealtimeClock(), notifier)
+		timer.Swap(ktime.Setting{
+			Enabled: true,
+			Next:    ktime.FromTimespec(ts),
+		})
+		err = t.BlockWithTimer(w.C, tchan)
+		timer.Destroy()
+	}
+
+	t.Futex().WaitComplete(w)
+	return syserror.ConvertIntr(err, kernel.ERESTARTSYS)
+}
+
+func tryLockPI(t *kernel.Task, addr usermem.Addr, private bool) error {
+	w := t.FutexWaiter()
+	locked, err := t.Futex().LockPI(w, t, addr, uint32(t.ThreadID()), private, true)
+	if err != nil {
+		return err
+	}
+	if !locked {
+		return syserror.EWOULDBLOCK
+	}
+	return nil
+}
+
 // Futex implements linux syscall futex(2).
 // It provides a method for a program to wait for a value at a given address to
 // change, and a method to wake up anyone waiting on a particular address.
@ -144,7 +184,7 @@ func Futex(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 	switch cmd {
 	case linux.FUTEX_WAIT, linux.FUTEX_WAIT_BITSET:
 		// WAIT{_BITSET} wait forever if the timeout isn't passed.
-		forever := timeout == 0
+		forever := (timeout == 0)

 		var timespec linux.Timespec
 		if !forever {
@ -205,8 +245,30 @@ func Futex(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 		n, err := t.Futex().WakeOp(t, addr, naddr, private, val, nreq, op)
 		return uintptr(n), nil, err

-	case linux.FUTEX_LOCK_PI, linux.FUTEX_UNLOCK_PI, linux.FUTEX_TRYLOCK_PI, linux.FUTEX_WAIT_REQUEUE_PI, linux.FUTEX_CMP_REQUEUE_PI:
-		// We don't support any priority inversion futexes.
+	case linux.FUTEX_LOCK_PI:
+		forever := (timeout == 0)
+
+		var timespec linux.Timespec
+		if !forever {
+			var err error
+			timespec, err = copyTimespecIn(t, timeout)
+			if err != nil {
+				return 0, nil, err
+			}
+		}
+		err := futexLockPI(t, timespec, forever, addr, private)
+		return 0, nil, err
+
+	case linux.FUTEX_TRYLOCK_PI:
+		err := tryLockPI(t, addr, private)
+		return 0, nil, err
+
+	case linux.FUTEX_UNLOCK_PI:
+		err := t.Futex().UnlockPI(t, addr, uint32(t.ThreadID()), private)
+		return 0, nil, err
+
+	case linux.FUTEX_WAIT_REQUEUE_PI, linux.FUTEX_CMP_REQUEUE_PI:
+		t.Kernel().EmitUnimplementedEvent(t)
 		return 0, nil, syserror.ENOSYS

 	default:
--- a/pkg/sentry/usermem/bytes_io_unsafe.go
+++ b/pkg/sentry/usermem/bytes_io_unsafe.go
@ -37,3 +37,11 @@ func (b *BytesIO) CompareAndSwapUint32(ctx context.Context, addr Addr, old, new
 	}
 	return atomicbitops.CompareAndSwapUint32((*uint32)(unsafe.Pointer(&b.Bytes[int(addr)])), old, new), nil
 }
+
+// LoadUint32 implements IO.LoadUint32.
+func (b *BytesIO) LoadUint32(ctx context.Context, addr Addr, opts IOOpts) (uint32, error) {
+	if _, err := b.rangeCheck(addr, 4); err != nil {
+		return 0, err
+	}
+	return atomic.LoadUint32((*uint32)(unsafe.Pointer(&b.Bytes[int(addr)]))), nil
+}
--- a/pkg/sentry/usermem/usermem.go
+++ b/pkg/sentry/usermem/usermem.go
@ -103,6 +103,13 @@ type IO interface {
 	// any following locks in the lock order. addr must be aligned to a 4-byte
 	// boundary.
 	CompareAndSwapUint32(ctx context.Context, addr Addr, old, new uint32, opts IOOpts) (uint32, error)
+
+	// LoadUint32 atomically loads the uint32 value at addr and returns it.
+	//
+	// Preconditions: The caller must not hold mm.MemoryManager.mappingMu or
+	// any following locks in the lock order. addr must be aligned to a 4-byte
+	// boundary.
+	LoadUint32(ctx context.Context, addr Addr, opts IOOpts) (uint32, error)
 }

 // IOOpts contains options applicable to all IO methods.
--- a/pkg/syserror/syserror.go
+++ b/pkg/syserror/syserror.go
@ -33,6 +33,7 @@ var (
 	ECHILD       = error(syscall.ECHILD)
 	ECONNREFUSED = error(syscall.ECONNREFUSED)
 	ECONNRESET   = error(syscall.ECONNRESET)
+	EDEADLK      = error(syscall.EDEADLK)
 	EEXIST       = error(syscall.EEXIST)
 	EFAULT       = error(syscall.EFAULT)
 	EFBIG        = error(syscall.EFBIG)
--- a/runsc/boot/compat.go
+++ b/runsc/boot/compat.go
@ -99,8 +99,8 @@ func (c *compatEmitter) emitUnimplementedSyscall(us *spb.UnimplementedSyscall) {
 			// args: cmd, ...
 			tr = newArgsTracker(0)

-		case syscall.SYS_IOCTL, syscall.SYS_EPOLL_CTL, syscall.SYS_SHMCTL:
-			// args: fd, cmd, ...
+		case syscall.SYS_IOCTL, syscall.SYS_EPOLL_CTL, syscall.SYS_SHMCTL, syscall.SYS_FUTEX:
+			// args: fd/addr, cmd, ...
 			tr = newArgsTracker(1)

 		case syscall.SYS_GETSOCKOPT, syscall.SYS_SETSOCKOPT:
--- a/test/syscalls/linux/BUILD
+++ b/test/syscalls/linux/BUILD
@ -808,6 +808,7 @@ cc_binary(
        "//test/util:cleanup",
        "//test/util:file_descriptor",
        "//test/util:memory_util",
+        "//test/util:save_util",
        "//test/util:temp_path",
        "//test/util:test_main",
        "//test/util:test_util",
--- a/test/syscalls/linux/futex.cc
+++ b/test/syscalls/linux/futex.cc
@ -32,6 +32,7 @@
 #include "test/util/cleanup.h"
 #include "test/util/file_descriptor.h"
 #include "test/util/memory_util.h"
+#include "test/util/save_util.h"
 #include "test/util/temp_path.h"
 #include "test/util/test_util.h"
 #include "test/util/thread_util.h"
@ -118,6 +119,30 @@ int futex_wake_op(bool priv, std::atomic<int>* uaddr1, std::atomic<int>* uaddr2,
  return syscall(SYS_futex, uaddr1, op, nwake1, nwake2, uaddr2, sub_op);
 }

+int futex_lock_pi(bool priv, std::atomic<int>* uaddr) {
+  int op = FUTEX_LOCK_PI;
+  if (priv) {
+    op |= FUTEX_PRIVATE_FLAG;
+  }
+  return RetryEINTR(syscall)(SYS_futex, uaddr, op, nullptr, nullptr);
+}
+
+int futex_trylock_pi(bool priv, std::atomic<int>* uaddr) {
+  int op = FUTEX_TRYLOCK_PI;
+  if (priv) {
+    op |= FUTEX_PRIVATE_FLAG;
+  }
+  return RetryEINTR(syscall)(SYS_futex, uaddr, op, nullptr, nullptr);
+}
+
+int futex_unlock_pi(bool priv, std::atomic<int>* uaddr) {
+  int op = FUTEX_UNLOCK_PI;
+  if (priv) {
+    op |= FUTEX_PRIVATE_FLAG;
+  }
+  return RetryEINTR(syscall)(SYS_futex, uaddr, op, nullptr, nullptr);
+}
+
 // Fixture for futex tests parameterized by whether to use private or shared
 // futexes.
 class PrivateAndSharedFutexTest : public ::testing::TestWithParam<bool> {
@ -589,7 +614,95 @@ TEST(SharedFutexTest, WakeInterprocessFile_NoRandomSave) {
      << " status " << status;
 }

-}  // namespace
+TEST_P(PrivateAndSharedFutexTest, PIBasic) {
+  std::atomic<int> a = ATOMIC_VAR_INIT(0);

+  ASSERT_THAT(futex_lock_pi(IsPrivate(), &a), SyscallSucceeds());
+  EXPECT_EQ(a.load(), gettid());
+  EXPECT_THAT(futex_lock_pi(IsPrivate(), &a), SyscallFailsWithErrno(EDEADLK));
+
+  ASSERT_THAT(futex_unlock_pi(IsPrivate(), &a), SyscallSucceeds());
+  EXPECT_EQ(a.load(), 0);
+  EXPECT_THAT(futex_unlock_pi(IsPrivate(), &a), SyscallFailsWithErrno(EPERM));
+}
+
+TEST_P(PrivateAndSharedFutexTest, PIConcurrency_NoRandomSave) {
+  DisableSave ds;  // Too many syscalls.
+
+  std::atomic<int> a = ATOMIC_VAR_INIT(0);
+  const bool is_priv = IsPrivate();
+
+  std::unique_ptr<ScopedThread> threads[100];
+  for (size_t i = 0; i < ABSL_ARRAYSIZE(threads); ++i) {
+    threads[i] = absl::make_unique<ScopedThread>([is_priv, &a] {
+      for (size_t j = 0; j < 10; ++j) {
+        ASSERT_THAT(futex_lock_pi(is_priv, &a), SyscallSucceeds());
+        EXPECT_EQ(a.load() & FUTEX_TID_MASK, gettid());
+        SleepSafe(absl::Milliseconds(5));
+        ASSERT_THAT(futex_unlock_pi(is_priv, &a), SyscallSucceeds());
+      }
+    });
+  }
+}
+
+TEST_P(PrivateAndSharedFutexTest, PIWaiters) {
+  std::atomic<int> a = ATOMIC_VAR_INIT(0);
+  const bool is_priv = IsPrivate();
+
+  ASSERT_THAT(futex_lock_pi(is_priv, &a), SyscallSucceeds());
+  EXPECT_EQ(a.load(), gettid());
+
+  ScopedThread th([is_priv, &a] {
+    ASSERT_THAT(futex_lock_pi(is_priv, &a), SyscallSucceeds());
+    ASSERT_THAT(futex_unlock_pi(is_priv, &a), SyscallSucceeds());
+  });
+
+  // Wait until the thread blocks on the futex, setting the waiters bit.
+  auto start = absl::Now();
+  while (a.load() != (FUTEX_WAITERS | gettid())) {
+    ASSERT_LT(absl::Now() - start, absl::Seconds(5));
+    absl::SleepFor(absl::Milliseconds(100));
+  }
+  ASSERT_THAT(futex_unlock_pi(is_priv, &a), SyscallSucceeds());
+}
+
+TEST_P(PrivateAndSharedFutexTest, PITryLock) {
+  std::atomic<int> a = ATOMIC_VAR_INIT(0);
+  const bool is_priv = IsPrivate();
+
+  ASSERT_THAT(futex_trylock_pi(IsPrivate(), &a), SyscallSucceeds());
+  EXPECT_EQ(a.load(), gettid());
+
+  EXPECT_THAT(futex_trylock_pi(is_priv, &a), SyscallFailsWithErrno(EDEADLK));
+  ScopedThread th([is_priv, &a] {
+    EXPECT_THAT(futex_trylock_pi(is_priv, &a), SyscallFailsWithErrno(EAGAIN));
+  });
+  th.Join();
+
+  ASSERT_THAT(futex_unlock_pi(IsPrivate(), &a), SyscallSucceeds());
+}
+
+TEST_P(PrivateAndSharedFutexTest, PITryLockConcurrency_NoRandomSave) {
+  DisableSave ds;  // Too many syscalls.
+
+  std::atomic<int> a = ATOMIC_VAR_INIT(0);
+  const bool is_priv = IsPrivate();
+
+  std::unique_ptr<ScopedThread> threads[100];
+  for (size_t i = 0; i < ABSL_ARRAYSIZE(threads); ++i) {
+    threads[i] = absl::make_unique<ScopedThread>([is_priv, &a] {
+      for (size_t j = 0; j < 10;) {
+        if (futex_trylock_pi(is_priv, &a) >= 0) {
+          ++j;
+          EXPECT_EQ(a.load() & FUTEX_TID_MASK, gettid());
+          SleepSafe(absl::Milliseconds(5));
+          ASSERT_THAT(futex_unlock_pi(is_priv, &a), SyscallSucceeds());
+        }
+      }
+    });
+  }
+}
+
+}  // namespace
 }  // namespace testing
 }  // namespace gvisor