gvisor/pkg/sentry/fs/lock/lock.go

// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Package lock is the API for POSIX-style advisory regional file locks and
// BSD-style full file locks.
//
// Callers needing to enforce these types of locks, like sys_fcntl, can call
// LockRegion and UnlockRegion on a thread-safe set of Locks.  Locks are
// specific to a unique file (unique device/inode pair) and for this reason
// should not be shared between files.
//
// A Lock has a set of holders identified by UniqueID.  Normally this is the
// pid of the thread attempting to acquire the lock.
//
// Since these are advisory locks, they do not need to be integrated into
// Reads/Writes and for this reason there is no way to *check* if a lock is
// held.  One can only attempt to take a lock or unlock an existing lock.
//
// A Lock in a set of Locks is typed: it is either a read lock with any number
// of readers and no writer, or a write lock with no readers.
//
// As expected from POSIX, any attempt to acquire a write lock on a file region
// when there already exits a write lock held by a different uid will fail. Any
// attempt to acquire a write lock on a file region when there is more than one
// reader will fail.  Any attempt to acquire a read lock on a file region when
// there is already a writer will fail.
//
// In special cases, a read lock may be upgraded to a write lock and a write lock
// can be downgraded to a read lock.  This can only happen if:
//
//  * read lock upgrade to write lock: There can be only one reader and the reader
//    must be the same as the requested write lock holder.
//
//  * write lock downgrade to read lock: The writer must be the same as the requested
//    read lock holder.
//
// UnlockRegion always succeeds.  If LockRegion fails the caller should normally
// interpret this as "try again later".
package lock

import (
	"fmt"
	"math"
	"sync"
	"syscall"

	"gvisor.dev/gvisor/pkg/waiter"
)

// LockType is a type of regional file lock.
type LockType int

// UniqueID is a unique identifier of the holder of a regional file lock.
type UniqueID uint64

const (
	// ReadLock describes a POSIX regional file lock to be taken
	// read only.  There may be multiple of these locks on a single
	// file region as long as there is no writer lock on the same
	// region.
	ReadLock LockType = iota

	// WriteLock describes a POSIX regional file lock to be taken
	// write only.  There may be only a single holder of this lock
	// and no read locks.
	WriteLock
)

// LockEOF is the maximal possible end of a regional file lock.
const LockEOF = math.MaxUint64

// Lock is a regional file lock.  It consists of either a single writer
// or a set of readers.
//
// A Lock may be upgraded from a read lock to a write lock only if there
// is a single reader and that reader has the same uid as the write lock.
//
// A Lock may be downgraded from a write lock to a read lock only if
// the write lock's uid is the same as the read lock.
//
// +stateify savable
type Lock struct {
	// Readers are the set of read lock holders identified by UniqueID.
	// If len(Readers) > 0 then HasWriter must be false.
	Readers map[UniqueID]bool

	// HasWriter indicates that this is a write lock held by a single
	// UniqueID.
	HasWriter bool

	// Writer is only valid if HasWriter is true.  It identifies a
	// single write lock holder.
	Writer UniqueID
}

// Locks is a thread-safe wrapper around a LockSet.
//
// +stateify savable
type Locks struct {
	// mu protects locks below.
	mu sync.Mutex `state:"nosave"`

	// locks is the set of region locks currently held on an Inode.
	locks LockSet

	// blockedQueue is the queue of waiters that are waiting on a lock.
	blockedQueue waiter.Queue `state:"zerovalue"`
}

// Blocker is the interface used for blocking locks. Passing a nil Blocker
// will be treated as non-blocking.
type Blocker interface {
	Block(C <-chan struct{}) error
}

const (
	// EventMaskAll is the mask we will always use for locks, by using the
	// same mask all the time we can wake up everyone anytime the lock
	// changes state.
	EventMaskAll waiter.EventMask = 0xFFFF
)

// LockRegion attempts to acquire a typed lock for the uid on a region
// of a file. Returns true if successful in locking the region. If false
// is returned, the caller should normally interpret this as "try again later" if
// acquiring the lock in a non-blocking mode or "interrupted" if in a blocking mode.
// Blocker is the interface used to provide blocking behavior, passing a nil Blocker
// will result in non-blocking behavior.
func (l *Locks) LockRegion(uid UniqueID, t LockType, r LockRange, block Blocker) bool {
	for {
		l.mu.Lock()

		// Blocking locks must run in a loop because we'll be woken up whenever an unlock event
		// happens for this lock. We will then attempt to take the lock again and if it fails
		// continue blocking.
		res := l.locks.lock(uid, t, r)
		if !res && block != nil {
			e, ch := waiter.NewChannelEntry(nil)
			l.blockedQueue.EventRegister(&e, EventMaskAll)
			l.mu.Unlock()
			if err := block.Block(ch); err != nil {
				// We were interrupted, the caller can translate this to EINTR if applicable.
				l.blockedQueue.EventUnregister(&e)
				return false
			}
			l.blockedQueue.EventUnregister(&e)
			continue // Try again now that someone has unlocked.
		}

		l.mu.Unlock()
		return res
	}
}

// UnlockRegion attempts to release a lock for the uid on a region of a file.
// This operation is always successful, even if there did not exist a lock on
// the requested region held by uid in the first place.
func (l *Locks) UnlockRegion(uid UniqueID, r LockRange) {
	l.mu.Lock()
	defer l.mu.Unlock()
	l.locks.unlock(uid, r)

	// Now that we've released the lock, we need to wake up any waiters.
	l.blockedQueue.Notify(EventMaskAll)
}

// makeLock returns a new typed Lock that has either uid as its only reader
// or uid as its only writer.
func makeLock(uid UniqueID, t LockType) Lock {
	value := Lock{Readers: make(map[UniqueID]bool)}
	switch t {
	case ReadLock:
		value.Readers[uid] = true
	case WriteLock:
		value.HasWriter = true
		value.Writer = uid
	default:
		panic(fmt.Sprintf("makeLock: invalid lock type %d", t))
	}
	return value
}

// isHeld returns true if uid is a holder of Lock.
func (l Lock) isHeld(uid UniqueID) bool {
	if l.HasWriter && l.Writer == uid {
		return true
	}
	return l.Readers[uid]
}

// lock sets uid as a holder of a typed lock on Lock.
//
// Preconditions: canLock is true for the range containing this Lock.
func (l *Lock) lock(uid UniqueID, t LockType) {
	switch t {
	case ReadLock:
		// If we are already a reader, then this is a no-op.
		if l.Readers[uid] {
			return
		}
		// We cannot downgrade a write lock to a read lock unless the
		// uid is the same.
		if l.HasWriter {
			if l.Writer != uid {
				panic(fmt.Sprintf("lock: cannot downgrade write lock to read lock for uid %d, writer is %d", uid, l.Writer))
			}
			// Ensure that there is only one reader if upgrading.
			l.Readers = make(map[UniqueID]bool)
			// Ensure that there is no longer a writer.
			l.HasWriter = false
		}
		l.Readers[uid] = true
		return
	case WriteLock:
		// If we are already the writer, then this is a no-op.
		if l.HasWriter && l.Writer == uid {
			return
		}
		// We can only upgrade a read lock to a write lock if there
		// is only one reader and that reader has the same uid as
		// the write lock.
		if readers := len(l.Readers); readers > 0 {
			if readers != 1 {
				panic(fmt.Sprintf("lock: cannot upgrade read lock to write lock for uid %d, too many readers %v", uid, l.Readers))
			}
			if !l.Readers[uid] {
				panic(fmt.Sprintf("lock: cannot upgrade read lock to write lock for uid %d, conflicting reader %v", uid, l.Readers))
			}
		}
		// Ensure that there is only a writer.
		l.Readers = make(map[UniqueID]bool)
		l.HasWriter = true
		l.Writer = uid
	default:
		panic(fmt.Sprintf("lock: invalid lock type %d", t))
	}
}

// lockable returns true if check returns true for every Lock in LockRange.
// Further, check should return true if Lock meets the callers requirements
// for locking Lock.
func (l LockSet) lockable(r LockRange, check func(value Lock) bool) bool {
	// Get our starting point.
	seg := l.LowerBoundSegment(r.Start)
	for seg.Ok() && seg.Start() < r.End {
		// Note that we don't care about overruning the end of the
		// last segment because if everything checks out we'll just
		// split the last segment.
		if !check(seg.Value()) {
			return false
		}
		// Jump to the next segment, ignoring gaps, for the same
		// reason we ignored the first gap.
		seg = seg.NextSegment()
	}
	// No conflict, we can get a lock for uid over the entire range.
	return true
}

// canLock returns true if uid will be able to take a Lock of type t on the
// entire range specified by LockRange.
func (l LockSet) canLock(uid UniqueID, t LockType, r LockRange) bool {
	switch t {
	case ReadLock:
		return l.lockable(r, func(value Lock) bool {
			// If there is no writer, there's no problem adding
			// another reader.
			if !value.HasWriter {
				return true
			}
			// If there is a writer, then it must be the same uid
			// in order to downgrade the lock to a read lock.
			return value.Writer == uid
		})
	case WriteLock:
		return l.lockable(r, func(value Lock) bool {
			// If there are only readers.
			if !value.HasWriter {
				// Then this uid can only take a write lock if
				// this is a private upgrade, meaning that the
				// only reader is uid.
				return len(value.Readers) == 1 && value.Readers[uid]
			}
			// If the uid is already a writer on this region, then
			// adding a write lock would be a no-op.
			return value.Writer == uid
		})
	default:
		panic(fmt.Sprintf("canLock: invalid lock type %d", t))
	}
}

// lock returns true if uid took a lock of type t on the entire range of LockRange.
//
// Preconditions: r.Start <= r.End (will panic otherwise).
func (l *LockSet) lock(uid UniqueID, t LockType, r LockRange) bool {
	if r.Start > r.End {
		panic(fmt.Sprintf("lock: r.Start %d > r.End %d", r.Start, r.End))
	}

	// Don't attempt to insert anything with a range of 0 and treat this
	// as a successful no-op.
	if r.Length() == 0 {
		return true
	}

	// Do a first-pass check.  We *could* hold onto the segments we
	// checked if canLock would return true, but traversing the segment
	// set should be fast and this keeps things simple.
	if !l.canLock(uid, t, r) {
		return false
	}
	// Get our starting point.
	seg, gap := l.Find(r.Start)
	if gap.Ok() {
		// Fill in the gap and get the next segment to modify.
		seg = l.Insert(gap, gap.Range().Intersect(r), makeLock(uid, t)).NextSegment()
	} else if seg.Start() < r.Start {
		// Get our first segment to modify.
		_, seg = l.Split(seg, r.Start)
	}
	for seg.Ok() && seg.Start() < r.End {
		// Split the last one if necessary.
		if seg.End() > r.End {
			seg, _ = l.SplitUnchecked(seg, r.End)
		}

		// Set the lock on the segment.  This is guaranteed to
		// always be safe, given canLock above.
		value := seg.ValuePtr()
		value.lock(uid, t)

		// Fill subsequent gaps.
		gap = seg.NextGap()
		if gr := gap.Range().Intersect(r); gr.Length() > 0 {
			seg = l.Insert(gap, gr, makeLock(uid, t)).NextSegment()
		} else {
			seg = gap.NextSegment()
		}
	}
	return true
}

// unlock is always successful.  If uid has no locks held for the range LockRange,
// unlock is a no-op.
//
// Preconditions: same as lock.
func (l *LockSet) unlock(uid UniqueID, r LockRange) {
	if r.Start > r.End {
		panic(fmt.Sprintf("unlock: r.Start %d > r.End %d", r.Start, r.End))
	}

	// Same as setlock.
	if r.Length() == 0 {
		return
	}

	// Get our starting point.
	seg := l.LowerBoundSegment(r.Start)
	for seg.Ok() && seg.Start() < r.End {
		// If this segment doesn't have a lock from uid then
		// there is no need to fragment the set with Isolate (below).
		// In this case just move on to the next segment.
		if !seg.Value().isHeld(uid) {
			seg = seg.NextSegment()
			continue
		}

		// Ensure that if we need to unlock a sub-segment that
		// we don't unlock/remove that entire segment.
		seg = l.Isolate(seg, r)

		value := seg.Value()
		var remove bool
		if value.HasWriter && value.Writer == uid {
			// If we are unlocking a writer, then since there can
			// only ever be one writer and no readers, then this
			// lock should always be removed from the set.
			remove = true
		} else if value.Readers[uid] {
			// If uid is the last reader, then just remove the entire
			// segment.
			if len(value.Readers) == 1 {
				remove = true
			} else {
				// Otherwise we need to remove this reader without
				// affecting any other segment's readers.  To do
				// this, we need to make a copy of the Readers map
				// and not add this uid.
				newValue := Lock{Readers: make(map[UniqueID]bool)}
				for k, v := range value.Readers {
					if k != uid {
						newValue.Readers[k] = v
					}
				}
				seg.SetValue(newValue)
			}
		}
		if remove {
			seg = l.Remove(seg).NextSegment()
		} else {
			seg = seg.NextSegment()
		}
	}
}

// ComputeRange takes a positive file offset and computes the start of a LockRange
// using start (relative to offset) and the end of the LockRange using length. The
// values of start and length may be negative but the resulting LockRange must
// preserve that LockRange.Start < LockRange.End and LockRange.Start > 0.
func ComputeRange(start, length, offset int64) (LockRange, error) {
	offset += start
	// fcntl(2): "l_start can be a negative number provided the offset
	// does not lie before the start of the file"
	if offset < 0 {
		return LockRange{}, syscall.EINVAL
	}

	// fcntl(2): Specifying 0 for l_len has the  special meaning: lock all
	// bytes starting at the location specified by l_whence and l_start
	// through to the end of file, no matter how large the file grows.
	end := uint64(LockEOF)
	if length > 0 {
		// fcntl(2): If l_len is positive, then the range to be locked
		// covers bytes l_start up to and including l_start+l_len-1.
		//
		// Since LockRange.End is exclusive we need not -1 from length..
		end = uint64(offset + length)
	} else if length < 0 {
		// fcntl(2): If l_len is negative, the interval described by
		// lock covers bytes l_start+l_len up to and including l_start-1.
		//
		// Since LockRange.End is exclusive we need not -1 from offset.
		signedEnd := offset
		// Add to offset using a negative length (subtract).
		offset += length
		if offset < 0 {
			return LockRange{}, syscall.EINVAL
		}
		if signedEnd < offset {
			return LockRange{}, syscall.EOVERFLOW
		}
		// At this point signedEnd cannot be negative,
		// since we asserted that offset is not negative
		// and it is not less than offset.
		end = uint64(signedEnd)
	}
	// Offset is guaranteed to be positive at this point.
	return LockRange{Start: uint64(offset), End: end}, nil
}