gvisor/pkg/sentry/fs/lock/lock.go

462 lines
15 KiB
Go
Raw Normal View History

// Copyright 2018 Google Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// Package lock is the API for POSIX-style advisory regional file locks and
// BSD-style full file locks.
//
// Callers needing to enforce these types of locks, like sys_fcntl, can call
// LockRegion and UnlockRegion on a thread-safe set of Locks. Locks are
// specific to a unique file (unique device/inode pair) and for this reason
// should not be shared between files.
//
// A Lock has a set of holders identified by UniqueID. Normally this is the
// pid of the thread attempting to acquire the lock.
//
// Since these are advisory locks, they do not need to be integrated into
// Reads/Writes and for this reason there is no way to *check* if a lock is
// held. One can only attempt to take a lock or unlock an existing lock.
//
// A Lock in a set of Locks is typed: it is either a read lock with any number
// of readers and no writer, or a write lock with no readers.
//
// As expected from POSIX, any attempt to acquire a write lock on a file region
// when there already exits a write lock held by a different uid will fail. Any
// attempt to acquire a write lock on a file region when there is more than one
// reader will fail. Any attempt to acquire a read lock on a file region when
// there is already a writer will fail.
//
// In special cases, a read lock may be upgraded to a write lock and a write lock
// can be downgraded to a read lock. This can only happen if:
//
// * read lock upgrade to write lock: There can be only one reader and the reader
// must be the same as the requested write lock holder.
//
// * write lock downgrade to read lock: The writer must be the same as the requested
// read lock holder.
//
// UnlockRegion always succeeds. If LockRegion fails the caller should normally
// interpret this as "try again later".
package lock
import (
"fmt"
"math"
"sync"
"syscall"
"gvisor.googlesource.com/gvisor/pkg/waiter"
)
// LockType is a type of regional file lock.
type LockType int
// UniqueID is a unique identifier of the holder of a regional file lock.
type UniqueID uint64
const (
// ReadLock describes a POSIX regional file lock to be taken
// read only. There may be multiple of these locks on a single
// file region as long as there is no writer lock on the same
// region.
ReadLock LockType = iota
// WriteLock describes a POSIX regional file lock to be taken
// write only. There may be only a single holder of this lock
// and no read locks.
WriteLock
)
// LockEOF is the maximal possible end of a regional file lock.
const LockEOF = math.MaxUint64
// Lock is a regional file lock. It consists of either a single writer
// or a set of readers.
//
// A Lock may be upgraded from a read lock to a write lock only if there
// is a single reader and that reader has the same uid as the write lock.
//
// A Lock may be downgraded from a write lock to a read lock only if
// the write lock's uid is the same as the read lock.
//
// +stateify savable
type Lock struct {
// Readers are the set of read lock holders identified by UniqueID.
// If len(Readers) > 0 then HasWriter must be false.
Readers map[UniqueID]bool
// HasWriter indicates that this is a write lock held by a single
// UniqueID.
HasWriter bool
// Writer is only valid if HasWriter is true. It identifies a
// single write lock holder.
Writer UniqueID
}
// Locks is a thread-safe wrapper around a LockSet.
//
// +stateify savable
type Locks struct {
// mu protects locks below.
mu sync.Mutex `state:"nosave"`
// locks is the set of region locks currently held on an Inode.
locks LockSet
// blockedQueue is the queue of waiters that are waiting on a lock.
blockedQueue waiter.Queue `state:"zerovalue"`
}
// Blocker is the interface used for blocking locks. Passing a nil Blocker
// will be treated as non-blocking.
type Blocker interface {
Block(C chan struct{}) error
}
const (
// EventMaskAll is the mask we will always use for locks, by using the
// same mask all the time we can wake up everyone anytime the lock
// changes state.
EventMaskAll waiter.EventMask = 0xFFFF
)
// LockRegion attempts to acquire a typed lock for the uid on a region
// of a file. Returns true if successful in locking the region. If false
// is returned, the caller should normally interpret this as "try again later" if
// accquiring the lock in a non-blocking mode or "interrupted" if in a blocking mode.
// Blocker is the interface used to provide blocking behavior, passing a nil Blocker
// will result in non-blocking behavior.
func (l *Locks) LockRegion(uid UniqueID, t LockType, r LockRange, block Blocker) bool {
for {
l.mu.Lock()
// Blocking locks must run in a loop because we'll be woken up whenever an unlock event
// happens for this lock. We will then attempt to take the lock again and if it fails
// continue blocking.
res := l.locks.lock(uid, t, r)
if !res && block != nil {
e, ch := waiter.NewChannelEntry(nil)
l.blockedQueue.EventRegister(&e, EventMaskAll)
l.mu.Unlock()
if err := block.Block(ch); err != nil {
// We were interrupted, the caller can translate this to EINTR if applicable.
l.blockedQueue.EventUnregister(&e)
return false
}
l.blockedQueue.EventUnregister(&e)
continue // Try again now that someone has unlocked.
}
l.mu.Unlock()
return res
}
}
// UnlockRegion attempts to release a lock for the uid on a region of a file.
// This operation is always successful, even if there did not exist a lock on
// the requested region held by uid in the first place.
func (l *Locks) UnlockRegion(uid UniqueID, r LockRange) {
l.mu.Lock()
defer l.mu.Unlock()
l.locks.unlock(uid, r)
// Now that we've released the lock, we need to wake up any waiters.
l.blockedQueue.Notify(EventMaskAll)
}
// makeLock returns a new typed Lock that has either uid as its only reader
// or uid as its only writer.
func makeLock(uid UniqueID, t LockType) Lock {
value := Lock{Readers: make(map[UniqueID]bool)}
switch t {
case ReadLock:
value.Readers[uid] = true
case WriteLock:
value.HasWriter = true
value.Writer = uid
default:
panic(fmt.Sprintf("makeLock: invalid lock type %d", t))
}
return value
}
// isHeld returns true if uid is a holder of Lock.
func (l Lock) isHeld(uid UniqueID) bool {
if l.HasWriter && l.Writer == uid {
return true
}
return l.Readers[uid]
}
// lock sets uid as a holder of a typed lock on Lock.
//
// Preconditions: canLock is true for the range containing this Lock.
func (l *Lock) lock(uid UniqueID, t LockType) {
switch t {
case ReadLock:
// If we are already a reader, then this is a no-op.
if l.Readers[uid] {
return
}
// We cannot downgrade a write lock to a read lock unless the
// uid is the same.
if l.HasWriter {
if l.Writer != uid {
panic(fmt.Sprintf("lock: cannot downgrade write lock to read lock for uid %d, writer is %d", uid, l.Writer))
}
// Ensure that there is only one reader if upgrading.
l.Readers = make(map[UniqueID]bool)
// Ensure that there is no longer a writer.
l.HasWriter = false
}
l.Readers[uid] = true
return
case WriteLock:
// If we are already the writer, then this is a no-op.
if l.HasWriter && l.Writer == uid {
return
}
// We can only upgrade a read lock to a write lock if there
// is only one reader and that reader has the same uid as
// the write lock.
if readers := len(l.Readers); readers > 0 {
if readers != 1 {
panic(fmt.Sprintf("lock: cannot upgrade read lock to write lock for uid %d, too many readers %v", uid, l.Readers))
}
if !l.Readers[uid] {
panic(fmt.Sprintf("lock: cannot upgrade read lock to write lock for uid %d, conflicting reader %v", uid, l.Readers))
}
}
// Ensure that there is only a writer.
l.Readers = make(map[UniqueID]bool)
l.HasWriter = true
l.Writer = uid
default:
panic(fmt.Sprintf("lock: invalid lock type %d", t))
}
}
// lockable returns true if check returns true for every Lock in LockRange.
// Further, check should return true if Lock meets the callers requirements
// for locking Lock.
func (l LockSet) lockable(r LockRange, check func(value Lock) bool) bool {
// Get our starting point.
seg := l.LowerBoundSegment(r.Start)
for seg.Ok() && seg.Start() < r.End {
// Note that we don't care about overruning the end of the
// last segment because if everything checks out we'll just
// split the last segment.
if !check(seg.Value()) {
return false
}
// Jump to the next segment, ignoring gaps, for the same
// reason we ignored the first gap.
seg = seg.NextSegment()
}
// No conflict, we can get a lock for uid over the entire range.
return true
}
// canLock returns true if uid will be able to take a Lock of type t on the
// entire range specified by LockRange.
func (l LockSet) canLock(uid UniqueID, t LockType, r LockRange) bool {
switch t {
case ReadLock:
return l.lockable(r, func(value Lock) bool {
// If there is no writer, there's no problem adding
// another reader.
if !value.HasWriter {
return true
}
// If there is a writer, then it must be the same uid
// in order to downgrade the lock to a read lock.
return value.Writer == uid
})
case WriteLock:
return l.lockable(r, func(value Lock) bool {
// If there are only readers.
if !value.HasWriter {
// Then this uid can only take a write lock if
// this is a private upgrade, meaning that the
// only reader is uid.
return len(value.Readers) == 1 && value.Readers[uid]
}
// If the uid is already a writer on this region, then
// adding a write lock would be a no-op.
return value.Writer == uid
})
default:
panic(fmt.Sprintf("canLock: invalid lock type %d", t))
}
}
// lock returns true if uid took a lock of type t on the entire range of LockRange.
//
// Preconditions: r.Start <= r.End (will panic otherwise).
func (l *LockSet) lock(uid UniqueID, t LockType, r LockRange) bool {
if r.Start > r.End {
panic(fmt.Sprintf("lock: r.Start %d > r.End %d", r.Start, r.End))
}
// Don't attempt to insert anything with a range of 0 and treat this
// as a successful no-op.
if r.Length() == 0 {
return true
}
// Do a first-pass check. We *could* hold onto the segments we
// checked if canLock would return true, but traversing the segment
// set should be fast and this keeps things simple.
if !l.canLock(uid, t, r) {
return false
}
// Get our starting point.
seg, gap := l.Find(r.Start)
if gap.Ok() {
// Fill in the gap and get the next segment to modify.
seg = l.Insert(gap, gap.Range().Intersect(r), makeLock(uid, t)).NextSegment()
} else if seg.Start() < r.Start {
// Get our first segment to modify.
_, seg = l.Split(seg, r.Start)
}
for seg.Ok() && seg.Start() < r.End {
// Split the last one if necessary.
if seg.End() > r.End {
seg, _ = l.SplitUnchecked(seg, r.End)
}
// Set the lock on the segment. This is guaranteed to
// always be safe, given canLock above.
value := seg.ValuePtr()
value.lock(uid, t)
// Fill subsequent gaps.
gap = seg.NextGap()
if gr := gap.Range().Intersect(r); gr.Length() > 0 {
seg = l.Insert(gap, gr, makeLock(uid, t)).NextSegment()
} else {
seg = gap.NextSegment()
}
}
return true
}
// unlock is always successful. If uid has no locks held for the range LockRange,
// unlock is a no-op.
//
// Preconditions: same as lock.
func (l *LockSet) unlock(uid UniqueID, r LockRange) {
if r.Start > r.End {
panic(fmt.Sprintf("unlock: r.Start %d > r.End %d", r.Start, r.End))
}
// Same as setlock.
if r.Length() == 0 {
return
}
// Get our starting point.
seg := l.LowerBoundSegment(r.Start)
for seg.Ok() && seg.Start() < r.End {
// If this segment doesn't have a lock from uid then
// there is no need to fragment the set with Isolate (below).
// In this case just move on to the next segment.
if !seg.Value().isHeld(uid) {
seg = seg.NextSegment()
continue
}
// Ensure that if we need to unlock a sub-segment that
// we don't unlock/remove that entire segment.
seg = l.Isolate(seg, r)
value := seg.Value()
var remove bool
if value.HasWriter && value.Writer == uid {
// If we are unlocking a writer, then since there can
// only ever be one writer and no readers, then this
// lock should always be removed from the set.
remove = true
} else if value.Readers[uid] {
// If uid is the last reader, then just remove the entire
// segment.
if len(value.Readers) == 1 {
remove = true
} else {
// Otherwise we need to remove this reader without
// affecting any other segment's readers. To do
// this, we need to make a copy of the Readers map
// and not add this uid.
newValue := Lock{Readers: make(map[UniqueID]bool)}
for k, v := range value.Readers {
if k != uid {
newValue.Readers[k] = v
}
}
seg.SetValue(newValue)
}
}
if remove {
seg = l.Remove(seg).NextSegment()
} else {
seg = seg.NextSegment()
}
}
}
// ComputeRange takes a positive file offset and computes the start of a LockRange
// using start (relative to offset) and the end of the LockRange using length. The
// values of start and length may be negative but the resulting LockRange must
// preserve that LockRange.Start < LockRange.End and LockRange.Start > 0.
func ComputeRange(start, length, offset int64) (LockRange, error) {
offset += start
// fcntl(2): "l_start can be a negative number provided the offset
// does not lie before the start of the file"
if offset < 0 {
return LockRange{}, syscall.EINVAL
}
// fcntl(2): Specifying 0 for l_len has the special meaning: lock all
// bytes starting at the location specified by l_whence and l_start
// through to the end of file, no matter how large the file grows.
end := uint64(LockEOF)
if length > 0 {
// fcntl(2): If l_len is positive, then the range to be locked
// covers bytes l_start up to and including l_start+l_len-1.
//
// Since LockRange.End is exclusive we need not -1 from length..
end = uint64(offset + length)
} else if length < 0 {
// fcntl(2): If l_len is negative, the interval described by
// lock covers bytes l_start+l_len up to and including l_start-1.
//
// Since LockRange.End is exclusive we need not -1 from offset.
signedEnd := offset
// Add to offset using a negative length (subtract).
offset += length
if offset < 0 {
return LockRange{}, syscall.EINVAL
}
if signedEnd < offset {
return LockRange{}, syscall.EOVERFLOW
}
// At this point signedEnd cannot be negative,
// since we asserted that offset is not negative
// and it is not less than offset.
end = uint64(signedEnd)
}
// Offset is guaranteed to be positive at this point.
return LockRange{Start: uint64(offset), End: end}, nil
}