462 lines
15 KiB
Go
462 lines
15 KiB
Go
// Copyright 2018 The gVisor Authors.
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
// Package lock is the API for POSIX-style advisory regional file locks and
|
|
// BSD-style full file locks.
|
|
//
|
|
// Callers needing to enforce these types of locks, like sys_fcntl, can call
|
|
// LockRegion and UnlockRegion on a thread-safe set of Locks. Locks are
|
|
// specific to a unique file (unique device/inode pair) and for this reason
|
|
// should not be shared between files.
|
|
//
|
|
// A Lock has a set of holders identified by UniqueID. Normally this is the
|
|
// pid of the thread attempting to acquire the lock.
|
|
//
|
|
// Since these are advisory locks, they do not need to be integrated into
|
|
// Reads/Writes and for this reason there is no way to *check* if a lock is
|
|
// held. One can only attempt to take a lock or unlock an existing lock.
|
|
//
|
|
// A Lock in a set of Locks is typed: it is either a read lock with any number
|
|
// of readers and no writer, or a write lock with no readers.
|
|
//
|
|
// As expected from POSIX, any attempt to acquire a write lock on a file region
|
|
// when there already exits a write lock held by a different uid will fail. Any
|
|
// attempt to acquire a write lock on a file region when there is more than one
|
|
// reader will fail. Any attempt to acquire a read lock on a file region when
|
|
// there is already a writer will fail.
|
|
//
|
|
// In special cases, a read lock may be upgraded to a write lock and a write lock
|
|
// can be downgraded to a read lock. This can only happen if:
|
|
//
|
|
// * read lock upgrade to write lock: There can be only one reader and the reader
|
|
// must be the same as the requested write lock holder.
|
|
//
|
|
// * write lock downgrade to read lock: The writer must be the same as the requested
|
|
// read lock holder.
|
|
//
|
|
// UnlockRegion always succeeds. If LockRegion fails the caller should normally
|
|
// interpret this as "try again later".
|
|
package lock
|
|
|
|
import (
|
|
"fmt"
|
|
"math"
|
|
"sync"
|
|
"syscall"
|
|
|
|
"gvisor.dev/gvisor/pkg/waiter"
|
|
)
|
|
|
|
// LockType is a type of regional file lock.
|
|
type LockType int
|
|
|
|
// UniqueID is a unique identifier of the holder of a regional file lock.
|
|
type UniqueID uint64
|
|
|
|
const (
|
|
// ReadLock describes a POSIX regional file lock to be taken
|
|
// read only. There may be multiple of these locks on a single
|
|
// file region as long as there is no writer lock on the same
|
|
// region.
|
|
ReadLock LockType = iota
|
|
|
|
// WriteLock describes a POSIX regional file lock to be taken
|
|
// write only. There may be only a single holder of this lock
|
|
// and no read locks.
|
|
WriteLock
|
|
)
|
|
|
|
// LockEOF is the maximal possible end of a regional file lock.
|
|
const LockEOF = math.MaxUint64
|
|
|
|
// Lock is a regional file lock. It consists of either a single writer
|
|
// or a set of readers.
|
|
//
|
|
// A Lock may be upgraded from a read lock to a write lock only if there
|
|
// is a single reader and that reader has the same uid as the write lock.
|
|
//
|
|
// A Lock may be downgraded from a write lock to a read lock only if
|
|
// the write lock's uid is the same as the read lock.
|
|
//
|
|
// +stateify savable
|
|
type Lock struct {
|
|
// Readers are the set of read lock holders identified by UniqueID.
|
|
// If len(Readers) > 0 then HasWriter must be false.
|
|
Readers map[UniqueID]bool
|
|
|
|
// HasWriter indicates that this is a write lock held by a single
|
|
// UniqueID.
|
|
HasWriter bool
|
|
|
|
// Writer is only valid if HasWriter is true. It identifies a
|
|
// single write lock holder.
|
|
Writer UniqueID
|
|
}
|
|
|
|
// Locks is a thread-safe wrapper around a LockSet.
|
|
//
|
|
// +stateify savable
|
|
type Locks struct {
|
|
// mu protects locks below.
|
|
mu sync.Mutex `state:"nosave"`
|
|
|
|
// locks is the set of region locks currently held on an Inode.
|
|
locks LockSet
|
|
|
|
// blockedQueue is the queue of waiters that are waiting on a lock.
|
|
blockedQueue waiter.Queue `state:"zerovalue"`
|
|
}
|
|
|
|
// Blocker is the interface used for blocking locks. Passing a nil Blocker
|
|
// will be treated as non-blocking.
|
|
type Blocker interface {
|
|
Block(C <-chan struct{}) error
|
|
}
|
|
|
|
const (
|
|
// EventMaskAll is the mask we will always use for locks, by using the
|
|
// same mask all the time we can wake up everyone anytime the lock
|
|
// changes state.
|
|
EventMaskAll waiter.EventMask = 0xFFFF
|
|
)
|
|
|
|
// LockRegion attempts to acquire a typed lock for the uid on a region
|
|
// of a file. Returns true if successful in locking the region. If false
|
|
// is returned, the caller should normally interpret this as "try again later" if
|
|
// acquiring the lock in a non-blocking mode or "interrupted" if in a blocking mode.
|
|
// Blocker is the interface used to provide blocking behavior, passing a nil Blocker
|
|
// will result in non-blocking behavior.
|
|
func (l *Locks) LockRegion(uid UniqueID, t LockType, r LockRange, block Blocker) bool {
|
|
for {
|
|
l.mu.Lock()
|
|
|
|
// Blocking locks must run in a loop because we'll be woken up whenever an unlock event
|
|
// happens for this lock. We will then attempt to take the lock again and if it fails
|
|
// continue blocking.
|
|
res := l.locks.lock(uid, t, r)
|
|
if !res && block != nil {
|
|
e, ch := waiter.NewChannelEntry(nil)
|
|
l.blockedQueue.EventRegister(&e, EventMaskAll)
|
|
l.mu.Unlock()
|
|
if err := block.Block(ch); err != nil {
|
|
// We were interrupted, the caller can translate this to EINTR if applicable.
|
|
l.blockedQueue.EventUnregister(&e)
|
|
return false
|
|
}
|
|
l.blockedQueue.EventUnregister(&e)
|
|
continue // Try again now that someone has unlocked.
|
|
}
|
|
|
|
l.mu.Unlock()
|
|
return res
|
|
}
|
|
}
|
|
|
|
// UnlockRegion attempts to release a lock for the uid on a region of a file.
|
|
// This operation is always successful, even if there did not exist a lock on
|
|
// the requested region held by uid in the first place.
|
|
func (l *Locks) UnlockRegion(uid UniqueID, r LockRange) {
|
|
l.mu.Lock()
|
|
defer l.mu.Unlock()
|
|
l.locks.unlock(uid, r)
|
|
|
|
// Now that we've released the lock, we need to wake up any waiters.
|
|
l.blockedQueue.Notify(EventMaskAll)
|
|
}
|
|
|
|
// makeLock returns a new typed Lock that has either uid as its only reader
|
|
// or uid as its only writer.
|
|
func makeLock(uid UniqueID, t LockType) Lock {
|
|
value := Lock{Readers: make(map[UniqueID]bool)}
|
|
switch t {
|
|
case ReadLock:
|
|
value.Readers[uid] = true
|
|
case WriteLock:
|
|
value.HasWriter = true
|
|
value.Writer = uid
|
|
default:
|
|
panic(fmt.Sprintf("makeLock: invalid lock type %d", t))
|
|
}
|
|
return value
|
|
}
|
|
|
|
// isHeld returns true if uid is a holder of Lock.
|
|
func (l Lock) isHeld(uid UniqueID) bool {
|
|
if l.HasWriter && l.Writer == uid {
|
|
return true
|
|
}
|
|
return l.Readers[uid]
|
|
}
|
|
|
|
// lock sets uid as a holder of a typed lock on Lock.
|
|
//
|
|
// Preconditions: canLock is true for the range containing this Lock.
|
|
func (l *Lock) lock(uid UniqueID, t LockType) {
|
|
switch t {
|
|
case ReadLock:
|
|
// If we are already a reader, then this is a no-op.
|
|
if l.Readers[uid] {
|
|
return
|
|
}
|
|
// We cannot downgrade a write lock to a read lock unless the
|
|
// uid is the same.
|
|
if l.HasWriter {
|
|
if l.Writer != uid {
|
|
panic(fmt.Sprintf("lock: cannot downgrade write lock to read lock for uid %d, writer is %d", uid, l.Writer))
|
|
}
|
|
// Ensure that there is only one reader if upgrading.
|
|
l.Readers = make(map[UniqueID]bool)
|
|
// Ensure that there is no longer a writer.
|
|
l.HasWriter = false
|
|
}
|
|
l.Readers[uid] = true
|
|
return
|
|
case WriteLock:
|
|
// If we are already the writer, then this is a no-op.
|
|
if l.HasWriter && l.Writer == uid {
|
|
return
|
|
}
|
|
// We can only upgrade a read lock to a write lock if there
|
|
// is only one reader and that reader has the same uid as
|
|
// the write lock.
|
|
if readers := len(l.Readers); readers > 0 {
|
|
if readers != 1 {
|
|
panic(fmt.Sprintf("lock: cannot upgrade read lock to write lock for uid %d, too many readers %v", uid, l.Readers))
|
|
}
|
|
if !l.Readers[uid] {
|
|
panic(fmt.Sprintf("lock: cannot upgrade read lock to write lock for uid %d, conflicting reader %v", uid, l.Readers))
|
|
}
|
|
}
|
|
// Ensure that there is only a writer.
|
|
l.Readers = make(map[UniqueID]bool)
|
|
l.HasWriter = true
|
|
l.Writer = uid
|
|
default:
|
|
panic(fmt.Sprintf("lock: invalid lock type %d", t))
|
|
}
|
|
}
|
|
|
|
// lockable returns true if check returns true for every Lock in LockRange.
|
|
// Further, check should return true if Lock meets the callers requirements
|
|
// for locking Lock.
|
|
func (l LockSet) lockable(r LockRange, check func(value Lock) bool) bool {
|
|
// Get our starting point.
|
|
seg := l.LowerBoundSegment(r.Start)
|
|
for seg.Ok() && seg.Start() < r.End {
|
|
// Note that we don't care about overruning the end of the
|
|
// last segment because if everything checks out we'll just
|
|
// split the last segment.
|
|
if !check(seg.Value()) {
|
|
return false
|
|
}
|
|
// Jump to the next segment, ignoring gaps, for the same
|
|
// reason we ignored the first gap.
|
|
seg = seg.NextSegment()
|
|
}
|
|
// No conflict, we can get a lock for uid over the entire range.
|
|
return true
|
|
}
|
|
|
|
// canLock returns true if uid will be able to take a Lock of type t on the
|
|
// entire range specified by LockRange.
|
|
func (l LockSet) canLock(uid UniqueID, t LockType, r LockRange) bool {
|
|
switch t {
|
|
case ReadLock:
|
|
return l.lockable(r, func(value Lock) bool {
|
|
// If there is no writer, there's no problem adding
|
|
// another reader.
|
|
if !value.HasWriter {
|
|
return true
|
|
}
|
|
// If there is a writer, then it must be the same uid
|
|
// in order to downgrade the lock to a read lock.
|
|
return value.Writer == uid
|
|
})
|
|
case WriteLock:
|
|
return l.lockable(r, func(value Lock) bool {
|
|
// If there are only readers.
|
|
if !value.HasWriter {
|
|
// Then this uid can only take a write lock if
|
|
// this is a private upgrade, meaning that the
|
|
// only reader is uid.
|
|
return len(value.Readers) == 1 && value.Readers[uid]
|
|
}
|
|
// If the uid is already a writer on this region, then
|
|
// adding a write lock would be a no-op.
|
|
return value.Writer == uid
|
|
})
|
|
default:
|
|
panic(fmt.Sprintf("canLock: invalid lock type %d", t))
|
|
}
|
|
}
|
|
|
|
// lock returns true if uid took a lock of type t on the entire range of LockRange.
|
|
//
|
|
// Preconditions: r.Start <= r.End (will panic otherwise).
|
|
func (l *LockSet) lock(uid UniqueID, t LockType, r LockRange) bool {
|
|
if r.Start > r.End {
|
|
panic(fmt.Sprintf("lock: r.Start %d > r.End %d", r.Start, r.End))
|
|
}
|
|
|
|
// Don't attempt to insert anything with a range of 0 and treat this
|
|
// as a successful no-op.
|
|
if r.Length() == 0 {
|
|
return true
|
|
}
|
|
|
|
// Do a first-pass check. We *could* hold onto the segments we
|
|
// checked if canLock would return true, but traversing the segment
|
|
// set should be fast and this keeps things simple.
|
|
if !l.canLock(uid, t, r) {
|
|
return false
|
|
}
|
|
// Get our starting point.
|
|
seg, gap := l.Find(r.Start)
|
|
if gap.Ok() {
|
|
// Fill in the gap and get the next segment to modify.
|
|
seg = l.Insert(gap, gap.Range().Intersect(r), makeLock(uid, t)).NextSegment()
|
|
} else if seg.Start() < r.Start {
|
|
// Get our first segment to modify.
|
|
_, seg = l.Split(seg, r.Start)
|
|
}
|
|
for seg.Ok() && seg.Start() < r.End {
|
|
// Split the last one if necessary.
|
|
if seg.End() > r.End {
|
|
seg, _ = l.SplitUnchecked(seg, r.End)
|
|
}
|
|
|
|
// Set the lock on the segment. This is guaranteed to
|
|
// always be safe, given canLock above.
|
|
value := seg.ValuePtr()
|
|
value.lock(uid, t)
|
|
|
|
// Fill subsequent gaps.
|
|
gap = seg.NextGap()
|
|
if gr := gap.Range().Intersect(r); gr.Length() > 0 {
|
|
seg = l.Insert(gap, gr, makeLock(uid, t)).NextSegment()
|
|
} else {
|
|
seg = gap.NextSegment()
|
|
}
|
|
}
|
|
return true
|
|
}
|
|
|
|
// unlock is always successful. If uid has no locks held for the range LockRange,
|
|
// unlock is a no-op.
|
|
//
|
|
// Preconditions: same as lock.
|
|
func (l *LockSet) unlock(uid UniqueID, r LockRange) {
|
|
if r.Start > r.End {
|
|
panic(fmt.Sprintf("unlock: r.Start %d > r.End %d", r.Start, r.End))
|
|
}
|
|
|
|
// Same as setlock.
|
|
if r.Length() == 0 {
|
|
return
|
|
}
|
|
|
|
// Get our starting point.
|
|
seg := l.LowerBoundSegment(r.Start)
|
|
for seg.Ok() && seg.Start() < r.End {
|
|
// If this segment doesn't have a lock from uid then
|
|
// there is no need to fragment the set with Isolate (below).
|
|
// In this case just move on to the next segment.
|
|
if !seg.Value().isHeld(uid) {
|
|
seg = seg.NextSegment()
|
|
continue
|
|
}
|
|
|
|
// Ensure that if we need to unlock a sub-segment that
|
|
// we don't unlock/remove that entire segment.
|
|
seg = l.Isolate(seg, r)
|
|
|
|
value := seg.Value()
|
|
var remove bool
|
|
if value.HasWriter && value.Writer == uid {
|
|
// If we are unlocking a writer, then since there can
|
|
// only ever be one writer and no readers, then this
|
|
// lock should always be removed from the set.
|
|
remove = true
|
|
} else if value.Readers[uid] {
|
|
// If uid is the last reader, then just remove the entire
|
|
// segment.
|
|
if len(value.Readers) == 1 {
|
|
remove = true
|
|
} else {
|
|
// Otherwise we need to remove this reader without
|
|
// affecting any other segment's readers. To do
|
|
// this, we need to make a copy of the Readers map
|
|
// and not add this uid.
|
|
newValue := Lock{Readers: make(map[UniqueID]bool)}
|
|
for k, v := range value.Readers {
|
|
if k != uid {
|
|
newValue.Readers[k] = v
|
|
}
|
|
}
|
|
seg.SetValue(newValue)
|
|
}
|
|
}
|
|
if remove {
|
|
seg = l.Remove(seg).NextSegment()
|
|
} else {
|
|
seg = seg.NextSegment()
|
|
}
|
|
}
|
|
}
|
|
|
|
// ComputeRange takes a positive file offset and computes the start of a LockRange
|
|
// using start (relative to offset) and the end of the LockRange using length. The
|
|
// values of start and length may be negative but the resulting LockRange must
|
|
// preserve that LockRange.Start < LockRange.End and LockRange.Start > 0.
|
|
func ComputeRange(start, length, offset int64) (LockRange, error) {
|
|
offset += start
|
|
// fcntl(2): "l_start can be a negative number provided the offset
|
|
// does not lie before the start of the file"
|
|
if offset < 0 {
|
|
return LockRange{}, syscall.EINVAL
|
|
}
|
|
|
|
// fcntl(2): Specifying 0 for l_len has the special meaning: lock all
|
|
// bytes starting at the location specified by l_whence and l_start
|
|
// through to the end of file, no matter how large the file grows.
|
|
end := uint64(LockEOF)
|
|
if length > 0 {
|
|
// fcntl(2): If l_len is positive, then the range to be locked
|
|
// covers bytes l_start up to and including l_start+l_len-1.
|
|
//
|
|
// Since LockRange.End is exclusive we need not -1 from length..
|
|
end = uint64(offset + length)
|
|
} else if length < 0 {
|
|
// fcntl(2): If l_len is negative, the interval described by
|
|
// lock covers bytes l_start+l_len up to and including l_start-1.
|
|
//
|
|
// Since LockRange.End is exclusive we need not -1 from offset.
|
|
signedEnd := offset
|
|
// Add to offset using a negative length (subtract).
|
|
offset += length
|
|
if offset < 0 {
|
|
return LockRange{}, syscall.EINVAL
|
|
}
|
|
if signedEnd < offset {
|
|
return LockRange{}, syscall.EOVERFLOW
|
|
}
|
|
// At this point signedEnd cannot be negative,
|
|
// since we asserted that offset is not negative
|
|
// and it is not less than offset.
|
|
end = uint64(signedEnd)
|
|
}
|
|
// Offset is guaranteed to be positive at this point.
|
|
return LockRange{Start: uint64(offset), End: end}, nil
|
|
}
|