gvisor/pkg/sentry/kernel/semaphore/semaphore.go

572 lines
14 KiB
Go

// Copyright 2018 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// Package semaphore implements System V semaphores.
package semaphore
import (
"fmt"
"sync"
"gvisor.googlesource.com/gvisor/pkg/abi/linux"
"gvisor.googlesource.com/gvisor/pkg/log"
"gvisor.googlesource.com/gvisor/pkg/sentry/context"
"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time"
"gvisor.googlesource.com/gvisor/pkg/syserror"
)
const (
valueMax = 32767 // SEMVMX
// semaphoresMax is "maximum number of semaphores per semaphore ID" (SEMMSL).
semaphoresMax = 32000
// setMax is "system-wide limit on the number of semaphore sets" (SEMMNI).
setsMax = 32000
// semaphoresTotalMax is "system-wide limit on the number of semaphores"
// (SEMMNS = SEMMNI*SEMMSL).
semaphoresTotalMax = 1024000000
)
// Registry maintains a set of semaphores that can be found by key or ID.
//
// +stateify savable
type Registry struct {
// userNS owning the ipc name this registry belongs to. Immutable.
userNS *auth.UserNamespace
// mu protects all fields below.
mu sync.Mutex `state:"nosave"`
semaphores map[int32]*Set
lastIDUsed int32
}
// Set represents a set of semaphores that can be operated atomically.
//
// +stateify savable
type Set struct {
// registry owning this sem set. Immutable.
registry *Registry
// Id is a handle that identifies the set.
ID int32
// key is an user provided key that can be shared between processes.
key int32
// creator is the user that created the set. Immutable.
creator fs.FileOwner
// mu protects all fields below.
mu sync.Mutex `state:"nosave"`
owner fs.FileOwner
perms fs.FilePermissions
opTime ktime.Time
changeTime ktime.Time
// sems holds all semaphores in the set. The slice itself is immutable after
// it's been set, however each 'sem' object in the slice requires 'mu' lock.
sems []sem
// dead is set to true when the set is removed and can't be reached anymore.
// All waiters must wake up and fail when set is dead.
dead bool
}
// sem represents a single semanphore from a set.
//
// +stateify savable
type sem struct {
value int16
waiters waiterList `state:"zerovalue"`
pid int32
}
// waiter represents a caller that is waiting for the semaphore value to
// become positive or zero.
//
// +stateify savable
type waiter struct {
waiterEntry
// value represents how much resource the waiter needs to wake up.
value int16
ch chan struct{}
}
// NewRegistry creates a new semaphore set registry.
func NewRegistry(userNS *auth.UserNamespace) *Registry {
return &Registry{
userNS: userNS,
semaphores: make(map[int32]*Set),
}
}
// FindOrCreate searches for a semaphore set that matches 'key'. If not found,
// it may create a new one if requested. If private is true, key is ignored and
// a new set is always created. If create is false, it fails if a set cannot
// be found. If exclusive is true, it fails if a set with the same key already
// exists.
func (r *Registry) FindOrCreate(ctx context.Context, key, nsems int32, mode linux.FileMode, private, create, exclusive bool) (*Set, error) {
if nsems < 0 || nsems > semaphoresMax {
return nil, syserror.EINVAL
}
r.mu.Lock()
defer r.mu.Unlock()
if !private {
// Look up an existing semaphore.
if set := r.findByKey(key); set != nil {
set.mu.Lock()
defer set.mu.Unlock()
// Check that caller can access semaphore set.
creds := auth.CredentialsFromContext(ctx)
if !set.checkPerms(creds, fs.PermsFromMode(mode)) {
return nil, syserror.EACCES
}
// Validate parameters.
if nsems > int32(set.Size()) {
return nil, syserror.EINVAL
}
if create && exclusive {
return nil, syserror.EEXIST
}
return set, nil
}
if !create {
// Semaphore not found and should not be created.
return nil, syserror.ENOENT
}
}
// Zero is only valid if an existing set is found.
if nsems == 0 {
return nil, syserror.EINVAL
}
// Apply system limits.
if len(r.semaphores) >= setsMax {
return nil, syserror.EINVAL
}
if r.totalSems() > int(semaphoresTotalMax-nsems) {
return nil, syserror.EINVAL
}
// Finally create a new set.
owner := fs.FileOwnerFromContext(ctx)
perms := fs.FilePermsFromMode(mode)
return r.newSet(ctx, key, owner, owner, perms, nsems)
}
// RemoveID removes set with give 'id' from the registry and marks the set as
// dead. All waiters will be awakened and fail.
func (r *Registry) RemoveID(id int32, creds *auth.Credentials) error {
r.mu.Lock()
defer r.mu.Unlock()
set := r.semaphores[id]
if set == nil {
return syserror.EINVAL
}
set.mu.Lock()
defer set.mu.Unlock()
// "The effective user ID of the calling process must match the creator or
// owner of the semaphore set, or the caller must be privileged."
if !set.checkCredentials(creds) && !set.checkCapability(creds) {
return syserror.EACCES
}
delete(r.semaphores, set.ID)
set.destroy()
return nil
}
func (r *Registry) newSet(ctx context.Context, key int32, owner, creator fs.FileOwner, perms fs.FilePermissions, nsems int32) (*Set, error) {
set := &Set{
registry: r,
key: key,
owner: owner,
creator: owner,
perms: perms,
changeTime: ktime.NowFromContext(ctx),
sems: make([]sem, nsems),
}
// Find the next available ID.
for id := r.lastIDUsed + 1; id != r.lastIDUsed; id++ {
// Handle wrap around.
if id < 0 {
id = 0
continue
}
if r.semaphores[id] == nil {
r.lastIDUsed = id
r.semaphores[id] = set
set.ID = id
return set, nil
}
}
log.Warningf("Semaphore map is full, they must be leaking")
return nil, syserror.ENOMEM
}
// FindByID looks up a set given an ID.
func (r *Registry) FindByID(id int32) *Set {
r.mu.Lock()
defer r.mu.Unlock()
return r.semaphores[id]
}
func (r *Registry) findByKey(key int32) *Set {
for _, v := range r.semaphores {
if v.key == key {
return v
}
}
return nil
}
func (r *Registry) totalSems() int {
totalSems := 0
for _, v := range r.semaphores {
totalSems += v.Size()
}
return totalSems
}
func (s *Set) findSem(num int32) *sem {
if num < 0 || int(num) >= s.Size() {
return nil
}
return &s.sems[num]
}
// Size returns the number of semaphores in the set. Size is immutable.
func (s *Set) Size() int {
return len(s.sems)
}
// Change changes some fields from the set atomically.
func (s *Set) Change(ctx context.Context, creds *auth.Credentials, owner fs.FileOwner, perms fs.FilePermissions) error {
s.mu.Lock()
defer s.mu.Unlock()
// "The effective UID of the calling process must match the owner or creator
// of the semaphore set, or the caller must be privileged."
if !s.checkCredentials(creds) && !s.checkCapability(creds) {
return syserror.EACCES
}
s.owner = owner
s.perms = perms
s.changeTime = ktime.NowFromContext(ctx)
return nil
}
// SetVal overrides a semaphore value, waking up waiters as needed.
func (s *Set) SetVal(ctx context.Context, num int32, val int16, creds *auth.Credentials, pid int32) error {
if val < 0 || val > valueMax {
return syserror.ERANGE
}
s.mu.Lock()
defer s.mu.Unlock()
// "The calling process must have alter permission on the semaphore set."
if !s.checkPerms(creds, fs.PermMask{Write: true}) {
return syserror.EACCES
}
sem := s.findSem(num)
if sem == nil {
return syserror.ERANGE
}
// TODO(b/29354920): Clear undo entries in all processes
sem.value = val
sem.pid = pid
s.changeTime = ktime.NowFromContext(ctx)
sem.wakeWaiters()
return nil
}
// SetValAll overrides all semaphores values, waking up waiters as needed. It also
// sets semaphore's PID which was fixed in Linux 4.6.
//
// 'len(vals)' must be equal to 's.Size()'.
func (s *Set) SetValAll(ctx context.Context, vals []uint16, creds *auth.Credentials, pid int32) error {
if len(vals) != s.Size() {
panic(fmt.Sprintf("vals length (%d) different that Set.Size() (%d)", len(vals), s.Size()))
}
for _, val := range vals {
if val < 0 || val > valueMax {
return syserror.ERANGE
}
}
s.mu.Lock()
defer s.mu.Unlock()
// "The calling process must have alter permission on the semaphore set."
if !s.checkPerms(creds, fs.PermMask{Write: true}) {
return syserror.EACCES
}
for i, val := range vals {
sem := &s.sems[i]
// TODO(b/29354920): Clear undo entries in all processes
sem.value = int16(val)
sem.pid = pid
sem.wakeWaiters()
}
s.changeTime = ktime.NowFromContext(ctx)
return nil
}
// GetVal returns a semaphore value.
func (s *Set) GetVal(num int32, creds *auth.Credentials) (int16, error) {
s.mu.Lock()
defer s.mu.Unlock()
// "The calling process must have read permission on the semaphore set."
if !s.checkPerms(creds, fs.PermMask{Read: true}) {
return 0, syserror.EACCES
}
sem := s.findSem(num)
if sem == nil {
return 0, syserror.ERANGE
}
return sem.value, nil
}
// GetValAll returns value for all semaphores.
func (s *Set) GetValAll(creds *auth.Credentials) ([]uint16, error) {
s.mu.Lock()
defer s.mu.Unlock()
// "The calling process must have read permission on the semaphore set."
if !s.checkPerms(creds, fs.PermMask{Read: true}) {
return nil, syserror.EACCES
}
vals := make([]uint16, s.Size())
for i, sem := range s.sems {
vals[i] = uint16(sem.value)
}
return vals, nil
}
// GetPID returns the PID set when performing operations in the semaphore.
func (s *Set) GetPID(num int32, creds *auth.Credentials) (int32, error) {
s.mu.Lock()
defer s.mu.Unlock()
// "The calling process must have read permission on the semaphore set."
if !s.checkPerms(creds, fs.PermMask{Read: true}) {
return 0, syserror.EACCES
}
sem := s.findSem(num)
if sem == nil {
return 0, syserror.ERANGE
}
return sem.pid, nil
}
// ExecuteOps attempts to execute a list of operations to the set. It only
// succeeds when all operations can be applied. No changes are made if it fails.
//
// On failure, it may return an error (retries are hopeless) or it may return
// a channel that can be waited on before attempting again.
func (s *Set) ExecuteOps(ctx context.Context, ops []linux.Sembuf, creds *auth.Credentials, pid int32) (chan struct{}, int32, error) {
s.mu.Lock()
defer s.mu.Unlock()
// Did it race with a removal operation?
if s.dead {
return nil, 0, syserror.EIDRM
}
// Validate the operations.
readOnly := true
for _, op := range ops {
if s.findSem(int32(op.SemNum)) == nil {
return nil, 0, syserror.EFBIG
}
if op.SemOp != 0 {
readOnly = false
}
}
if !s.checkPerms(creds, fs.PermMask{Read: readOnly, Write: !readOnly}) {
return nil, 0, syserror.EACCES
}
ch, num, err := s.executeOps(ctx, ops, pid)
if err != nil {
return nil, 0, err
}
return ch, num, nil
}
func (s *Set) executeOps(ctx context.Context, ops []linux.Sembuf, pid int32) (chan struct{}, int32, error) {
// Changes to semaphores go to this slice temporarily until they all succeed.
tmpVals := make([]int16, len(s.sems))
for i := range s.sems {
tmpVals[i] = s.sems[i].value
}
for _, op := range ops {
sem := &s.sems[op.SemNum]
if op.SemOp == 0 {
// Handle 'wait for zero' operation.
if tmpVals[op.SemNum] != 0 {
// Semaphore isn't 0, must wait.
if op.SemFlg&linux.IPC_NOWAIT != 0 {
return nil, 0, syserror.ErrWouldBlock
}
w := newWaiter(op.SemOp)
sem.waiters.PushBack(w)
return w.ch, int32(op.SemNum), nil
}
} else {
if op.SemOp < 0 {
// Handle 'wait' operation.
if -op.SemOp > valueMax {
return nil, 0, syserror.ERANGE
}
if -op.SemOp > tmpVals[op.SemNum] {
// Not enough resources, must wait.
if op.SemFlg&linux.IPC_NOWAIT != 0 {
return nil, 0, syserror.ErrWouldBlock
}
w := newWaiter(op.SemOp)
sem.waiters.PushBack(w)
return w.ch, int32(op.SemNum), nil
}
} else {
// op.SemOp > 0: Handle 'signal' operation.
if tmpVals[op.SemNum] > valueMax-op.SemOp {
return nil, 0, syserror.ERANGE
}
}
tmpVals[op.SemNum] += op.SemOp
}
}
// All operations succeeded, apply them.
// TODO(b/29354920): handle undo operations.
for i, v := range tmpVals {
s.sems[i].value = v
s.sems[i].wakeWaiters()
s.sems[i].pid = pid
}
s.opTime = ktime.NowFromContext(ctx)
return nil, 0, nil
}
// AbortWait notifies that a waiter is giving up and will not wait on the
// channel anymore.
func (s *Set) AbortWait(num int32, ch chan struct{}) {
s.mu.Lock()
defer s.mu.Unlock()
sem := &s.sems[num]
for w := sem.waiters.Front(); w != nil; w = w.Next() {
if w.ch == ch {
sem.waiters.Remove(w)
return
}
}
// Waiter may not be found in case it raced with wakeWaiters().
}
func (s *Set) checkCredentials(creds *auth.Credentials) bool {
return s.owner.UID == creds.EffectiveKUID ||
s.owner.GID == creds.EffectiveKGID ||
s.creator.UID == creds.EffectiveKUID ||
s.creator.GID == creds.EffectiveKGID
}
func (s *Set) checkCapability(creds *auth.Credentials) bool {
return creds.HasCapabilityIn(linux.CAP_IPC_OWNER, s.registry.userNS) && creds.UserNamespace.MapFromKUID(s.owner.UID).Ok()
}
func (s *Set) checkPerms(creds *auth.Credentials, reqPerms fs.PermMask) bool {
// Are we owner, or in group, or other?
p := s.perms.Other
if s.owner.UID == creds.EffectiveKUID {
p = s.perms.User
} else if creds.InGroup(s.owner.GID) {
p = s.perms.Group
}
// Are permissions satisfied without capability checks?
if p.SupersetOf(reqPerms) {
return true
}
return s.checkCapability(creds)
}
// destroy destroys the set. Caller must hold 's.mu'.
func (s *Set) destroy() {
// Notify all waiters. They will fail on the next attempt to execute
// operations and return error.
s.dead = true
for _, s := range s.sems {
for w := s.waiters.Front(); w != nil; w = w.Next() {
w.ch <- struct{}{}
}
s.waiters.Reset()
}
}
// wakeWaiters goes over all waiters and checks which of them can be notified.
func (s *sem) wakeWaiters() {
// Note that this will release all waiters waiting for 0 too.
for w := s.waiters.Front(); w != nil; {
if s.value < w.value {
// Still blocked, skip it.
continue
}
w.ch <- struct{}{}
old := w
w = w.Next()
s.waiters.Remove(old)
}
}
func newWaiter(val int16) *waiter {
return &waiter{
value: val,
ch: make(chan struct{}, 1),
}
}