2019-04-29 21:25:05 +00:00
|
|
|
// Copyright 2018 The gVisor Authors.
|
2018-05-17 22:05:15 +00:00
|
|
|
//
|
|
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
// you may not use this file except in compliance with the License.
|
|
|
|
// You may obtain a copy of the License at
|
|
|
|
//
|
|
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
//
|
|
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
// See the License for the specific language governing permissions and
|
|
|
|
// limitations under the License.
|
|
|
|
|
|
|
|
// Package shm implements sysv shared memory segments.
|
|
|
|
//
|
|
|
|
// Known missing features:
|
|
|
|
//
|
|
|
|
// - SHM_LOCK/SHM_UNLOCK are no-ops. The sentry currently doesn't implement
|
|
|
|
// memory locking in general.
|
|
|
|
//
|
|
|
|
// - SHM_HUGETLB and related flags for shmget(2) are ignored. There's no easy
|
|
|
|
// way to implement hugetlb support on a per-map basis, and it has no impact
|
|
|
|
// on correctness.
|
|
|
|
//
|
|
|
|
// - SHM_NORESERVE for shmget(2) is ignored, the sentry doesn't implement swap
|
|
|
|
// so it's meaningless to reserve space for swap.
|
|
|
|
//
|
|
|
|
// - No per-process segment size enforcement. This feature probably isn't used
|
|
|
|
// much anyways, since Linux sets the per-process limits to the system-wide
|
|
|
|
// limits by default.
|
|
|
|
//
|
|
|
|
// Lock ordering: mm.mappingMu -> shm registry lock -> shm lock
|
|
|
|
package shm
|
|
|
|
|
|
|
|
import (
|
|
|
|
"fmt"
|
|
|
|
|
2019-06-13 23:49:09 +00:00
|
|
|
"gvisor.dev/gvisor/pkg/abi/linux"
|
|
|
|
"gvisor.dev/gvisor/pkg/log"
|
|
|
|
"gvisor.dev/gvisor/pkg/refs"
|
|
|
|
"gvisor.dev/gvisor/pkg/sentry/context"
|
|
|
|
"gvisor.dev/gvisor/pkg/sentry/fs"
|
|
|
|
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
|
|
|
|
ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
|
|
|
|
"gvisor.dev/gvisor/pkg/sentry/memmap"
|
|
|
|
"gvisor.dev/gvisor/pkg/sentry/pgalloc"
|
|
|
|
"gvisor.dev/gvisor/pkg/sentry/platform"
|
|
|
|
"gvisor.dev/gvisor/pkg/sentry/usage"
|
|
|
|
"gvisor.dev/gvisor/pkg/sentry/usermem"
|
2020-01-10 06:00:42 +00:00
|
|
|
"gvisor.dev/gvisor/pkg/sync"
|
2019-06-13 23:49:09 +00:00
|
|
|
"gvisor.dev/gvisor/pkg/syserror"
|
2018-05-17 22:05:15 +00:00
|
|
|
)
|
|
|
|
|
2018-12-10 20:47:20 +00:00
|
|
|
// Key represents a shm segment key. Analogous to a file name.
|
|
|
|
type Key int32
|
|
|
|
|
|
|
|
// ID represents the opaque handle for a shm segment. Analogous to an fd.
|
|
|
|
type ID int32
|
|
|
|
|
2018-05-17 22:05:15 +00:00
|
|
|
// Registry tracks all shared memory segments in an IPC namespace. The registry
|
|
|
|
// provides the mechanisms for creating and finding segments, and reporting
|
|
|
|
// global shm parameters.
|
2018-08-02 17:41:44 +00:00
|
|
|
//
|
|
|
|
// +stateify savable
|
2018-05-17 22:05:15 +00:00
|
|
|
type Registry struct {
|
|
|
|
// userNS owns the IPC namespace this registry belong to. Immutable.
|
|
|
|
userNS *auth.UserNamespace
|
|
|
|
|
2018-12-12 21:17:46 +00:00
|
|
|
// mu protects all fields below.
|
2018-05-17 22:05:15 +00:00
|
|
|
mu sync.Mutex `state:"nosave"`
|
|
|
|
|
2018-12-12 21:17:46 +00:00
|
|
|
// shms maps segment ids to segments.
|
2020-01-06 17:27:35 +00:00
|
|
|
//
|
|
|
|
// shms holds all referenced segments, which are removed on the last
|
|
|
|
// DecRef. Thus, it cannot itself hold a reference on the Shm.
|
|
|
|
//
|
|
|
|
// Since removal only occurs after the last (unlocked) DecRef, there
|
|
|
|
// exists a short window during which a Shm still exists in Shm, but is
|
|
|
|
// unreferenced. Users must use TryIncRef to determine if the Shm is
|
|
|
|
// still valid.
|
2018-12-10 20:47:20 +00:00
|
|
|
shms map[ID]*Shm
|
2018-05-17 22:05:15 +00:00
|
|
|
|
2018-12-12 21:17:46 +00:00
|
|
|
// keysToShms maps segment keys to segments.
|
2020-01-06 17:27:35 +00:00
|
|
|
//
|
|
|
|
// Shms in keysToShms are guaranteed to be referenced, as they are
|
|
|
|
// removed by disassociateKey before the last DecRef.
|
2018-12-12 21:17:46 +00:00
|
|
|
keysToShms map[Key]*Shm
|
|
|
|
|
2018-05-17 22:05:15 +00:00
|
|
|
// Sum of the sizes of all existing segments rounded up to page size, in
|
2018-12-12 21:17:46 +00:00
|
|
|
// units of page size.
|
2018-05-17 22:05:15 +00:00
|
|
|
totalPages uint64
|
|
|
|
|
2018-12-12 21:17:46 +00:00
|
|
|
// ID assigned to the last created segment. Used to quickly find the next
|
|
|
|
// unused ID.
|
2018-12-10 20:47:20 +00:00
|
|
|
lastIDUsed ID
|
2018-05-17 22:05:15 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// NewRegistry creates a new shm registry.
|
|
|
|
func NewRegistry(userNS *auth.UserNamespace) *Registry {
|
|
|
|
return &Registry{
|
2018-12-12 21:17:46 +00:00
|
|
|
userNS: userNS,
|
|
|
|
shms: make(map[ID]*Shm),
|
|
|
|
keysToShms: make(map[Key]*Shm),
|
2018-05-17 22:05:15 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// FindByID looks up a segment given an ID.
|
2020-01-06 17:27:35 +00:00
|
|
|
//
|
|
|
|
// FindByID returns a reference on Shm.
|
2018-12-10 20:47:20 +00:00
|
|
|
func (r *Registry) FindByID(id ID) *Shm {
|
2018-05-17 22:05:15 +00:00
|
|
|
r.mu.Lock()
|
|
|
|
defer r.mu.Unlock()
|
2020-01-06 17:27:35 +00:00
|
|
|
s := r.shms[id]
|
|
|
|
// Take a reference on s. If TryIncRef fails, s has reached the last
|
|
|
|
// DecRef, but hasn't quite been removed from r.shms yet.
|
|
|
|
if s != nil && s.TryIncRef() {
|
|
|
|
return s
|
|
|
|
}
|
|
|
|
return nil
|
2018-05-17 22:05:15 +00:00
|
|
|
}
|
|
|
|
|
2018-12-12 21:17:46 +00:00
|
|
|
// dissociateKey removes the association between a segment and its key,
|
|
|
|
// preventing it from being discovered in the registry. This doesn't necessarily
|
|
|
|
// mean the segment is about to be destroyed. This is analogous to unlinking a
|
|
|
|
// file; the segment can still be used by a process already referencing it, but
|
|
|
|
// cannot be discovered by a new process.
|
|
|
|
func (r *Registry) dissociateKey(s *Shm) {
|
|
|
|
r.mu.Lock()
|
|
|
|
defer r.mu.Unlock()
|
|
|
|
s.mu.Lock()
|
|
|
|
defer s.mu.Unlock()
|
|
|
|
if s.key != linux.IPC_PRIVATE {
|
|
|
|
delete(r.keysToShms, s.key)
|
|
|
|
s.key = linux.IPC_PRIVATE
|
2018-05-17 22:05:15 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// FindOrCreate looks up or creates a segment in the registry. It's functionally
|
|
|
|
// analogous to open(2).
|
2020-01-06 17:27:35 +00:00
|
|
|
//
|
|
|
|
// FindOrCreate returns a reference on Shm.
|
2018-12-10 20:47:20 +00:00
|
|
|
func (r *Registry) FindOrCreate(ctx context.Context, pid int32, key Key, size uint64, mode linux.FileMode, private, create, exclusive bool) (*Shm, error) {
|
2018-10-23 21:17:47 +00:00
|
|
|
if (create || private) && (size < linux.SHMMIN || size > linux.SHMMAX) {
|
2018-05-17 22:05:15 +00:00
|
|
|
// "A new segment was to be created and size is less than SHMMIN or
|
|
|
|
// greater than SHMMAX." - man shmget(2)
|
2018-10-23 21:17:47 +00:00
|
|
|
//
|
|
|
|
// Note that 'private' always implies the creation of a new segment
|
|
|
|
// whether IPC_CREAT is specified or not.
|
2018-05-17 22:05:15 +00:00
|
|
|
return nil, syserror.EINVAL
|
|
|
|
}
|
|
|
|
|
|
|
|
r.mu.Lock()
|
|
|
|
defer r.mu.Unlock()
|
|
|
|
|
2018-08-28 00:20:36 +00:00
|
|
|
if len(r.shms) >= linux.SHMMNI {
|
2018-05-17 22:05:15 +00:00
|
|
|
// "All possible shared memory IDs have been taken (SHMMNI) ..."
|
|
|
|
// - man shmget(2)
|
|
|
|
return nil, syserror.ENOSPC
|
|
|
|
}
|
|
|
|
|
|
|
|
if !private {
|
|
|
|
// Look up an existing segment.
|
2018-12-12 21:17:46 +00:00
|
|
|
if shm := r.keysToShms[key]; shm != nil {
|
2018-05-17 22:05:15 +00:00
|
|
|
shm.mu.Lock()
|
|
|
|
defer shm.mu.Unlock()
|
|
|
|
|
|
|
|
// Check that caller can access the segment.
|
|
|
|
if !shm.checkPermissions(ctx, fs.PermsFromMode(mode)) {
|
|
|
|
// "The user does not have permission to access the shared
|
|
|
|
// memory segment, and does not have the CAP_IPC_OWNER
|
|
|
|
// capability in the user namespace that governs its IPC
|
|
|
|
// namespace." - man shmget(2)
|
|
|
|
return nil, syserror.EACCES
|
|
|
|
}
|
|
|
|
|
|
|
|
if size > shm.size {
|
|
|
|
// "A segment for the given key exists, but size is greater than
|
|
|
|
// the size of that segment." - man shmget(2)
|
|
|
|
return nil, syserror.EINVAL
|
|
|
|
}
|
|
|
|
|
|
|
|
if create && exclusive {
|
|
|
|
// "IPC_CREAT and IPC_EXCL were specified in shmflg, but a
|
|
|
|
// shared memory segment already exists for key."
|
|
|
|
// - man shmget(2)
|
|
|
|
return nil, syserror.EEXIST
|
|
|
|
}
|
|
|
|
|
2020-01-06 17:27:35 +00:00
|
|
|
shm.IncRef()
|
2018-05-17 22:05:15 +00:00
|
|
|
return shm, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
if !create {
|
|
|
|
// "No segment exists for the given key, and IPC_CREAT was not
|
|
|
|
// specified." - man shmget(2)
|
|
|
|
return nil, syserror.ENOENT
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
var sizeAligned uint64
|
|
|
|
if val, ok := usermem.Addr(size).RoundUp(); ok {
|
|
|
|
sizeAligned = uint64(val)
|
|
|
|
} else {
|
|
|
|
return nil, syserror.EINVAL
|
|
|
|
}
|
|
|
|
|
2018-08-28 00:20:36 +00:00
|
|
|
if numPages := sizeAligned / usermem.PageSize; r.totalPages+numPages > linux.SHMALL {
|
2018-05-17 22:05:15 +00:00
|
|
|
// "... allocating a segment of the requested size would cause the
|
|
|
|
// system to exceed the system-wide limit on shared memory (SHMALL)."
|
|
|
|
// - man shmget(2)
|
|
|
|
return nil, syserror.ENOSPC
|
|
|
|
}
|
|
|
|
|
|
|
|
// Need to create a new segment.
|
|
|
|
creator := fs.FileOwnerFromContext(ctx)
|
|
|
|
perms := fs.FilePermsFromMode(mode)
|
2020-01-06 17:27:35 +00:00
|
|
|
s, err := r.newShm(ctx, pid, key, creator, perms, size)
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
// The initial reference is held by s itself. Take another to return to
|
|
|
|
// the caller.
|
|
|
|
s.IncRef()
|
|
|
|
return s, nil
|
2018-05-17 22:05:15 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// newShm creates a new segment in the registry.
|
2018-12-12 21:17:46 +00:00
|
|
|
//
|
|
|
|
// Precondition: Caller must hold r.mu.
|
2018-12-10 20:47:20 +00:00
|
|
|
func (r *Registry) newShm(ctx context.Context, pid int32, key Key, creator fs.FileOwner, perms fs.FilePermissions, size uint64) (*Shm, error) {
|
2019-03-14 15:11:36 +00:00
|
|
|
mfp := pgalloc.MemoryFileProviderFromContext(ctx)
|
|
|
|
if mfp == nil {
|
|
|
|
panic(fmt.Sprintf("context.Context %T lacks non-nil value for key %T", ctx, pgalloc.CtxMemoryFileProvider))
|
2018-05-17 22:05:15 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
effectiveSize := uint64(usermem.Addr(size).MustRoundUp())
|
2019-03-14 15:11:36 +00:00
|
|
|
fr, err := mfp.MemoryFile().Allocate(effectiveSize, usage.Anonymous)
|
2018-05-17 22:05:15 +00:00
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
|
|
|
|
shm := &Shm{
|
2019-03-14 15:11:36 +00:00
|
|
|
mfp: mfp,
|
2018-05-17 22:05:15 +00:00
|
|
|
registry: r,
|
|
|
|
creator: creator,
|
|
|
|
size: size,
|
|
|
|
effectiveSize: effectiveSize,
|
|
|
|
fr: fr,
|
|
|
|
key: key,
|
|
|
|
perms: perms,
|
|
|
|
owner: creator,
|
|
|
|
creatorPID: pid,
|
|
|
|
changeTime: ktime.NowFromContext(ctx),
|
|
|
|
}
|
2019-06-29 03:06:33 +00:00
|
|
|
shm.EnableLeakCheck("kernel.Shm")
|
2018-05-17 22:05:15 +00:00
|
|
|
|
|
|
|
// Find the next available ID.
|
|
|
|
for id := r.lastIDUsed + 1; id != r.lastIDUsed; id++ {
|
|
|
|
// Handle wrap around.
|
|
|
|
if id < 0 {
|
|
|
|
id = 0
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
if r.shms[id] == nil {
|
|
|
|
r.lastIDUsed = id
|
2018-12-12 21:17:46 +00:00
|
|
|
|
2018-05-17 22:05:15 +00:00
|
|
|
shm.ID = id
|
2018-12-12 21:17:46 +00:00
|
|
|
r.shms[id] = shm
|
|
|
|
r.keysToShms[key] = shm
|
2018-05-17 22:05:15 +00:00
|
|
|
|
|
|
|
r.totalPages += effectiveSize / usermem.PageSize
|
|
|
|
|
|
|
|
return shm, nil
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
log.Warningf("Shm ids exhuasted, they may be leaking")
|
|
|
|
return nil, syserror.ENOSPC
|
|
|
|
}
|
|
|
|
|
|
|
|
// IPCInfo reports global parameters for sysv shared memory segments on this
|
|
|
|
// system. See shmctl(IPC_INFO).
|
|
|
|
func (r *Registry) IPCInfo() *linux.ShmParams {
|
|
|
|
return &linux.ShmParams{
|
2018-08-28 00:20:36 +00:00
|
|
|
ShmMax: linux.SHMMAX,
|
|
|
|
ShmMin: linux.SHMMIN,
|
|
|
|
ShmMni: linux.SHMMNI,
|
|
|
|
ShmSeg: linux.SHMSEG,
|
|
|
|
ShmAll: linux.SHMALL,
|
2018-05-17 22:05:15 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// ShmInfo reports linux-specific global parameters for sysv shared memory
|
|
|
|
// segments on this system. See shmctl(SHM_INFO).
|
|
|
|
func (r *Registry) ShmInfo() *linux.ShmInfo {
|
|
|
|
r.mu.Lock()
|
|
|
|
defer r.mu.Unlock()
|
|
|
|
|
|
|
|
return &linux.ShmInfo{
|
|
|
|
UsedIDs: int32(r.lastIDUsed),
|
|
|
|
ShmTot: r.totalPages,
|
|
|
|
ShmRss: r.totalPages, // We could probably get a better estimate from memory accounting.
|
|
|
|
ShmSwp: 0, // No reclaim at the moment.
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2018-12-12 21:17:46 +00:00
|
|
|
// remove deletes a segment from this registry, deaccounting the memory used by
|
|
|
|
// the segment.
|
2018-05-17 22:05:15 +00:00
|
|
|
//
|
2018-12-12 21:17:46 +00:00
|
|
|
// Precondition: Must follow a call to r.dissociateKey(s).
|
2018-05-17 22:05:15 +00:00
|
|
|
func (r *Registry) remove(s *Shm) {
|
|
|
|
r.mu.Lock()
|
|
|
|
defer r.mu.Unlock()
|
2018-12-12 21:17:46 +00:00
|
|
|
s.mu.Lock()
|
|
|
|
defer s.mu.Unlock()
|
|
|
|
|
|
|
|
if s.key != linux.IPC_PRIVATE {
|
2019-04-02 23:45:27 +00:00
|
|
|
panic(fmt.Sprintf("Attempted to remove %s from the registry whose key is still associated", s.debugLocked()))
|
2018-12-12 21:17:46 +00:00
|
|
|
}
|
|
|
|
|
2018-05-17 22:05:15 +00:00
|
|
|
delete(r.shms, s.ID)
|
|
|
|
r.totalPages -= s.effectiveSize / usermem.PageSize
|
|
|
|
}
|
|
|
|
|
|
|
|
// Shm represents a single shared memory segment.
|
|
|
|
//
|
2020-01-06 17:27:35 +00:00
|
|
|
// Shm segment are backed directly by an allocation from platform memory.
|
|
|
|
// Segments are always mapped as a whole, greatly simplifying how mappings are
|
|
|
|
// tracked. However note that mremap and munmap calls may cause the vma for a
|
|
|
|
// segment to become fragmented; which requires special care when unmapping a
|
|
|
|
// segment. See mm/shm.go.
|
2018-05-17 22:05:15 +00:00
|
|
|
//
|
|
|
|
// Segments persist until they are explicitly marked for destruction via
|
2020-01-06 17:27:35 +00:00
|
|
|
// MarkDestroyed().
|
2018-05-17 22:05:15 +00:00
|
|
|
//
|
|
|
|
// Shm implements memmap.Mappable and memmap.MappingIdentity.
|
2018-08-02 17:41:44 +00:00
|
|
|
//
|
|
|
|
// +stateify savable
|
2018-05-17 22:05:15 +00:00
|
|
|
type Shm struct {
|
2020-01-06 17:27:35 +00:00
|
|
|
// AtomicRefCount tracks the number of references to this segment.
|
|
|
|
//
|
|
|
|
// A segment holds a reference to itself until it is marked for
|
2018-05-17 22:05:15 +00:00
|
|
|
// destruction.
|
2020-01-06 17:27:35 +00:00
|
|
|
//
|
|
|
|
// In addition to direct users, the MemoryManager will hold references
|
|
|
|
// via MappingIdentity.
|
2018-05-17 22:05:15 +00:00
|
|
|
refs.AtomicRefCount
|
|
|
|
|
2019-03-14 15:11:36 +00:00
|
|
|
mfp pgalloc.MemoryFileProvider
|
2018-05-17 22:05:15 +00:00
|
|
|
|
|
|
|
// registry points to the shm registry containing this segment. Immutable.
|
|
|
|
registry *Registry
|
|
|
|
|
|
|
|
// ID is the kernel identifier for this segment. Immutable.
|
2018-12-10 20:47:20 +00:00
|
|
|
ID ID
|
2018-05-17 22:05:15 +00:00
|
|
|
|
|
|
|
// creator is the user that created the segment. Immutable.
|
|
|
|
creator fs.FileOwner
|
|
|
|
|
|
|
|
// size is the requested size of the segment at creation, in
|
|
|
|
// bytes. Immutable.
|
|
|
|
size uint64
|
|
|
|
|
|
|
|
// effectiveSize of the segment, rounding up to the next page
|
|
|
|
// boundary. Immutable.
|
|
|
|
//
|
|
|
|
// Invariant: effectiveSize must be a multiple of usermem.PageSize.
|
|
|
|
effectiveSize uint64
|
|
|
|
|
2019-03-14 15:11:36 +00:00
|
|
|
// fr is the offset into mfp.MemoryFile() that backs this contents of this
|
2018-05-17 22:05:15 +00:00
|
|
|
// segment. Immutable.
|
|
|
|
fr platform.FileRange
|
|
|
|
|
|
|
|
// mu protects all fields below.
|
|
|
|
mu sync.Mutex `state:"nosave"`
|
|
|
|
|
2018-12-12 21:17:46 +00:00
|
|
|
// key is the public identifier for this segment.
|
|
|
|
key Key
|
|
|
|
|
2018-05-17 22:05:15 +00:00
|
|
|
// perms is the access permissions for the segment.
|
|
|
|
perms fs.FilePermissions
|
|
|
|
|
|
|
|
// owner of this segment.
|
|
|
|
owner fs.FileOwner
|
|
|
|
// attachTime is updated on every successful shmat.
|
|
|
|
attachTime ktime.Time
|
|
|
|
// detachTime is updated on every successful shmdt.
|
|
|
|
detachTime ktime.Time
|
|
|
|
// changeTime is updated on every successful changes to the segment via
|
|
|
|
// shmctl(IPC_SET).
|
|
|
|
changeTime ktime.Time
|
|
|
|
|
|
|
|
// creatorPID is the PID of the process that created the segment.
|
|
|
|
creatorPID int32
|
|
|
|
// lastAttachDetachPID is the pid of the process that issued the last shmat
|
|
|
|
// or shmdt syscall.
|
|
|
|
lastAttachDetachPID int32
|
|
|
|
|
|
|
|
// pendingDestruction indicates the segment was marked as destroyed through
|
|
|
|
// shmctl(IPC_RMID). When marked as destroyed, the segment will not be found
|
|
|
|
// in the registry and can no longer be attached. When the last user
|
2018-12-12 21:17:46 +00:00
|
|
|
// detaches from the segment, it is destroyed.
|
2018-05-17 22:05:15 +00:00
|
|
|
pendingDestruction bool
|
|
|
|
}
|
|
|
|
|
2019-04-02 23:45:27 +00:00
|
|
|
// Precondition: Caller must hold s.mu.
|
|
|
|
func (s *Shm) debugLocked() string {
|
|
|
|
return fmt.Sprintf("Shm{id: %d, key: %d, size: %d bytes, refs: %d, destroyed: %v}",
|
|
|
|
s.ID, s.key, s.size, s.ReadRefs(), s.pendingDestruction)
|
|
|
|
}
|
|
|
|
|
2018-05-17 22:05:15 +00:00
|
|
|
// MappedName implements memmap.MappingIdentity.MappedName.
|
|
|
|
func (s *Shm) MappedName(ctx context.Context) string {
|
2018-12-12 21:17:46 +00:00
|
|
|
s.mu.Lock()
|
|
|
|
defer s.mu.Unlock()
|
2018-05-17 22:05:15 +00:00
|
|
|
return fmt.Sprintf("SYSV%08d", s.key)
|
|
|
|
}
|
|
|
|
|
|
|
|
// DeviceID implements memmap.MappingIdentity.DeviceID.
|
|
|
|
func (s *Shm) DeviceID() uint64 {
|
|
|
|
return shmDevice.DeviceID()
|
|
|
|
}
|
|
|
|
|
|
|
|
// InodeID implements memmap.MappingIdentity.InodeID.
|
|
|
|
func (s *Shm) InodeID() uint64 {
|
|
|
|
// "shmid gets reported as "inode#" in /proc/pid/maps. proc-ps tools use
|
|
|
|
// this. Changing this will break them." -- Linux, ipc/shm.c:newseg()
|
|
|
|
return uint64(s.ID)
|
|
|
|
}
|
|
|
|
|
|
|
|
// DecRef overrides refs.RefCount.DecRef with a destructor.
|
2018-12-12 21:17:46 +00:00
|
|
|
//
|
|
|
|
// Precondition: Caller must not hold s.mu.
|
2018-05-17 22:05:15 +00:00
|
|
|
func (s *Shm) DecRef() {
|
|
|
|
s.DecRefWithDestructor(s.destroy)
|
|
|
|
}
|
|
|
|
|
|
|
|
// Msync implements memmap.MappingIdentity.Msync. Msync is a no-op for shm
|
|
|
|
// segments.
|
|
|
|
func (s *Shm) Msync(context.Context, memmap.MappableRange) error {
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
// AddMapping implements memmap.Mappable.AddMapping.
|
2018-12-12 21:09:10 +00:00
|
|
|
func (s *Shm) AddMapping(ctx context.Context, _ memmap.MappingSpace, _ usermem.AddrRange, _ uint64, _ bool) error {
|
2018-05-17 22:05:15 +00:00
|
|
|
s.mu.Lock()
|
|
|
|
defer s.mu.Unlock()
|
|
|
|
s.attachTime = ktime.NowFromContext(ctx)
|
|
|
|
if pid, ok := context.ThreadGroupIDFromContext(ctx); ok {
|
|
|
|
s.lastAttachDetachPID = pid
|
|
|
|
} else {
|
|
|
|
// AddMapping is called during a syscall, so ctx should always be a task
|
|
|
|
// context.
|
2019-04-02 23:45:27 +00:00
|
|
|
log.Warningf("Adding mapping to %s but couldn't get the current pid; not updating the last attach pid", s.debugLocked())
|
2018-05-17 22:05:15 +00:00
|
|
|
}
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
// RemoveMapping implements memmap.Mappable.RemoveMapping.
|
2018-12-12 21:09:10 +00:00
|
|
|
func (s *Shm) RemoveMapping(ctx context.Context, _ memmap.MappingSpace, _ usermem.AddrRange, _ uint64, _ bool) {
|
2018-05-17 22:05:15 +00:00
|
|
|
s.mu.Lock()
|
|
|
|
defer s.mu.Unlock()
|
2019-04-29 21:03:04 +00:00
|
|
|
// TODO(b/38173783): RemoveMapping may be called during task exit, when ctx
|
2018-05-17 22:05:15 +00:00
|
|
|
// is context.Background. Gracefully handle missing clocks. Failing to
|
|
|
|
// update the detach time in these cases is ok, since no one can observe the
|
|
|
|
// omission.
|
|
|
|
if clock := ktime.RealtimeClockFromContext(ctx); clock != nil {
|
|
|
|
s.detachTime = clock.Now()
|
|
|
|
}
|
|
|
|
|
|
|
|
// If called from a non-task context we also won't have a threadgroup
|
|
|
|
// id. Silently skip updating the lastAttachDetachPid in that case.
|
|
|
|
if pid, ok := context.ThreadGroupIDFromContext(ctx); ok {
|
|
|
|
s.lastAttachDetachPID = pid
|
|
|
|
} else {
|
2019-04-02 23:45:27 +00:00
|
|
|
log.Debugf("Couldn't obtain pid when removing mapping to %s, not updating the last detach pid.", s.debugLocked())
|
2018-05-17 22:05:15 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// CopyMapping implements memmap.Mappable.CopyMapping.
|
2018-12-12 21:09:10 +00:00
|
|
|
func (*Shm) CopyMapping(context.Context, memmap.MappingSpace, usermem.AddrRange, usermem.AddrRange, uint64, bool) error {
|
2018-05-17 22:05:15 +00:00
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
// Translate implements memmap.Mappable.Translate.
|
|
|
|
func (s *Shm) Translate(ctx context.Context, required, optional memmap.MappableRange, at usermem.AccessType) ([]memmap.Translation, error) {
|
|
|
|
var err error
|
|
|
|
if required.End > s.fr.Length() {
|
|
|
|
err = &memmap.BusError{syserror.EFAULT}
|
|
|
|
}
|
|
|
|
if source := optional.Intersect(memmap.MappableRange{0, s.fr.Length()}); source.Length() != 0 {
|
|
|
|
return []memmap.Translation{
|
|
|
|
{
|
|
|
|
Source: source,
|
2019-03-14 15:11:36 +00:00
|
|
|
File: s.mfp.MemoryFile(),
|
2018-05-17 22:05:15 +00:00
|
|
|
Offset: s.fr.Start + source.Start,
|
2019-03-25 19:41:36 +00:00
|
|
|
Perms: usermem.AnyAccess,
|
2018-05-17 22:05:15 +00:00
|
|
|
},
|
|
|
|
}, err
|
|
|
|
}
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
|
|
|
|
// InvalidateUnsavable implements memmap.Mappable.InvalidateUnsavable.
|
|
|
|
func (s *Shm) InvalidateUnsavable(ctx context.Context) error {
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
// AttachOpts describes various flags passed to shmat(2).
|
|
|
|
type AttachOpts struct {
|
|
|
|
Execute bool
|
|
|
|
Readonly bool
|
|
|
|
Remap bool
|
|
|
|
}
|
|
|
|
|
|
|
|
// ConfigureAttach creates an mmap configuration for the segment with the
|
|
|
|
// requested attach options.
|
|
|
|
//
|
2020-01-06 17:27:35 +00:00
|
|
|
// Postconditions: The returned MMapOpts are valid only as long as a reference
|
|
|
|
// continues to be held on s.
|
2018-05-17 22:05:15 +00:00
|
|
|
func (s *Shm) ConfigureAttach(ctx context.Context, addr usermem.Addr, opts AttachOpts) (memmap.MMapOpts, error) {
|
|
|
|
s.mu.Lock()
|
|
|
|
defer s.mu.Unlock()
|
|
|
|
if s.pendingDestruction && s.ReadRefs() == 0 {
|
|
|
|
return memmap.MMapOpts{}, syserror.EIDRM
|
|
|
|
}
|
|
|
|
|
|
|
|
if !s.checkPermissions(ctx, fs.PermMask{
|
|
|
|
Read: true,
|
|
|
|
Write: !opts.Readonly,
|
|
|
|
Execute: opts.Execute,
|
|
|
|
}) {
|
|
|
|
// "The calling process does not have the required permissions for the
|
|
|
|
// requested attach type, and does not have the CAP_IPC_OWNER capability
|
|
|
|
// in the user namespace that governs its IPC namespace." - man shmat(2)
|
|
|
|
return memmap.MMapOpts{}, syserror.EACCES
|
|
|
|
}
|
|
|
|
return memmap.MMapOpts{
|
|
|
|
Length: s.size,
|
|
|
|
Offset: 0,
|
|
|
|
Addr: addr,
|
|
|
|
Fixed: opts.Remap,
|
|
|
|
Perms: usermem.AccessType{
|
|
|
|
Read: true,
|
|
|
|
Write: !opts.Readonly,
|
|
|
|
Execute: opts.Execute,
|
|
|
|
},
|
|
|
|
MaxPerms: usermem.AnyAccess,
|
|
|
|
Mappable: s,
|
|
|
|
MappingIdentity: s,
|
|
|
|
}, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
// EffectiveSize returns the size of the underlying shared memory segment. This
|
|
|
|
// may be larger than the requested size at creation, due to rounding to page
|
|
|
|
// boundaries.
|
|
|
|
func (s *Shm) EffectiveSize() uint64 {
|
|
|
|
return s.effectiveSize
|
|
|
|
}
|
|
|
|
|
|
|
|
// IPCStat returns information about a shm. See shmctl(IPC_STAT).
|
|
|
|
func (s *Shm) IPCStat(ctx context.Context) (*linux.ShmidDS, error) {
|
|
|
|
s.mu.Lock()
|
|
|
|
defer s.mu.Unlock()
|
|
|
|
|
|
|
|
// "The caller must have read permission on the shared memory segment."
|
|
|
|
// - man shmctl(2)
|
|
|
|
if !s.checkPermissions(ctx, fs.PermMask{Read: true}) {
|
|
|
|
// "IPC_STAT or SHM_STAT is requested and shm_perm.mode does not allow
|
|
|
|
// read access for shmid, and the calling process does not have the
|
|
|
|
// CAP_IPC_OWNER capability in the user namespace that governs its IPC
|
|
|
|
// namespace." - man shmctl(2)
|
|
|
|
return nil, syserror.EACCES
|
|
|
|
}
|
|
|
|
|
|
|
|
var mode uint16
|
|
|
|
if s.pendingDestruction {
|
|
|
|
mode |= linux.SHM_DEST
|
|
|
|
}
|
|
|
|
creds := auth.CredentialsFromContext(ctx)
|
|
|
|
|
2020-01-06 17:27:35 +00:00
|
|
|
// Use the reference count as a rudimentary count of the number of
|
|
|
|
// attaches. We exclude:
|
|
|
|
//
|
|
|
|
// 1. The reference the caller holds.
|
|
|
|
// 2. The self-reference held by s prior to destruction.
|
|
|
|
//
|
|
|
|
// Note that this may still overcount by including transient references
|
|
|
|
// used in concurrent calls.
|
|
|
|
nattach := uint64(s.ReadRefs()) - 1
|
2018-05-17 22:05:15 +00:00
|
|
|
if !s.pendingDestruction {
|
|
|
|
nattach--
|
|
|
|
}
|
|
|
|
|
|
|
|
ds := &linux.ShmidDS{
|
|
|
|
ShmPerm: linux.IPCPerm{
|
|
|
|
Key: uint32(s.key),
|
|
|
|
UID: uint32(creds.UserNamespace.MapFromKUID(s.owner.UID)),
|
|
|
|
GID: uint32(creds.UserNamespace.MapFromKGID(s.owner.GID)),
|
|
|
|
CUID: uint32(creds.UserNamespace.MapFromKUID(s.creator.UID)),
|
|
|
|
CGID: uint32(creds.UserNamespace.MapFromKGID(s.creator.GID)),
|
|
|
|
Mode: mode | uint16(s.perms.LinuxMode()),
|
|
|
|
Seq: 0, // IPC sequences not supported.
|
|
|
|
},
|
|
|
|
ShmSegsz: s.size,
|
|
|
|
ShmAtime: s.attachTime.TimeT(),
|
|
|
|
ShmDtime: s.detachTime.TimeT(),
|
|
|
|
ShmCtime: s.changeTime.TimeT(),
|
|
|
|
ShmCpid: s.creatorPID,
|
|
|
|
ShmLpid: s.lastAttachDetachPID,
|
|
|
|
ShmNattach: nattach,
|
|
|
|
}
|
|
|
|
|
|
|
|
return ds, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
// Set modifies attributes for a segment. See shmctl(IPC_SET).
|
|
|
|
func (s *Shm) Set(ctx context.Context, ds *linux.ShmidDS) error {
|
|
|
|
s.mu.Lock()
|
|
|
|
defer s.mu.Unlock()
|
|
|
|
|
|
|
|
if !s.checkOwnership(ctx) {
|
|
|
|
return syserror.EPERM
|
|
|
|
}
|
|
|
|
|
|
|
|
creds := auth.CredentialsFromContext(ctx)
|
|
|
|
uid := creds.UserNamespace.MapToKUID(auth.UID(ds.ShmPerm.UID))
|
|
|
|
gid := creds.UserNamespace.MapToKGID(auth.GID(ds.ShmPerm.GID))
|
|
|
|
if !uid.Ok() || !gid.Ok() {
|
|
|
|
return syserror.EINVAL
|
|
|
|
}
|
|
|
|
|
|
|
|
// User may only modify the lower 9 bits of the mode. All the other bits are
|
|
|
|
// always 0 for the underlying inode.
|
|
|
|
mode := linux.FileMode(ds.ShmPerm.Mode & 0x1ff)
|
|
|
|
s.perms = fs.FilePermsFromMode(mode)
|
|
|
|
|
|
|
|
s.owner.UID = uid
|
|
|
|
s.owner.GID = gid
|
|
|
|
|
|
|
|
s.changeTime = ktime.NowFromContext(ctx)
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
func (s *Shm) destroy() {
|
2019-03-14 15:11:36 +00:00
|
|
|
s.mfp.MemoryFile().DecRef(s.fr)
|
2018-12-12 21:17:46 +00:00
|
|
|
s.registry.remove(s)
|
2018-05-17 22:05:15 +00:00
|
|
|
}
|
|
|
|
|
2018-12-12 21:17:46 +00:00
|
|
|
// MarkDestroyed marks a segment for destruction. The segment is actually
|
|
|
|
// destroyed once it has no references. MarkDestroyed may be called multiple
|
|
|
|
// times, and is safe to call after a segment has already been destroyed. See
|
|
|
|
// shmctl(IPC_RMID).
|
2018-05-17 22:05:15 +00:00
|
|
|
func (s *Shm) MarkDestroyed() {
|
2018-12-12 21:17:46 +00:00
|
|
|
s.registry.dissociateKey(s)
|
2018-11-01 22:53:25 +00:00
|
|
|
|
2018-12-12 21:17:46 +00:00
|
|
|
s.mu.Lock()
|
2020-01-06 17:27:35 +00:00
|
|
|
defer s.mu.Unlock()
|
2018-11-01 22:53:25 +00:00
|
|
|
if !s.pendingDestruction {
|
|
|
|
s.pendingDestruction = true
|
2020-01-06 17:27:35 +00:00
|
|
|
// Drop the self-reference so destruction occurs when all
|
|
|
|
// external references are gone.
|
|
|
|
//
|
|
|
|
// N.B. This cannot be the final DecRef, as the caller also
|
|
|
|
// holds a reference.
|
2018-11-01 22:53:25 +00:00
|
|
|
s.DecRef()
|
2018-12-12 21:17:46 +00:00
|
|
|
return
|
2018-11-01 22:53:25 +00:00
|
|
|
}
|
2018-05-17 22:05:15 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// checkOwnership verifies whether a segment may be accessed by ctx as an
|
|
|
|
// owner. See ipc/util.c:ipcctl_pre_down_nolock() in Linux.
|
|
|
|
//
|
|
|
|
// Precondition: Caller must hold s.mu.
|
|
|
|
func (s *Shm) checkOwnership(ctx context.Context) bool {
|
|
|
|
creds := auth.CredentialsFromContext(ctx)
|
|
|
|
if s.owner.UID == creds.EffectiveKUID || s.creator.UID == creds.EffectiveKUID {
|
|
|
|
return true
|
|
|
|
}
|
|
|
|
|
|
|
|
// Tasks with CAP_SYS_ADMIN may bypass ownership checks. Strangely, Linux
|
|
|
|
// doesn't use CAP_IPC_OWNER for this despite CAP_IPC_OWNER being documented
|
|
|
|
// for use to "override IPC ownership checks".
|
|
|
|
return creds.HasCapabilityIn(linux.CAP_SYS_ADMIN, s.registry.userNS)
|
|
|
|
}
|
|
|
|
|
|
|
|
// checkPermissions verifies whether a segment is accessible by ctx for access
|
|
|
|
// described by req. See ipc/util.c:ipcperms() in Linux.
|
|
|
|
//
|
|
|
|
// Precondition: Caller must hold s.mu.
|
|
|
|
func (s *Shm) checkPermissions(ctx context.Context, req fs.PermMask) bool {
|
|
|
|
creds := auth.CredentialsFromContext(ctx)
|
|
|
|
|
|
|
|
p := s.perms.Other
|
|
|
|
if s.owner.UID == creds.EffectiveKUID {
|
|
|
|
p = s.perms.User
|
|
|
|
} else if creds.InGroup(s.owner.GID) {
|
|
|
|
p = s.perms.Group
|
|
|
|
}
|
|
|
|
if p.SupersetOf(req) {
|
|
|
|
return true
|
|
|
|
}
|
|
|
|
|
|
|
|
// Tasks with CAP_IPC_OWNER may bypass permission checks.
|
|
|
|
return creds.HasCapabilityIn(linux.CAP_IPC_OWNER, s.registry.userNS)
|
|
|
|
}
|