Automated rollback of changelist 225861605

PiperOrigin-RevId: 226224230
Change-Id: Id24c7d3733722fd41d5fe74ef64e0ce8c68f0b12
This commit is contained in:
Googler 2018-12-19 13:29:10 -08:00 committed by Shentubot
parent ff7178a4d1
commit 86c9bd2547
18 changed files with 134 additions and 946 deletions

View File

@ -60,7 +60,7 @@ const (
DefaultNofileHardLimit = 4096
// DefaultMemlockLimit is called MLOCK_LIMIT in Linux.
DefaultMemlockLimit = 64 * 1024
DefaultMemlockLimit = 64 * 1094
// DefaultMsgqueueLimit is called MQ_BYTES_MAX in Linux.
DefaultMsgqueueLimit = 819200

View File

@ -49,18 +49,6 @@ const (
MREMAP_FIXED = 1 << 1
)
// Flags for mlock2(2).
const (
MLOCK_ONFAULT = 0x01
)
// Flags for mlockall(2).
const (
MCL_CURRENT = 1
MCL_FUTURE = 2
MCL_ONFAULT = 4
)
// Advice for madvise(2).
const (
MADV_NORMAL = 0

View File

@ -33,7 +33,7 @@ const (
Rss
ProcessCount
NumberOfFiles
MemoryLocked
MemoryPagesLocked
AS
Locks
SignalsPending

View File

@ -30,7 +30,7 @@ var FromLinuxResource = map[int]LimitType{
linux.RLIMIT_RSS: Rss,
linux.RLIMIT_NPROC: ProcessCount,
linux.RLIMIT_NOFILE: NumberOfFiles,
linux.RLIMIT_MEMLOCK: MemoryLocked,
linux.RLIMIT_MEMLOCK: MemoryPagesLocked,
linux.RLIMIT_AS: AS,
linux.RLIMIT_LOCKS: Locks,
linux.RLIMIT_SIGPENDING: SignalsPending,

View File

@ -243,40 +243,6 @@ type MappingIdentity interface {
Msync(ctx context.Context, mr MappableRange) error
}
// MLockMode specifies the memory locking behavior of a memory mapping.
type MLockMode int
// Note that the ordering of MLockModes is significant; see
// mm.MemoryManager.defMLockMode.
const (
// MLockNone specifies that a mapping has no memory locking behavior.
//
// This must be the zero value for MLockMode.
MLockNone MLockMode = iota
// MLockEager specifies that a mapping is memory-locked, as by mlock() or
// similar. Pages in the mapping should be made, and kept, resident in
// physical memory as soon as possible.
//
// As of this writing, MLockEager does not cause memory-locking to be
// requested from the host; it only affects the sentry's memory management
// behavior.
//
// MLockEager is analogous to Linux's VM_LOCKED.
MLockEager
// MLockLazy specifies that a mapping is memory-locked, as by mlock() or
// similar. Pages in the mapping should be kept resident in physical memory
// once they have been made resident due to e.g. a page fault.
//
// As of this writing, MLockLazy does not cause memory-locking to be
// requested from the host; in fact, it has virtually no effect, except for
// interactions between mlocked pages and other syscalls.
//
// MLockLazy is analogous to Linux's VM_LOCKED | VM_LOCKONFAULT.
MLockLazy
)
// MMapOpts specifies a request to create a memory mapping.
type MMapOpts struct {
// Length is the length of the mapping.
@ -337,9 +303,6 @@ type MMapOpts struct {
// mapping (see platform.AddressSpace.MapFile).
Precommit bool
// MLockMode specifies the memory locking behavior of the mapping.
MLockMode MLockMode
// Hint is the name used for the mapping in /proc/[pid]/maps. If Hint is
// empty, MappingIdentity.MappedName() will be used instead.
//

View File

@ -106,7 +106,6 @@ go_library(
"//pkg/sentry/context",
"//pkg/sentry/fs",
"//pkg/sentry/fs/proc/seqfile",
"//pkg/sentry/kernel/auth",
"//pkg/sentry/kernel/futex",
"//pkg/sentry/kernel/shm",
"//pkg/sentry/limits",

View File

@ -149,7 +149,7 @@ func (mm *MemoryManager) Deactivate() {
// for all addresses in ar should be precommitted.
//
// Preconditions: mm.activeMu must be locked. mm.as != nil. ar.Length() != 0.
// ar must be page-aligned. pseg == mm.pmas.LowerBoundSegment(ar.Start).
// ar must be page-aligned. pseg.Range().Contains(ar.Start).
func (mm *MemoryManager) mapASLocked(pseg pmaIterator, ar usermem.AddrRange, precommit bool) error {
// By default, map entire pmas at a time, under the assumption that there
// is no cost to mapping more of a pma than necessary.
@ -173,9 +173,7 @@ func (mm *MemoryManager) mapASLocked(pseg pmaIterator, ar usermem.AddrRange, pre
}
}
// Since this checks ar.End and not mapAR.End, we will never map a pma that
// is not required.
for pseg.Ok() && pseg.Start() < ar.End {
for {
pma := pseg.ValuePtr()
pmaAR := pseg.Range()
pmaMapAR := pmaAR.Intersect(mapAR)
@ -186,9 +184,13 @@ func (mm *MemoryManager) mapASLocked(pseg pmaIterator, ar usermem.AddrRange, pre
if err := pma.file.MapInto(mm.as, pmaMapAR.Start, pseg.fileRangeOf(pmaMapAR), perms, precommit); err != nil {
return err
}
// Since this checks ar.End and not mapAR.End, we will never map a pma
// that is not required.
if ar.End <= pmaAR.End {
return nil
}
pseg = pseg.NextSegment()
}
return nil
}
// unmapASLocked removes all AddressSpace mappings for addresses in ar.

View File

@ -22,7 +22,6 @@ import (
"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
"gvisor.googlesource.com/gvisor/pkg/sentry/context"
"gvisor.googlesource.com/gvisor/pkg/sentry/limits"
"gvisor.googlesource.com/gvisor/pkg/sentry/memmap"
"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
)
@ -59,17 +58,13 @@ func (mm *MemoryManager) Fork(ctx context.Context) (*MemoryManager, error) {
mm.mappingMu.RLock()
defer mm.mappingMu.RUnlock()
mm2 := &MemoryManager{
p: mm.p,
haveASIO: mm.haveASIO,
layout: mm.layout,
privateRefs: mm.privateRefs,
users: 1,
brk: mm.brk,
usageAS: mm.usageAS,
// "The child does not inherit its parent's memory locks (mlock(2),
// mlockall(2))." - fork(2). So lockedAS is 0 and defMLockMode is
// MLockNone, both of which are zero values. vma.mlockMode is reset
// when copied below.
p: mm.p,
haveASIO: mm.haveASIO,
layout: mm.layout,
privateRefs: mm.privateRefs,
users: 1,
usageAS: mm.usageAS,
brk: mm.brk,
captureInvalidations: true,
argv: mm.argv,
envv: mm.envv,
@ -82,7 +77,7 @@ func (mm *MemoryManager) Fork(ctx context.Context) (*MemoryManager, error) {
// Copy vmas.
dstvgap := mm2.vmas.FirstGap()
for srcvseg := mm.vmas.FirstSegment(); srcvseg.Ok(); srcvseg = srcvseg.NextSegment() {
vma := srcvseg.Value() // makes a copy of the vma
vma := srcvseg.ValuePtr()
vmaAR := srcvseg.Range()
// Inform the Mappable, if any, of the new mapping.
if vma.mappable != nil {
@ -94,8 +89,7 @@ func (mm *MemoryManager) Fork(ctx context.Context) (*MemoryManager, error) {
if vma.id != nil {
vma.id.IncRef()
}
vma.mlockMode = memmap.MLockNone
dstvgap = mm2.vmas.Insert(dstvgap, vmaAR, vma).NextGap()
dstvgap = mm2.vmas.Insert(dstvgap, vmaAR, *vma).NextGap()
// We don't need to update mm2.usageAS since we copied it from mm
// above.
}

View File

@ -95,6 +95,11 @@ type MemoryManager struct {
// vmas is protected by mappingMu.
vmas vmaSet
// usageAS is vmas.Span(), cached to accelerate RLIMIT_AS checks.
//
// usageAS is protected by mappingMu.
usageAS uint64
// brk is the mm's brk, which is manipulated using the brk(2) system call.
// The brk is initially set up by the loader which maps an executable
// binary into the mm.
@ -102,23 +107,6 @@ type MemoryManager struct {
// brk is protected by mappingMu.
brk usermem.AddrRange
// usageAS is vmas.Span(), cached to accelerate RLIMIT_AS checks.
//
// usageAS is protected by mappingMu.
usageAS uint64
// lockedAS is the combined size in bytes of all vmas with vma.mlockMode !=
// memmap.MLockNone.
//
// lockedAS is protected by mappingMu.
lockedAS uint64
// New VMAs created by MMap use whichever of memmap.MMapOpts.MLockMode or
// defMLockMode is greater.
//
// defMLockMode is protected by mappingMu.
defMLockMode memmap.MLockMode
// activeMu is loosely analogous to Linux's struct
// mm_struct::page_table_lock.
activeMu ssync.DowngradableRWMutex `state:"nosave"`
@ -264,8 +252,6 @@ type vma struct {
// metag, none of which we currently support.
growsDown bool `state:"manual"`
mlockMode memmap.MLockMode
// If id is not nil, it controls the lifecycle of mappable and provides vma
// metadata shown in /proc/[pid]/maps, and the vma holds a reference.
id memmap.MappingIdentity

View File

@ -20,7 +20,6 @@ import (
"gvisor.googlesource.com/gvisor/pkg/abi/linux"
"gvisor.googlesource.com/gvisor/pkg/sentry/context"
"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/futex"
"gvisor.googlesource.com/gvisor/pkg/sentry/limits"
"gvisor.googlesource.com/gvisor/pkg/sentry/memmap"
@ -129,24 +128,16 @@ func (mm *MemoryManager) MMap(ctx context.Context, opts memmap.MMapOpts) (userme
// Get the new vma.
mm.mappingMu.Lock()
if opts.MLockMode < mm.defMLockMode {
opts.MLockMode = mm.defMLockMode
}
vseg, ar, err := mm.createVMALocked(ctx, opts)
if err != nil {
mm.mappingMu.Unlock()
return 0, err
}
// TODO: In Linux, VM_LOCKONFAULT (which may be set on the new
// vma by mlockall(MCL_FUTURE|MCL_ONFAULT) => mm_struct::def_flags) appears
// to effectively disable MAP_POPULATE by unsetting FOLL_POPULATE in
// mm/util.c:vm_mmap_pgoff() => mm/gup.c:__mm_populate() =>
// populate_vma_page_range(). Confirm this behavior.
switch {
case opts.Precommit || opts.MLockMode == memmap.MLockEager:
case opts.Precommit:
// Get pmas and map with precommit as requested.
mm.populateVMAAndUnlock(ctx, vseg, ar, true)
mm.populateAndUnlock(ctx, vseg, ar, true)
case opts.Mappable == nil && length <= privateAllocUnit:
// NOTE: Get pmas and map eagerly in the hope
@ -155,7 +146,7 @@ func (mm *MemoryManager) MMap(ctx context.Context, opts memmap.MMapOpts) (userme
// memmap.Mappable.Translate is unknown; and only for small mappings,
// to avoid needing to allocate large amounts of memory that we may
// subsequently need to checkpoint.
mm.populateVMAAndUnlock(ctx, vseg, ar, false)
mm.populateAndUnlock(ctx, vseg, ar, false)
default:
mm.mappingMu.Unlock()
@ -164,29 +155,31 @@ func (mm *MemoryManager) MMap(ctx context.Context, opts memmap.MMapOpts) (userme
return ar.Start, nil
}
// populateVMA obtains pmas for addresses in ar in the given vma, and maps them
// into mm.as if it is active.
// Preconditions: mm.mappingMu must be locked for writing.
//
// Preconditions: mm.mappingMu must be locked. vseg.Range().IsSupersetOf(ar).
func (mm *MemoryManager) populateVMA(ctx context.Context, vseg vmaIterator, ar usermem.AddrRange, precommit bool) {
// Postconditions: mm.mappingMu will be unlocked.
func (mm *MemoryManager) populateAndUnlock(ctx context.Context, vseg vmaIterator, ar usermem.AddrRange, precommit bool) {
if !vseg.ValuePtr().effectivePerms.Any() {
// Linux doesn't populate inaccessible pages. See
// mm/gup.c:populate_vma_page_range.
mm.mappingMu.Unlock()
return
}
mm.activeMu.Lock()
// Can't defer mm.activeMu.Unlock(); see below.
// Even if we get new pmas, we can't actually map them if we don't have an
// Even if we get a new pma, we can't actually map it if we don't have an
// AddressSpace.
if mm.as == nil {
mm.activeMu.Unlock()
mm.mappingMu.Unlock()
return
}
// Ensure that we have usable pmas.
mm.mappingMu.DowngradeLock()
pseg, _, err := mm.getPMAsLocked(ctx, vseg, ar, pmaOpts{})
mm.mappingMu.RUnlock()
if err != nil {
// mm/util.c:vm_mmap_pgoff() ignores the error, if any, from
// mm/gup.c:mm_populate(). If it matters, we'll get it again when
@ -204,45 +197,6 @@ func (mm *MemoryManager) populateVMA(ctx context.Context, vseg vmaIterator, ar u
mm.activeMu.RUnlock()
}
// populateVMAAndUnlock is equivalent to populateVMA, but also unconditionally
// unlocks mm.mappingMu. In cases where populateVMAAndUnlock is usable, it is
// preferable to populateVMA since it unlocks mm.mappingMu before performing
// expensive operations that don't require it to be locked.
//
// Preconditions: mm.mappingMu must be locked for writing.
// vseg.Range().IsSupersetOf(ar).
//
// Postconditions: mm.mappingMu will be unlocked.
func (mm *MemoryManager) populateVMAAndUnlock(ctx context.Context, vseg vmaIterator, ar usermem.AddrRange, precommit bool) {
// See populateVMA above for commentary.
if !vseg.ValuePtr().effectivePerms.Any() {
mm.mappingMu.Unlock()
return
}
mm.activeMu.Lock()
if mm.as == nil {
mm.activeMu.Unlock()
mm.mappingMu.Unlock()
return
}
// mm.mappingMu doesn't need to be write-locked for getPMAsLocked, and it
// isn't needed at all for mapASLocked.
mm.mappingMu.DowngradeLock()
pseg, _, err := mm.getPMAsLocked(ctx, vseg, ar, pmaOpts{})
mm.mappingMu.RUnlock()
if err != nil {
mm.activeMu.Unlock()
return
}
mm.activeMu.DowngradeLock()
mm.mapASLocked(pseg, ar, precommit)
mm.activeMu.RUnlock()
}
// MapStack allocates the initial process stack.
func (mm *MemoryManager) MapStack(ctx context.Context) (usermem.AddrRange, error) {
// maxStackSize is the maximum supported process stack size in bytes.
@ -282,7 +236,6 @@ func (mm *MemoryManager) MapStack(ctx context.Context) (usermem.AddrRange, error
MaxPerms: usermem.AnyAccess,
Private: true,
GrowsDown: true,
MLockMode: mm.defMLockMode,
Hint: "[stack]",
})
return ar, err
@ -381,19 +334,6 @@ func (mm *MemoryManager) MRemap(ctx context.Context, oldAddr usermem.Addr, oldSi
// occupies at least part of the destination. Thus the NoMove case always
// fails and the MayMove case always falls back to copying.
if vma := vseg.ValuePtr(); newSize > oldSize && vma.mlockMode != memmap.MLockNone {
// Check against RLIMIT_MEMLOCK. Unlike mmap, mlock, and mlockall,
// mremap in Linux does not check mm/mlock.c:can_do_mlock() and
// therefore does not return EPERM if RLIMIT_MEMLOCK is 0 and
// !CAP_IPC_LOCK.
mlockLimit := limits.FromContext(ctx).Get(limits.MemoryLocked).Cur
if creds := auth.CredentialsFromContext(ctx); !creds.HasCapabilityIn(linux.CAP_IPC_LOCK, creds.UserNamespace.Root()) {
if newLockedAS := mm.lockedAS - oldSize + newSize; newLockedAS > mlockLimit {
return 0, syserror.EAGAIN
}
}
}
if opts.Move != MRemapMustMove {
// Handle no-ops and in-place shrinking. These cases don't care if
// [oldAddr, oldEnd) maps to a single vma, or is even mapped at all
@ -420,7 +360,7 @@ func (mm *MemoryManager) MRemap(ctx context.Context, oldAddr usermem.Addr, oldSi
if vma.mappable != nil {
newOffset = vseg.mappableRange().End
}
vseg, ar, err := mm.createVMALocked(ctx, memmap.MMapOpts{
_, _, err := mm.createVMALocked(ctx, memmap.MMapOpts{
Length: newSize - oldSize,
MappingIdentity: vma.id,
Mappable: vma.mappable,
@ -431,13 +371,9 @@ func (mm *MemoryManager) MRemap(ctx context.Context, oldAddr usermem.Addr, oldSi
MaxPerms: vma.maxPerms,
Private: vma.private,
GrowsDown: vma.growsDown,
MLockMode: vma.mlockMode,
Hint: vma.hint,
})
if err == nil {
if vma.mlockMode == memmap.MLockEager {
mm.populateVMA(ctx, vseg, ar, true)
}
return oldAddr, nil
}
// In-place growth failed. In the MRemapMayMove case, fall through to
@ -526,14 +462,8 @@ func (mm *MemoryManager) MRemap(ctx context.Context, oldAddr usermem.Addr, oldSi
if vma.id != nil {
vma.id.IncRef()
}
vseg := mm.vmas.Insert(mm.vmas.FindGap(newAR.Start), newAR, vma)
mm.vmas.Add(newAR, vma)
mm.usageAS += uint64(newAR.Length())
if vma.mlockMode != memmap.MLockNone {
mm.lockedAS += uint64(newAR.Length())
if vma.mlockMode == memmap.MLockEager {
mm.populateVMA(ctx, vseg, newAR, true)
}
}
return newAR.Start, nil
}
@ -555,11 +485,8 @@ func (mm *MemoryManager) MRemap(ctx context.Context, oldAddr usermem.Addr, oldSi
vseg = mm.vmas.Isolate(vseg, oldAR)
vma := vseg.Value()
mm.vmas.Remove(vseg)
vseg = mm.vmas.Insert(mm.vmas.FindGap(newAR.Start), newAR, vma)
mm.vmas.Add(newAR, vma)
mm.usageAS = mm.usageAS - uint64(oldAR.Length()) + uint64(newAR.Length())
if vma.mlockMode != memmap.MLockNone {
mm.lockedAS = mm.lockedAS - uint64(oldAR.Length()) + uint64(newAR.Length())
}
// Move pmas. This is technically optional for non-private pmas, which
// could just go through memmap.Mappable.Translate again, but it's required
@ -574,10 +501,6 @@ func (mm *MemoryManager) MRemap(ctx context.Context, oldAddr usermem.Addr, oldSi
vma.mappable.RemoveMapping(ctx, mm, oldAR, vma.off, vma.isMappableAsWritable())
}
if vma.mlockMode == memmap.MLockEager {
mm.populateVMA(ctx, vseg, newAR, true)
}
return newAR.Start, nil
}
@ -688,10 +611,9 @@ func (mm *MemoryManager) BrkSetup(ctx context.Context, addr usermem.Addr) {
// error on failure.
func (mm *MemoryManager) Brk(ctx context.Context, addr usermem.Addr) (usermem.Addr, error) {
mm.mappingMu.Lock()
// Can't defer mm.mappingMu.Unlock(); see below.
defer mm.mappingMu.Unlock()
if addr < mm.brk.Start {
mm.mappingMu.Unlock()
return mm.brk.End, syserror.EINVAL
}
@ -701,24 +623,21 @@ func (mm *MemoryManager) Brk(ctx context.Context, addr usermem.Addr) (usermem.Ad
// heap + data + bss. The segment sizes need to be plumbed from the
// loader package to fully enforce RLIMIT_DATA.
if uint64(addr-mm.brk.Start) > limits.FromContext(ctx).Get(limits.Data).Cur {
mm.mappingMu.Unlock()
return mm.brk.End, syserror.ENOMEM
}
oldbrkpg, _ := mm.brk.End.RoundUp()
newbrkpg, ok := addr.RoundUp()
if !ok {
mm.mappingMu.Unlock()
return mm.brk.End, syserror.EFAULT
}
switch {
case newbrkpg < oldbrkpg:
mm.unmapLocked(ctx, usermem.AddrRange{newbrkpg, oldbrkpg})
mm.mappingMu.Unlock()
case oldbrkpg < newbrkpg:
vseg, ar, err := mm.createVMALocked(ctx, memmap.MMapOpts{
_, _, err := mm.createVMALocked(ctx, memmap.MMapOpts{
Length: uint64(newbrkpg - oldbrkpg),
Addr: oldbrkpg,
Fixed: true,
@ -727,221 +646,17 @@ func (mm *MemoryManager) Brk(ctx context.Context, addr usermem.Addr) (usermem.Ad
Perms: usermem.ReadWrite,
MaxPerms: usermem.AnyAccess,
Private: true,
// Linux: mm/mmap.c:sys_brk() => do_brk_flags() includes
// mm->def_flags.
MLockMode: mm.defMLockMode,
Hint: "[heap]",
Hint: "[heap]",
})
if err != nil {
mm.mappingMu.Unlock()
return mm.brk.End, err
}
if mm.defMLockMode == memmap.MLockEager {
mm.populateVMAAndUnlock(ctx, vseg, ar, true)
} else {
mm.mappingMu.Unlock()
}
default:
// Nothing to do.
mm.mappingMu.Unlock()
}
mm.brk.End = addr
return addr, nil
}
// MLock implements the semantics of Linux's mlock()/mlock2()/munlock(),
// depending on mode.
func (mm *MemoryManager) MLock(ctx context.Context, addr usermem.Addr, length uint64, mode memmap.MLockMode) error {
// Linux allows this to overflow.
la, _ := usermem.Addr(length + addr.PageOffset()).RoundUp()
ar, ok := addr.RoundDown().ToRange(uint64(la))
if !ok {
return syserror.EINVAL
}
mm.mappingMu.Lock()
// Can't defer mm.mappingMu.Unlock(); see below.
if mode != memmap.MLockNone {
// Check against RLIMIT_MEMLOCK.
if creds := auth.CredentialsFromContext(ctx); !creds.HasCapabilityIn(linux.CAP_IPC_LOCK, creds.UserNamespace.Root()) {
mlockLimit := limits.FromContext(ctx).Get(limits.MemoryLocked).Cur
if mlockLimit == 0 {
mm.mappingMu.Unlock()
return syserror.EPERM
}
if newLockedAS := mm.lockedAS + uint64(ar.Length()) - mm.mlockedBytesRangeLocked(ar); newLockedAS > mlockLimit {
mm.mappingMu.Unlock()
return syserror.ENOMEM
}
}
}
// Check this after RLIMIT_MEMLOCK for consistency with Linux.
if ar.Length() == 0 {
mm.mappingMu.Unlock()
return nil
}
// Apply the new mlock mode to vmas.
var unmapped bool
vseg := mm.vmas.FindSegment(ar.Start)
for {
if !vseg.Ok() {
unmapped = true
break
}
vseg = mm.vmas.Isolate(vseg, ar)
vma := vseg.ValuePtr()
prevMode := vma.mlockMode
vma.mlockMode = mode
if mode != memmap.MLockNone && prevMode == memmap.MLockNone {
mm.lockedAS += uint64(vseg.Range().Length())
} else if mode == memmap.MLockNone && prevMode != memmap.MLockNone {
mm.lockedAS -= uint64(vseg.Range().Length())
}
if ar.End <= vseg.End() {
break
}
vseg, _ = vseg.NextNonEmpty()
}
mm.vmas.MergeRange(ar)
mm.vmas.MergeAdjacent(ar)
if unmapped {
mm.mappingMu.Unlock()
return syserror.ENOMEM
}
if mode == memmap.MLockEager {
// Ensure that we have usable pmas. Since we didn't return ENOMEM
// above, ar must be fully covered by vmas, so we can just use
// NextSegment below.
mm.activeMu.Lock()
mm.mappingMu.DowngradeLock()
for vseg := mm.vmas.FindSegment(ar.Start); vseg.Ok() && vseg.Start() < ar.End; vseg = vseg.NextSegment() {
if !vseg.ValuePtr().effectivePerms.Any() {
// Linux: mm/gup.c:__get_user_pages() returns EFAULT in this
// case, which is converted to ENOMEM by mlock.
mm.activeMu.Unlock()
mm.mappingMu.RUnlock()
return syserror.ENOMEM
}
_, _, err := mm.getPMAsLocked(ctx, vseg, vseg.Range().Intersect(ar), pmaOpts{})
if err != nil {
mm.activeMu.Unlock()
mm.mappingMu.RUnlock()
// Linux: mm/mlock.c:__mlock_posix_error_return()
if err == syserror.EFAULT {
return syserror.ENOMEM
}
if err == syserror.ENOMEM {
return syserror.EAGAIN
}
return err
}
}
// Map pmas into the active AddressSpace, if we have one.
mm.mappingMu.RUnlock()
if mm.as != nil {
mm.activeMu.DowngradeLock()
err := mm.mapASLocked(mm.pmas.LowerBoundSegment(ar.Start), ar, true /* precommit */)
mm.activeMu.RUnlock()
if err != nil {
return err
}
} else {
mm.activeMu.Unlock()
}
} else {
mm.mappingMu.Unlock()
}
return nil
}
// MLockAllOpts holds options to MLockAll.
type MLockAllOpts struct {
// If Current is true, change the memory-locking behavior of all mappings
// to Mode. If Future is true, upgrade the memory-locking behavior of all
// future mappings to Mode. At least one of Current or Future must be true.
Current bool
Future bool
Mode memmap.MLockMode
}
// MLockAll implements the semantics of Linux's mlockall()/munlockall(),
// depending on opts.
func (mm *MemoryManager) MLockAll(ctx context.Context, opts MLockAllOpts) error {
if !opts.Current && !opts.Future {
return syserror.EINVAL
}
mm.mappingMu.Lock()
// Can't defer mm.mappingMu.Unlock(); see below.
if opts.Current {
if opts.Mode != memmap.MLockNone {
// Check against RLIMIT_MEMLOCK.
if creds := auth.CredentialsFromContext(ctx); !creds.HasCapabilityIn(linux.CAP_IPC_LOCK, creds.UserNamespace.Root()) {
mlockLimit := limits.FromContext(ctx).Get(limits.MemoryLocked).Cur
if mlockLimit == 0 {
mm.mappingMu.Unlock()
return syserror.EPERM
}
if uint64(mm.vmas.Span()) > mlockLimit {
mm.mappingMu.Unlock()
return syserror.ENOMEM
}
}
}
for vseg := mm.vmas.FirstSegment(); vseg.Ok(); vseg = vseg.NextSegment() {
vma := vseg.ValuePtr()
prevMode := vma.mlockMode
vma.mlockMode = opts.Mode
if opts.Mode != memmap.MLockNone && prevMode == memmap.MLockNone {
mm.lockedAS += uint64(vseg.Range().Length())
} else if opts.Mode == memmap.MLockNone && prevMode != memmap.MLockNone {
mm.lockedAS -= uint64(vseg.Range().Length())
}
}
}
if opts.Future {
mm.defMLockMode = opts.Mode
}
if opts.Current && opts.Mode == memmap.MLockEager {
// Linux: mm/mlock.c:sys_mlockall() => include/linux/mm.h:mm_populate()
// ignores the return value of __mm_populate(), so all errors below are
// ignored.
//
// Try to get usable pmas.
mm.activeMu.Lock()
mm.mappingMu.DowngradeLock()
for vseg := mm.vmas.FirstSegment(); vseg.Ok(); vseg = vseg.NextSegment() {
if vseg.ValuePtr().effectivePerms.Any() {
mm.getPMAsLocked(ctx, vseg, vseg.Range(), pmaOpts{})
}
}
// Map all pmas into the active AddressSpace, if we have one.
mm.mappingMu.RUnlock()
if mm.as != nil {
mm.activeMu.DowngradeLock()
mm.mapASLocked(mm.pmas.FirstSegment(), mm.applicationAddrRange(), true /* precommit */)
mm.activeMu.RUnlock()
} else {
mm.activeMu.Unlock()
}
} else {
mm.mappingMu.Unlock()
}
return nil
}
// Decommit implements the semantics of Linux's madvise(MADV_DONTNEED).
func (mm *MemoryManager) Decommit(addr usermem.Addr, length uint64) error {
ar, ok := addr.ToRange(length)
@ -965,49 +680,46 @@ func (mm *MemoryManager) Decommit(addr usermem.Addr, length uint64) error {
// ensures that Decommit immediately reduces host memory usage.
var didUnmapAS bool
pseg := mm.pmas.LowerBoundSegment(ar.Start)
vseg := mm.vmas.LowerBoundSegment(ar.Start)
mem := mm.p.Memory()
for vseg := mm.vmas.LowerBoundSegment(ar.Start); vseg.Ok() && vseg.Start() < ar.End; vseg = vseg.NextSegment() {
vma := vseg.ValuePtr()
if vma.mlockMode != memmap.MLockNone {
return syserror.EINVAL
}
vsegAR := vseg.Range().Intersect(ar)
// pseg should already correspond to either this vma or a later one,
// since there can't be a pma without a corresponding vma.
if checkInvariants {
if pseg.Ok() && pseg.End() <= vsegAR.Start {
panic(fmt.Sprintf("pma %v precedes vma %v", pseg.Range(), vsegAR))
}
}
for pseg.Ok() && pseg.Start() < vsegAR.End {
pma := pseg.ValuePtr()
if pma.private && !mm.isPMACopyOnWriteLocked(pseg) {
psegAR := pseg.Range().Intersect(ar)
if vsegAR.IsSupersetOf(psegAR) && vma.mappable == nil {
if err := mem.Decommit(pseg.fileRangeOf(psegAR)); err == nil {
pseg = pseg.NextSegment()
continue
}
// If an error occurs, fall through to the general
// invalidation case below.
for pseg.Ok() && pseg.Start() < ar.End {
pma := pseg.ValuePtr()
if pma.private && !mm.isPMACopyOnWriteLocked(pseg) {
psegAR := pseg.Range().Intersect(ar)
vseg = vseg.seekNextLowerBound(psegAR.Start)
if checkInvariants {
if !vseg.Ok() {
panic(fmt.Sprintf("no vma after %#x", psegAR.Start))
}
if psegAR.Start < vseg.Start() {
panic(fmt.Sprintf("no vma in [%#x, %#x)", psegAR.Start, vseg.Start()))
}
}
pseg = mm.pmas.Isolate(pseg, vsegAR)
pma = pseg.ValuePtr()
if !didUnmapAS {
// Unmap all of ar, not just pseg.Range(), to minimize host
// syscalls. AddressSpace mappings must be removed before
// mm.decPrivateRef().
mm.unmapASLocked(ar)
didUnmapAS = true
if vseg.Range().IsSupersetOf(psegAR) && vseg.ValuePtr().mappable == nil {
if err := mem.Decommit(pseg.fileRangeOf(psegAR)); err == nil {
pseg = pseg.NextSegment()
continue
}
// If an error occurs, fall through to the general
// invalidation case below.
}
if pma.private {
mm.decPrivateRef(pseg.fileRange())
}
pma.file.DecRef(pseg.fileRange())
mm.removeRSSLocked(pseg.Range())
pseg = mm.pmas.Remove(pseg).NextSegment()
}
pseg = mm.pmas.Isolate(pseg, ar)
pma = pseg.ValuePtr()
if !didUnmapAS {
// Unmap all of ar, not just pseg.Range(), to minimize host
// syscalls. AddressSpace mappings must be removed before
// mm.decPrivateRef().
mm.unmapASLocked(ar)
didUnmapAS = true
}
if pma.private {
mm.decPrivateRef(pseg.fileRange())
}
pma.file.DecRef(pseg.fileRange())
mm.removeRSSLocked(pseg.Range())
pseg = mm.pmas.Remove(pseg).NextSegment()
}
// "If there are some parts of the specified address space that are not
@ -1020,28 +732,9 @@ func (mm *MemoryManager) Decommit(addr usermem.Addr, length uint64) error {
return nil
}
// MSyncOpts holds options to MSync.
type MSyncOpts struct {
// Sync has the semantics of MS_SYNC.
Sync bool
// Invalidate has the semantics of MS_INVALIDATE.
Invalidate bool
}
// MSync implements the semantics of Linux's msync().
func (mm *MemoryManager) MSync(ctx context.Context, addr usermem.Addr, length uint64, opts MSyncOpts) error {
if addr != addr.RoundDown() {
return syserror.EINVAL
}
if length == 0 {
return nil
}
la, ok := usermem.Addr(length).RoundUp()
if !ok {
return syserror.ENOMEM
}
ar, ok := addr.ToRange(uint64(la))
// Sync implements the semantics of Linux's msync(MS_SYNC).
func (mm *MemoryManager) Sync(ctx context.Context, addr usermem.Addr, length uint64) error {
ar, ok := addr.ToRange(length)
if !ok {
return syserror.ENOMEM
}
@ -1066,14 +759,10 @@ func (mm *MemoryManager) MSync(ctx context.Context, addr usermem.Addr, length ui
}
lastEnd = vseg.End()
vma := vseg.ValuePtr()
if opts.Invalidate && vma.mlockMode != memmap.MLockNone {
mm.mappingMu.RUnlock()
return syserror.EBUSY
}
// It's only possible to have dirtied the Mappable through a shared
// mapping. Don't check if the mapping is writable, because mprotect
// may have changed this, and also because Linux doesn't.
if id := vma.id; opts.Sync && id != nil && vma.mappable != nil && !vma.private {
if id := vma.id; id != nil && vma.mappable != nil && !vma.private {
// We can't call memmap.MappingIdentity.Msync while holding
// mm.mappingMu since it may take fs locks that precede it in the
// lock order.

View File

@ -17,10 +17,8 @@ package mm
import (
"fmt"
"gvisor.googlesource.com/gvisor/pkg/abi/linux"
"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
"gvisor.googlesource.com/gvisor/pkg/sentry/context"
"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
"gvisor.googlesource.com/gvisor/pkg/sentry/limits"
"gvisor.googlesource.com/gvisor/pkg/sentry/memmap"
"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
@ -55,23 +53,6 @@ func (mm *MemoryManager) createVMALocked(ctx context.Context, opts memmap.MMapOp
return vmaIterator{}, usermem.AddrRange{}, syserror.ENOMEM
}
if opts.MLockMode != memmap.MLockNone {
// Check against RLIMIT_MEMLOCK.
if creds := auth.CredentialsFromContext(ctx); !creds.HasCapabilityIn(linux.CAP_IPC_LOCK, creds.UserNamespace.Root()) {
mlockLimit := limits.FromContext(ctx).Get(limits.MemoryLocked).Cur
if mlockLimit == 0 {
return vmaIterator{}, usermem.AddrRange{}, syserror.EPERM
}
newLockedAS := mm.lockedAS + opts.Length
if opts.Unmap {
newLockedAS -= mm.mlockedBytesRangeLocked(ar)
}
if newLockedAS > mlockLimit {
return vmaIterator{}, usermem.AddrRange{}, syserror.EAGAIN
}
}
}
// Remove overwritten mappings. This ordering is consistent with Linux:
// compare Linux's mm/mmap.c:mmap_region() => do_munmap(),
// file->f_op->mmap().
@ -104,14 +85,10 @@ func (mm *MemoryManager) createVMALocked(ctx context.Context, opts memmap.MMapOp
maxPerms: opts.MaxPerms,
private: opts.Private,
growsDown: opts.GrowsDown,
mlockMode: opts.MLockMode,
id: opts.MappingIdentity,
hint: opts.Hint,
})
mm.usageAS += opts.Length
if opts.MLockMode != memmap.MLockNone {
mm.lockedAS += opts.Length
}
return vseg, ar, nil
}
@ -224,17 +201,6 @@ func (mm *MemoryManager) findHighestAvailableLocked(length, alignment uint64, bo
return 0, syserror.ENOMEM
}
// Preconditions: mm.mappingMu must be locked.
func (mm *MemoryManager) mlockedBytesRangeLocked(ar usermem.AddrRange) uint64 {
var total uint64
for vseg := mm.vmas.LowerBoundSegment(ar.Start); vseg.Ok() && vseg.Start() < ar.End; vseg = vseg.NextSegment() {
if vseg.ValuePtr().mlockMode != memmap.MLockNone {
total += uint64(vseg.Range().Intersect(ar).Length())
}
}
return total
}
// getVMAsLocked ensures that vmas exist for all addresses in ar, and support
// access of type (at, ignorePermissions). It returns:
//
@ -372,9 +338,6 @@ func (mm *MemoryManager) removeVMAsLocked(ctx context.Context, ar usermem.AddrRa
vma.id.DecRef()
}
mm.usageAS -= uint64(vmaAR.Length())
if vma.mlockMode != memmap.MLockNone {
mm.lockedAS -= uint64(vmaAR.Length())
}
vgap = mm.vmas.Remove(vseg)
vseg = vgap.NextSegment()
}
@ -405,7 +368,6 @@ func (vmaSetFunctions) Merge(ar1 usermem.AddrRange, vma1 vma, ar2 usermem.AddrRa
vma1.maxPerms != vma2.maxPerms ||
vma1.private != vma2.private ||
vma1.growsDown != vma2.growsDown ||
vma1.mlockMode != vma2.mlockMode ||
vma1.id != vma2.id ||
vma1.hint != vma2.hint {
return vma{}, false

View File

@ -196,11 +196,11 @@ var AMD64 = &kernel.SyscallTable{
145: SchedGetscheduler,
146: SchedGetPriorityMax,
147: SchedGetPriorityMin,
148: syscalls.ErrorWithEvent(syscall.EPERM), // SchedRrGetInterval,
149: Mlock,
150: Munlock,
151: Mlockall,
152: Munlockall,
148: syscalls.ErrorWithEvent(syscall.EPERM), // SchedRrGetInterval,
149: syscalls.Error(nil), // Mlock, TODO
150: syscalls.Error(nil), // Munlock, TODO
151: syscalls.Error(nil), // Mlockall, TODO
152: syscalls.Error(nil), // Munlockall, TODO
153: syscalls.CapError(linux.CAP_SYS_TTY_CONFIG), // Vhangup,
154: syscalls.Error(syscall.EPERM), // ModifyLdt,
155: syscalls.Error(syscall.EPERM), // PivotRoot,
@ -373,9 +373,8 @@ var AMD64 = &kernel.SyscallTable{
// 322: Execveat, TODO
// 323: Userfaultfd, TODO
// 324: Membarrier, TODO
325: Mlock2,
// Syscalls after 325 are "backports" from versions of Linux after 4.4.
// 326: CopyFileRange,
// Syscalls after 325 are backports from 4.6.
325: syscalls.Error(nil), // Mlock2, TODO
327: Preadv2,
328: Pwritev2,
},

View File

@ -69,9 +69,6 @@ func Mmap(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC
GrowsDown: linux.MAP_GROWSDOWN&flags != 0,
Precommit: linux.MAP_POPULATE&flags != 0,
}
if linux.MAP_LOCKED&flags != 0 {
opts.MLockMode = memmap.MLockEager
}
defer func() {
if opts.MappingIdentity != nil {
opts.MappingIdentity.DecRef()
@ -387,6 +384,16 @@ func Msync(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
length := args[1].SizeT()
flags := args[2].Int()
if addr != addr.RoundDown() {
return 0, nil, syserror.EINVAL
}
if length == 0 {
return 0, nil, nil
}
la, ok := usermem.Addr(length).RoundUp()
if !ok {
return 0, nil, syserror.ENOMEM
}
// "The flags argument should specify exactly one of MS_ASYNC and MS_SYNC,
// and may additionally include the MS_INVALIDATE bit. ... However, Linux
// permits a call to msync() that specifies neither of these flags, with
@ -399,72 +406,39 @@ func Msync(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
if sync && flags&linux.MS_ASYNC != 0 {
return 0, nil, syserror.EINVAL
}
err := t.MemoryManager().MSync(t, addr, uint64(length), mm.MSyncOpts{
Sync: sync,
Invalidate: flags&linux.MS_INVALIDATE != 0,
})
// MSync calls fsync, the same interrupt conversion rules apply, see
// mm/msync.c, fsync POSIX.1-2008.
return 0, nil, syserror.ConvertIntr(err, kernel.ERESTARTSYS)
}
// Mlock implements linux syscall mlock(2).
func Mlock(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
addr := args[0].Pointer()
length := args[1].SizeT()
return 0, nil, t.MemoryManager().MLock(t, addr, uint64(length), memmap.MLockEager)
}
// Mlock2 implements linux syscall mlock2(2).
func Mlock2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
addr := args[0].Pointer()
length := args[1].SizeT()
flags := args[2].Int()
if flags&^(linux.MLOCK_ONFAULT) != 0 {
// MS_INVALIDATE "asks to invalidate other mappings of the same file (so
// that they can be updated with the fresh values just written)". This is a
// no-op given that shared memory exists. However, MS_INVALIDATE can also
// be used to detect mlocks: "EBUSY: MS_INVALIDATE was specified in flags,
// and a memory lock exists for the specified address range." Given that
// mlock is stubbed out, it's unsafe to pass MS_INVALIDATE silently since
// some user program could be using it for synchronization.
if flags&linux.MS_INVALIDATE != 0 {
return 0, nil, syserror.EINVAL
}
mode := memmap.MLockEager
if flags&linux.MLOCK_ONFAULT != 0 {
mode = memmap.MLockLazy
// MS_SYNC "requests an update and waits for it to complete."
if sync {
err := t.MemoryManager().Sync(t, addr, uint64(la))
// Sync calls fsync, the same interrupt conversion rules apply, see
// mm/msync.c, fsync POSIX.1-2008.
return 0, nil, syserror.ConvertIntr(err, kernel.ERESTARTSYS)
}
return 0, nil, t.MemoryManager().MLock(t, addr, uint64(length), mode)
}
// Munlock implements linux syscall munlock(2).
func Munlock(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
addr := args[0].Pointer()
length := args[1].SizeT()
return 0, nil, t.MemoryManager().MLock(t, addr, uint64(length), memmap.MLockNone)
}
// Mlockall implements linux syscall mlockall(2).
func Mlockall(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
flags := args[0].Int()
if flags&^(linux.MCL_CURRENT|linux.MCL_FUTURE|linux.MCL_ONFAULT) != 0 {
return 0, nil, syserror.EINVAL
// MS_ASYNC "specifies that an update be scheduled, but the call returns
// immediately". As long as dirty pages are tracked and eventually written
// back, this is a no-op. (Correspondingly: "Since Linux 2.6.19, MS_ASYNC
// is in fact a no-op, since the kernel properly tracks dirty pages and
// flushes them to storage as necessary.")
//
// However: "ENOMEM: The indicated memory (or part of it) was not mapped."
// This applies even for MS_ASYNC.
ar, ok := addr.ToRange(uint64(la))
if !ok {
return 0, nil, syserror.ENOMEM
}
mode := memmap.MLockEager
if flags&linux.MCL_ONFAULT != 0 {
mode = memmap.MLockLazy
mapped := t.MemoryManager().VirtualMemorySizeRange(ar)
if mapped != uint64(la) {
return 0, nil, syserror.ENOMEM
}
return 0, nil, t.MemoryManager().MLockAll(t, mm.MLockAllOpts{
Current: flags&linux.MCL_CURRENT != 0,
Future: flags&linux.MCL_FUTURE != 0,
Mode: mode,
})
}
// Munlockall implements linux syscall munlockall(2).
func Munlockall(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
return 0, nil, t.MemoryManager().MLockAll(t, mm.MLockAllOpts{
Current: true,
Future: true,
Mode: memmap.MLockNone,
})
return 0, nil, nil
}

View File

@ -90,7 +90,6 @@ var setableLimits = map[limits.LimitType]struct{}{
limits.CPU: {},
limits.Data: {},
limits.FileSize: {},
limits.MemoryLocked: {},
limits.Stack: {},
// These are not enforced, but we include them here to avoid returning
// EPERM, since some apps expect them to succeed.

View File

@ -29,7 +29,7 @@ var fromLinuxResource = map[string]limits.LimitType{
"RLIMIT_DATA": limits.Data,
"RLIMIT_FSIZE": limits.FileSize,
"RLIMIT_LOCKS": limits.Locks,
"RLIMIT_MEMLOCK": limits.MemoryLocked,
"RLIMIT_MEMLOCK": limits.MemoryPagesLocked,
"RLIMIT_MSGQUEUE": limits.MessageQueueBytes,
"RLIMIT_NICE": limits.Nice,
"RLIMIT_NOFILE": limits.NumberOfFiles,
@ -55,7 +55,7 @@ func createLimitSet(spec *specs.Spec) (*limits.LimitSet, error) {
ls.SetUnchecked(limits.Data, limits.Limit{Cur: limits.Infinity, Max: limits.Infinity})
ls.SetUnchecked(limits.FileSize, limits.Limit{Cur: limits.Infinity, Max: limits.Infinity})
ls.SetUnchecked(limits.Locks, limits.Limit{Cur: limits.Infinity, Max: limits.Infinity})
ls.SetUnchecked(limits.MemoryLocked, limits.Limit{Cur: 65536, Max: 65536})
ls.SetUnchecked(limits.MemoryPagesLocked, limits.Limit{Cur: 65536, Max: 65536})
ls.SetUnchecked(limits.MessageQueueBytes, limits.Limit{Cur: 819200, Max: 819200})
ls.SetUnchecked(limits.Nice, limits.Limit{Cur: 0, Max: 0})
ls.SetUnchecked(limits.NumberOfFiles, limits.Limit{Cur: 1048576, Max: 1048576})

View File

@ -1019,21 +1019,6 @@ cc_binary(
],
)
cc_binary(
name = "mlock_test",
testonly = 1,
srcs = ["mlock.cc"],
linkstatic = 1,
deps = [
"//test/util:capability_util",
"//test/util:cleanup",
"//test/util:memory_util",
"//test/util:multiprocess_util",
"//test/util:test_util",
"@com_google_googletest//:gtest",
],
)
cc_binary(
name = "mmap_test",
testonly = 1,

View File

@ -1,344 +0,0 @@
// Copyright 2018 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <errno.h>
#include <string.h>
#include <sys/mman.h>
#include <sys/resource.h>
#include <sys/syscall.h>
#include <unistd.h>
#include "test/util/capability_util.h"
#include "test/util/cleanup.h"
#include "test/util/memory_util.h"
#include "test/util/multiprocess_util.h"
#include "test/util/test_util.h"
using ::testing::_;
namespace gvisor {
namespace testing {
namespace {
PosixErrorOr<bool> CanMlock() {
struct rlimit rlim;
if (getrlimit(RLIMIT_MEMLOCK, &rlim) < 0) {
return PosixError(errno, "getrlimit(RLIMIT_MEMLOCK)");
}
if (rlim.rlim_cur != 0) {
return true;
}
return HaveCapability(CAP_IPC_LOCK);
}
// Returns true if the page containing addr is mlocked.
bool IsPageMlocked(uintptr_t addr) {
// This relies on msync(MS_INVALIDATE) interacting correctly with mlocked
// pages, which is tested for by the MsyncInvalidate case below.
int const rv = msync(reinterpret_cast<void*>(addr & ~(kPageSize - 1)),
kPageSize, MS_ASYNC | MS_INVALIDATE);
if (rv == 0) {
return false;
}
// This uses TEST_PCHECK_MSG since it's used in subprocesses.
TEST_PCHECK_MSG(errno == EBUSY, "msync failed with unexpected errno");
return true;
}
PosixErrorOr<Cleanup> ScopedSetSoftRlimit(int resource, rlim_t newval) {
struct rlimit old_rlim;
if (getrlimit(resource, &old_rlim) != 0) {
return PosixError(errno, "getrlimit failed");
}
struct rlimit new_rlim = old_rlim;
new_rlim.rlim_cur = newval;
if (setrlimit(resource, &new_rlim) != 0) {
return PosixError(errno, "setrlimit failed");
}
return Cleanup([resource, old_rlim] {
TEST_PCHECK(setrlimit(resource, &old_rlim) == 0);
});
}
TEST(MlockTest, Basic) {
SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(CanMlock()));
auto const mapping = ASSERT_NO_ERRNO_AND_VALUE(
MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE));
EXPECT_FALSE(IsPageMlocked(mapping.addr()));
ASSERT_THAT(mlock(mapping.ptr(), mapping.len()), SyscallSucceeds());
EXPECT_TRUE(IsPageMlocked(mapping.addr()));
}
TEST(MlockTest, ProtNone) {
SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(CanMlock()));
auto const mapping =
ASSERT_NO_ERRNO_AND_VALUE(MmapAnon(kPageSize, PROT_NONE, MAP_PRIVATE));
EXPECT_FALSE(IsPageMlocked(mapping.addr()));
ASSERT_THAT(mlock(mapping.ptr(), mapping.len()),
SyscallFailsWithErrno(ENOMEM));
// ENOMEM is returned because mlock can't populate the page, but it's still
// considered locked.
EXPECT_TRUE(IsPageMlocked(mapping.addr()));
}
TEST(MlockTest, MadviseDontneed) {
SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(CanMlock()));
auto const mapping = ASSERT_NO_ERRNO_AND_VALUE(
MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE));
ASSERT_THAT(mlock(mapping.ptr(), mapping.len()), SyscallSucceeds());
EXPECT_THAT(madvise(mapping.ptr(), mapping.len(), MADV_DONTNEED),
SyscallFailsWithErrno(EINVAL));
}
TEST(MlockTest, MsyncInvalidate) {
SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(CanMlock()));
auto const mapping = ASSERT_NO_ERRNO_AND_VALUE(
MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE));
ASSERT_THAT(mlock(mapping.ptr(), mapping.len()), SyscallSucceeds());
EXPECT_THAT(msync(mapping.ptr(), mapping.len(), MS_ASYNC | MS_INVALIDATE),
SyscallFailsWithErrno(EBUSY));
EXPECT_THAT(msync(mapping.ptr(), mapping.len(), MS_SYNC | MS_INVALIDATE),
SyscallFailsWithErrno(EBUSY));
}
TEST(MlockTest, Fork) {
SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(CanMlock()));
auto const mapping = ASSERT_NO_ERRNO_AND_VALUE(
MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE));
EXPECT_FALSE(IsPageMlocked(mapping.addr()));
ASSERT_THAT(mlock(mapping.ptr(), mapping.len()), SyscallSucceeds());
EXPECT_TRUE(IsPageMlocked(mapping.addr()));
EXPECT_THAT(
InForkedProcess([&] { TEST_CHECK(!IsPageMlocked(mapping.addr())); }),
IsPosixErrorOkAndHolds(0));
}
TEST(MlockTest, RlimitMemlockZero) {
if (ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_IPC_LOCK))) {
ASSERT_NO_ERRNO(SetCapability(CAP_IPC_LOCK, false));
}
Cleanup reset_rlimit =
ASSERT_NO_ERRNO_AND_VALUE(ScopedSetSoftRlimit(RLIMIT_MEMLOCK, 0));
auto const mapping = ASSERT_NO_ERRNO_AND_VALUE(
MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE));
EXPECT_FALSE(IsPageMlocked(mapping.addr()));
ASSERT_THAT(mlock(mapping.ptr(), mapping.len()),
SyscallFailsWithErrno(EPERM));
}
TEST(MlockTest, RlimitMemlockInsufficient) {
if (ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_IPC_LOCK))) {
ASSERT_NO_ERRNO(SetCapability(CAP_IPC_LOCK, false));
}
Cleanup reset_rlimit =
ASSERT_NO_ERRNO_AND_VALUE(ScopedSetSoftRlimit(RLIMIT_MEMLOCK, kPageSize));
auto const mapping = ASSERT_NO_ERRNO_AND_VALUE(
MmapAnon(2 * kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE));
EXPECT_FALSE(IsPageMlocked(mapping.addr()));
ASSERT_THAT(mlock(mapping.ptr(), mapping.len()),
SyscallFailsWithErrno(ENOMEM));
}
TEST(MunlockTest, Basic) {
SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(CanMlock()));
auto const mapping = ASSERT_NO_ERRNO_AND_VALUE(
MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE));
EXPECT_FALSE(IsPageMlocked(mapping.addr()));
ASSERT_THAT(mlock(mapping.ptr(), mapping.len()), SyscallSucceeds());
EXPECT_TRUE(IsPageMlocked(mapping.addr()));
ASSERT_THAT(munlock(mapping.ptr(), mapping.len()), SyscallSucceeds());
EXPECT_FALSE(IsPageMlocked(mapping.addr()));
}
TEST(MunlockTest, NotLocked) {
SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(CanMlock()));
auto const mapping = ASSERT_NO_ERRNO_AND_VALUE(
MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE));
EXPECT_FALSE(IsPageMlocked(mapping.addr()));
EXPECT_THAT(munlock(mapping.ptr(), mapping.len()), SyscallSucceeds());
EXPECT_FALSE(IsPageMlocked(mapping.addr()));
}
// There is currently no test for mlockall(MCL_CURRENT) because the default
// RLIMIT_MEMLOCK of 64 KB is insufficient to actually invoke
// mlockall(MCL_CURRENT).
TEST(MlockallTest, Future) {
SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(CanMlock()));
// Run this test in a separate (single-threaded) subprocess to ensure that a
// background thread doesn't try to mmap a large amount of memory, fail due
// to hitting RLIMIT_MEMLOCK, and explode the process violently.
EXPECT_THAT(InForkedProcess([] {
auto const mapping =
MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE)
.ValueOrDie();
TEST_CHECK(!IsPageMlocked(mapping.addr()));
TEST_PCHECK(mlockall(MCL_FUTURE) == 0);
// Ensure that mlockall(MCL_FUTURE) is turned off before the end
// of the test, as otherwise mmaps may fail unexpectedly.
Cleanup do_munlockall([] { TEST_PCHECK(munlockall() == 0); });
auto const mapping2 = ASSERT_NO_ERRNO_AND_VALUE(
MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE));
TEST_CHECK(IsPageMlocked(mapping2.addr()));
// Fire munlockall() and check that it disables
// mlockall(MCL_FUTURE).
do_munlockall.Release()();
auto const mapping3 = ASSERT_NO_ERRNO_AND_VALUE(
MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE));
TEST_CHECK(!IsPageMlocked(mapping2.addr()));
}),
IsPosixErrorOkAndHolds(0));
}
TEST(MunlockallTest, Basic) {
SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(CanMlock()));
auto const mapping = ASSERT_NO_ERRNO_AND_VALUE(
MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_LOCKED));
EXPECT_TRUE(IsPageMlocked(mapping.addr()));
ASSERT_THAT(munlockall(), SyscallSucceeds());
EXPECT_FALSE(IsPageMlocked(mapping.addr()));
}
#ifndef SYS_mlock2
#ifdef __x86_64__
#define SYS_mlock2 325
#endif
#endif
#ifndef MLOCK_ONFAULT
#define MLOCK_ONFAULT 0x01 // Linux: include/uapi/asm-generic/mman-common.h
#endif
#ifdef SYS_mlock2
int mlock2(void const* addr, size_t len, int flags) {
return syscall(SYS_mlock2, addr, len, flags);
}
TEST(Mlock2Test, NoFlags) {
SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(CanMlock()));
auto const mapping = ASSERT_NO_ERRNO_AND_VALUE(
MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE));
EXPECT_FALSE(IsPageMlocked(mapping.addr()));
ASSERT_THAT(mlock2(mapping.ptr(), mapping.len(), 0), SyscallSucceeds());
EXPECT_TRUE(IsPageMlocked(mapping.addr()));
}
TEST(Mlock2Test, MlockOnfault) {
SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(CanMlock()));
auto const mapping = ASSERT_NO_ERRNO_AND_VALUE(
MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE));
EXPECT_FALSE(IsPageMlocked(mapping.addr()));
ASSERT_THAT(mlock2(mapping.ptr(), mapping.len(), MLOCK_ONFAULT),
SyscallSucceeds());
EXPECT_TRUE(IsPageMlocked(mapping.addr()));
}
TEST(Mlock2Test, UnknownFlags) {
SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(CanMlock()));
auto const mapping = ASSERT_NO_ERRNO_AND_VALUE(
MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE));
EXPECT_THAT(mlock2(mapping.ptr(), mapping.len(), ~0),
SyscallFailsWithErrno(EINVAL));
}
#endif // defined(SYS_mlock2)
TEST(MapLockedTest, Basic) {
SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(CanMlock()));
auto const mapping = ASSERT_NO_ERRNO_AND_VALUE(
MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_LOCKED));
EXPECT_TRUE(IsPageMlocked(mapping.addr()));
EXPECT_THAT(munlock(mapping.ptr(), mapping.len()), SyscallSucceeds());
EXPECT_FALSE(IsPageMlocked(mapping.addr()));
}
TEST(MapLockedTest, RlimitMemlockZero) {
if (ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_IPC_LOCK))) {
ASSERT_NO_ERRNO(SetCapability(CAP_IPC_LOCK, false));
}
Cleanup reset_rlimit =
ASSERT_NO_ERRNO_AND_VALUE(ScopedSetSoftRlimit(RLIMIT_MEMLOCK, 0));
EXPECT_THAT(
MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_LOCKED),
PosixErrorIs(EPERM, _));
}
TEST(MapLockedTest, RlimitMemlockInsufficient) {
if (ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_IPC_LOCK))) {
ASSERT_NO_ERRNO(SetCapability(CAP_IPC_LOCK, false));
}
Cleanup reset_rlimit =
ASSERT_NO_ERRNO_AND_VALUE(ScopedSetSoftRlimit(RLIMIT_MEMLOCK, kPageSize));
EXPECT_THAT(
MmapAnon(2 * kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_LOCKED),
PosixErrorIs(EAGAIN, _));
}
TEST(MremapLockedTest, Basic) {
SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(CanMlock()));
auto mapping = ASSERT_NO_ERRNO_AND_VALUE(
MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_LOCKED));
EXPECT_TRUE(IsPageMlocked(mapping.addr()));
void* addr = mremap(mapping.ptr(), mapping.len(), 2 * mapping.len(),
MREMAP_MAYMOVE, nullptr);
if (addr == MAP_FAILED) {
FAIL() << "mremap failed: " << errno << " (" << strerror(errno) << ")";
}
mapping.release();
mapping.reset(addr, 2 * mapping.len());
EXPECT_TRUE(IsPageMlocked(reinterpret_cast<uintptr_t>(addr)));
}
TEST(MremapLockedTest, RlimitMemlockZero) {
SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(CanMlock()));
auto mapping = ASSERT_NO_ERRNO_AND_VALUE(
MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_LOCKED));
EXPECT_TRUE(IsPageMlocked(mapping.addr()));
if (ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_IPC_LOCK))) {
ASSERT_NO_ERRNO(SetCapability(CAP_IPC_LOCK, false));
}
Cleanup reset_rlimit =
ASSERT_NO_ERRNO_AND_VALUE(ScopedSetSoftRlimit(RLIMIT_MEMLOCK, 0));
void* addr = mremap(mapping.ptr(), mapping.len(), 2 * mapping.len(),
MREMAP_MAYMOVE, nullptr);
EXPECT_TRUE(addr == MAP_FAILED && errno == EAGAIN)
<< "addr = " << addr << ", errno = " << errno;
}
TEST(MremapLockedTest, RlimitMemlockInsufficient) {
SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(CanMlock()));
auto mapping = ASSERT_NO_ERRNO_AND_VALUE(
MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_LOCKED));
EXPECT_TRUE(IsPageMlocked(mapping.addr()));
if (ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_IPC_LOCK))) {
ASSERT_NO_ERRNO(SetCapability(CAP_IPC_LOCK, false));
}
Cleanup reset_rlimit = ASSERT_NO_ERRNO_AND_VALUE(
ScopedSetSoftRlimit(RLIMIT_MEMLOCK, mapping.len()));
void* addr = mremap(mapping.ptr(), mapping.len(), 2 * mapping.len(),
MREMAP_MAYMOVE, nullptr);
EXPECT_TRUE(addr == MAP_FAILED && errno == EAGAIN)
<< "addr = " << addr << ", errno = " << errno;
}
} // namespace
} // namespace testing
} // namespace gvisor

View File

@ -43,13 +43,14 @@ class MsyncParameterizedTest : public ::testing::TestWithParam<MsyncTestParam> {
protected:
int msync_flags() const { return std::get<0>(GetParam()); }
PosixErrorOr<Mapping> GetMapping() const { return std::get<1>(GetParam())(); }
PosixErrorOr<Mapping> GetMapping() const {
auto rv = std::get<1>(GetParam())();
return rv;
}
};
// All valid msync(2) flag combinations, not including MS_INVALIDATE. ("Linux
// permits a call to msync() that specifies neither [MS_SYNC or MS_ASYNC], with
// semantics that are (currently) equivalent to specifying MS_ASYNC." -
// msync(2))
// All valid msync(2) flag combinations (not including MS_INVALIDATE, which
// gVisor doesn't implement).
constexpr std::initializer_list<int> kMsyncFlags = {MS_SYNC, MS_ASYNC, 0};
// Returns functions that return mappings that should be successfully
@ -133,15 +134,6 @@ TEST_P(MsyncFullParamTest, UnalignedAddressFails) {
SyscallFailsWithErrno(EINVAL));
}
TEST_P(MsyncFullParamTest, InvalidateUnlockedSucceeds) {
auto m = ASSERT_NO_ERRNO_AND_VALUE(GetMapping());
EXPECT_THAT(msync(m.ptr(), m.len(), msync_flags() | MS_INVALIDATE),
SyscallSucceeds());
}
// The test for MS_INVALIDATE on mlocked pages is in mlock.cc since it requires
// probing for mlock support.
INSTANTIATE_TEST_CASE_P(
All, MsyncFullParamTest,
::testing::Combine(::testing::ValuesIn(kMsyncFlags),