From 86c9bd254749ebf65270aa60f728d9c847ac02d4 Mon Sep 17 00:00:00 2001 From: Googler Date: Wed, 19 Dec 2018 13:29:10 -0800 Subject: [PATCH] Automated rollback of changelist 225861605 PiperOrigin-RevId: 226224230 Change-Id: Id24c7d3733722fd41d5fe74ef64e0ce8c68f0b12 --- pkg/abi/linux/limits.go | 2 +- pkg/abi/linux/mm.go | 12 - pkg/sentry/limits/limits.go | 2 +- pkg/sentry/limits/linux.go | 2 +- pkg/sentry/memmap/memmap.go | 37 --- pkg/sentry/mm/BUILD | 1 - pkg/sentry/mm/address_space.go | 12 +- pkg/sentry/mm/lifecycle.go | 24 +- pkg/sentry/mm/mm.go | 24 +- pkg/sentry/mm/syscalls.go | 423 ++++-------------------- pkg/sentry/mm/vma.go | 38 --- pkg/sentry/syscalls/linux/linux64.go | 15 +- pkg/sentry/syscalls/linux/sys_mmap.go | 104 +++--- pkg/sentry/syscalls/linux/sys_rlimit.go | 1 - runsc/boot/limits.go | 4 +- test/syscalls/linux/BUILD | 15 - test/syscalls/linux/mlock.cc | 344 ------------------- test/syscalls/linux/msync.cc | 20 +- 18 files changed, 134 insertions(+), 946 deletions(-) delete mode 100644 test/syscalls/linux/mlock.cc diff --git a/pkg/abi/linux/limits.go b/pkg/abi/linux/limits.go index e0aa5b31d..b2e51b9bd 100644 --- a/pkg/abi/linux/limits.go +++ b/pkg/abi/linux/limits.go @@ -60,7 +60,7 @@ const ( DefaultNofileHardLimit = 4096 // DefaultMemlockLimit is called MLOCK_LIMIT in Linux. - DefaultMemlockLimit = 64 * 1024 + DefaultMemlockLimit = 64 * 1094 // DefaultMsgqueueLimit is called MQ_BYTES_MAX in Linux. DefaultMsgqueueLimit = 819200 diff --git a/pkg/abi/linux/mm.go b/pkg/abi/linux/mm.go index eda8d9788..3fcdf8235 100644 --- a/pkg/abi/linux/mm.go +++ b/pkg/abi/linux/mm.go @@ -49,18 +49,6 @@ const ( MREMAP_FIXED = 1 << 1 ) -// Flags for mlock2(2). -const ( - MLOCK_ONFAULT = 0x01 -) - -// Flags for mlockall(2). -const ( - MCL_CURRENT = 1 - MCL_FUTURE = 2 - MCL_ONFAULT = 4 -) - // Advice for madvise(2). const ( MADV_NORMAL = 0 diff --git a/pkg/sentry/limits/limits.go b/pkg/sentry/limits/limits.go index eeca01876..ba0b7d4fd 100644 --- a/pkg/sentry/limits/limits.go +++ b/pkg/sentry/limits/limits.go @@ -33,7 +33,7 @@ const ( Rss ProcessCount NumberOfFiles - MemoryLocked + MemoryPagesLocked AS Locks SignalsPending diff --git a/pkg/sentry/limits/linux.go b/pkg/sentry/limits/linux.go index 295f9c398..511db6733 100644 --- a/pkg/sentry/limits/linux.go +++ b/pkg/sentry/limits/linux.go @@ -30,7 +30,7 @@ var FromLinuxResource = map[int]LimitType{ linux.RLIMIT_RSS: Rss, linux.RLIMIT_NPROC: ProcessCount, linux.RLIMIT_NOFILE: NumberOfFiles, - linux.RLIMIT_MEMLOCK: MemoryLocked, + linux.RLIMIT_MEMLOCK: MemoryPagesLocked, linux.RLIMIT_AS: AS, linux.RLIMIT_LOCKS: Locks, linux.RLIMIT_SIGPENDING: SignalsPending, diff --git a/pkg/sentry/memmap/memmap.go b/pkg/sentry/memmap/memmap.go index cf20b11e3..28e2bed9b 100644 --- a/pkg/sentry/memmap/memmap.go +++ b/pkg/sentry/memmap/memmap.go @@ -243,40 +243,6 @@ type MappingIdentity interface { Msync(ctx context.Context, mr MappableRange) error } -// MLockMode specifies the memory locking behavior of a memory mapping. -type MLockMode int - -// Note that the ordering of MLockModes is significant; see -// mm.MemoryManager.defMLockMode. -const ( - // MLockNone specifies that a mapping has no memory locking behavior. - // - // This must be the zero value for MLockMode. - MLockNone MLockMode = iota - - // MLockEager specifies that a mapping is memory-locked, as by mlock() or - // similar. Pages in the mapping should be made, and kept, resident in - // physical memory as soon as possible. - // - // As of this writing, MLockEager does not cause memory-locking to be - // requested from the host; it only affects the sentry's memory management - // behavior. - // - // MLockEager is analogous to Linux's VM_LOCKED. - MLockEager - - // MLockLazy specifies that a mapping is memory-locked, as by mlock() or - // similar. Pages in the mapping should be kept resident in physical memory - // once they have been made resident due to e.g. a page fault. - // - // As of this writing, MLockLazy does not cause memory-locking to be - // requested from the host; in fact, it has virtually no effect, except for - // interactions between mlocked pages and other syscalls. - // - // MLockLazy is analogous to Linux's VM_LOCKED | VM_LOCKONFAULT. - MLockLazy -) - // MMapOpts specifies a request to create a memory mapping. type MMapOpts struct { // Length is the length of the mapping. @@ -337,9 +303,6 @@ type MMapOpts struct { // mapping (see platform.AddressSpace.MapFile). Precommit bool - // MLockMode specifies the memory locking behavior of the mapping. - MLockMode MLockMode - // Hint is the name used for the mapping in /proc/[pid]/maps. If Hint is // empty, MappingIdentity.MappedName() will be used instead. // diff --git a/pkg/sentry/mm/BUILD b/pkg/sentry/mm/BUILD index 5a9185e5d..744e73a39 100644 --- a/pkg/sentry/mm/BUILD +++ b/pkg/sentry/mm/BUILD @@ -106,7 +106,6 @@ go_library( "//pkg/sentry/context", "//pkg/sentry/fs", "//pkg/sentry/fs/proc/seqfile", - "//pkg/sentry/kernel/auth", "//pkg/sentry/kernel/futex", "//pkg/sentry/kernel/shm", "//pkg/sentry/limits", diff --git a/pkg/sentry/mm/address_space.go b/pkg/sentry/mm/address_space.go index e7aa24c69..7488f7c4a 100644 --- a/pkg/sentry/mm/address_space.go +++ b/pkg/sentry/mm/address_space.go @@ -149,7 +149,7 @@ func (mm *MemoryManager) Deactivate() { // for all addresses in ar should be precommitted. // // Preconditions: mm.activeMu must be locked. mm.as != nil. ar.Length() != 0. -// ar must be page-aligned. pseg == mm.pmas.LowerBoundSegment(ar.Start). +// ar must be page-aligned. pseg.Range().Contains(ar.Start). func (mm *MemoryManager) mapASLocked(pseg pmaIterator, ar usermem.AddrRange, precommit bool) error { // By default, map entire pmas at a time, under the assumption that there // is no cost to mapping more of a pma than necessary. @@ -173,9 +173,7 @@ func (mm *MemoryManager) mapASLocked(pseg pmaIterator, ar usermem.AddrRange, pre } } - // Since this checks ar.End and not mapAR.End, we will never map a pma that - // is not required. - for pseg.Ok() && pseg.Start() < ar.End { + for { pma := pseg.ValuePtr() pmaAR := pseg.Range() pmaMapAR := pmaAR.Intersect(mapAR) @@ -186,9 +184,13 @@ func (mm *MemoryManager) mapASLocked(pseg pmaIterator, ar usermem.AddrRange, pre if err := pma.file.MapInto(mm.as, pmaMapAR.Start, pseg.fileRangeOf(pmaMapAR), perms, precommit); err != nil { return err } + // Since this checks ar.End and not mapAR.End, we will never map a pma + // that is not required. + if ar.End <= pmaAR.End { + return nil + } pseg = pseg.NextSegment() } - return nil } // unmapASLocked removes all AddressSpace mappings for addresses in ar. diff --git a/pkg/sentry/mm/lifecycle.go b/pkg/sentry/mm/lifecycle.go index a42e32b43..1613ce11d 100644 --- a/pkg/sentry/mm/lifecycle.go +++ b/pkg/sentry/mm/lifecycle.go @@ -22,7 +22,6 @@ import ( "gvisor.googlesource.com/gvisor/pkg/sentry/arch" "gvisor.googlesource.com/gvisor/pkg/sentry/context" "gvisor.googlesource.com/gvisor/pkg/sentry/limits" - "gvisor.googlesource.com/gvisor/pkg/sentry/memmap" "gvisor.googlesource.com/gvisor/pkg/sentry/platform" "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" ) @@ -59,17 +58,13 @@ func (mm *MemoryManager) Fork(ctx context.Context) (*MemoryManager, error) { mm.mappingMu.RLock() defer mm.mappingMu.RUnlock() mm2 := &MemoryManager{ - p: mm.p, - haveASIO: mm.haveASIO, - layout: mm.layout, - privateRefs: mm.privateRefs, - users: 1, - brk: mm.brk, - usageAS: mm.usageAS, - // "The child does not inherit its parent's memory locks (mlock(2), - // mlockall(2))." - fork(2). So lockedAS is 0 and defMLockMode is - // MLockNone, both of which are zero values. vma.mlockMode is reset - // when copied below. + p: mm.p, + haveASIO: mm.haveASIO, + layout: mm.layout, + privateRefs: mm.privateRefs, + users: 1, + usageAS: mm.usageAS, + brk: mm.brk, captureInvalidations: true, argv: mm.argv, envv: mm.envv, @@ -82,7 +77,7 @@ func (mm *MemoryManager) Fork(ctx context.Context) (*MemoryManager, error) { // Copy vmas. dstvgap := mm2.vmas.FirstGap() for srcvseg := mm.vmas.FirstSegment(); srcvseg.Ok(); srcvseg = srcvseg.NextSegment() { - vma := srcvseg.Value() // makes a copy of the vma + vma := srcvseg.ValuePtr() vmaAR := srcvseg.Range() // Inform the Mappable, if any, of the new mapping. if vma.mappable != nil { @@ -94,8 +89,7 @@ func (mm *MemoryManager) Fork(ctx context.Context) (*MemoryManager, error) { if vma.id != nil { vma.id.IncRef() } - vma.mlockMode = memmap.MLockNone - dstvgap = mm2.vmas.Insert(dstvgap, vmaAR, vma).NextGap() + dstvgap = mm2.vmas.Insert(dstvgap, vmaAR, *vma).NextGap() // We don't need to update mm2.usageAS since we copied it from mm // above. } diff --git a/pkg/sentry/mm/mm.go b/pkg/sentry/mm/mm.go index c0632d232..b1e39e898 100644 --- a/pkg/sentry/mm/mm.go +++ b/pkg/sentry/mm/mm.go @@ -95,6 +95,11 @@ type MemoryManager struct { // vmas is protected by mappingMu. vmas vmaSet + // usageAS is vmas.Span(), cached to accelerate RLIMIT_AS checks. + // + // usageAS is protected by mappingMu. + usageAS uint64 + // brk is the mm's brk, which is manipulated using the brk(2) system call. // The brk is initially set up by the loader which maps an executable // binary into the mm. @@ -102,23 +107,6 @@ type MemoryManager struct { // brk is protected by mappingMu. brk usermem.AddrRange - // usageAS is vmas.Span(), cached to accelerate RLIMIT_AS checks. - // - // usageAS is protected by mappingMu. - usageAS uint64 - - // lockedAS is the combined size in bytes of all vmas with vma.mlockMode != - // memmap.MLockNone. - // - // lockedAS is protected by mappingMu. - lockedAS uint64 - - // New VMAs created by MMap use whichever of memmap.MMapOpts.MLockMode or - // defMLockMode is greater. - // - // defMLockMode is protected by mappingMu. - defMLockMode memmap.MLockMode - // activeMu is loosely analogous to Linux's struct // mm_struct::page_table_lock. activeMu ssync.DowngradableRWMutex `state:"nosave"` @@ -264,8 +252,6 @@ type vma struct { // metag, none of which we currently support. growsDown bool `state:"manual"` - mlockMode memmap.MLockMode - // If id is not nil, it controls the lifecycle of mappable and provides vma // metadata shown in /proc/[pid]/maps, and the vma holds a reference. id memmap.MappingIdentity diff --git a/pkg/sentry/mm/syscalls.go b/pkg/sentry/mm/syscalls.go index 383703ec3..daaae4da1 100644 --- a/pkg/sentry/mm/syscalls.go +++ b/pkg/sentry/mm/syscalls.go @@ -20,7 +20,6 @@ import ( "gvisor.googlesource.com/gvisor/pkg/abi/linux" "gvisor.googlesource.com/gvisor/pkg/sentry/context" - "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth" "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/futex" "gvisor.googlesource.com/gvisor/pkg/sentry/limits" "gvisor.googlesource.com/gvisor/pkg/sentry/memmap" @@ -129,24 +128,16 @@ func (mm *MemoryManager) MMap(ctx context.Context, opts memmap.MMapOpts) (userme // Get the new vma. mm.mappingMu.Lock() - if opts.MLockMode < mm.defMLockMode { - opts.MLockMode = mm.defMLockMode - } vseg, ar, err := mm.createVMALocked(ctx, opts) if err != nil { mm.mappingMu.Unlock() return 0, err } - // TODO: In Linux, VM_LOCKONFAULT (which may be set on the new - // vma by mlockall(MCL_FUTURE|MCL_ONFAULT) => mm_struct::def_flags) appears - // to effectively disable MAP_POPULATE by unsetting FOLL_POPULATE in - // mm/util.c:vm_mmap_pgoff() => mm/gup.c:__mm_populate() => - // populate_vma_page_range(). Confirm this behavior. switch { - case opts.Precommit || opts.MLockMode == memmap.MLockEager: + case opts.Precommit: // Get pmas and map with precommit as requested. - mm.populateVMAAndUnlock(ctx, vseg, ar, true) + mm.populateAndUnlock(ctx, vseg, ar, true) case opts.Mappable == nil && length <= privateAllocUnit: // NOTE: Get pmas and map eagerly in the hope @@ -155,7 +146,7 @@ func (mm *MemoryManager) MMap(ctx context.Context, opts memmap.MMapOpts) (userme // memmap.Mappable.Translate is unknown; and only for small mappings, // to avoid needing to allocate large amounts of memory that we may // subsequently need to checkpoint. - mm.populateVMAAndUnlock(ctx, vseg, ar, false) + mm.populateAndUnlock(ctx, vseg, ar, false) default: mm.mappingMu.Unlock() @@ -164,29 +155,31 @@ func (mm *MemoryManager) MMap(ctx context.Context, opts memmap.MMapOpts) (userme return ar.Start, nil } -// populateVMA obtains pmas for addresses in ar in the given vma, and maps them -// into mm.as if it is active. +// Preconditions: mm.mappingMu must be locked for writing. // -// Preconditions: mm.mappingMu must be locked. vseg.Range().IsSupersetOf(ar). -func (mm *MemoryManager) populateVMA(ctx context.Context, vseg vmaIterator, ar usermem.AddrRange, precommit bool) { +// Postconditions: mm.mappingMu will be unlocked. +func (mm *MemoryManager) populateAndUnlock(ctx context.Context, vseg vmaIterator, ar usermem.AddrRange, precommit bool) { if !vseg.ValuePtr().effectivePerms.Any() { // Linux doesn't populate inaccessible pages. See // mm/gup.c:populate_vma_page_range. + mm.mappingMu.Unlock() return } mm.activeMu.Lock() - // Can't defer mm.activeMu.Unlock(); see below. - // Even if we get new pmas, we can't actually map them if we don't have an + // Even if we get a new pma, we can't actually map it if we don't have an // AddressSpace. if mm.as == nil { mm.activeMu.Unlock() + mm.mappingMu.Unlock() return } // Ensure that we have usable pmas. + mm.mappingMu.DowngradeLock() pseg, _, err := mm.getPMAsLocked(ctx, vseg, ar, pmaOpts{}) + mm.mappingMu.RUnlock() if err != nil { // mm/util.c:vm_mmap_pgoff() ignores the error, if any, from // mm/gup.c:mm_populate(). If it matters, we'll get it again when @@ -204,45 +197,6 @@ func (mm *MemoryManager) populateVMA(ctx context.Context, vseg vmaIterator, ar u mm.activeMu.RUnlock() } -// populateVMAAndUnlock is equivalent to populateVMA, but also unconditionally -// unlocks mm.mappingMu. In cases where populateVMAAndUnlock is usable, it is -// preferable to populateVMA since it unlocks mm.mappingMu before performing -// expensive operations that don't require it to be locked. -// -// Preconditions: mm.mappingMu must be locked for writing. -// vseg.Range().IsSupersetOf(ar). -// -// Postconditions: mm.mappingMu will be unlocked. -func (mm *MemoryManager) populateVMAAndUnlock(ctx context.Context, vseg vmaIterator, ar usermem.AddrRange, precommit bool) { - // See populateVMA above for commentary. - if !vseg.ValuePtr().effectivePerms.Any() { - mm.mappingMu.Unlock() - return - } - - mm.activeMu.Lock() - - if mm.as == nil { - mm.activeMu.Unlock() - mm.mappingMu.Unlock() - return - } - - // mm.mappingMu doesn't need to be write-locked for getPMAsLocked, and it - // isn't needed at all for mapASLocked. - mm.mappingMu.DowngradeLock() - pseg, _, err := mm.getPMAsLocked(ctx, vseg, ar, pmaOpts{}) - mm.mappingMu.RUnlock() - if err != nil { - mm.activeMu.Unlock() - return - } - - mm.activeMu.DowngradeLock() - mm.mapASLocked(pseg, ar, precommit) - mm.activeMu.RUnlock() -} - // MapStack allocates the initial process stack. func (mm *MemoryManager) MapStack(ctx context.Context) (usermem.AddrRange, error) { // maxStackSize is the maximum supported process stack size in bytes. @@ -282,7 +236,6 @@ func (mm *MemoryManager) MapStack(ctx context.Context) (usermem.AddrRange, error MaxPerms: usermem.AnyAccess, Private: true, GrowsDown: true, - MLockMode: mm.defMLockMode, Hint: "[stack]", }) return ar, err @@ -381,19 +334,6 @@ func (mm *MemoryManager) MRemap(ctx context.Context, oldAddr usermem.Addr, oldSi // occupies at least part of the destination. Thus the NoMove case always // fails and the MayMove case always falls back to copying. - if vma := vseg.ValuePtr(); newSize > oldSize && vma.mlockMode != memmap.MLockNone { - // Check against RLIMIT_MEMLOCK. Unlike mmap, mlock, and mlockall, - // mremap in Linux does not check mm/mlock.c:can_do_mlock() and - // therefore does not return EPERM if RLIMIT_MEMLOCK is 0 and - // !CAP_IPC_LOCK. - mlockLimit := limits.FromContext(ctx).Get(limits.MemoryLocked).Cur - if creds := auth.CredentialsFromContext(ctx); !creds.HasCapabilityIn(linux.CAP_IPC_LOCK, creds.UserNamespace.Root()) { - if newLockedAS := mm.lockedAS - oldSize + newSize; newLockedAS > mlockLimit { - return 0, syserror.EAGAIN - } - } - } - if opts.Move != MRemapMustMove { // Handle no-ops and in-place shrinking. These cases don't care if // [oldAddr, oldEnd) maps to a single vma, or is even mapped at all @@ -420,7 +360,7 @@ func (mm *MemoryManager) MRemap(ctx context.Context, oldAddr usermem.Addr, oldSi if vma.mappable != nil { newOffset = vseg.mappableRange().End } - vseg, ar, err := mm.createVMALocked(ctx, memmap.MMapOpts{ + _, _, err := mm.createVMALocked(ctx, memmap.MMapOpts{ Length: newSize - oldSize, MappingIdentity: vma.id, Mappable: vma.mappable, @@ -431,13 +371,9 @@ func (mm *MemoryManager) MRemap(ctx context.Context, oldAddr usermem.Addr, oldSi MaxPerms: vma.maxPerms, Private: vma.private, GrowsDown: vma.growsDown, - MLockMode: vma.mlockMode, Hint: vma.hint, }) if err == nil { - if vma.mlockMode == memmap.MLockEager { - mm.populateVMA(ctx, vseg, ar, true) - } return oldAddr, nil } // In-place growth failed. In the MRemapMayMove case, fall through to @@ -526,14 +462,8 @@ func (mm *MemoryManager) MRemap(ctx context.Context, oldAddr usermem.Addr, oldSi if vma.id != nil { vma.id.IncRef() } - vseg := mm.vmas.Insert(mm.vmas.FindGap(newAR.Start), newAR, vma) + mm.vmas.Add(newAR, vma) mm.usageAS += uint64(newAR.Length()) - if vma.mlockMode != memmap.MLockNone { - mm.lockedAS += uint64(newAR.Length()) - if vma.mlockMode == memmap.MLockEager { - mm.populateVMA(ctx, vseg, newAR, true) - } - } return newAR.Start, nil } @@ -555,11 +485,8 @@ func (mm *MemoryManager) MRemap(ctx context.Context, oldAddr usermem.Addr, oldSi vseg = mm.vmas.Isolate(vseg, oldAR) vma := vseg.Value() mm.vmas.Remove(vseg) - vseg = mm.vmas.Insert(mm.vmas.FindGap(newAR.Start), newAR, vma) + mm.vmas.Add(newAR, vma) mm.usageAS = mm.usageAS - uint64(oldAR.Length()) + uint64(newAR.Length()) - if vma.mlockMode != memmap.MLockNone { - mm.lockedAS = mm.lockedAS - uint64(oldAR.Length()) + uint64(newAR.Length()) - } // Move pmas. This is technically optional for non-private pmas, which // could just go through memmap.Mappable.Translate again, but it's required @@ -574,10 +501,6 @@ func (mm *MemoryManager) MRemap(ctx context.Context, oldAddr usermem.Addr, oldSi vma.mappable.RemoveMapping(ctx, mm, oldAR, vma.off, vma.isMappableAsWritable()) } - if vma.mlockMode == memmap.MLockEager { - mm.populateVMA(ctx, vseg, newAR, true) - } - return newAR.Start, nil } @@ -688,10 +611,9 @@ func (mm *MemoryManager) BrkSetup(ctx context.Context, addr usermem.Addr) { // error on failure. func (mm *MemoryManager) Brk(ctx context.Context, addr usermem.Addr) (usermem.Addr, error) { mm.mappingMu.Lock() - // Can't defer mm.mappingMu.Unlock(); see below. + defer mm.mappingMu.Unlock() if addr < mm.brk.Start { - mm.mappingMu.Unlock() return mm.brk.End, syserror.EINVAL } @@ -701,24 +623,21 @@ func (mm *MemoryManager) Brk(ctx context.Context, addr usermem.Addr) (usermem.Ad // heap + data + bss. The segment sizes need to be plumbed from the // loader package to fully enforce RLIMIT_DATA. if uint64(addr-mm.brk.Start) > limits.FromContext(ctx).Get(limits.Data).Cur { - mm.mappingMu.Unlock() return mm.brk.End, syserror.ENOMEM } oldbrkpg, _ := mm.brk.End.RoundUp() newbrkpg, ok := addr.RoundUp() if !ok { - mm.mappingMu.Unlock() return mm.brk.End, syserror.EFAULT } switch { case newbrkpg < oldbrkpg: mm.unmapLocked(ctx, usermem.AddrRange{newbrkpg, oldbrkpg}) - mm.mappingMu.Unlock() case oldbrkpg < newbrkpg: - vseg, ar, err := mm.createVMALocked(ctx, memmap.MMapOpts{ + _, _, err := mm.createVMALocked(ctx, memmap.MMapOpts{ Length: uint64(newbrkpg - oldbrkpg), Addr: oldbrkpg, Fixed: true, @@ -727,221 +646,17 @@ func (mm *MemoryManager) Brk(ctx context.Context, addr usermem.Addr) (usermem.Ad Perms: usermem.ReadWrite, MaxPerms: usermem.AnyAccess, Private: true, - // Linux: mm/mmap.c:sys_brk() => do_brk_flags() includes - // mm->def_flags. - MLockMode: mm.defMLockMode, - Hint: "[heap]", + Hint: "[heap]", }) if err != nil { - mm.mappingMu.Unlock() return mm.brk.End, err } - if mm.defMLockMode == memmap.MLockEager { - mm.populateVMAAndUnlock(ctx, vseg, ar, true) - } else { - mm.mappingMu.Unlock() - } - - default: - // Nothing to do. - mm.mappingMu.Unlock() } mm.brk.End = addr return addr, nil } -// MLock implements the semantics of Linux's mlock()/mlock2()/munlock(), -// depending on mode. -func (mm *MemoryManager) MLock(ctx context.Context, addr usermem.Addr, length uint64, mode memmap.MLockMode) error { - // Linux allows this to overflow. - la, _ := usermem.Addr(length + addr.PageOffset()).RoundUp() - ar, ok := addr.RoundDown().ToRange(uint64(la)) - if !ok { - return syserror.EINVAL - } - - mm.mappingMu.Lock() - // Can't defer mm.mappingMu.Unlock(); see below. - - if mode != memmap.MLockNone { - // Check against RLIMIT_MEMLOCK. - if creds := auth.CredentialsFromContext(ctx); !creds.HasCapabilityIn(linux.CAP_IPC_LOCK, creds.UserNamespace.Root()) { - mlockLimit := limits.FromContext(ctx).Get(limits.MemoryLocked).Cur - if mlockLimit == 0 { - mm.mappingMu.Unlock() - return syserror.EPERM - } - if newLockedAS := mm.lockedAS + uint64(ar.Length()) - mm.mlockedBytesRangeLocked(ar); newLockedAS > mlockLimit { - mm.mappingMu.Unlock() - return syserror.ENOMEM - } - } - } - - // Check this after RLIMIT_MEMLOCK for consistency with Linux. - if ar.Length() == 0 { - mm.mappingMu.Unlock() - return nil - } - - // Apply the new mlock mode to vmas. - var unmapped bool - vseg := mm.vmas.FindSegment(ar.Start) - for { - if !vseg.Ok() { - unmapped = true - break - } - vseg = mm.vmas.Isolate(vseg, ar) - vma := vseg.ValuePtr() - prevMode := vma.mlockMode - vma.mlockMode = mode - if mode != memmap.MLockNone && prevMode == memmap.MLockNone { - mm.lockedAS += uint64(vseg.Range().Length()) - } else if mode == memmap.MLockNone && prevMode != memmap.MLockNone { - mm.lockedAS -= uint64(vseg.Range().Length()) - } - if ar.End <= vseg.End() { - break - } - vseg, _ = vseg.NextNonEmpty() - } - mm.vmas.MergeRange(ar) - mm.vmas.MergeAdjacent(ar) - if unmapped { - mm.mappingMu.Unlock() - return syserror.ENOMEM - } - - if mode == memmap.MLockEager { - // Ensure that we have usable pmas. Since we didn't return ENOMEM - // above, ar must be fully covered by vmas, so we can just use - // NextSegment below. - mm.activeMu.Lock() - mm.mappingMu.DowngradeLock() - for vseg := mm.vmas.FindSegment(ar.Start); vseg.Ok() && vseg.Start() < ar.End; vseg = vseg.NextSegment() { - if !vseg.ValuePtr().effectivePerms.Any() { - // Linux: mm/gup.c:__get_user_pages() returns EFAULT in this - // case, which is converted to ENOMEM by mlock. - mm.activeMu.Unlock() - mm.mappingMu.RUnlock() - return syserror.ENOMEM - } - _, _, err := mm.getPMAsLocked(ctx, vseg, vseg.Range().Intersect(ar), pmaOpts{}) - if err != nil { - mm.activeMu.Unlock() - mm.mappingMu.RUnlock() - // Linux: mm/mlock.c:__mlock_posix_error_return() - if err == syserror.EFAULT { - return syserror.ENOMEM - } - if err == syserror.ENOMEM { - return syserror.EAGAIN - } - return err - } - } - - // Map pmas into the active AddressSpace, if we have one. - mm.mappingMu.RUnlock() - if mm.as != nil { - mm.activeMu.DowngradeLock() - err := mm.mapASLocked(mm.pmas.LowerBoundSegment(ar.Start), ar, true /* precommit */) - mm.activeMu.RUnlock() - if err != nil { - return err - } - } else { - mm.activeMu.Unlock() - } - } else { - mm.mappingMu.Unlock() - } - - return nil -} - -// MLockAllOpts holds options to MLockAll. -type MLockAllOpts struct { - // If Current is true, change the memory-locking behavior of all mappings - // to Mode. If Future is true, upgrade the memory-locking behavior of all - // future mappings to Mode. At least one of Current or Future must be true. - Current bool - Future bool - Mode memmap.MLockMode -} - -// MLockAll implements the semantics of Linux's mlockall()/munlockall(), -// depending on opts. -func (mm *MemoryManager) MLockAll(ctx context.Context, opts MLockAllOpts) error { - if !opts.Current && !opts.Future { - return syserror.EINVAL - } - - mm.mappingMu.Lock() - // Can't defer mm.mappingMu.Unlock(); see below. - - if opts.Current { - if opts.Mode != memmap.MLockNone { - // Check against RLIMIT_MEMLOCK. - if creds := auth.CredentialsFromContext(ctx); !creds.HasCapabilityIn(linux.CAP_IPC_LOCK, creds.UserNamespace.Root()) { - mlockLimit := limits.FromContext(ctx).Get(limits.MemoryLocked).Cur - if mlockLimit == 0 { - mm.mappingMu.Unlock() - return syserror.EPERM - } - if uint64(mm.vmas.Span()) > mlockLimit { - mm.mappingMu.Unlock() - return syserror.ENOMEM - } - } - } - for vseg := mm.vmas.FirstSegment(); vseg.Ok(); vseg = vseg.NextSegment() { - vma := vseg.ValuePtr() - prevMode := vma.mlockMode - vma.mlockMode = opts.Mode - if opts.Mode != memmap.MLockNone && prevMode == memmap.MLockNone { - mm.lockedAS += uint64(vseg.Range().Length()) - } else if opts.Mode == memmap.MLockNone && prevMode != memmap.MLockNone { - mm.lockedAS -= uint64(vseg.Range().Length()) - } - } - } - - if opts.Future { - mm.defMLockMode = opts.Mode - } - - if opts.Current && opts.Mode == memmap.MLockEager { - // Linux: mm/mlock.c:sys_mlockall() => include/linux/mm.h:mm_populate() - // ignores the return value of __mm_populate(), so all errors below are - // ignored. - // - // Try to get usable pmas. - mm.activeMu.Lock() - mm.mappingMu.DowngradeLock() - for vseg := mm.vmas.FirstSegment(); vseg.Ok(); vseg = vseg.NextSegment() { - if vseg.ValuePtr().effectivePerms.Any() { - mm.getPMAsLocked(ctx, vseg, vseg.Range(), pmaOpts{}) - } - } - - // Map all pmas into the active AddressSpace, if we have one. - mm.mappingMu.RUnlock() - if mm.as != nil { - mm.activeMu.DowngradeLock() - mm.mapASLocked(mm.pmas.FirstSegment(), mm.applicationAddrRange(), true /* precommit */) - mm.activeMu.RUnlock() - } else { - mm.activeMu.Unlock() - } - } else { - mm.mappingMu.Unlock() - } - return nil -} - // Decommit implements the semantics of Linux's madvise(MADV_DONTNEED). func (mm *MemoryManager) Decommit(addr usermem.Addr, length uint64) error { ar, ok := addr.ToRange(length) @@ -965,49 +680,46 @@ func (mm *MemoryManager) Decommit(addr usermem.Addr, length uint64) error { // ensures that Decommit immediately reduces host memory usage. var didUnmapAS bool pseg := mm.pmas.LowerBoundSegment(ar.Start) + vseg := mm.vmas.LowerBoundSegment(ar.Start) mem := mm.p.Memory() - for vseg := mm.vmas.LowerBoundSegment(ar.Start); vseg.Ok() && vseg.Start() < ar.End; vseg = vseg.NextSegment() { - vma := vseg.ValuePtr() - if vma.mlockMode != memmap.MLockNone { - return syserror.EINVAL - } - vsegAR := vseg.Range().Intersect(ar) - // pseg should already correspond to either this vma or a later one, - // since there can't be a pma without a corresponding vma. - if checkInvariants { - if pseg.Ok() && pseg.End() <= vsegAR.Start { - panic(fmt.Sprintf("pma %v precedes vma %v", pseg.Range(), vsegAR)) - } - } - for pseg.Ok() && pseg.Start() < vsegAR.End { - pma := pseg.ValuePtr() - if pma.private && !mm.isPMACopyOnWriteLocked(pseg) { - psegAR := pseg.Range().Intersect(ar) - if vsegAR.IsSupersetOf(psegAR) && vma.mappable == nil { - if err := mem.Decommit(pseg.fileRangeOf(psegAR)); err == nil { - pseg = pseg.NextSegment() - continue - } - // If an error occurs, fall through to the general - // invalidation case below. + for pseg.Ok() && pseg.Start() < ar.End { + pma := pseg.ValuePtr() + if pma.private && !mm.isPMACopyOnWriteLocked(pseg) { + psegAR := pseg.Range().Intersect(ar) + vseg = vseg.seekNextLowerBound(psegAR.Start) + if checkInvariants { + if !vseg.Ok() { + panic(fmt.Sprintf("no vma after %#x", psegAR.Start)) + } + if psegAR.Start < vseg.Start() { + panic(fmt.Sprintf("no vma in [%#x, %#x)", psegAR.Start, vseg.Start())) } } - pseg = mm.pmas.Isolate(pseg, vsegAR) - pma = pseg.ValuePtr() - if !didUnmapAS { - // Unmap all of ar, not just pseg.Range(), to minimize host - // syscalls. AddressSpace mappings must be removed before - // mm.decPrivateRef(). - mm.unmapASLocked(ar) - didUnmapAS = true + if vseg.Range().IsSupersetOf(psegAR) && vseg.ValuePtr().mappable == nil { + if err := mem.Decommit(pseg.fileRangeOf(psegAR)); err == nil { + pseg = pseg.NextSegment() + continue + } + // If an error occurs, fall through to the general + // invalidation case below. } - if pma.private { - mm.decPrivateRef(pseg.fileRange()) - } - pma.file.DecRef(pseg.fileRange()) - mm.removeRSSLocked(pseg.Range()) - pseg = mm.pmas.Remove(pseg).NextSegment() } + pseg = mm.pmas.Isolate(pseg, ar) + pma = pseg.ValuePtr() + if !didUnmapAS { + // Unmap all of ar, not just pseg.Range(), to minimize host + // syscalls. AddressSpace mappings must be removed before + // mm.decPrivateRef(). + mm.unmapASLocked(ar) + didUnmapAS = true + } + if pma.private { + mm.decPrivateRef(pseg.fileRange()) + } + pma.file.DecRef(pseg.fileRange()) + mm.removeRSSLocked(pseg.Range()) + + pseg = mm.pmas.Remove(pseg).NextSegment() } // "If there are some parts of the specified address space that are not @@ -1020,28 +732,9 @@ func (mm *MemoryManager) Decommit(addr usermem.Addr, length uint64) error { return nil } -// MSyncOpts holds options to MSync. -type MSyncOpts struct { - // Sync has the semantics of MS_SYNC. - Sync bool - - // Invalidate has the semantics of MS_INVALIDATE. - Invalidate bool -} - -// MSync implements the semantics of Linux's msync(). -func (mm *MemoryManager) MSync(ctx context.Context, addr usermem.Addr, length uint64, opts MSyncOpts) error { - if addr != addr.RoundDown() { - return syserror.EINVAL - } - if length == 0 { - return nil - } - la, ok := usermem.Addr(length).RoundUp() - if !ok { - return syserror.ENOMEM - } - ar, ok := addr.ToRange(uint64(la)) +// Sync implements the semantics of Linux's msync(MS_SYNC). +func (mm *MemoryManager) Sync(ctx context.Context, addr usermem.Addr, length uint64) error { + ar, ok := addr.ToRange(length) if !ok { return syserror.ENOMEM } @@ -1066,14 +759,10 @@ func (mm *MemoryManager) MSync(ctx context.Context, addr usermem.Addr, length ui } lastEnd = vseg.End() vma := vseg.ValuePtr() - if opts.Invalidate && vma.mlockMode != memmap.MLockNone { - mm.mappingMu.RUnlock() - return syserror.EBUSY - } // It's only possible to have dirtied the Mappable through a shared // mapping. Don't check if the mapping is writable, because mprotect // may have changed this, and also because Linux doesn't. - if id := vma.id; opts.Sync && id != nil && vma.mappable != nil && !vma.private { + if id := vma.id; id != nil && vma.mappable != nil && !vma.private { // We can't call memmap.MappingIdentity.Msync while holding // mm.mappingMu since it may take fs locks that precede it in the // lock order. diff --git a/pkg/sentry/mm/vma.go b/pkg/sentry/mm/vma.go index 28ba9f2f5..5c2c802f6 100644 --- a/pkg/sentry/mm/vma.go +++ b/pkg/sentry/mm/vma.go @@ -17,10 +17,8 @@ package mm import ( "fmt" - "gvisor.googlesource.com/gvisor/pkg/abi/linux" "gvisor.googlesource.com/gvisor/pkg/sentry/arch" "gvisor.googlesource.com/gvisor/pkg/sentry/context" - "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth" "gvisor.googlesource.com/gvisor/pkg/sentry/limits" "gvisor.googlesource.com/gvisor/pkg/sentry/memmap" "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" @@ -55,23 +53,6 @@ func (mm *MemoryManager) createVMALocked(ctx context.Context, opts memmap.MMapOp return vmaIterator{}, usermem.AddrRange{}, syserror.ENOMEM } - if opts.MLockMode != memmap.MLockNone { - // Check against RLIMIT_MEMLOCK. - if creds := auth.CredentialsFromContext(ctx); !creds.HasCapabilityIn(linux.CAP_IPC_LOCK, creds.UserNamespace.Root()) { - mlockLimit := limits.FromContext(ctx).Get(limits.MemoryLocked).Cur - if mlockLimit == 0 { - return vmaIterator{}, usermem.AddrRange{}, syserror.EPERM - } - newLockedAS := mm.lockedAS + opts.Length - if opts.Unmap { - newLockedAS -= mm.mlockedBytesRangeLocked(ar) - } - if newLockedAS > mlockLimit { - return vmaIterator{}, usermem.AddrRange{}, syserror.EAGAIN - } - } - } - // Remove overwritten mappings. This ordering is consistent with Linux: // compare Linux's mm/mmap.c:mmap_region() => do_munmap(), // file->f_op->mmap(). @@ -104,14 +85,10 @@ func (mm *MemoryManager) createVMALocked(ctx context.Context, opts memmap.MMapOp maxPerms: opts.MaxPerms, private: opts.Private, growsDown: opts.GrowsDown, - mlockMode: opts.MLockMode, id: opts.MappingIdentity, hint: opts.Hint, }) mm.usageAS += opts.Length - if opts.MLockMode != memmap.MLockNone { - mm.lockedAS += opts.Length - } return vseg, ar, nil } @@ -224,17 +201,6 @@ func (mm *MemoryManager) findHighestAvailableLocked(length, alignment uint64, bo return 0, syserror.ENOMEM } -// Preconditions: mm.mappingMu must be locked. -func (mm *MemoryManager) mlockedBytesRangeLocked(ar usermem.AddrRange) uint64 { - var total uint64 - for vseg := mm.vmas.LowerBoundSegment(ar.Start); vseg.Ok() && vseg.Start() < ar.End; vseg = vseg.NextSegment() { - if vseg.ValuePtr().mlockMode != memmap.MLockNone { - total += uint64(vseg.Range().Intersect(ar).Length()) - } - } - return total -} - // getVMAsLocked ensures that vmas exist for all addresses in ar, and support // access of type (at, ignorePermissions). It returns: // @@ -372,9 +338,6 @@ func (mm *MemoryManager) removeVMAsLocked(ctx context.Context, ar usermem.AddrRa vma.id.DecRef() } mm.usageAS -= uint64(vmaAR.Length()) - if vma.mlockMode != memmap.MLockNone { - mm.lockedAS -= uint64(vmaAR.Length()) - } vgap = mm.vmas.Remove(vseg) vseg = vgap.NextSegment() } @@ -405,7 +368,6 @@ func (vmaSetFunctions) Merge(ar1 usermem.AddrRange, vma1 vma, ar2 usermem.AddrRa vma1.maxPerms != vma2.maxPerms || vma1.private != vma2.private || vma1.growsDown != vma2.growsDown || - vma1.mlockMode != vma2.mlockMode || vma1.id != vma2.id || vma1.hint != vma2.hint { return vma{}, false diff --git a/pkg/sentry/syscalls/linux/linux64.go b/pkg/sentry/syscalls/linux/linux64.go index e855590e6..7a5c93f9b 100644 --- a/pkg/sentry/syscalls/linux/linux64.go +++ b/pkg/sentry/syscalls/linux/linux64.go @@ -196,11 +196,11 @@ var AMD64 = &kernel.SyscallTable{ 145: SchedGetscheduler, 146: SchedGetPriorityMax, 147: SchedGetPriorityMin, - 148: syscalls.ErrorWithEvent(syscall.EPERM), // SchedRrGetInterval, - 149: Mlock, - 150: Munlock, - 151: Mlockall, - 152: Munlockall, + 148: syscalls.ErrorWithEvent(syscall.EPERM), // SchedRrGetInterval, + 149: syscalls.Error(nil), // Mlock, TODO + 150: syscalls.Error(nil), // Munlock, TODO + 151: syscalls.Error(nil), // Mlockall, TODO + 152: syscalls.Error(nil), // Munlockall, TODO 153: syscalls.CapError(linux.CAP_SYS_TTY_CONFIG), // Vhangup, 154: syscalls.Error(syscall.EPERM), // ModifyLdt, 155: syscalls.Error(syscall.EPERM), // PivotRoot, @@ -373,9 +373,8 @@ var AMD64 = &kernel.SyscallTable{ // 322: Execveat, TODO // 323: Userfaultfd, TODO // 324: Membarrier, TODO - 325: Mlock2, - // Syscalls after 325 are "backports" from versions of Linux after 4.4. - // 326: CopyFileRange, + // Syscalls after 325 are backports from 4.6. + 325: syscalls.Error(nil), // Mlock2, TODO 327: Preadv2, 328: Pwritev2, }, diff --git a/pkg/sentry/syscalls/linux/sys_mmap.go b/pkg/sentry/syscalls/linux/sys_mmap.go index 8732861e0..145f7846c 100644 --- a/pkg/sentry/syscalls/linux/sys_mmap.go +++ b/pkg/sentry/syscalls/linux/sys_mmap.go @@ -69,9 +69,6 @@ func Mmap(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC GrowsDown: linux.MAP_GROWSDOWN&flags != 0, Precommit: linux.MAP_POPULATE&flags != 0, } - if linux.MAP_LOCKED&flags != 0 { - opts.MLockMode = memmap.MLockEager - } defer func() { if opts.MappingIdentity != nil { opts.MappingIdentity.DecRef() @@ -387,6 +384,16 @@ func Msync(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall length := args[1].SizeT() flags := args[2].Int() + if addr != addr.RoundDown() { + return 0, nil, syserror.EINVAL + } + if length == 0 { + return 0, nil, nil + } + la, ok := usermem.Addr(length).RoundUp() + if !ok { + return 0, nil, syserror.ENOMEM + } // "The flags argument should specify exactly one of MS_ASYNC and MS_SYNC, // and may additionally include the MS_INVALIDATE bit. ... However, Linux // permits a call to msync() that specifies neither of these flags, with @@ -399,72 +406,39 @@ func Msync(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall if sync && flags&linux.MS_ASYNC != 0 { return 0, nil, syserror.EINVAL } - err := t.MemoryManager().MSync(t, addr, uint64(length), mm.MSyncOpts{ - Sync: sync, - Invalidate: flags&linux.MS_INVALIDATE != 0, - }) - // MSync calls fsync, the same interrupt conversion rules apply, see - // mm/msync.c, fsync POSIX.1-2008. - return 0, nil, syserror.ConvertIntr(err, kernel.ERESTARTSYS) -} -// Mlock implements linux syscall mlock(2). -func Mlock(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - addr := args[0].Pointer() - length := args[1].SizeT() - - return 0, nil, t.MemoryManager().MLock(t, addr, uint64(length), memmap.MLockEager) -} - -// Mlock2 implements linux syscall mlock2(2). -func Mlock2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - addr := args[0].Pointer() - length := args[1].SizeT() - flags := args[2].Int() - - if flags&^(linux.MLOCK_ONFAULT) != 0 { + // MS_INVALIDATE "asks to invalidate other mappings of the same file (so + // that they can be updated with the fresh values just written)". This is a + // no-op given that shared memory exists. However, MS_INVALIDATE can also + // be used to detect mlocks: "EBUSY: MS_INVALIDATE was specified in flags, + // and a memory lock exists for the specified address range." Given that + // mlock is stubbed out, it's unsafe to pass MS_INVALIDATE silently since + // some user program could be using it for synchronization. + if flags&linux.MS_INVALIDATE != 0 { return 0, nil, syserror.EINVAL } - - mode := memmap.MLockEager - if flags&linux.MLOCK_ONFAULT != 0 { - mode = memmap.MLockLazy + // MS_SYNC "requests an update and waits for it to complete." + if sync { + err := t.MemoryManager().Sync(t, addr, uint64(la)) + // Sync calls fsync, the same interrupt conversion rules apply, see + // mm/msync.c, fsync POSIX.1-2008. + return 0, nil, syserror.ConvertIntr(err, kernel.ERESTARTSYS) } - return 0, nil, t.MemoryManager().MLock(t, addr, uint64(length), mode) -} - -// Munlock implements linux syscall munlock(2). -func Munlock(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - addr := args[0].Pointer() - length := args[1].SizeT() - - return 0, nil, t.MemoryManager().MLock(t, addr, uint64(length), memmap.MLockNone) -} - -// Mlockall implements linux syscall mlockall(2). -func Mlockall(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - flags := args[0].Int() - - if flags&^(linux.MCL_CURRENT|linux.MCL_FUTURE|linux.MCL_ONFAULT) != 0 { - return 0, nil, syserror.EINVAL + // MS_ASYNC "specifies that an update be scheduled, but the call returns + // immediately". As long as dirty pages are tracked and eventually written + // back, this is a no-op. (Correspondingly: "Since Linux 2.6.19, MS_ASYNC + // is in fact a no-op, since the kernel properly tracks dirty pages and + // flushes them to storage as necessary.") + // + // However: "ENOMEM: The indicated memory (or part of it) was not mapped." + // This applies even for MS_ASYNC. + ar, ok := addr.ToRange(uint64(la)) + if !ok { + return 0, nil, syserror.ENOMEM } - - mode := memmap.MLockEager - if flags&linux.MCL_ONFAULT != 0 { - mode = memmap.MLockLazy + mapped := t.MemoryManager().VirtualMemorySizeRange(ar) + if mapped != uint64(la) { + return 0, nil, syserror.ENOMEM } - return 0, nil, t.MemoryManager().MLockAll(t, mm.MLockAllOpts{ - Current: flags&linux.MCL_CURRENT != 0, - Future: flags&linux.MCL_FUTURE != 0, - Mode: mode, - }) -} - -// Munlockall implements linux syscall munlockall(2). -func Munlockall(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - return 0, nil, t.MemoryManager().MLockAll(t, mm.MLockAllOpts{ - Current: true, - Future: true, - Mode: memmap.MLockNone, - }) + return 0, nil, nil } diff --git a/pkg/sentry/syscalls/linux/sys_rlimit.go b/pkg/sentry/syscalls/linux/sys_rlimit.go index b0b216045..2f16e1791 100644 --- a/pkg/sentry/syscalls/linux/sys_rlimit.go +++ b/pkg/sentry/syscalls/linux/sys_rlimit.go @@ -90,7 +90,6 @@ var setableLimits = map[limits.LimitType]struct{}{ limits.CPU: {}, limits.Data: {}, limits.FileSize: {}, - limits.MemoryLocked: {}, limits.Stack: {}, // These are not enforced, but we include them here to avoid returning // EPERM, since some apps expect them to succeed. diff --git a/runsc/boot/limits.go b/runsc/boot/limits.go index e3e716bf9..8ecda6d0e 100644 --- a/runsc/boot/limits.go +++ b/runsc/boot/limits.go @@ -29,7 +29,7 @@ var fromLinuxResource = map[string]limits.LimitType{ "RLIMIT_DATA": limits.Data, "RLIMIT_FSIZE": limits.FileSize, "RLIMIT_LOCKS": limits.Locks, - "RLIMIT_MEMLOCK": limits.MemoryLocked, + "RLIMIT_MEMLOCK": limits.MemoryPagesLocked, "RLIMIT_MSGQUEUE": limits.MessageQueueBytes, "RLIMIT_NICE": limits.Nice, "RLIMIT_NOFILE": limits.NumberOfFiles, @@ -55,7 +55,7 @@ func createLimitSet(spec *specs.Spec) (*limits.LimitSet, error) { ls.SetUnchecked(limits.Data, limits.Limit{Cur: limits.Infinity, Max: limits.Infinity}) ls.SetUnchecked(limits.FileSize, limits.Limit{Cur: limits.Infinity, Max: limits.Infinity}) ls.SetUnchecked(limits.Locks, limits.Limit{Cur: limits.Infinity, Max: limits.Infinity}) - ls.SetUnchecked(limits.MemoryLocked, limits.Limit{Cur: 65536, Max: 65536}) + ls.SetUnchecked(limits.MemoryPagesLocked, limits.Limit{Cur: 65536, Max: 65536}) ls.SetUnchecked(limits.MessageQueueBytes, limits.Limit{Cur: 819200, Max: 819200}) ls.SetUnchecked(limits.Nice, limits.Limit{Cur: 0, Max: 0}) ls.SetUnchecked(limits.NumberOfFiles, limits.Limit{Cur: 1048576, Max: 1048576}) diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD index f13e32daa..c0b8246b5 100644 --- a/test/syscalls/linux/BUILD +++ b/test/syscalls/linux/BUILD @@ -1019,21 +1019,6 @@ cc_binary( ], ) -cc_binary( - name = "mlock_test", - testonly = 1, - srcs = ["mlock.cc"], - linkstatic = 1, - deps = [ - "//test/util:capability_util", - "//test/util:cleanup", - "//test/util:memory_util", - "//test/util:multiprocess_util", - "//test/util:test_util", - "@com_google_googletest//:gtest", - ], -) - cc_binary( name = "mmap_test", testonly = 1, diff --git a/test/syscalls/linux/mlock.cc b/test/syscalls/linux/mlock.cc deleted file mode 100644 index a0d876c2e..000000000 --- a/test/syscalls/linux/mlock.cc +++ /dev/null @@ -1,344 +0,0 @@ -// Copyright 2018 Google LLC -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include -#include -#include -#include -#include -#include - -#include "test/util/capability_util.h" -#include "test/util/cleanup.h" -#include "test/util/memory_util.h" -#include "test/util/multiprocess_util.h" -#include "test/util/test_util.h" - -using ::testing::_; - -namespace gvisor { -namespace testing { - -namespace { - -PosixErrorOr CanMlock() { - struct rlimit rlim; - if (getrlimit(RLIMIT_MEMLOCK, &rlim) < 0) { - return PosixError(errno, "getrlimit(RLIMIT_MEMLOCK)"); - } - if (rlim.rlim_cur != 0) { - return true; - } - return HaveCapability(CAP_IPC_LOCK); -} - -// Returns true if the page containing addr is mlocked. -bool IsPageMlocked(uintptr_t addr) { - // This relies on msync(MS_INVALIDATE) interacting correctly with mlocked - // pages, which is tested for by the MsyncInvalidate case below. - int const rv = msync(reinterpret_cast(addr & ~(kPageSize - 1)), - kPageSize, MS_ASYNC | MS_INVALIDATE); - if (rv == 0) { - return false; - } - // This uses TEST_PCHECK_MSG since it's used in subprocesses. - TEST_PCHECK_MSG(errno == EBUSY, "msync failed with unexpected errno"); - return true; -} - -PosixErrorOr ScopedSetSoftRlimit(int resource, rlim_t newval) { - struct rlimit old_rlim; - if (getrlimit(resource, &old_rlim) != 0) { - return PosixError(errno, "getrlimit failed"); - } - struct rlimit new_rlim = old_rlim; - new_rlim.rlim_cur = newval; - if (setrlimit(resource, &new_rlim) != 0) { - return PosixError(errno, "setrlimit failed"); - } - return Cleanup([resource, old_rlim] { - TEST_PCHECK(setrlimit(resource, &old_rlim) == 0); - }); -} - -TEST(MlockTest, Basic) { - SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(CanMlock())); - auto const mapping = ASSERT_NO_ERRNO_AND_VALUE( - MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE)); - EXPECT_FALSE(IsPageMlocked(mapping.addr())); - ASSERT_THAT(mlock(mapping.ptr(), mapping.len()), SyscallSucceeds()); - EXPECT_TRUE(IsPageMlocked(mapping.addr())); -} - -TEST(MlockTest, ProtNone) { - SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(CanMlock())); - auto const mapping = - ASSERT_NO_ERRNO_AND_VALUE(MmapAnon(kPageSize, PROT_NONE, MAP_PRIVATE)); - EXPECT_FALSE(IsPageMlocked(mapping.addr())); - ASSERT_THAT(mlock(mapping.ptr(), mapping.len()), - SyscallFailsWithErrno(ENOMEM)); - // ENOMEM is returned because mlock can't populate the page, but it's still - // considered locked. - EXPECT_TRUE(IsPageMlocked(mapping.addr())); -} - -TEST(MlockTest, MadviseDontneed) { - SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(CanMlock())); - auto const mapping = ASSERT_NO_ERRNO_AND_VALUE( - MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE)); - ASSERT_THAT(mlock(mapping.ptr(), mapping.len()), SyscallSucceeds()); - EXPECT_THAT(madvise(mapping.ptr(), mapping.len(), MADV_DONTNEED), - SyscallFailsWithErrno(EINVAL)); -} - -TEST(MlockTest, MsyncInvalidate) { - SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(CanMlock())); - auto const mapping = ASSERT_NO_ERRNO_AND_VALUE( - MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE)); - ASSERT_THAT(mlock(mapping.ptr(), mapping.len()), SyscallSucceeds()); - EXPECT_THAT(msync(mapping.ptr(), mapping.len(), MS_ASYNC | MS_INVALIDATE), - SyscallFailsWithErrno(EBUSY)); - EXPECT_THAT(msync(mapping.ptr(), mapping.len(), MS_SYNC | MS_INVALIDATE), - SyscallFailsWithErrno(EBUSY)); -} - -TEST(MlockTest, Fork) { - SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(CanMlock())); - auto const mapping = ASSERT_NO_ERRNO_AND_VALUE( - MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE)); - EXPECT_FALSE(IsPageMlocked(mapping.addr())); - ASSERT_THAT(mlock(mapping.ptr(), mapping.len()), SyscallSucceeds()); - EXPECT_TRUE(IsPageMlocked(mapping.addr())); - EXPECT_THAT( - InForkedProcess([&] { TEST_CHECK(!IsPageMlocked(mapping.addr())); }), - IsPosixErrorOkAndHolds(0)); -} - -TEST(MlockTest, RlimitMemlockZero) { - if (ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_IPC_LOCK))) { - ASSERT_NO_ERRNO(SetCapability(CAP_IPC_LOCK, false)); - } - Cleanup reset_rlimit = - ASSERT_NO_ERRNO_AND_VALUE(ScopedSetSoftRlimit(RLIMIT_MEMLOCK, 0)); - auto const mapping = ASSERT_NO_ERRNO_AND_VALUE( - MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE)); - EXPECT_FALSE(IsPageMlocked(mapping.addr())); - ASSERT_THAT(mlock(mapping.ptr(), mapping.len()), - SyscallFailsWithErrno(EPERM)); -} - -TEST(MlockTest, RlimitMemlockInsufficient) { - if (ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_IPC_LOCK))) { - ASSERT_NO_ERRNO(SetCapability(CAP_IPC_LOCK, false)); - } - Cleanup reset_rlimit = - ASSERT_NO_ERRNO_AND_VALUE(ScopedSetSoftRlimit(RLIMIT_MEMLOCK, kPageSize)); - auto const mapping = ASSERT_NO_ERRNO_AND_VALUE( - MmapAnon(2 * kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE)); - EXPECT_FALSE(IsPageMlocked(mapping.addr())); - ASSERT_THAT(mlock(mapping.ptr(), mapping.len()), - SyscallFailsWithErrno(ENOMEM)); -} - -TEST(MunlockTest, Basic) { - SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(CanMlock())); - auto const mapping = ASSERT_NO_ERRNO_AND_VALUE( - MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE)); - EXPECT_FALSE(IsPageMlocked(mapping.addr())); - ASSERT_THAT(mlock(mapping.ptr(), mapping.len()), SyscallSucceeds()); - EXPECT_TRUE(IsPageMlocked(mapping.addr())); - ASSERT_THAT(munlock(mapping.ptr(), mapping.len()), SyscallSucceeds()); - EXPECT_FALSE(IsPageMlocked(mapping.addr())); -} - -TEST(MunlockTest, NotLocked) { - SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(CanMlock())); - auto const mapping = ASSERT_NO_ERRNO_AND_VALUE( - MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE)); - EXPECT_FALSE(IsPageMlocked(mapping.addr())); - EXPECT_THAT(munlock(mapping.ptr(), mapping.len()), SyscallSucceeds()); - EXPECT_FALSE(IsPageMlocked(mapping.addr())); -} - -// There is currently no test for mlockall(MCL_CURRENT) because the default -// RLIMIT_MEMLOCK of 64 KB is insufficient to actually invoke -// mlockall(MCL_CURRENT). - -TEST(MlockallTest, Future) { - SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(CanMlock())); - - // Run this test in a separate (single-threaded) subprocess to ensure that a - // background thread doesn't try to mmap a large amount of memory, fail due - // to hitting RLIMIT_MEMLOCK, and explode the process violently. - EXPECT_THAT(InForkedProcess([] { - auto const mapping = - MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE) - .ValueOrDie(); - TEST_CHECK(!IsPageMlocked(mapping.addr())); - TEST_PCHECK(mlockall(MCL_FUTURE) == 0); - // Ensure that mlockall(MCL_FUTURE) is turned off before the end - // of the test, as otherwise mmaps may fail unexpectedly. - Cleanup do_munlockall([] { TEST_PCHECK(munlockall() == 0); }); - auto const mapping2 = ASSERT_NO_ERRNO_AND_VALUE( - MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE)); - TEST_CHECK(IsPageMlocked(mapping2.addr())); - // Fire munlockall() and check that it disables - // mlockall(MCL_FUTURE). - do_munlockall.Release()(); - auto const mapping3 = ASSERT_NO_ERRNO_AND_VALUE( - MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE)); - TEST_CHECK(!IsPageMlocked(mapping2.addr())); - }), - IsPosixErrorOkAndHolds(0)); -} - -TEST(MunlockallTest, Basic) { - SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(CanMlock())); - auto const mapping = ASSERT_NO_ERRNO_AND_VALUE( - MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_LOCKED)); - EXPECT_TRUE(IsPageMlocked(mapping.addr())); - ASSERT_THAT(munlockall(), SyscallSucceeds()); - EXPECT_FALSE(IsPageMlocked(mapping.addr())); -} - -#ifndef SYS_mlock2 -#ifdef __x86_64__ -#define SYS_mlock2 325 -#endif -#endif - -#ifndef MLOCK_ONFAULT -#define MLOCK_ONFAULT 0x01 // Linux: include/uapi/asm-generic/mman-common.h -#endif - -#ifdef SYS_mlock2 - -int mlock2(void const* addr, size_t len, int flags) { - return syscall(SYS_mlock2, addr, len, flags); -} - -TEST(Mlock2Test, NoFlags) { - SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(CanMlock())); - auto const mapping = ASSERT_NO_ERRNO_AND_VALUE( - MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE)); - EXPECT_FALSE(IsPageMlocked(mapping.addr())); - ASSERT_THAT(mlock2(mapping.ptr(), mapping.len(), 0), SyscallSucceeds()); - EXPECT_TRUE(IsPageMlocked(mapping.addr())); -} - -TEST(Mlock2Test, MlockOnfault) { - SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(CanMlock())); - auto const mapping = ASSERT_NO_ERRNO_AND_VALUE( - MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE)); - EXPECT_FALSE(IsPageMlocked(mapping.addr())); - ASSERT_THAT(mlock2(mapping.ptr(), mapping.len(), MLOCK_ONFAULT), - SyscallSucceeds()); - EXPECT_TRUE(IsPageMlocked(mapping.addr())); -} - -TEST(Mlock2Test, UnknownFlags) { - SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(CanMlock())); - auto const mapping = ASSERT_NO_ERRNO_AND_VALUE( - MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE)); - EXPECT_THAT(mlock2(mapping.ptr(), mapping.len(), ~0), - SyscallFailsWithErrno(EINVAL)); -} - -#endif // defined(SYS_mlock2) - -TEST(MapLockedTest, Basic) { - SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(CanMlock())); - auto const mapping = ASSERT_NO_ERRNO_AND_VALUE( - MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_LOCKED)); - EXPECT_TRUE(IsPageMlocked(mapping.addr())); - EXPECT_THAT(munlock(mapping.ptr(), mapping.len()), SyscallSucceeds()); - EXPECT_FALSE(IsPageMlocked(mapping.addr())); -} - -TEST(MapLockedTest, RlimitMemlockZero) { - if (ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_IPC_LOCK))) { - ASSERT_NO_ERRNO(SetCapability(CAP_IPC_LOCK, false)); - } - Cleanup reset_rlimit = - ASSERT_NO_ERRNO_AND_VALUE(ScopedSetSoftRlimit(RLIMIT_MEMLOCK, 0)); - EXPECT_THAT( - MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_LOCKED), - PosixErrorIs(EPERM, _)); -} - -TEST(MapLockedTest, RlimitMemlockInsufficient) { - if (ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_IPC_LOCK))) { - ASSERT_NO_ERRNO(SetCapability(CAP_IPC_LOCK, false)); - } - Cleanup reset_rlimit = - ASSERT_NO_ERRNO_AND_VALUE(ScopedSetSoftRlimit(RLIMIT_MEMLOCK, kPageSize)); - EXPECT_THAT( - MmapAnon(2 * kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_LOCKED), - PosixErrorIs(EAGAIN, _)); -} - -TEST(MremapLockedTest, Basic) { - SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(CanMlock())); - auto mapping = ASSERT_NO_ERRNO_AND_VALUE( - MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_LOCKED)); - EXPECT_TRUE(IsPageMlocked(mapping.addr())); - - void* addr = mremap(mapping.ptr(), mapping.len(), 2 * mapping.len(), - MREMAP_MAYMOVE, nullptr); - if (addr == MAP_FAILED) { - FAIL() << "mremap failed: " << errno << " (" << strerror(errno) << ")"; - } - mapping.release(); - mapping.reset(addr, 2 * mapping.len()); - EXPECT_TRUE(IsPageMlocked(reinterpret_cast(addr))); -} - -TEST(MremapLockedTest, RlimitMemlockZero) { - SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(CanMlock())); - auto mapping = ASSERT_NO_ERRNO_AND_VALUE( - MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_LOCKED)); - EXPECT_TRUE(IsPageMlocked(mapping.addr())); - - if (ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_IPC_LOCK))) { - ASSERT_NO_ERRNO(SetCapability(CAP_IPC_LOCK, false)); - } - Cleanup reset_rlimit = - ASSERT_NO_ERRNO_AND_VALUE(ScopedSetSoftRlimit(RLIMIT_MEMLOCK, 0)); - void* addr = mremap(mapping.ptr(), mapping.len(), 2 * mapping.len(), - MREMAP_MAYMOVE, nullptr); - EXPECT_TRUE(addr == MAP_FAILED && errno == EAGAIN) - << "addr = " << addr << ", errno = " << errno; -} - -TEST(MremapLockedTest, RlimitMemlockInsufficient) { - SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(CanMlock())); - auto mapping = ASSERT_NO_ERRNO_AND_VALUE( - MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_LOCKED)); - EXPECT_TRUE(IsPageMlocked(mapping.addr())); - - if (ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_IPC_LOCK))) { - ASSERT_NO_ERRNO(SetCapability(CAP_IPC_LOCK, false)); - } - Cleanup reset_rlimit = ASSERT_NO_ERRNO_AND_VALUE( - ScopedSetSoftRlimit(RLIMIT_MEMLOCK, mapping.len())); - void* addr = mremap(mapping.ptr(), mapping.len(), 2 * mapping.len(), - MREMAP_MAYMOVE, nullptr); - EXPECT_TRUE(addr == MAP_FAILED && errno == EAGAIN) - << "addr = " << addr << ", errno = " << errno; -} - -} // namespace - -} // namespace testing -} // namespace gvisor diff --git a/test/syscalls/linux/msync.cc b/test/syscalls/linux/msync.cc index 72d90dc78..0ddc621aa 100644 --- a/test/syscalls/linux/msync.cc +++ b/test/syscalls/linux/msync.cc @@ -43,13 +43,14 @@ class MsyncParameterizedTest : public ::testing::TestWithParam { protected: int msync_flags() const { return std::get<0>(GetParam()); } - PosixErrorOr GetMapping() const { return std::get<1>(GetParam())(); } + PosixErrorOr GetMapping() const { + auto rv = std::get<1>(GetParam())(); + return rv; + } }; -// All valid msync(2) flag combinations, not including MS_INVALIDATE. ("Linux -// permits a call to msync() that specifies neither [MS_SYNC or MS_ASYNC], with -// semantics that are (currently) equivalent to specifying MS_ASYNC." - -// msync(2)) +// All valid msync(2) flag combinations (not including MS_INVALIDATE, which +// gVisor doesn't implement). constexpr std::initializer_list kMsyncFlags = {MS_SYNC, MS_ASYNC, 0}; // Returns functions that return mappings that should be successfully @@ -133,15 +134,6 @@ TEST_P(MsyncFullParamTest, UnalignedAddressFails) { SyscallFailsWithErrno(EINVAL)); } -TEST_P(MsyncFullParamTest, InvalidateUnlockedSucceeds) { - auto m = ASSERT_NO_ERRNO_AND_VALUE(GetMapping()); - EXPECT_THAT(msync(m.ptr(), m.len(), msync_flags() | MS_INVALIDATE), - SyscallSucceeds()); -} - -// The test for MS_INVALIDATE on mlocked pages is in mlock.cc since it requires -// probing for mlock support. - INSTANTIATE_TEST_CASE_P( All, MsyncFullParamTest, ::testing::Combine(::testing::ValuesIn(kMsyncFlags),