gvisor/pkg/sentry/mm/syscalls.go

// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package mm

import (
	"fmt"
	mrand "math/rand"

	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/futex"
	"gvisor.googlesource.com/gvisor/pkg/sentry/limits"
	"gvisor.googlesource.com/gvisor/pkg/sentry/memmap"
	"gvisor.googlesource.com/gvisor/pkg/sentry/pgalloc"
	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
	"gvisor.googlesource.com/gvisor/pkg/syserror"
)

// HandleUserFault handles an application page fault. sp is the faulting
// application thread's stack pointer.
//
// Preconditions: mm.as != nil.
func (mm *MemoryManager) HandleUserFault(ctx context.Context, addr usermem.Addr, at usermem.AccessType, sp usermem.Addr) error {
	ar, ok := addr.RoundDown().ToRange(usermem.PageSize)
	if !ok {
		return syserror.EFAULT
	}

	// Don't bother trying existingPMAsLocked; in most cases, if we did have
	// existing pmas, we wouldn't have faulted.

	// Ensure that we have a usable vma. Here and below, since we are only
	// asking for a single page, there is no possibility of partial success,
	// and any error is immediately fatal.
	mm.mappingMu.RLock()
	vseg, _, err := mm.getVMAsLocked(ctx, ar, at, false)
	if err != nil {
		mm.mappingMu.RUnlock()
		return err
	}

	// Ensure that we have a usable pma.
	mm.activeMu.Lock()
	pseg, _, err := mm.getPMAsLocked(ctx, vseg, ar, at)
	mm.mappingMu.RUnlock()
	if err != nil {
		mm.activeMu.Unlock()
		return err
	}

	// Downgrade to a read-lock on activeMu since we don't need to mutate pmas
	// anymore.
	mm.activeMu.DowngradeLock()

	// Map the faulted page into the active AddressSpace.
	err = mm.mapASLocked(pseg, ar, false)
	mm.activeMu.RUnlock()
	return err
}

// MMap establishes a memory mapping.
func (mm *MemoryManager) MMap(ctx context.Context, opts memmap.MMapOpts) (usermem.Addr, error) {
	if opts.Length == 0 {
		return 0, syserror.EINVAL
	}
	length, ok := usermem.Addr(opts.Length).RoundUp()
	if !ok {
		return 0, syserror.ENOMEM
	}
	opts.Length = uint64(length)

	if opts.Mappable != nil {
		// Offset must be aligned.
		if usermem.Addr(opts.Offset).RoundDown() != usermem.Addr(opts.Offset) {
			return 0, syserror.EINVAL
		}
		// Offset + length must not overflow.
		if end := opts.Offset + opts.Length; end < opts.Offset {
			return 0, syserror.ENOMEM
		}
	} else {
		opts.Offset = 0
		if !opts.Private {
			if opts.MappingIdentity != nil {
				return 0, syserror.EINVAL
			}
			m, err := NewSharedAnonMappable(opts.Length, pgalloc.MemoryFileProviderFromContext(ctx))
			if err != nil {
				return 0, err
			}
			defer m.DecRef()
			opts.MappingIdentity = m
			opts.Mappable = m
		}
	}

	if opts.Addr.RoundDown() != opts.Addr {
		// MAP_FIXED requires addr to be page-aligned; non-fixed mappings
		// don't.
		if opts.Fixed {
			return 0, syserror.EINVAL
		}
		opts.Addr = opts.Addr.RoundDown()
	}

	if !opts.MaxPerms.SupersetOf(opts.Perms) {
		return 0, syserror.EACCES
	}
	if opts.Unmap && !opts.Fixed {
		return 0, syserror.EINVAL
	}
	if opts.GrowsDown && opts.Mappable != nil {
		return 0, syserror.EINVAL
	}

	// Get the new vma.
	mm.mappingMu.Lock()
	if opts.MLockMode < mm.defMLockMode {
		opts.MLockMode = mm.defMLockMode
	}
	vseg, ar, err := mm.createVMALocked(ctx, opts)
	if err != nil {
		mm.mappingMu.Unlock()
		return 0, err
	}

	// TODO(jamieliu): In Linux, VM_LOCKONFAULT (which may be set on the new
	// vma by mlockall(MCL_FUTURE|MCL_ONFAULT) => mm_struct::def_flags) appears
	// to effectively disable MAP_POPULATE by unsetting FOLL_POPULATE in
	// mm/util.c:vm_mmap_pgoff() => mm/gup.c:__mm_populate() =>
	// populate_vma_page_range(). Confirm this behavior.
	switch {
	case opts.Precommit || opts.MLockMode == memmap.MLockEager:
		// Get pmas and map with precommit as requested.
		mm.populateVMAAndUnlock(ctx, vseg, ar, true)

	case opts.Mappable == nil && length <= privateAllocUnit:
		// NOTE(b/63077076, b/63360184): Get pmas and map eagerly in the hope
		// that doing so will save on future page faults. We only do this for
		// anonymous mappings, since otherwise the cost of
		// memmap.Mappable.Translate is unknown; and only for small mappings,
		// to avoid needing to allocate large amounts of memory that we may
		// subsequently need to checkpoint.
		mm.populateVMAAndUnlock(ctx, vseg, ar, false)

	default:
		mm.mappingMu.Unlock()
	}

	return ar.Start, nil
}

// populateVMA obtains pmas for addresses in ar in the given vma, and maps them
// into mm.as if it is active.
//
// Preconditions: mm.mappingMu must be locked. vseg.Range().IsSupersetOf(ar).
func (mm *MemoryManager) populateVMA(ctx context.Context, vseg vmaIterator, ar usermem.AddrRange, precommit bool) {
	if !vseg.ValuePtr().effectivePerms.Any() {
		// Linux doesn't populate inaccessible pages. See
		// mm/gup.c:populate_vma_page_range.
		return
	}

	mm.activeMu.Lock()
	// Can't defer mm.activeMu.Unlock(); see below.

	// Even if we get new pmas, we can't actually map them if we don't have an
	// AddressSpace.
	if mm.as == nil {
		mm.activeMu.Unlock()
		return
	}

	// Ensure that we have usable pmas.
	pseg, _, err := mm.getPMAsLocked(ctx, vseg, ar, usermem.NoAccess)
	if err != nil {
		// mm/util.c:vm_mmap_pgoff() ignores the error, if any, from
		// mm/gup.c:mm_populate(). If it matters, we'll get it again when
		// userspace actually tries to use the failing page.
		mm.activeMu.Unlock()
		return
	}

	// Downgrade to a read-lock on activeMu since we don't need to mutate pmas
	// anymore.
	mm.activeMu.DowngradeLock()

	// As above, errors are silently ignored.
	mm.mapASLocked(pseg, ar, precommit)
	mm.activeMu.RUnlock()
}

// populateVMAAndUnlock is equivalent to populateVMA, but also unconditionally
// unlocks mm.mappingMu. In cases where populateVMAAndUnlock is usable, it is
// preferable to populateVMA since it unlocks mm.mappingMu before performing
// expensive operations that don't require it to be locked.
//
// Preconditions: mm.mappingMu must be locked for writing.
// vseg.Range().IsSupersetOf(ar).
//
// Postconditions: mm.mappingMu will be unlocked.
func (mm *MemoryManager) populateVMAAndUnlock(ctx context.Context, vseg vmaIterator, ar usermem.AddrRange, precommit bool) {
	// See populateVMA above for commentary.
	if !vseg.ValuePtr().effectivePerms.Any() {
		mm.mappingMu.Unlock()
		return
	}

	mm.activeMu.Lock()

	if mm.as == nil {
		mm.activeMu.Unlock()
		mm.mappingMu.Unlock()
		return
	}

	// mm.mappingMu doesn't need to be write-locked for getPMAsLocked, and it
	// isn't needed at all for mapASLocked.
	mm.mappingMu.DowngradeLock()
	pseg, _, err := mm.getPMAsLocked(ctx, vseg, ar, usermem.NoAccess)
	mm.mappingMu.RUnlock()
	if err != nil {
		mm.activeMu.Unlock()
		return
	}

	mm.activeMu.DowngradeLock()
	mm.mapASLocked(pseg, ar, precommit)
	mm.activeMu.RUnlock()
}

// MapStack allocates the initial process stack.
func (mm *MemoryManager) MapStack(ctx context.Context) (usermem.AddrRange, error) {
	// maxStackSize is the maximum supported process stack size in bytes.
	//
	// This limit exists because stack growing isn't implemented, so the entire
	// process stack must be mapped up-front.
	const maxStackSize = 128 << 20

	stackSize := limits.FromContext(ctx).Get(limits.Stack)
	r, ok := usermem.Addr(stackSize.Cur).RoundUp()
	sz := uint64(r)
	if !ok {
		// RLIM_INFINITY rounds up to 0.
		sz = linux.DefaultStackSoftLimit
	} else if sz > maxStackSize {
		ctx.Warningf("Capping stack size from RLIMIT_STACK of %v down to %v.", sz, maxStackSize)
		sz = maxStackSize
	} else if sz == 0 {
		return usermem.AddrRange{}, syserror.ENOMEM
	}
	szaddr := usermem.Addr(sz)
	ctx.Debugf("Allocating stack with size of %v bytes", sz)

	// Determine the stack's desired location. Unlike Linux, address
	// randomization can't be disabled.
	stackEnd := mm.layout.MaxAddr - usermem.Addr(mrand.Int63n(int64(mm.layout.MaxStackRand))).RoundDown()
	if stackEnd < szaddr {
		return usermem.AddrRange{}, syserror.ENOMEM
	}
	stackStart := stackEnd - szaddr
	mm.mappingMu.Lock()
	defer mm.mappingMu.Unlock()
	_, ar, err := mm.createVMALocked(ctx, memmap.MMapOpts{
		Length:    sz,
		Addr:      stackStart,
		Perms:     usermem.ReadWrite,
		MaxPerms:  usermem.AnyAccess,
		Private:   true,
		GrowsDown: true,
		MLockMode: mm.defMLockMode,
		Hint:      "[stack]",
	})
	return ar, err
}

// MUnmap implements the semantics of Linux's munmap(2).
func (mm *MemoryManager) MUnmap(ctx context.Context, addr usermem.Addr, length uint64) error {
	if addr != addr.RoundDown() {
		return syserror.EINVAL
	}
	if length == 0 {
		return syserror.EINVAL
	}
	la, ok := usermem.Addr(length).RoundUp()
	if !ok {
		return syserror.EINVAL
	}
	ar, ok := addr.ToRange(uint64(la))
	if !ok {
		return syserror.EINVAL
	}

	mm.mappingMu.Lock()
	defer mm.mappingMu.Unlock()
	mm.unmapLocked(ctx, ar)
	return nil
}

// MRemapOpts specifies options to MRemap.
type MRemapOpts struct {
	// Move controls whether MRemap moves the remapped mapping to a new address.
	Move MRemapMoveMode

	// NewAddr is the new address for the remapping. NewAddr is ignored unless
	// Move is MMRemapMustMove.
	NewAddr usermem.Addr
}

// MRemapMoveMode controls MRemap's moving behavior.
type MRemapMoveMode int

const (
	// MRemapNoMove prevents MRemap from moving the remapped mapping.
	MRemapNoMove MRemapMoveMode = iota

	// MRemapMayMove allows MRemap to move the remapped mapping.
	MRemapMayMove

	// MRemapMustMove requires MRemap to move the remapped mapping to
	// MRemapOpts.NewAddr, replacing any existing mappings in the remapped
	// range.
	MRemapMustMove
)

// MRemap implements the semantics of Linux's mremap(2).
func (mm *MemoryManager) MRemap(ctx context.Context, oldAddr usermem.Addr, oldSize uint64, newSize uint64, opts MRemapOpts) (usermem.Addr, error) {
	// "Note that old_address has to be page aligned." - mremap(2)
	if oldAddr.RoundDown() != oldAddr {
		return 0, syserror.EINVAL
	}

	// Linux treats an old_size that rounds up to 0 as 0, which is otherwise a
	// valid size. However, new_size can't be 0 after rounding.
	oldSizeAddr, _ := usermem.Addr(oldSize).RoundUp()
	oldSize = uint64(oldSizeAddr)
	newSizeAddr, ok := usermem.Addr(newSize).RoundUp()
	if !ok || newSizeAddr == 0 {
		return 0, syserror.EINVAL
	}
	newSize = uint64(newSizeAddr)

	oldEnd, ok := oldAddr.AddLength(oldSize)
	if !ok {
		return 0, syserror.EINVAL
	}

	mm.mappingMu.Lock()
	defer mm.mappingMu.Unlock()

	// All cases require that a vma exists at oldAddr.
	vseg := mm.vmas.FindSegment(oldAddr)
	if !vseg.Ok() {
		return 0, syserror.EFAULT
	}

	// Behavior matrix:
	//
	// Move     | oldSize = 0 | oldSize < newSize | oldSize = newSize | oldSize > newSize
	// ---------+-------------+-------------------+-------------------+------------------
	//   NoMove | ENOMEM [1]  | Grow in-place     | No-op             | Shrink in-place
	//  MayMove | Copy [1]    | Grow in-place or  | No-op             | Shrink in-place
	//          |             |   move            |                   |
	// MustMove | Copy        | Move and grow     | Move              | Shrink and move
	//
	// [1] In-place growth is impossible because the vma at oldAddr already
	// occupies at least part of the destination. Thus the NoMove case always
	// fails and the MayMove case always falls back to copying.

	if vma := vseg.ValuePtr(); newSize > oldSize && vma.mlockMode != memmap.MLockNone {
		// Check against RLIMIT_MEMLOCK. Unlike mmap, mlock, and mlockall,
		// mremap in Linux does not check mm/mlock.c:can_do_mlock() and
		// therefore does not return EPERM if RLIMIT_MEMLOCK is 0 and
		// !CAP_IPC_LOCK.
		mlockLimit := limits.FromContext(ctx).Get(limits.MemoryLocked).Cur
		if creds := auth.CredentialsFromContext(ctx); !creds.HasCapabilityIn(linux.CAP_IPC_LOCK, creds.UserNamespace.Root()) {
			if newLockedAS := mm.lockedAS - oldSize + newSize; newLockedAS > mlockLimit {
				return 0, syserror.EAGAIN
			}
		}
	}

	if opts.Move != MRemapMustMove {
		// Handle no-ops and in-place shrinking. These cases don't care if
		// [oldAddr, oldEnd) maps to a single vma, or is even mapped at all
		// (aside from oldAddr).
		if newSize <= oldSize {
			if newSize < oldSize {
				// If oldAddr+oldSize didn't overflow, oldAddr+newSize can't
				// either.
				newEnd := oldAddr + usermem.Addr(newSize)
				mm.unmapLocked(ctx, usermem.AddrRange{newEnd, oldEnd})
			}
			return oldAddr, nil
		}

		// Handle in-place growing.

		// Check that oldEnd maps to the same vma as oldAddr.
		if vseg.End() < oldEnd {
			return 0, syserror.EFAULT
		}
		// "Grow" the existing vma by creating a new mergeable one.
		vma := vseg.ValuePtr()
		var newOffset uint64
		if vma.mappable != nil {
			newOffset = vseg.mappableRange().End
		}
		vseg, ar, err := mm.createVMALocked(ctx, memmap.MMapOpts{
			Length:          newSize - oldSize,
			MappingIdentity: vma.id,
			Mappable:        vma.mappable,
			Offset:          newOffset,
			Addr:            oldEnd,
			Fixed:           true,
			Perms:           vma.realPerms,
			MaxPerms:        vma.maxPerms,
			Private:         vma.private,
			GrowsDown:       vma.growsDown,
			MLockMode:       vma.mlockMode,
			Hint:            vma.hint,
		})
		if err == nil {
			if vma.mlockMode == memmap.MLockEager {
				mm.populateVMA(ctx, vseg, ar, true)
			}
			return oldAddr, nil
		}
		// In-place growth failed. In the MRemapMayMove case, fall through to
		// copying/moving below.
		if opts.Move == MRemapNoMove {
			return 0, err
		}
	}

	// Find a location for the new mapping.
	var newAR usermem.AddrRange
	switch opts.Move {
	case MRemapMayMove:
		newAddr, err := mm.findAvailableLocked(newSize, findAvailableOpts{})
		if err != nil {
			return 0, err
		}
		newAR, _ = newAddr.ToRange(newSize)

	case MRemapMustMove:
		newAddr := opts.NewAddr
		if newAddr.RoundDown() != newAddr {
			return 0, syserror.EINVAL
		}
		var ok bool
		newAR, ok = newAddr.ToRange(newSize)
		if !ok {
			return 0, syserror.EINVAL
		}
		if (usermem.AddrRange{oldAddr, oldEnd}).Overlaps(newAR) {
			return 0, syserror.EINVAL
		}

		// Unmap any mappings at the destination.
		mm.unmapLocked(ctx, newAR)

		// If the sizes specify shrinking, unmap everything between the new and
		// old sizes at the source. Unmapping before the following checks is
		// correct: compare Linux's mm/mremap.c:mremap_to() => do_munmap(),
		// vma_to_resize().
		if newSize < oldSize {
			oldNewEnd := oldAddr + usermem.Addr(newSize)
			mm.unmapLocked(ctx, usermem.AddrRange{oldNewEnd, oldEnd})
			oldEnd = oldNewEnd
		}

		// unmapLocked may have invalidated vseg; look it up again.
		vseg = mm.vmas.FindSegment(oldAddr)
	}

	oldAR := usermem.AddrRange{oldAddr, oldEnd}

	// Check that oldEnd maps to the same vma as oldAddr.
	if vseg.End() < oldEnd {
		return 0, syserror.EFAULT
	}

	// Check against RLIMIT_AS.
	newUsageAS := mm.usageAS - uint64(oldAR.Length()) + uint64(newAR.Length())
	if limitAS := limits.FromContext(ctx).Get(limits.AS).Cur; newUsageAS > limitAS {
		return 0, syserror.ENOMEM
	}

	if vma := vseg.ValuePtr(); vma.mappable != nil {
		// Check that offset+length does not overflow.
		if vma.off+uint64(newAR.Length()) < vma.off {
			return 0, syserror.EINVAL
		}
		// Inform the Mappable, if any, of the new mapping.
		if err := vma.mappable.CopyMapping(ctx, mm, oldAR, newAR, vseg.mappableOffsetAt(oldAR.Start), vma.canWriteMappableLocked()); err != nil {
			return 0, err
		}
	}

	if oldSize == 0 {
		// Handle copying.
		//
		// We can't use createVMALocked because it calls Mappable.AddMapping,
		// whereas we've already called Mappable.CopyMapping (which is
		// consistent with Linux). Call vseg.Value() (rather than
		// vseg.ValuePtr()) to make a copy of the vma.
		vma := vseg.Value()
		if vma.mappable != nil {
			vma.off = vseg.mappableOffsetAt(oldAR.Start)
		}
		if vma.id != nil {
			vma.id.IncRef()
		}
		vseg := mm.vmas.Insert(mm.vmas.FindGap(newAR.Start), newAR, vma)
		mm.usageAS += uint64(newAR.Length())
		if vma.mlockMode != memmap.MLockNone {
			mm.lockedAS += uint64(newAR.Length())
			if vma.mlockMode == memmap.MLockEager {
				mm.populateVMA(ctx, vseg, newAR, true)
			}
		}
		return newAR.Start, nil
	}

	// Handle moving.
	//
	// Remove the existing vma before inserting the new one to minimize
	// iterator invalidation. We do this directly (instead of calling
	// removeVMAsLocked) because:
	//
	// 1. We can't drop the reference on vma.id, which will be transferred to
	// the new vma.
	//
	// 2. We can't call vma.mappable.RemoveMapping, because pmas are still at
	// oldAR, so calling RemoveMapping could cause us to miss an invalidation
	// overlapping oldAR.
	//
	// Call vseg.Value() (rather than vseg.ValuePtr()) to make a copy of the
	// vma.
	vseg = mm.vmas.Isolate(vseg, oldAR)
	vma := vseg.Value()
	mm.vmas.Remove(vseg)
	vseg = mm.vmas.Insert(mm.vmas.FindGap(newAR.Start), newAR, vma)
	mm.usageAS = mm.usageAS - uint64(oldAR.Length()) + uint64(newAR.Length())
	if vma.mlockMode != memmap.MLockNone {
		mm.lockedAS = mm.lockedAS - uint64(oldAR.Length()) + uint64(newAR.Length())
	}

	// Move pmas. This is technically optional for non-private pmas, which
	// could just go through memmap.Mappable.Translate again, but it's required
	// for private pmas.
	mm.activeMu.Lock()
	mm.movePMAsLocked(oldAR, newAR)
	mm.activeMu.Unlock()

	// Now that pmas have been moved to newAR, we can notify vma.mappable that
	// oldAR is no longer mapped.
	if vma.mappable != nil {
		vma.mappable.RemoveMapping(ctx, mm, oldAR, vma.off, vma.canWriteMappableLocked())
	}

	if vma.mlockMode == memmap.MLockEager {
		mm.populateVMA(ctx, vseg, newAR, true)
	}

	return newAR.Start, nil
}

// MProtect implements the semantics of Linux's mprotect(2).
func (mm *MemoryManager) MProtect(addr usermem.Addr, length uint64, realPerms usermem.AccessType, growsDown bool) error {
	if addr.RoundDown() != addr {
		return syserror.EINVAL
	}
	if length == 0 {
		return nil
	}
	rlength, ok := usermem.Addr(length).RoundUp()
	if !ok {
		return syserror.ENOMEM
	}
	ar, ok := addr.ToRange(uint64(rlength))
	if !ok {
		return syserror.ENOMEM
	}
	effectivePerms := realPerms.Effective()

	mm.mappingMu.Lock()
	defer mm.mappingMu.Unlock()
	// Non-growsDown mprotect requires that all of ar is mapped, and stops at
	// the first non-empty gap. growsDown mprotect requires that the first vma
	// be growsDown, but does not require it to extend all the way to ar.Start;
	// vmas after the first must be contiguous but need not be growsDown, like
	// the non-growsDown case.
	vseg := mm.vmas.LowerBoundSegment(ar.Start)
	if !vseg.Ok() {
		return syserror.ENOMEM
	}
	if growsDown {
		if !vseg.ValuePtr().growsDown {
			return syserror.EINVAL
		}
		if ar.End <= vseg.Start() {
			return syserror.ENOMEM
		}
		ar.Start = vseg.Start()
	} else {
		if ar.Start < vseg.Start() {
			return syserror.ENOMEM
		}
	}

	mm.activeMu.Lock()
	defer mm.activeMu.Unlock()
	defer func() {
		mm.vmas.MergeRange(ar)
		mm.vmas.MergeAdjacent(ar)
		mm.pmas.MergeRange(ar)
		mm.pmas.MergeAdjacent(ar)
	}()
	pseg := mm.pmas.LowerBoundSegment(ar.Start)
	var didUnmapAS bool
	for {
		// Check for permission validity before splitting vmas, for consistency
		// with Linux.
		if !vseg.ValuePtr().maxPerms.SupersetOf(effectivePerms) {
			return syserror.EACCES
		}
		vseg = mm.vmas.Isolate(vseg, ar)

		// Update vma permissions.
		vma := vseg.ValuePtr()
		vma.realPerms = realPerms
		vma.effectivePerms = effectivePerms

		// Propagate vma permission changes to pmas.
		for pseg.Ok() && pseg.Start() < vseg.End() {
			if pseg.Range().Overlaps(vseg.Range()) {
				pseg = mm.pmas.Isolate(pseg, vseg.Range())
				pma := pseg.ValuePtr()
				if !effectivePerms.SupersetOf(pma.effectivePerms) && !didUnmapAS {
					// Unmap all of ar, not just vseg.Range(), to minimize host
					// syscalls.
					mm.unmapASLocked(ar)
					didUnmapAS = true
				}
				pma.effectivePerms = effectivePerms.Intersect(pma.translatePerms)
				if pma.needCOW {
					pma.effectivePerms.Write = false
				}
			}
			pseg = pseg.NextSegment()
		}

		// Continue to the next vma.
		if ar.End <= vseg.End() {
			return nil
		}
		vseg, _ = vseg.NextNonEmpty()
		if !vseg.Ok() {
			return syserror.ENOMEM
		}
	}
}

// BrkSetup sets mm's brk address to addr and its brk size to 0.
func (mm *MemoryManager) BrkSetup(ctx context.Context, addr usermem.Addr) {
	mm.mappingMu.Lock()
	defer mm.mappingMu.Unlock()
	// Unmap the existing brk.
	if mm.brk.Length() != 0 {
		mm.unmapLocked(ctx, mm.brk)
	}
	mm.brk = usermem.AddrRange{addr, addr}
}

// Brk implements the semantics of Linux's brk(2), except that it returns an
// error on failure.
func (mm *MemoryManager) Brk(ctx context.Context, addr usermem.Addr) (usermem.Addr, error) {
	mm.mappingMu.Lock()
	// Can't defer mm.mappingMu.Unlock(); see below.

	if addr < mm.brk.Start {
		addr = mm.brk.End
		mm.mappingMu.Unlock()
		return addr, syserror.EINVAL
	}

	// TODO(gvisor.dev/issue/156): This enforces RLIMIT_DATA, but is
	// slightly more permissive than the usual data limit. In particular,
	// this only limits the size of the heap; a true RLIMIT_DATA limits the
	// size of heap + data + bss. The segment sizes need to be plumbed from
	// the loader package to fully enforce RLIMIT_DATA.
	if uint64(addr-mm.brk.Start) > limits.FromContext(ctx).Get(limits.Data).Cur {
		addr = mm.brk.End
		mm.mappingMu.Unlock()
		return addr, syserror.ENOMEM
	}

	oldbrkpg, _ := mm.brk.End.RoundUp()
	newbrkpg, ok := addr.RoundUp()
	if !ok {
		addr = mm.brk.End
		mm.mappingMu.Unlock()
		return addr, syserror.EFAULT
	}

	switch {
	case oldbrkpg < newbrkpg:
		vseg, ar, err := mm.createVMALocked(ctx, memmap.MMapOpts{
			Length: uint64(newbrkpg - oldbrkpg),
			Addr:   oldbrkpg,
			Fixed:  true,
			// Compare Linux's
			// arch/x86/include/asm/page_types.h:VM_DATA_DEFAULT_FLAGS.
			Perms:    usermem.ReadWrite,
			MaxPerms: usermem.AnyAccess,
			Private:  true,
			// Linux: mm/mmap.c:sys_brk() => do_brk_flags() includes
			// mm->def_flags.
			MLockMode: mm.defMLockMode,
			Hint:      "[heap]",
		})
		if err != nil {
			addr = mm.brk.End
			mm.mappingMu.Unlock()
			return addr, err
		}
		mm.brk.End = addr
		if mm.defMLockMode == memmap.MLockEager {
			mm.populateVMAAndUnlock(ctx, vseg, ar, true)
		} else {
			mm.mappingMu.Unlock()
		}

	case newbrkpg < oldbrkpg:
		mm.unmapLocked(ctx, usermem.AddrRange{newbrkpg, oldbrkpg})
		fallthrough

	default:
		mm.brk.End = addr
		mm.mappingMu.Unlock()
	}

	return addr, nil
}

// MLock implements the semantics of Linux's mlock()/mlock2()/munlock(),
// depending on mode.
func (mm *MemoryManager) MLock(ctx context.Context, addr usermem.Addr, length uint64, mode memmap.MLockMode) error {
	// Linux allows this to overflow.
	la, _ := usermem.Addr(length + addr.PageOffset()).RoundUp()
	ar, ok := addr.RoundDown().ToRange(uint64(la))
	if !ok {
		return syserror.EINVAL
	}

	mm.mappingMu.Lock()
	// Can't defer mm.mappingMu.Unlock(); see below.

	if mode != memmap.MLockNone {
		// Check against RLIMIT_MEMLOCK.
		if creds := auth.CredentialsFromContext(ctx); !creds.HasCapabilityIn(linux.CAP_IPC_LOCK, creds.UserNamespace.Root()) {
			mlockLimit := limits.FromContext(ctx).Get(limits.MemoryLocked).Cur
			if mlockLimit == 0 {
				mm.mappingMu.Unlock()
				return syserror.EPERM
			}
			if newLockedAS := mm.lockedAS + uint64(ar.Length()) - mm.mlockedBytesRangeLocked(ar); newLockedAS > mlockLimit {
				mm.mappingMu.Unlock()
				return syserror.ENOMEM
			}
		}
	}

	// Check this after RLIMIT_MEMLOCK for consistency with Linux.
	if ar.Length() == 0 {
		mm.mappingMu.Unlock()
		return nil
	}

	// Apply the new mlock mode to vmas.
	var unmapped bool
	vseg := mm.vmas.FindSegment(ar.Start)
	for {
		if !vseg.Ok() {
			unmapped = true
			break
		}
		vseg = mm.vmas.Isolate(vseg, ar)
		vma := vseg.ValuePtr()
		prevMode := vma.mlockMode
		vma.mlockMode = mode
		if mode != memmap.MLockNone && prevMode == memmap.MLockNone {
			mm.lockedAS += uint64(vseg.Range().Length())
		} else if mode == memmap.MLockNone && prevMode != memmap.MLockNone {
			mm.lockedAS -= uint64(vseg.Range().Length())
		}
		if ar.End <= vseg.End() {
			break
		}
		vseg, _ = vseg.NextNonEmpty()
	}
	mm.vmas.MergeRange(ar)
	mm.vmas.MergeAdjacent(ar)
	if unmapped {
		mm.mappingMu.Unlock()
		return syserror.ENOMEM
	}

	if mode == memmap.MLockEager {
		// Ensure that we have usable pmas. Since we didn't return ENOMEM
		// above, ar must be fully covered by vmas, so we can just use
		// NextSegment below.
		mm.activeMu.Lock()
		mm.mappingMu.DowngradeLock()
		for vseg := mm.vmas.FindSegment(ar.Start); vseg.Ok() && vseg.Start() < ar.End; vseg = vseg.NextSegment() {
			if !vseg.ValuePtr().effectivePerms.Any() {
				// Linux: mm/gup.c:__get_user_pages() returns EFAULT in this
				// case, which is converted to ENOMEM by mlock.
				mm.activeMu.Unlock()
				mm.mappingMu.RUnlock()
				return syserror.ENOMEM
			}
			_, _, err := mm.getPMAsLocked(ctx, vseg, vseg.Range().Intersect(ar), usermem.NoAccess)
			if err != nil {
				mm.activeMu.Unlock()
				mm.mappingMu.RUnlock()
				// Linux: mm/mlock.c:__mlock_posix_error_return()
				if err == syserror.EFAULT {
					return syserror.ENOMEM
				}
				if err == syserror.ENOMEM {
					return syserror.EAGAIN
				}
				return err
			}
		}

		// Map pmas into the active AddressSpace, if we have one.
		mm.mappingMu.RUnlock()
		if mm.as != nil {
			mm.activeMu.DowngradeLock()
			err := mm.mapASLocked(mm.pmas.LowerBoundSegment(ar.Start), ar, true /* precommit */)
			mm.activeMu.RUnlock()
			if err != nil {
				return err
			}
		} else {
			mm.activeMu.Unlock()
		}
	} else {
		mm.mappingMu.Unlock()
	}

	return nil
}

// MLockAllOpts holds options to MLockAll.
type MLockAllOpts struct {
	// If Current is true, change the memory-locking behavior of all mappings
	// to Mode. If Future is true, upgrade the memory-locking behavior of all
	// future mappings to Mode. At least one of Current or Future must be true.
	Current bool
	Future  bool
	Mode    memmap.MLockMode
}

// MLockAll implements the semantics of Linux's mlockall()/munlockall(),
// depending on opts.
func (mm *MemoryManager) MLockAll(ctx context.Context, opts MLockAllOpts) error {
	if !opts.Current && !opts.Future {
		return syserror.EINVAL
	}

	mm.mappingMu.Lock()
	// Can't defer mm.mappingMu.Unlock(); see below.

	if opts.Current {
		if opts.Mode != memmap.MLockNone {
			// Check against RLIMIT_MEMLOCK.
			if creds := auth.CredentialsFromContext(ctx); !creds.HasCapabilityIn(linux.CAP_IPC_LOCK, creds.UserNamespace.Root()) {
				mlockLimit := limits.FromContext(ctx).Get(limits.MemoryLocked).Cur
				if mlockLimit == 0 {
					mm.mappingMu.Unlock()
					return syserror.EPERM
				}
				if uint64(mm.vmas.Span()) > mlockLimit {
					mm.mappingMu.Unlock()
					return syserror.ENOMEM
				}
			}
		}
		for vseg := mm.vmas.FirstSegment(); vseg.Ok(); vseg = vseg.NextSegment() {
			vma := vseg.ValuePtr()
			prevMode := vma.mlockMode
			vma.mlockMode = opts.Mode
			if opts.Mode != memmap.MLockNone && prevMode == memmap.MLockNone {
				mm.lockedAS += uint64(vseg.Range().Length())
			} else if opts.Mode == memmap.MLockNone && prevMode != memmap.MLockNone {
				mm.lockedAS -= uint64(vseg.Range().Length())
			}
		}
	}

	if opts.Future {
		mm.defMLockMode = opts.Mode
	}

	if opts.Current && opts.Mode == memmap.MLockEager {
		// Linux: mm/mlock.c:sys_mlockall() => include/linux/mm.h:mm_populate()
		// ignores the return value of __mm_populate(), so all errors below are
		// ignored.
		//
		// Try to get usable pmas.
		mm.activeMu.Lock()
		mm.mappingMu.DowngradeLock()
		for vseg := mm.vmas.FirstSegment(); vseg.Ok(); vseg = vseg.NextSegment() {
			if vseg.ValuePtr().effectivePerms.Any() {
				mm.getPMAsLocked(ctx, vseg, vseg.Range(), usermem.NoAccess)
			}
		}

		// Map all pmas into the active AddressSpace, if we have one.
		mm.mappingMu.RUnlock()
		if mm.as != nil {
			mm.activeMu.DowngradeLock()
			mm.mapASLocked(mm.pmas.FirstSegment(), mm.applicationAddrRange(), true /* precommit */)
			mm.activeMu.RUnlock()
		} else {
			mm.activeMu.Unlock()
		}
	} else {
		mm.mappingMu.Unlock()
	}
	return nil
}

// Decommit implements the semantics of Linux's madvise(MADV_DONTNEED).
func (mm *MemoryManager) Decommit(addr usermem.Addr, length uint64) error {
	ar, ok := addr.ToRange(length)
	if !ok {
		return syserror.EINVAL
	}

	mm.mappingMu.RLock()
	defer mm.mappingMu.RUnlock()
	mm.activeMu.Lock()
	defer mm.activeMu.Unlock()

	// Linux's mm/madvise.c:madvise_dontneed() => mm/memory.c:zap_page_range()
	// is analogous to our mm.invalidateLocked(ar, true, true). We inline this
	// here, with the special case that we synchronously decommit
	// uniquely-owned (non-copy-on-write) pages for private anonymous vma,
	// which is the common case for MADV_DONTNEED. Invalidating these pmas, and
	// allowing them to be reallocated when touched again, increases pma
	// fragmentation, which may significantly reduce performance for
	// non-vectored I/O implementations. Also, decommitting synchronously
	// ensures that Decommit immediately reduces host memory usage.
	var didUnmapAS bool
	pseg := mm.pmas.LowerBoundSegment(ar.Start)
	mf := mm.mfp.MemoryFile()
	for vseg := mm.vmas.LowerBoundSegment(ar.Start); vseg.Ok() && vseg.Start() < ar.End; vseg = vseg.NextSegment() {
		vma := vseg.ValuePtr()
		if vma.mlockMode != memmap.MLockNone {
			return syserror.EINVAL
		}
		vsegAR := vseg.Range().Intersect(ar)
		// pseg should already correspond to either this vma or a later one,
		// since there can't be a pma without a corresponding vma.
		if checkInvariants {
			if pseg.Ok() && pseg.End() <= vsegAR.Start {
				panic(fmt.Sprintf("pma %v precedes vma %v", pseg.Range(), vsegAR))
			}
		}
		for pseg.Ok() && pseg.Start() < vsegAR.End {
			pma := pseg.ValuePtr()
			if pma.private && !mm.isPMACopyOnWriteLocked(vseg, pseg) {
				psegAR := pseg.Range().Intersect(ar)
				if vsegAR.IsSupersetOf(psegAR) && vma.mappable == nil {
					if err := mf.Decommit(pseg.fileRangeOf(psegAR)); err == nil {
						pseg = pseg.NextSegment()
						continue
					}
					// If an error occurs, fall through to the general
					// invalidation case below.
				}
			}
			pseg = mm.pmas.Isolate(pseg, vsegAR)
			pma = pseg.ValuePtr()
			if !didUnmapAS {
				// Unmap all of ar, not just pseg.Range(), to minimize host
				// syscalls. AddressSpace mappings must be removed before
				// mm.decPrivateRef().
				mm.unmapASLocked(ar)
				didUnmapAS = true
			}
			if pma.private {
				mm.decPrivateRef(pseg.fileRange())
			}
			pma.file.DecRef(pseg.fileRange())
			mm.removeRSSLocked(pseg.Range())
			pseg = mm.pmas.Remove(pseg).NextSegment()
		}
	}

	// "If there are some parts of the specified address space that are not
	// mapped, the Linux version of madvise() ignores them and applies the call
	// to the rest (but returns ENOMEM from the system call, as it should)." -
	// madvise(2)
	if mm.vmas.SpanRange(ar) != ar.Length() {
		return syserror.ENOMEM
	}
	return nil
}

// MSyncOpts holds options to MSync.
type MSyncOpts struct {
	// Sync has the semantics of MS_SYNC.
	Sync bool

	// Invalidate has the semantics of MS_INVALIDATE.
	Invalidate bool
}

// MSync implements the semantics of Linux's msync().
func (mm *MemoryManager) MSync(ctx context.Context, addr usermem.Addr, length uint64, opts MSyncOpts) error {
	if addr != addr.RoundDown() {
		return syserror.EINVAL
	}
	if length == 0 {
		return nil
	}
	la, ok := usermem.Addr(length).RoundUp()
	if !ok {
		return syserror.ENOMEM
	}
	ar, ok := addr.ToRange(uint64(la))
	if !ok {
		return syserror.ENOMEM
	}

	mm.mappingMu.RLock()
	// Can't defer mm.mappingMu.RUnlock(); see below.
	vseg := mm.vmas.LowerBoundSegment(ar.Start)
	if !vseg.Ok() {
		mm.mappingMu.RUnlock()
		return syserror.ENOMEM
	}
	var unmapped bool
	lastEnd := ar.Start
	for {
		if !vseg.Ok() {
			mm.mappingMu.RUnlock()
			unmapped = true
			break
		}
		if lastEnd < vseg.Start() {
			unmapped = true
		}
		lastEnd = vseg.End()
		vma := vseg.ValuePtr()
		if opts.Invalidate && vma.mlockMode != memmap.MLockNone {
			mm.mappingMu.RUnlock()
			return syserror.EBUSY
		}
		// It's only possible to have dirtied the Mappable through a shared
		// mapping. Don't check if the mapping is writable, because mprotect
		// may have changed this, and also because Linux doesn't.
		if id := vma.id; opts.Sync && id != nil && vma.mappable != nil && !vma.private {
			// We can't call memmap.MappingIdentity.Msync while holding
			// mm.mappingMu since it may take fs locks that precede it in the
			// lock order.
			id.IncRef()
			mr := vseg.mappableRangeOf(vseg.Range().Intersect(ar))
			mm.mappingMu.RUnlock()
			err := id.Msync(ctx, mr)
			id.DecRef()
			if err != nil {
				return err
			}
			if lastEnd >= ar.End {
				break
			}
			mm.mappingMu.RLock()
			vseg = mm.vmas.LowerBoundSegment(lastEnd)
		} else {
			if lastEnd >= ar.End {
				mm.mappingMu.RUnlock()
				break
			}
			vseg = vseg.NextSegment()
		}
	}

	if unmapped {
		return syserror.ENOMEM
	}
	return nil
}

// GetSharedFutexKey is used by kernel.Task.GetSharedKey.
func (mm *MemoryManager) GetSharedFutexKey(ctx context.Context, addr usermem.Addr) (futex.Key, error) {
	ar, ok := addr.ToRange(4) // sizeof(int32).
	if !ok {
		return futex.Key{}, syserror.EFAULT
	}

	mm.mappingMu.RLock()
	defer mm.mappingMu.RUnlock()
	vseg, _, err := mm.getVMAsLocked(ctx, ar, usermem.Read, false)
	if err != nil {
		return futex.Key{}, err
	}
	vma := vseg.ValuePtr()

	if vma.private {
		return futex.Key{
			Kind:   futex.KindSharedPrivate,
			Offset: uint64(addr),
		}, nil
	}

	if vma.id != nil {
		vma.id.IncRef()
	}
	return futex.Key{
		Kind:            futex.KindSharedMappable,
		Mappable:        vma.mappable,
		MappingIdentity: vma.id,
		Offset:          vseg.mappableOffsetAt(addr),
	}, nil
}

// VirtualMemorySize returns the combined length in bytes of all mappings in
// mm.
func (mm *MemoryManager) VirtualMemorySize() uint64 {
	mm.mappingMu.RLock()
	defer mm.mappingMu.RUnlock()
	return uint64(mm.usageAS)
}

// VirtualMemorySizeRange returns the combined length in bytes of all mappings
// in ar in mm.
func (mm *MemoryManager) VirtualMemorySizeRange(ar usermem.AddrRange) uint64 {
	mm.mappingMu.RLock()
	defer mm.mappingMu.RUnlock()
	return uint64(mm.vmas.SpanRange(ar))
}

// ResidentSetSize returns the value advertised as mm's RSS in bytes.
func (mm *MemoryManager) ResidentSetSize() uint64 {
	mm.activeMu.RLock()
	defer mm.activeMu.RUnlock()
	return uint64(mm.curRSS)
}

// MaxResidentSetSize returns the value advertised as mm's max RSS in bytes.
func (mm *MemoryManager) MaxResidentSetSize() uint64 {
	mm.activeMu.RLock()
	defer mm.activeMu.RUnlock()
	return uint64(mm.maxRSS)
}