gvisor/pkg/sentry/mm/mm.go

472 lines
15 KiB
Go

// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// Package mm provides a memory management subsystem. See README.md for a
// detailed overview.
//
// Lock order:
//
// fs locks, except for memmap.Mappable locks
// mm.MemoryManager.metadataMu
// mm.MemoryManager.mappingMu
// Locks taken by memmap.Mappable methods other than Translate
// mm.MemoryManager.activeMu
// Locks taken by memmap.Mappable.Translate
// mm.privateRefs.mu
// platform.AddressSpace locks
// platform.File locks
// mm.aioManager.mu
// mm.AIOContext.mu
//
// Only mm.MemoryManager.Fork is permitted to lock mm.MemoryManager.activeMu in
// multiple mm.MemoryManagers, as it does so in a well-defined order (forked
// child first).
package mm
import (
"sync"
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/fs"
"gvisor.dev/gvisor/pkg/sentry/memmap"
"gvisor.dev/gvisor/pkg/sentry/pgalloc"
"gvisor.dev/gvisor/pkg/sentry/platform"
"gvisor.dev/gvisor/pkg/sentry/safemem"
"gvisor.dev/gvisor/pkg/sentry/usermem"
"gvisor.dev/gvisor/third_party/gvsync"
)
// MemoryManager implements a virtual address space.
//
// +stateify savable
type MemoryManager struct {
// p and mfp are immutable.
p platform.Platform
mfp pgalloc.MemoryFileProvider
// haveASIO is the cached result of p.SupportsAddressSpaceIO(). Aside from
// eliminating an indirect call in the hot I/O path, this makes
// MemoryManager.asioEnabled() a leaf function, allowing it to be inlined.
//
// haveASIO is immutable.
haveASIO bool `state:"nosave"`
// layout is the memory layout.
//
// layout is set by the binary loader before the MemoryManager can be used.
layout arch.MmapLayout
// privateRefs stores reference counts for private memory (memory whose
// ownership is shared by one or more pmas instead of being owned by a
// memmap.Mappable).
//
// privateRefs is immutable.
privateRefs *privateRefs
// users is the number of dependencies on the mappings in the MemoryManager.
// When the number of references in users reaches zero, all mappings are
// unmapped.
//
// users is accessed using atomic memory operations.
users int32
// mappingMu is analogous to Linux's struct mm_struct::mmap_sem.
mappingMu gvsync.DowngradableRWMutex `state:"nosave"`
// vmas stores virtual memory areas. Since vmas are stored by value,
// clients should usually use vmaIterator.ValuePtr() instead of
// vmaIterator.Value() to get a pointer to the vma rather than a copy.
//
// Invariants: vmas are always page-aligned.
//
// vmas is protected by mappingMu.
vmas vmaSet
// brk is the mm's brk, which is manipulated using the brk(2) system call.
// The brk is initially set up by the loader which maps an executable
// binary into the mm.
//
// brk is protected by mappingMu.
brk usermem.AddrRange
// usageAS is vmas.Span(), cached to accelerate RLIMIT_AS checks.
//
// usageAS is protected by mappingMu.
usageAS uint64
// lockedAS is the combined size in bytes of all vmas with vma.mlockMode !=
// memmap.MLockNone.
//
// lockedAS is protected by mappingMu.
lockedAS uint64
// dataAS is the size of private data segments, like mm_struct->data_vm.
// It means the vma which is private, writable, not stack.
//
// dataAS is protected by mappingMu.
dataAS uint64
// New VMAs created by MMap use whichever of memmap.MMapOpts.MLockMode or
// defMLockMode is greater.
//
// defMLockMode is protected by mappingMu.
defMLockMode memmap.MLockMode
// activeMu is loosely analogous to Linux's struct
// mm_struct::page_table_lock.
activeMu gvsync.DowngradableRWMutex `state:"nosave"`
// pmas stores platform mapping areas used to implement vmas. Since pmas
// are stored by value, clients should usually use pmaIterator.ValuePtr()
// instead of pmaIterator.Value() to get a pointer to the pma rather than
// a copy.
//
// Inserting or removing segments from pmas should happen along with a
// call to mm.insertRSS or mm.removeRSS.
//
// Invariants: pmas are always page-aligned. If a pma exists for a given
// address, a vma must also exist for that address.
//
// pmas is protected by activeMu.
pmas pmaSet
// curRSS is pmas.Span(), cached to accelerate updates to maxRSS. It is
// reported as the MemoryManager's RSS.
//
// maxRSS should be modified only via insertRSS and removeRSS, not
// directly.
//
// maxRSS is protected by activeMu.
curRSS uint64
// maxRSS is the maximum resident set size in bytes of a MemoryManager.
// It is tracked as the application adds and removes mappings to pmas.
//
// maxRSS should be modified only via insertRSS, not directly.
//
// maxRSS is protected by activeMu.
maxRSS uint64
// as is the platform.AddressSpace that pmas are mapped into. active is the
// number of contexts that require as to be non-nil; if active == 0, as may
// be nil.
//
// as is protected by activeMu. active is manipulated with atomic memory
// operations; transitions to and from zero are additionally protected by
// activeMu. (This is because such transitions may need to be atomic with
// changes to as.)
as platform.AddressSpace `state:"nosave"`
active int32 `state:"zerovalue"`
// unmapAllOnActivate indicates that the next Activate call should activate
// an empty AddressSpace.
//
// This is used to ensure that an AddressSpace cached in
// NewAddressSpace is not used after some change in the MemoryManager
// or VMAs has made that AddressSpace stale.
//
// unmapAllOnActivate is protected by activeMu. It must only be set when
// there is no active or cached AddressSpace. If as != nil, then
// invalidations should be propagated immediately.
unmapAllOnActivate bool `state:"nosave"`
// If captureInvalidations is true, calls to MM.Invalidate() are recorded
// in capturedInvalidations rather than being applied immediately to pmas.
// This is to avoid a race condition in MM.Fork(); see that function for
// details.
//
// Both captureInvalidations and capturedInvalidations are protected by
// activeMu. Neither need to be saved since captureInvalidations is only
// enabled during MM.Fork(), during which saving can't occur.
captureInvalidations bool `state:"zerovalue"`
capturedInvalidations []invalidateArgs `state:"nosave"`
metadataMu sync.Mutex `state:"nosave"`
// argv is the application argv. This is set up by the loader and may be
// modified by prctl(PR_SET_MM_ARG_START/PR_SET_MM_ARG_END). No
// requirements apply to argv; we do not require that argv.WellFormed().
//
// argv is protected by metadataMu.
argv usermem.AddrRange
// envv is the application envv. This is set up by the loader and may be
// modified by prctl(PR_SET_MM_ENV_START/PR_SET_MM_ENV_END). No
// requirements apply to envv; we do not require that envv.WellFormed().
//
// envv is protected by metadataMu.
envv usermem.AddrRange
// auxv is the ELF's auxiliary vector.
//
// auxv is protected by metadataMu.
auxv arch.Auxv
// executable is the executable for this MemoryManager. If executable
// is not nil, it holds a reference on the Dirent.
//
// executable is protected by metadataMu.
executable *fs.Dirent
// dumpability describes if and how this MemoryManager may be dumped to
// userspace.
//
// dumpability is protected by metadataMu.
dumpability Dumpability
// aioManager keeps track of AIOContexts used for async IOs. AIOManager
// must be cloned when CLONE_VM is used.
aioManager aioManager
}
// vma represents a virtual memory area.
//
// +stateify savable
type vma struct {
// mappable is the virtual memory object mapped by this vma. If mappable is
// nil, the vma represents a private anonymous mapping.
mappable memmap.Mappable
// off is the offset into mappable at which this vma begins. If mappable is
// nil, off is meaningless.
off uint64
// To speedup VMA save/restore, we group and save the following booleans
// as a single integer.
// realPerms are the memory permissions on this vma, as defined by the
// application.
realPerms usermem.AccessType `state:".(int)"`
// effectivePerms are the memory permissions on this vma which are
// actually used to control access.
//
// Invariant: effectivePerms == realPerms.Effective().
effectivePerms usermem.AccessType `state:"manual"`
// maxPerms limits the set of permissions that may ever apply to this
// memory, as well as accesses for which usermem.IOOpts.IgnorePermissions
// is true (e.g. ptrace(PTRACE_POKEDATA)).
//
// Invariant: maxPerms == maxPerms.Effective().
maxPerms usermem.AccessType `state:"manual"`
// private is true if this is a MAP_PRIVATE mapping, such that writes to
// the mapping are propagated to a copy.
private bool `state:"manual"`
// growsDown is true if the mapping may be automatically extended downward
// under certain conditions. If growsDown is true, mappable must be nil.
//
// There is currently no corresponding growsUp flag; in Linux, the only
// architectures that can have VM_GROWSUP mappings are ia64, parisc, and
// metag, none of which we currently support.
growsDown bool `state:"manual"`
// dontfork is the MADV_DONTFORK setting for this vma configured by madvise().
dontfork bool
mlockMode memmap.MLockMode
// numaPolicy is the NUMA policy for this vma set by mbind().
numaPolicy int32
// numaNodemask is the NUMA nodemask for this vma set by mbind().
numaNodemask uint64
// If id is not nil, it controls the lifecycle of mappable and provides vma
// metadata shown in /proc/[pid]/maps, and the vma holds a reference.
id memmap.MappingIdentity
// If hint is non-empty, it is a description of the vma printed in
// /proc/[pid]/maps. hint takes priority over id.MappedName().
hint string
}
const (
vmaRealPermsRead = 1 << iota
vmaRealPermsWrite
vmaRealPermsExecute
vmaEffectivePermsRead
vmaEffectivePermsWrite
vmaEffectivePermsExecute
vmaMaxPermsRead
vmaMaxPermsWrite
vmaMaxPermsExecute
vmaPrivate
vmaGrowsDown
)
func (v *vma) saveRealPerms() int {
var b int
if v.realPerms.Read {
b |= vmaRealPermsRead
}
if v.realPerms.Write {
b |= vmaRealPermsWrite
}
if v.realPerms.Execute {
b |= vmaRealPermsExecute
}
if v.effectivePerms.Read {
b |= vmaEffectivePermsRead
}
if v.effectivePerms.Write {
b |= vmaEffectivePermsWrite
}
if v.effectivePerms.Execute {
b |= vmaEffectivePermsExecute
}
if v.maxPerms.Read {
b |= vmaMaxPermsRead
}
if v.maxPerms.Write {
b |= vmaMaxPermsWrite
}
if v.maxPerms.Execute {
b |= vmaMaxPermsExecute
}
if v.private {
b |= vmaPrivate
}
if v.growsDown {
b |= vmaGrowsDown
}
return b
}
func (v *vma) loadRealPerms(b int) {
if b&vmaRealPermsRead > 0 {
v.realPerms.Read = true
}
if b&vmaRealPermsWrite > 0 {
v.realPerms.Write = true
}
if b&vmaRealPermsExecute > 0 {
v.realPerms.Execute = true
}
if b&vmaEffectivePermsRead > 0 {
v.effectivePerms.Read = true
}
if b&vmaEffectivePermsWrite > 0 {
v.effectivePerms.Write = true
}
if b&vmaEffectivePermsExecute > 0 {
v.effectivePerms.Execute = true
}
if b&vmaMaxPermsRead > 0 {
v.maxPerms.Read = true
}
if b&vmaMaxPermsWrite > 0 {
v.maxPerms.Write = true
}
if b&vmaMaxPermsExecute > 0 {
v.maxPerms.Execute = true
}
if b&vmaPrivate > 0 {
v.private = true
}
if b&vmaGrowsDown > 0 {
v.growsDown = true
}
}
// pma represents a platform mapping area.
//
// +stateify savable
type pma struct {
// file is the file mapped by this pma. Only pmas for which file ==
// MemoryManager.mfp.MemoryFile() may be saved. pmas hold a reference to
// the corresponding file range while they exist.
file platform.File `state:"nosave"`
// off is the offset into file at which this pma begins.
//
// Note that pmas do *not* hold references on offsets in file! If private
// is true, MemoryManager.privateRefs holds the reference instead. If
// private is false, the corresponding memmap.Mappable holds the reference
// instead (per memmap.Mappable.Translate requirement).
off uint64
// translatePerms is the permissions returned by memmap.Mappable.Translate.
// If private is true, translatePerms is usermem.AnyAccess.
translatePerms usermem.AccessType
// effectivePerms is the permissions allowed for non-ignorePermissions
// accesses. maxPerms is the permissions allowed for ignorePermissions
// accesses. These are vma.effectivePerms and vma.maxPerms respectively,
// masked by pma.translatePerms and with Write disallowed if pma.needCOW is
// true.
//
// These are stored in the pma so that the IO implementation can avoid
// iterating mm.vmas when pmas already exist.
effectivePerms usermem.AccessType
maxPerms usermem.AccessType
// needCOW is true if writes to the mapping must be propagated to a copy.
needCOW bool
// private is true if this pma represents private memory.
//
// If private is true, file must be MemoryManager.mfp.MemoryFile(), the pma
// holds a reference on the mapped memory that is tracked in privateRefs,
// and calls to Invalidate for which
// memmap.InvalidateOpts.InvalidatePrivate is false should ignore the pma.
//
// If private is false, this pma caches a translation from the
// corresponding vma's memmap.Mappable.Translate.
private bool
// If internalMappings is not empty, it is the cached return value of
// file.MapInternal for the platform.FileRange mapped by this pma.
internalMappings safemem.BlockSeq `state:"nosave"`
}
// +stateify savable
type privateRefs struct {
mu sync.Mutex `state:"nosave"`
// refs maps offsets into MemoryManager.mfp.MemoryFile() to the number of
// pmas (or, equivalently, MemoryManagers) that share ownership of the
// memory at that offset.
refs fileRefcountSet
}
type invalidateArgs struct {
ar usermem.AddrRange
opts memmap.InvalidateOpts
}
// fileRefcountSetFunctions implements segment.Functions for fileRefcountSet.
type fileRefcountSetFunctions struct{}
func (fileRefcountSetFunctions) MinKey() uint64 {
return 0
}
func (fileRefcountSetFunctions) MaxKey() uint64 {
return ^uint64(0)
}
func (fileRefcountSetFunctions) ClearValue(_ *int32) {
}
func (fileRefcountSetFunctions) Merge(_ platform.FileRange, rc1 int32, _ platform.FileRange, rc2 int32) (int32, bool) {
return rc1, rc1 == rc2
}
func (fileRefcountSetFunctions) Split(_ platform.FileRange, rc int32, _ uint64) (int32, int32) {
return rc, rc
}