472 lines
15 KiB
Go
472 lines
15 KiB
Go
// Copyright 2018 The gVisor Authors.
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
// Package mm provides a memory management subsystem. See README.md for a
|
|
// detailed overview.
|
|
//
|
|
// Lock order:
|
|
//
|
|
// fs locks, except for memmap.Mappable locks
|
|
// mm.MemoryManager.metadataMu
|
|
// mm.MemoryManager.mappingMu
|
|
// Locks taken by memmap.Mappable methods other than Translate
|
|
// mm.MemoryManager.activeMu
|
|
// Locks taken by memmap.Mappable.Translate
|
|
// mm.privateRefs.mu
|
|
// platform.AddressSpace locks
|
|
// platform.File locks
|
|
// mm.aioManager.mu
|
|
// mm.AIOContext.mu
|
|
//
|
|
// Only mm.MemoryManager.Fork is permitted to lock mm.MemoryManager.activeMu in
|
|
// multiple mm.MemoryManagers, as it does so in a well-defined order (forked
|
|
// child first).
|
|
package mm
|
|
|
|
import (
|
|
"sync"
|
|
|
|
"gvisor.dev/gvisor/pkg/sentry/arch"
|
|
"gvisor.dev/gvisor/pkg/sentry/fs"
|
|
"gvisor.dev/gvisor/pkg/sentry/memmap"
|
|
"gvisor.dev/gvisor/pkg/sentry/pgalloc"
|
|
"gvisor.dev/gvisor/pkg/sentry/platform"
|
|
"gvisor.dev/gvisor/pkg/sentry/safemem"
|
|
"gvisor.dev/gvisor/pkg/sentry/usermem"
|
|
"gvisor.dev/gvisor/third_party/gvsync"
|
|
)
|
|
|
|
// MemoryManager implements a virtual address space.
|
|
//
|
|
// +stateify savable
|
|
type MemoryManager struct {
|
|
// p and mfp are immutable.
|
|
p platform.Platform
|
|
mfp pgalloc.MemoryFileProvider
|
|
|
|
// haveASIO is the cached result of p.SupportsAddressSpaceIO(). Aside from
|
|
// eliminating an indirect call in the hot I/O path, this makes
|
|
// MemoryManager.asioEnabled() a leaf function, allowing it to be inlined.
|
|
//
|
|
// haveASIO is immutable.
|
|
haveASIO bool `state:"nosave"`
|
|
|
|
// layout is the memory layout.
|
|
//
|
|
// layout is set by the binary loader before the MemoryManager can be used.
|
|
layout arch.MmapLayout
|
|
|
|
// privateRefs stores reference counts for private memory (memory whose
|
|
// ownership is shared by one or more pmas instead of being owned by a
|
|
// memmap.Mappable).
|
|
//
|
|
// privateRefs is immutable.
|
|
privateRefs *privateRefs
|
|
|
|
// users is the number of dependencies on the mappings in the MemoryManager.
|
|
// When the number of references in users reaches zero, all mappings are
|
|
// unmapped.
|
|
//
|
|
// users is accessed using atomic memory operations.
|
|
users int32
|
|
|
|
// mappingMu is analogous to Linux's struct mm_struct::mmap_sem.
|
|
mappingMu gvsync.DowngradableRWMutex `state:"nosave"`
|
|
|
|
// vmas stores virtual memory areas. Since vmas are stored by value,
|
|
// clients should usually use vmaIterator.ValuePtr() instead of
|
|
// vmaIterator.Value() to get a pointer to the vma rather than a copy.
|
|
//
|
|
// Invariants: vmas are always page-aligned.
|
|
//
|
|
// vmas is protected by mappingMu.
|
|
vmas vmaSet
|
|
|
|
// brk is the mm's brk, which is manipulated using the brk(2) system call.
|
|
// The brk is initially set up by the loader which maps an executable
|
|
// binary into the mm.
|
|
//
|
|
// brk is protected by mappingMu.
|
|
brk usermem.AddrRange
|
|
|
|
// usageAS is vmas.Span(), cached to accelerate RLIMIT_AS checks.
|
|
//
|
|
// usageAS is protected by mappingMu.
|
|
usageAS uint64
|
|
|
|
// lockedAS is the combined size in bytes of all vmas with vma.mlockMode !=
|
|
// memmap.MLockNone.
|
|
//
|
|
// lockedAS is protected by mappingMu.
|
|
lockedAS uint64
|
|
|
|
// dataAS is the size of private data segments, like mm_struct->data_vm.
|
|
// It means the vma which is private, writable, not stack.
|
|
//
|
|
// dataAS is protected by mappingMu.
|
|
dataAS uint64
|
|
|
|
// New VMAs created by MMap use whichever of memmap.MMapOpts.MLockMode or
|
|
// defMLockMode is greater.
|
|
//
|
|
// defMLockMode is protected by mappingMu.
|
|
defMLockMode memmap.MLockMode
|
|
|
|
// activeMu is loosely analogous to Linux's struct
|
|
// mm_struct::page_table_lock.
|
|
activeMu gvsync.DowngradableRWMutex `state:"nosave"`
|
|
|
|
// pmas stores platform mapping areas used to implement vmas. Since pmas
|
|
// are stored by value, clients should usually use pmaIterator.ValuePtr()
|
|
// instead of pmaIterator.Value() to get a pointer to the pma rather than
|
|
// a copy.
|
|
//
|
|
// Inserting or removing segments from pmas should happen along with a
|
|
// call to mm.insertRSS or mm.removeRSS.
|
|
//
|
|
// Invariants: pmas are always page-aligned. If a pma exists for a given
|
|
// address, a vma must also exist for that address.
|
|
//
|
|
// pmas is protected by activeMu.
|
|
pmas pmaSet
|
|
|
|
// curRSS is pmas.Span(), cached to accelerate updates to maxRSS. It is
|
|
// reported as the MemoryManager's RSS.
|
|
//
|
|
// maxRSS should be modified only via insertRSS and removeRSS, not
|
|
// directly.
|
|
//
|
|
// maxRSS is protected by activeMu.
|
|
curRSS uint64
|
|
|
|
// maxRSS is the maximum resident set size in bytes of a MemoryManager.
|
|
// It is tracked as the application adds and removes mappings to pmas.
|
|
//
|
|
// maxRSS should be modified only via insertRSS, not directly.
|
|
//
|
|
// maxRSS is protected by activeMu.
|
|
maxRSS uint64
|
|
|
|
// as is the platform.AddressSpace that pmas are mapped into. active is the
|
|
// number of contexts that require as to be non-nil; if active == 0, as may
|
|
// be nil.
|
|
//
|
|
// as is protected by activeMu. active is manipulated with atomic memory
|
|
// operations; transitions to and from zero are additionally protected by
|
|
// activeMu. (This is because such transitions may need to be atomic with
|
|
// changes to as.)
|
|
as platform.AddressSpace `state:"nosave"`
|
|
active int32 `state:"zerovalue"`
|
|
|
|
// unmapAllOnActivate indicates that the next Activate call should activate
|
|
// an empty AddressSpace.
|
|
//
|
|
// This is used to ensure that an AddressSpace cached in
|
|
// NewAddressSpace is not used after some change in the MemoryManager
|
|
// or VMAs has made that AddressSpace stale.
|
|
//
|
|
// unmapAllOnActivate is protected by activeMu. It must only be set when
|
|
// there is no active or cached AddressSpace. If as != nil, then
|
|
// invalidations should be propagated immediately.
|
|
unmapAllOnActivate bool `state:"nosave"`
|
|
|
|
// If captureInvalidations is true, calls to MM.Invalidate() are recorded
|
|
// in capturedInvalidations rather than being applied immediately to pmas.
|
|
// This is to avoid a race condition in MM.Fork(); see that function for
|
|
// details.
|
|
//
|
|
// Both captureInvalidations and capturedInvalidations are protected by
|
|
// activeMu. Neither need to be saved since captureInvalidations is only
|
|
// enabled during MM.Fork(), during which saving can't occur.
|
|
captureInvalidations bool `state:"zerovalue"`
|
|
capturedInvalidations []invalidateArgs `state:"nosave"`
|
|
|
|
metadataMu sync.Mutex `state:"nosave"`
|
|
|
|
// argv is the application argv. This is set up by the loader and may be
|
|
// modified by prctl(PR_SET_MM_ARG_START/PR_SET_MM_ARG_END). No
|
|
// requirements apply to argv; we do not require that argv.WellFormed().
|
|
//
|
|
// argv is protected by metadataMu.
|
|
argv usermem.AddrRange
|
|
|
|
// envv is the application envv. This is set up by the loader and may be
|
|
// modified by prctl(PR_SET_MM_ENV_START/PR_SET_MM_ENV_END). No
|
|
// requirements apply to envv; we do not require that envv.WellFormed().
|
|
//
|
|
// envv is protected by metadataMu.
|
|
envv usermem.AddrRange
|
|
|
|
// auxv is the ELF's auxiliary vector.
|
|
//
|
|
// auxv is protected by metadataMu.
|
|
auxv arch.Auxv
|
|
|
|
// executable is the executable for this MemoryManager. If executable
|
|
// is not nil, it holds a reference on the Dirent.
|
|
//
|
|
// executable is protected by metadataMu.
|
|
executable *fs.Dirent
|
|
|
|
// dumpability describes if and how this MemoryManager may be dumped to
|
|
// userspace.
|
|
//
|
|
// dumpability is protected by metadataMu.
|
|
dumpability Dumpability
|
|
|
|
// aioManager keeps track of AIOContexts used for async IOs. AIOManager
|
|
// must be cloned when CLONE_VM is used.
|
|
aioManager aioManager
|
|
}
|
|
|
|
// vma represents a virtual memory area.
|
|
//
|
|
// +stateify savable
|
|
type vma struct {
|
|
// mappable is the virtual memory object mapped by this vma. If mappable is
|
|
// nil, the vma represents a private anonymous mapping.
|
|
mappable memmap.Mappable
|
|
|
|
// off is the offset into mappable at which this vma begins. If mappable is
|
|
// nil, off is meaningless.
|
|
off uint64
|
|
|
|
// To speedup VMA save/restore, we group and save the following booleans
|
|
// as a single integer.
|
|
|
|
// realPerms are the memory permissions on this vma, as defined by the
|
|
// application.
|
|
realPerms usermem.AccessType `state:".(int)"`
|
|
|
|
// effectivePerms are the memory permissions on this vma which are
|
|
// actually used to control access.
|
|
//
|
|
// Invariant: effectivePerms == realPerms.Effective().
|
|
effectivePerms usermem.AccessType `state:"manual"`
|
|
|
|
// maxPerms limits the set of permissions that may ever apply to this
|
|
// memory, as well as accesses for which usermem.IOOpts.IgnorePermissions
|
|
// is true (e.g. ptrace(PTRACE_POKEDATA)).
|
|
//
|
|
// Invariant: maxPerms == maxPerms.Effective().
|
|
maxPerms usermem.AccessType `state:"manual"`
|
|
|
|
// private is true if this is a MAP_PRIVATE mapping, such that writes to
|
|
// the mapping are propagated to a copy.
|
|
private bool `state:"manual"`
|
|
|
|
// growsDown is true if the mapping may be automatically extended downward
|
|
// under certain conditions. If growsDown is true, mappable must be nil.
|
|
//
|
|
// There is currently no corresponding growsUp flag; in Linux, the only
|
|
// architectures that can have VM_GROWSUP mappings are ia64, parisc, and
|
|
// metag, none of which we currently support.
|
|
growsDown bool `state:"manual"`
|
|
|
|
// dontfork is the MADV_DONTFORK setting for this vma configured by madvise().
|
|
dontfork bool
|
|
|
|
mlockMode memmap.MLockMode
|
|
|
|
// numaPolicy is the NUMA policy for this vma set by mbind().
|
|
numaPolicy int32
|
|
|
|
// numaNodemask is the NUMA nodemask for this vma set by mbind().
|
|
numaNodemask uint64
|
|
|
|
// If id is not nil, it controls the lifecycle of mappable and provides vma
|
|
// metadata shown in /proc/[pid]/maps, and the vma holds a reference.
|
|
id memmap.MappingIdentity
|
|
|
|
// If hint is non-empty, it is a description of the vma printed in
|
|
// /proc/[pid]/maps. hint takes priority over id.MappedName().
|
|
hint string
|
|
}
|
|
|
|
const (
|
|
vmaRealPermsRead = 1 << iota
|
|
vmaRealPermsWrite
|
|
vmaRealPermsExecute
|
|
vmaEffectivePermsRead
|
|
vmaEffectivePermsWrite
|
|
vmaEffectivePermsExecute
|
|
vmaMaxPermsRead
|
|
vmaMaxPermsWrite
|
|
vmaMaxPermsExecute
|
|
vmaPrivate
|
|
vmaGrowsDown
|
|
)
|
|
|
|
func (v *vma) saveRealPerms() int {
|
|
var b int
|
|
if v.realPerms.Read {
|
|
b |= vmaRealPermsRead
|
|
}
|
|
if v.realPerms.Write {
|
|
b |= vmaRealPermsWrite
|
|
}
|
|
if v.realPerms.Execute {
|
|
b |= vmaRealPermsExecute
|
|
}
|
|
if v.effectivePerms.Read {
|
|
b |= vmaEffectivePermsRead
|
|
}
|
|
if v.effectivePerms.Write {
|
|
b |= vmaEffectivePermsWrite
|
|
}
|
|
if v.effectivePerms.Execute {
|
|
b |= vmaEffectivePermsExecute
|
|
}
|
|
if v.maxPerms.Read {
|
|
b |= vmaMaxPermsRead
|
|
}
|
|
if v.maxPerms.Write {
|
|
b |= vmaMaxPermsWrite
|
|
}
|
|
if v.maxPerms.Execute {
|
|
b |= vmaMaxPermsExecute
|
|
}
|
|
if v.private {
|
|
b |= vmaPrivate
|
|
}
|
|
if v.growsDown {
|
|
b |= vmaGrowsDown
|
|
}
|
|
return b
|
|
}
|
|
|
|
func (v *vma) loadRealPerms(b int) {
|
|
if b&vmaRealPermsRead > 0 {
|
|
v.realPerms.Read = true
|
|
}
|
|
if b&vmaRealPermsWrite > 0 {
|
|
v.realPerms.Write = true
|
|
}
|
|
if b&vmaRealPermsExecute > 0 {
|
|
v.realPerms.Execute = true
|
|
}
|
|
if b&vmaEffectivePermsRead > 0 {
|
|
v.effectivePerms.Read = true
|
|
}
|
|
if b&vmaEffectivePermsWrite > 0 {
|
|
v.effectivePerms.Write = true
|
|
}
|
|
if b&vmaEffectivePermsExecute > 0 {
|
|
v.effectivePerms.Execute = true
|
|
}
|
|
if b&vmaMaxPermsRead > 0 {
|
|
v.maxPerms.Read = true
|
|
}
|
|
if b&vmaMaxPermsWrite > 0 {
|
|
v.maxPerms.Write = true
|
|
}
|
|
if b&vmaMaxPermsExecute > 0 {
|
|
v.maxPerms.Execute = true
|
|
}
|
|
if b&vmaPrivate > 0 {
|
|
v.private = true
|
|
}
|
|
if b&vmaGrowsDown > 0 {
|
|
v.growsDown = true
|
|
}
|
|
}
|
|
|
|
// pma represents a platform mapping area.
|
|
//
|
|
// +stateify savable
|
|
type pma struct {
|
|
// file is the file mapped by this pma. Only pmas for which file ==
|
|
// MemoryManager.mfp.MemoryFile() may be saved. pmas hold a reference to
|
|
// the corresponding file range while they exist.
|
|
file platform.File `state:"nosave"`
|
|
|
|
// off is the offset into file at which this pma begins.
|
|
//
|
|
// Note that pmas do *not* hold references on offsets in file! If private
|
|
// is true, MemoryManager.privateRefs holds the reference instead. If
|
|
// private is false, the corresponding memmap.Mappable holds the reference
|
|
// instead (per memmap.Mappable.Translate requirement).
|
|
off uint64
|
|
|
|
// translatePerms is the permissions returned by memmap.Mappable.Translate.
|
|
// If private is true, translatePerms is usermem.AnyAccess.
|
|
translatePerms usermem.AccessType
|
|
|
|
// effectivePerms is the permissions allowed for non-ignorePermissions
|
|
// accesses. maxPerms is the permissions allowed for ignorePermissions
|
|
// accesses. These are vma.effectivePerms and vma.maxPerms respectively,
|
|
// masked by pma.translatePerms and with Write disallowed if pma.needCOW is
|
|
// true.
|
|
//
|
|
// These are stored in the pma so that the IO implementation can avoid
|
|
// iterating mm.vmas when pmas already exist.
|
|
effectivePerms usermem.AccessType
|
|
maxPerms usermem.AccessType
|
|
|
|
// needCOW is true if writes to the mapping must be propagated to a copy.
|
|
needCOW bool
|
|
|
|
// private is true if this pma represents private memory.
|
|
//
|
|
// If private is true, file must be MemoryManager.mfp.MemoryFile(), the pma
|
|
// holds a reference on the mapped memory that is tracked in privateRefs,
|
|
// and calls to Invalidate for which
|
|
// memmap.InvalidateOpts.InvalidatePrivate is false should ignore the pma.
|
|
//
|
|
// If private is false, this pma caches a translation from the
|
|
// corresponding vma's memmap.Mappable.Translate.
|
|
private bool
|
|
|
|
// If internalMappings is not empty, it is the cached return value of
|
|
// file.MapInternal for the platform.FileRange mapped by this pma.
|
|
internalMappings safemem.BlockSeq `state:"nosave"`
|
|
}
|
|
|
|
// +stateify savable
|
|
type privateRefs struct {
|
|
mu sync.Mutex `state:"nosave"`
|
|
|
|
// refs maps offsets into MemoryManager.mfp.MemoryFile() to the number of
|
|
// pmas (or, equivalently, MemoryManagers) that share ownership of the
|
|
// memory at that offset.
|
|
refs fileRefcountSet
|
|
}
|
|
|
|
type invalidateArgs struct {
|
|
ar usermem.AddrRange
|
|
opts memmap.InvalidateOpts
|
|
}
|
|
|
|
// fileRefcountSetFunctions implements segment.Functions for fileRefcountSet.
|
|
type fileRefcountSetFunctions struct{}
|
|
|
|
func (fileRefcountSetFunctions) MinKey() uint64 {
|
|
return 0
|
|
}
|
|
|
|
func (fileRefcountSetFunctions) MaxKey() uint64 {
|
|
return ^uint64(0)
|
|
}
|
|
|
|
func (fileRefcountSetFunctions) ClearValue(_ *int32) {
|
|
}
|
|
|
|
func (fileRefcountSetFunctions) Merge(_ platform.FileRange, rc1 int32, _ platform.FileRange, rc2 int32) (int32, bool) {
|
|
return rc1, rc1 == rc2
|
|
}
|
|
|
|
func (fileRefcountSetFunctions) Split(_ platform.FileRange, rc int32, _ uint64) (int32, int32) {
|
|
return rc, rc
|
|
}
|