2019-04-29 21:25:05 +00:00
|
|
|
// Copyright 2018 The gVisor Authors.
|
2018-04-27 17:37:02 +00:00
|
|
|
//
|
|
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
// you may not use this file except in compliance with the License.
|
|
|
|
// You may obtain a copy of the License at
|
|
|
|
//
|
|
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
//
|
|
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
// See the License for the specific language governing permissions and
|
|
|
|
// limitations under the License.
|
|
|
|
|
|
|
|
package mm
|
|
|
|
|
|
|
|
import (
|
|
|
|
"fmt"
|
|
|
|
"sync/atomic"
|
|
|
|
|
2019-06-13 23:49:09 +00:00
|
|
|
"gvisor.dev/gvisor/pkg/atomicbitops"
|
|
|
|
"gvisor.dev/gvisor/pkg/sentry/arch"
|
|
|
|
"gvisor.dev/gvisor/pkg/sentry/context"
|
|
|
|
"gvisor.dev/gvisor/pkg/sentry/limits"
|
|
|
|
"gvisor.dev/gvisor/pkg/sentry/memmap"
|
|
|
|
"gvisor.dev/gvisor/pkg/sentry/pgalloc"
|
|
|
|
"gvisor.dev/gvisor/pkg/sentry/platform"
|
|
|
|
"gvisor.dev/gvisor/pkg/sentry/usermem"
|
2018-04-27 17:37:02 +00:00
|
|
|
)
|
|
|
|
|
|
|
|
// NewMemoryManager returns a new MemoryManager with no mappings and 1 user.
|
2019-03-14 15:11:36 +00:00
|
|
|
func NewMemoryManager(p platform.Platform, mfp pgalloc.MemoryFileProvider) *MemoryManager {
|
2018-04-27 17:37:02 +00:00
|
|
|
return &MemoryManager{
|
|
|
|
p: p,
|
2019-03-14 15:11:36 +00:00
|
|
|
mfp: mfp,
|
2018-04-27 17:37:02 +00:00
|
|
|
haveASIO: p.SupportsAddressSpaceIO(),
|
|
|
|
privateRefs: &privateRefs{},
|
|
|
|
users: 1,
|
|
|
|
auxv: arch.Auxv{},
|
2019-06-05 20:59:01 +00:00
|
|
|
dumpability: UserDumpable,
|
2018-04-27 17:37:02 +00:00
|
|
|
aioManager: aioManager{contexts: make(map[uint64]*AIOContext)},
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// SetMmapLayout initializes mm's layout from the given arch.Context.
|
|
|
|
//
|
|
|
|
// Preconditions: mm contains no mappings and is not used concurrently.
|
|
|
|
func (mm *MemoryManager) SetMmapLayout(ac arch.Context, r *limits.LimitSet) (arch.MmapLayout, error) {
|
|
|
|
layout, err := ac.NewMmapLayout(mm.p.MinUserAddress(), mm.p.MaxUserAddress(), r)
|
|
|
|
if err != nil {
|
|
|
|
return arch.MmapLayout{}, err
|
|
|
|
}
|
|
|
|
mm.layout = layout
|
|
|
|
return layout, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
// Fork creates a copy of mm with 1 user, as for Linux syscalls fork() or
|
|
|
|
// clone() (without CLONE_VM).
|
|
|
|
func (mm *MemoryManager) Fork(ctx context.Context) (*MemoryManager, error) {
|
|
|
|
mm.metadataMu.Lock()
|
|
|
|
defer mm.metadataMu.Unlock()
|
|
|
|
mm.mappingMu.RLock()
|
|
|
|
defer mm.mappingMu.RUnlock()
|
|
|
|
mm2 := &MemoryManager{
|
2018-12-21 16:22:24 +00:00
|
|
|
p: mm.p,
|
2019-03-14 15:11:36 +00:00
|
|
|
mfp: mm.mfp,
|
2018-12-21 16:22:24 +00:00
|
|
|
haveASIO: mm.haveASIO,
|
|
|
|
layout: mm.layout,
|
|
|
|
privateRefs: mm.privateRefs,
|
|
|
|
users: 1,
|
|
|
|
brk: mm.brk,
|
|
|
|
usageAS: mm.usageAS,
|
2019-05-29 23:48:19 +00:00
|
|
|
dataAS: mm.dataAS,
|
2018-12-21 16:22:24 +00:00
|
|
|
// "The child does not inherit its parent's memory locks (mlock(2),
|
|
|
|
// mlockall(2))." - fork(2). So lockedAS is 0 and defMLockMode is
|
|
|
|
// MLockNone, both of which are zero values. vma.mlockMode is reset
|
|
|
|
// when copied below.
|
2018-04-27 17:37:02 +00:00
|
|
|
captureInvalidations: true,
|
|
|
|
argv: mm.argv,
|
|
|
|
envv: mm.envv,
|
|
|
|
auxv: append(arch.Auxv(nil), mm.auxv...),
|
|
|
|
// IncRef'd below, once we know that there isn't an error.
|
2019-06-05 20:59:01 +00:00
|
|
|
executable: mm.executable,
|
|
|
|
dumpability: mm.dumpability,
|
|
|
|
aioManager: aioManager{contexts: make(map[uint64]*AIOContext)},
|
2018-04-27 17:37:02 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// Copy vmas.
|
2019-06-20 19:54:40 +00:00
|
|
|
dontforks := false
|
2018-04-27 17:37:02 +00:00
|
|
|
dstvgap := mm2.vmas.FirstGap()
|
|
|
|
for srcvseg := mm.vmas.FirstSegment(); srcvseg.Ok(); srcvseg = srcvseg.NextSegment() {
|
2018-12-21 16:22:24 +00:00
|
|
|
vma := srcvseg.Value() // makes a copy of the vma
|
2018-04-27 17:37:02 +00:00
|
|
|
vmaAR := srcvseg.Range()
|
2019-06-20 19:54:40 +00:00
|
|
|
|
|
|
|
if vma.dontfork {
|
|
|
|
length := uint64(vmaAR.Length())
|
|
|
|
mm2.usageAS -= length
|
|
|
|
if vma.isPrivateDataLocked() {
|
|
|
|
mm2.dataAS -= length
|
|
|
|
}
|
|
|
|
dontforks = true
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
2018-04-27 17:37:02 +00:00
|
|
|
// Inform the Mappable, if any, of the new mapping.
|
|
|
|
if vma.mappable != nil {
|
2019-01-07 23:16:37 +00:00
|
|
|
if err := vma.mappable.AddMapping(ctx, mm2, vmaAR, vma.off, vma.canWriteMappableLocked()); err != nil {
|
2018-04-27 17:37:02 +00:00
|
|
|
mm2.removeVMAsLocked(ctx, mm2.applicationAddrRange())
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if vma.id != nil {
|
|
|
|
vma.id.IncRef()
|
|
|
|
}
|
2018-12-21 16:22:24 +00:00
|
|
|
vma.mlockMode = memmap.MLockNone
|
|
|
|
dstvgap = mm2.vmas.Insert(dstvgap, vmaAR, vma).NextGap()
|
2018-04-27 17:37:02 +00:00
|
|
|
// We don't need to update mm2.usageAS since we copied it from mm
|
|
|
|
// above.
|
|
|
|
}
|
|
|
|
|
|
|
|
// Copy pmas. We have to lock mm.activeMu for writing to make existing
|
|
|
|
// private pmas copy-on-write. We also have to lock mm2.activeMu since
|
|
|
|
// after copying vmas above, memmap.Mappables may call mm2.Invalidate. We
|
|
|
|
// only copy private pmas, since in the common case where fork(2) is
|
|
|
|
// immediately followed by execve(2), copying non-private pmas that can be
|
|
|
|
// regenerated by calling memmap.Mappable.Translate is a waste of time.
|
|
|
|
// (Linux does the same; compare kernel/fork.c:dup_mmap() =>
|
|
|
|
// mm/memory.c:copy_page_range().)
|
|
|
|
mm2.activeMu.Lock()
|
|
|
|
defer mm2.activeMu.Unlock()
|
|
|
|
mm.activeMu.Lock()
|
|
|
|
defer mm.activeMu.Unlock()
|
2019-06-20 19:54:40 +00:00
|
|
|
if dontforks {
|
|
|
|
defer mm.pmas.MergeRange(mm.applicationAddrRange())
|
|
|
|
}
|
|
|
|
srcvseg := mm.vmas.FirstSegment()
|
2018-04-27 17:37:02 +00:00
|
|
|
dstpgap := mm2.pmas.FirstGap()
|
|
|
|
var unmapAR usermem.AddrRange
|
|
|
|
for srcpseg := mm.pmas.FirstSegment(); srcpseg.Ok(); srcpseg = srcpseg.NextSegment() {
|
|
|
|
pma := srcpseg.ValuePtr()
|
|
|
|
if !pma.private {
|
|
|
|
continue
|
|
|
|
}
|
2019-06-20 19:54:40 +00:00
|
|
|
|
|
|
|
if dontforks {
|
|
|
|
// Find the 'vma' that contains the starting address
|
|
|
|
// associated with the 'pma' (there must be one).
|
|
|
|
srcvseg = srcvseg.seekNextLowerBound(srcpseg.Start())
|
|
|
|
if checkInvariants {
|
|
|
|
if !srcvseg.Ok() {
|
|
|
|
panic(fmt.Sprintf("no vma covers pma range %v", srcpseg.Range()))
|
|
|
|
}
|
|
|
|
if srcpseg.Start() < srcvseg.Start() {
|
|
|
|
panic(fmt.Sprintf("vma %v ran ahead of pma %v", srcvseg.Range(), srcpseg.Range()))
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
srcpseg = mm.pmas.Isolate(srcpseg, srcvseg.Range())
|
|
|
|
if srcvseg.ValuePtr().dontfork {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
pma = srcpseg.ValuePtr()
|
|
|
|
}
|
|
|
|
|
2018-04-27 17:37:02 +00:00
|
|
|
if !pma.needCOW {
|
|
|
|
pma.needCOW = true
|
2019-03-25 19:41:36 +00:00
|
|
|
if pma.effectivePerms.Write {
|
2018-04-27 17:37:02 +00:00
|
|
|
// We don't want to unmap the whole address space, even though
|
|
|
|
// doing so would reduce calls to unmapASLocked(), because mm
|
|
|
|
// will most likely continue to be used after the fork, so
|
|
|
|
// unmapping pmas unnecessarily will result in extra page
|
|
|
|
// faults. But we do want to merge consecutive AddrRanges
|
|
|
|
// across pma boundaries.
|
|
|
|
if unmapAR.End == srcpseg.Start() {
|
|
|
|
unmapAR.End = srcpseg.End()
|
|
|
|
} else {
|
|
|
|
if unmapAR.Length() != 0 {
|
|
|
|
mm.unmapASLocked(unmapAR)
|
|
|
|
}
|
|
|
|
unmapAR = srcpseg.Range()
|
|
|
|
}
|
2019-03-25 19:41:36 +00:00
|
|
|
pma.effectivePerms.Write = false
|
2018-04-27 17:37:02 +00:00
|
|
|
}
|
2019-03-25 19:41:36 +00:00
|
|
|
pma.maxPerms.Write = false
|
2018-04-27 17:37:02 +00:00
|
|
|
}
|
|
|
|
fr := srcpseg.fileRange()
|
|
|
|
mm2.incPrivateRef(fr)
|
|
|
|
srcpseg.ValuePtr().file.IncRef(fr)
|
|
|
|
addrRange := srcpseg.Range()
|
|
|
|
mm2.addRSSLocked(addrRange)
|
|
|
|
dstpgap = mm2.pmas.Insert(dstpgap, addrRange, *pma).NextGap()
|
|
|
|
}
|
|
|
|
if unmapAR.Length() != 0 {
|
|
|
|
mm.unmapASLocked(unmapAR)
|
|
|
|
}
|
|
|
|
|
|
|
|
// Between when we call memmap.Mappable.AddMapping while copying vmas and
|
|
|
|
// when we lock mm2.activeMu to copy pmas, calls to mm2.Invalidate() are
|
|
|
|
// ineffective because the pmas they invalidate haven't yet been copied,
|
|
|
|
// possibly allowing mm2 to get invalidated translations:
|
|
|
|
//
|
|
|
|
// Invalidating Mappable mm.Fork
|
|
|
|
// --------------------- -------
|
|
|
|
//
|
|
|
|
// mm2.Invalidate()
|
|
|
|
// mm.activeMu.Lock()
|
|
|
|
// mm.Invalidate() /* blocks */
|
|
|
|
// mm2.activeMu.Lock()
|
|
|
|
// (mm copies invalidated pma to mm2)
|
|
|
|
//
|
|
|
|
// This would technically be both safe (since we only copy private pmas,
|
|
|
|
// which will still hold a reference on their memory) and consistent with
|
|
|
|
// Linux, but we avoid it anyway by setting mm2.captureInvalidations during
|
|
|
|
// construction, causing calls to mm2.Invalidate() to be captured in
|
|
|
|
// mm2.capturedInvalidations, to be replayed after pmas are copied - i.e.
|
|
|
|
// here.
|
|
|
|
mm2.captureInvalidations = false
|
|
|
|
for _, invArgs := range mm2.capturedInvalidations {
|
|
|
|
mm2.invalidateLocked(invArgs.ar, invArgs.opts.InvalidatePrivate, true)
|
|
|
|
}
|
|
|
|
mm2.capturedInvalidations = nil
|
|
|
|
|
|
|
|
if mm2.executable != nil {
|
|
|
|
mm2.executable.IncRef()
|
|
|
|
}
|
|
|
|
return mm2, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
// IncUsers increments mm's user count and returns true. If the user count is
|
|
|
|
// already 0, IncUsers does nothing and returns false.
|
|
|
|
func (mm *MemoryManager) IncUsers() bool {
|
|
|
|
return atomicbitops.IncUnlessZeroInt32(&mm.users)
|
|
|
|
}
|
|
|
|
|
|
|
|
// DecUsers decrements mm's user count. If the user count reaches 0, all
|
|
|
|
// mappings in mm are unmapped.
|
|
|
|
func (mm *MemoryManager) DecUsers(ctx context.Context) {
|
|
|
|
if users := atomic.AddInt32(&mm.users, -1); users > 0 {
|
|
|
|
return
|
|
|
|
} else if users < 0 {
|
|
|
|
panic(fmt.Sprintf("Invalid MemoryManager.users: %d", users))
|
|
|
|
}
|
|
|
|
|
|
|
|
mm.aioManager.destroy()
|
|
|
|
|
|
|
|
mm.metadataMu.Lock()
|
|
|
|
exe := mm.executable
|
|
|
|
mm.executable = nil
|
|
|
|
mm.metadataMu.Unlock()
|
|
|
|
if exe != nil {
|
|
|
|
exe.DecRef()
|
|
|
|
}
|
|
|
|
|
|
|
|
mm.activeMu.Lock()
|
|
|
|
// Sanity check.
|
|
|
|
if atomic.LoadInt32(&mm.active) != 0 {
|
|
|
|
panic("active address space lost?")
|
|
|
|
}
|
|
|
|
// Make sure the AddressSpace is returned.
|
|
|
|
if mm.as != nil {
|
|
|
|
mm.as.Release()
|
|
|
|
mm.as = nil
|
|
|
|
}
|
|
|
|
mm.activeMu.Unlock()
|
|
|
|
|
|
|
|
mm.mappingMu.Lock()
|
|
|
|
defer mm.mappingMu.Unlock()
|
2018-06-25 23:49:47 +00:00
|
|
|
// If mm is being dropped before mm.SetMmapLayout was called,
|
|
|
|
// mm.applicationAddrRange() will be empty.
|
|
|
|
if ar := mm.applicationAddrRange(); ar.Length() != 0 {
|
|
|
|
mm.unmapLocked(ctx, ar)
|
|
|
|
}
|
2018-04-27 17:37:02 +00:00
|
|
|
}
|