2019-04-29 21:25:05 +00:00
|
|
|
// Copyright 2018 The gVisor Authors.
|
2018-04-27 17:37:02 +00:00
|
|
|
//
|
|
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
// you may not use this file except in compliance with the License.
|
|
|
|
// You may obtain a copy of the License at
|
|
|
|
//
|
|
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
//
|
|
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
// See the License for the specific language governing permissions and
|
|
|
|
// limitations under the License.
|
|
|
|
|
|
|
|
package kvm
|
|
|
|
|
|
|
|
import (
|
2018-06-01 20:50:17 +00:00
|
|
|
"sync"
|
2018-06-19 23:59:25 +00:00
|
|
|
"sync/atomic"
|
2018-04-27 17:37:02 +00:00
|
|
|
|
2019-06-13 23:49:09 +00:00
|
|
|
"gvisor.dev/gvisor/pkg/atomicbitops"
|
|
|
|
"gvisor.dev/gvisor/pkg/sentry/platform"
|
|
|
|
"gvisor.dev/gvisor/pkg/sentry/platform/ring0/pagetables"
|
|
|
|
"gvisor.dev/gvisor/pkg/sentry/usermem"
|
2018-04-27 17:37:02 +00:00
|
|
|
)
|
|
|
|
|
2018-06-19 23:59:25 +00:00
|
|
|
// dirtySet tracks vCPUs for invalidation.
|
|
|
|
type dirtySet struct {
|
2018-09-14 04:46:03 +00:00
|
|
|
vCPUs []uint64
|
2018-06-19 23:59:25 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// forEach iterates over all CPUs in the dirty set.
|
|
|
|
func (ds *dirtySet) forEach(m *machine, fn func(c *vCPU)) {
|
|
|
|
m.mu.RLock()
|
|
|
|
defer m.mu.RUnlock()
|
|
|
|
|
2018-09-14 04:46:03 +00:00
|
|
|
for index := range ds.vCPUs {
|
|
|
|
mask := atomic.SwapUint64(&ds.vCPUs[index], 0)
|
|
|
|
if mask != 0 {
|
|
|
|
for bit := 0; bit < 64; bit++ {
|
|
|
|
if mask&(1<<uint64(bit)) == 0 {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
id := 64*index + bit
|
|
|
|
fn(m.vCPUsByID[id])
|
|
|
|
}
|
2018-06-19 23:59:25 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// mark marks the given vCPU as dirty and returns whether it was previously
|
|
|
|
// clean. Being previously clean implies that a flush is needed on entry.
|
|
|
|
func (ds *dirtySet) mark(c *vCPU) bool {
|
|
|
|
index := uint64(c.id) / 64
|
|
|
|
bit := uint64(1) << uint(c.id%64)
|
|
|
|
|
|
|
|
oldValue := atomic.LoadUint64(&ds.vCPUs[index])
|
|
|
|
if oldValue&bit != 0 {
|
|
|
|
return false // Not clean.
|
|
|
|
}
|
|
|
|
|
|
|
|
// Set the bit unilaterally, and ensure that a flush takes place. Note
|
|
|
|
// that it's possible for races to occur here, but since the flush is
|
|
|
|
// taking place long after these lines there's no race in practice.
|
|
|
|
atomicbitops.OrUint64(&ds.vCPUs[index], bit)
|
|
|
|
return true // Previously clean.
|
|
|
|
}
|
|
|
|
|
2018-04-27 17:37:02 +00:00
|
|
|
// addressSpace is a wrapper for PageTables.
|
|
|
|
type addressSpace struct {
|
|
|
|
platform.NoAddressSpaceIO
|
|
|
|
|
2018-06-01 20:50:17 +00:00
|
|
|
// mu is the lock for modifications to the address space.
|
|
|
|
//
|
|
|
|
// Note that the page tables themselves are not locked.
|
|
|
|
mu sync.Mutex
|
|
|
|
|
2018-04-27 17:37:02 +00:00
|
|
|
// machine is the underlying machine.
|
|
|
|
machine *machine
|
|
|
|
|
|
|
|
// pageTables are for this particular address space.
|
|
|
|
pageTables *pagetables.PageTables
|
|
|
|
|
|
|
|
// dirtySet is the set of dirty vCPUs.
|
2018-09-14 04:46:03 +00:00
|
|
|
dirtySet *dirtySet
|
2018-04-27 17:37:02 +00:00
|
|
|
}
|
|
|
|
|
2018-06-01 20:50:17 +00:00
|
|
|
// invalidate is the implementation for Invalidate.
|
|
|
|
func (as *addressSpace) invalidate() {
|
2018-06-19 23:59:25 +00:00
|
|
|
as.dirtySet.forEach(as.machine, func(c *vCPU) {
|
|
|
|
if c.active.get() == as { // If this happens to be active,
|
|
|
|
c.BounceToKernel() // ... force a kernel transition.
|
2018-05-16 05:20:36 +00:00
|
|
|
}
|
2018-06-19 23:59:25 +00:00
|
|
|
})
|
2018-04-27 17:37:02 +00:00
|
|
|
}
|
|
|
|
|
2018-06-01 20:50:17 +00:00
|
|
|
// Invalidate interrupts all dirty contexts.
|
|
|
|
func (as *addressSpace) Invalidate() {
|
|
|
|
as.mu.Lock()
|
|
|
|
defer as.mu.Unlock()
|
|
|
|
as.invalidate()
|
|
|
|
}
|
|
|
|
|
2018-04-27 17:37:02 +00:00
|
|
|
// Touch adds the given vCPU to the dirty list.
|
2018-05-16 05:20:36 +00:00
|
|
|
//
|
|
|
|
// The return value indicates whether a flush is required.
|
|
|
|
func (as *addressSpace) Touch(c *vCPU) bool {
|
2018-06-19 23:59:25 +00:00
|
|
|
return as.dirtySet.mark(c)
|
2018-04-27 17:37:02 +00:00
|
|
|
}
|
|
|
|
|
2019-03-12 17:28:23 +00:00
|
|
|
type hostMapEntry struct {
|
|
|
|
addr uintptr
|
|
|
|
length uintptr
|
|
|
|
}
|
|
|
|
|
2018-04-27 17:37:02 +00:00
|
|
|
func (as *addressSpace) mapHost(addr usermem.Addr, m hostMapEntry, at usermem.AccessType) (inv bool) {
|
|
|
|
for m.length > 0 {
|
2018-06-07 04:47:39 +00:00
|
|
|
physical, length, ok := translateToPhysical(m.addr)
|
2018-04-27 17:37:02 +00:00
|
|
|
if !ok {
|
|
|
|
panic("unable to translate segment")
|
|
|
|
}
|
|
|
|
if length > m.length {
|
|
|
|
length = m.length
|
|
|
|
}
|
|
|
|
|
|
|
|
// Ensure that this map has physical mappings. If the page does
|
|
|
|
// not have physical mappings, the KVM module may inject
|
|
|
|
// spurious exceptions when emulation fails (i.e. it tries to
|
|
|
|
// emulate because the RIP is pointed at those pages).
|
2019-10-30 22:51:42 +00:00
|
|
|
as.machine.mapPhysical(physical, length, physicalRegions, _KVM_MEM_FLAGS_NONE)
|
2018-04-27 17:37:02 +00:00
|
|
|
|
|
|
|
// Install the page table mappings. Note that the ordering is
|
|
|
|
// important; if the pagetable mappings were installed before
|
|
|
|
// ensuring the physical pages were available, then some other
|
|
|
|
// thread could theoretically access them.
|
2018-06-07 06:25:26 +00:00
|
|
|
//
|
|
|
|
// Due to the way KVM's shadow paging implementation works,
|
|
|
|
// modifications to the page tables while in host mode may not
|
|
|
|
// be trapped, leading to the shadow pages being out of sync.
|
|
|
|
// Therefore, we need to ensure that we are in guest mode for
|
|
|
|
// page table modifications. See the call to bluepill, below.
|
|
|
|
as.machine.retryInGuest(func() {
|
|
|
|
inv = as.pageTables.Map(addr, length, pagetables.MapOpts{
|
|
|
|
AccessType: at,
|
|
|
|
User: true,
|
|
|
|
}, physical) || inv
|
|
|
|
})
|
2018-04-27 17:37:02 +00:00
|
|
|
m.addr += length
|
|
|
|
m.length -= length
|
|
|
|
addr += usermem.Addr(length)
|
|
|
|
}
|
|
|
|
|
|
|
|
return inv
|
|
|
|
}
|
|
|
|
|
2019-03-12 17:28:23 +00:00
|
|
|
// MapFile implements platform.AddressSpace.MapFile.
|
|
|
|
func (as *addressSpace) MapFile(addr usermem.Addr, f platform.File, fr platform.FileRange, at usermem.AccessType, precommit bool) error {
|
|
|
|
as.mu.Lock()
|
|
|
|
defer as.mu.Unlock()
|
2018-04-27 17:37:02 +00:00
|
|
|
|
2019-03-12 17:28:23 +00:00
|
|
|
// Get mappings in the sentry's address space, which are guaranteed to be
|
|
|
|
// valid as long as a reference is held on the mapped pages (which is in
|
|
|
|
// turn required by AddressSpace.MapFile precondition).
|
|
|
|
//
|
|
|
|
// If precommit is true, we will touch mappings to commit them, so ensure
|
|
|
|
// that mappings are readable from sentry context.
|
|
|
|
//
|
|
|
|
// We don't execute from application file-mapped memory, and guest page
|
|
|
|
// tables don't care if we have execute permission (but they do need pages
|
|
|
|
// to be readable).
|
|
|
|
bs, err := f.MapInternal(fr, usermem.AccessType{
|
|
|
|
Read: at.Read || at.Execute || precommit,
|
|
|
|
Write: at.Write,
|
2018-04-27 17:37:02 +00:00
|
|
|
})
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
2019-03-12 17:28:23 +00:00
|
|
|
// Map the mappings in the sentry's address space (guest physical memory)
|
|
|
|
// into the application's address space (guest virtual memory).
|
2018-04-27 17:37:02 +00:00
|
|
|
inv := false
|
|
|
|
for !bs.IsEmpty() {
|
|
|
|
b := bs.Head()
|
|
|
|
bs = bs.Tail()
|
|
|
|
// Since fr was page-aligned, b should also be page-aligned. We do the
|
|
|
|
// lookup in our host page tables for this translation.
|
|
|
|
if precommit {
|
2019-03-12 17:28:23 +00:00
|
|
|
s := b.ToSlice()
|
2018-04-27 17:37:02 +00:00
|
|
|
for i := 0; i < len(s); i += usermem.PageSize {
|
|
|
|
_ = s[i] // Touch to commit.
|
|
|
|
}
|
|
|
|
}
|
2018-05-15 03:44:56 +00:00
|
|
|
prev := as.mapHost(addr, hostMapEntry{
|
2019-03-12 17:28:23 +00:00
|
|
|
addr: b.Addr(),
|
|
|
|
length: uintptr(b.Len()),
|
2018-04-27 17:37:02 +00:00
|
|
|
}, at)
|
2018-05-15 03:44:56 +00:00
|
|
|
inv = inv || prev
|
2019-03-12 17:28:23 +00:00
|
|
|
addr += usermem.Addr(b.Len())
|
2018-04-27 17:37:02 +00:00
|
|
|
}
|
|
|
|
if inv {
|
2018-06-01 20:50:17 +00:00
|
|
|
as.invalidate()
|
2018-04-27 17:37:02 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
// Unmap unmaps the given range by calling pagetables.PageTables.Unmap.
|
|
|
|
func (as *addressSpace) Unmap(addr usermem.Addr, length uint64) {
|
2018-06-01 20:50:17 +00:00
|
|
|
as.mu.Lock()
|
|
|
|
defer as.mu.Unlock()
|
|
|
|
|
2018-06-07 06:25:26 +00:00
|
|
|
// See above re: retryInGuest.
|
|
|
|
var prev bool
|
|
|
|
as.machine.retryInGuest(func() {
|
|
|
|
prev = as.pageTables.Unmap(addr, uintptr(length)) || prev
|
|
|
|
})
|
|
|
|
if prev {
|
2018-06-01 20:50:17 +00:00
|
|
|
as.invalidate()
|
2018-08-22 21:14:32 +00:00
|
|
|
|
|
|
|
// Recycle any freed intermediate pages.
|
|
|
|
as.pageTables.Allocator.Recycle()
|
2018-04-27 17:37:02 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Release releases the page tables.
|
2018-05-11 19:23:25 +00:00
|
|
|
func (as *addressSpace) Release() {
|
2018-04-27 17:37:02 +00:00
|
|
|
as.Unmap(0, ^uint64(0))
|
2018-06-07 04:47:39 +00:00
|
|
|
|
|
|
|
// Free all pages from the allocator.
|
|
|
|
as.pageTables.Allocator.(allocator).base.Drain()
|
2018-06-07 05:51:58 +00:00
|
|
|
|
|
|
|
// Drop all cached machine references.
|
|
|
|
as.machine.dropPageTables(as.pageTables)
|
2018-04-27 17:37:02 +00:00
|
|
|
}
|