kvm: trap mmap syscalls to map new regions to the guest

We install seccomp rules so that the SIGSYS signal is generated for
each mmap system call. Then our signal handler executes the real mmap
syscall and if a new regions is created, it maps it to the guest.

Signed-off-by: Andrei Vagin <avagin@google.com>
This commit is contained in:
Andrei Vagin 2021-09-13 16:52:47 -07:00 committed by Andrei Vagin
parent 981111a9ee
commit 0bdd79ccd4
12 changed files with 244 additions and 60 deletions

View File

@ -322,12 +322,3 @@ func (p *PageTables) Lookup(addr hostarch.Addr, findFirst bool) (virtual hostarc
func (p *PageTables) MarkReadOnlyShared() {
p.readOnlyShared = true
}
// PrefaultRootTable touches the root table page to be sure that its physical
// pages are mapped.
//
//go:nosplit
//go:noinline
func (p *PageTables) PrefaultRootTable() PTE {
return p.root[0]
}

View File

@ -1,13 +1,26 @@
load("//tools:defs.bzl", "go_library", "go_test")
load("//tools/go_generics:defs.bzl", "go_template_instance")
package(licenses = ["notice"])
go_template_instance(
name = "atomicptr_machine",
out = "atomicptr_machine_unsafe.go",
package = "kvm",
prefix = "machine",
template = "//pkg/sync/atomicptr:generic_atomicptr",
types = {
"Value": "machine",
},
)
go_library(
name = "kvm",
srcs = [
"address_space.go",
"address_space_amd64.go",
"address_space_arm64.go",
"atomicptr_machine_unsafe.go",
"bluepill.go",
"bluepill_allocator.go",
"bluepill_amd64.go",
@ -90,6 +103,12 @@ go_test(
"//pkg/sentry/time",
"@org_golang_x_sys//unix:go_default_library",
],
# FIXME(gvisor.dev/issue/3374): Not working with all build systems.
nogo = False,
# cgo has to be disabled. We have seen libc that blocks all signals and
# calls mmap from pthread_create, but we use SIGSYS to trap mmap system
# calls.
pure = True,
)
genrule(

View File

@ -61,6 +61,9 @@ var (
// This is called by bluepillHandler.
savedHandler uintptr
// savedSigsysHandler is a pointer to the previos handler of the SIGSYS signals.
savedSigsysHandler uintptr
// dieTrampolineAddr is the address of dieTrampoline.
dieTrampolineAddr uintptr
)

View File

@ -32,6 +32,8 @@
// This is checked as the source of the fault.
#define CLI $0xfa
#define SYS_MMAP 9
// See bluepill.go.
TEXT ·bluepill(SB),NOSPLIT,$0
begin:
@ -95,6 +97,31 @@ TEXT ·addrOfSighandler(SB), $0-8
MOVQ AX, ret+0(FP)
RET
TEXT ·sigsysHandler(SB),NOSPLIT,$0
// Check if the signal is from the kernel.
MOVQ $1, CX
CMPL CX, 0x8(SI)
JNE fallback
MOVL CONTEXT_RAX(DX), CX
CMPL CX, $SYS_MMAP
JNE fallback
PUSHQ DX // First argument (context).
CALL ·seccompMmapHandler(SB) // Call the handler.
POPQ DX // Discard the argument.
RET
fallback:
// Jump to the previous signal handler.
XORQ CX, CX
MOVQ ·savedSigsysHandler(SB), AX
JMP AX
// func addrOfSighandler() uintptr
TEXT ·addrOfSigsysHandler(SB), $0-8
MOVQ $·sigsysHandler(SB), AX
MOVQ AX, ret+0(FP)
RET
// dieTrampoline: see bluepill.go, bluepill_amd64_unsafe.go for documentation.
TEXT ·dieTrampoline(SB),NOSPLIT,$0
PUSHQ BX // First argument (vCPU).

View File

@ -29,9 +29,12 @@
// Only limited use of the context is done in the assembly stub below, most is
// done in the Go handlers.
#define SIGINFO_SIGNO 0x0
#define SIGINFO_CODE 0x8
#define CONTEXT_PC 0x1B8
#define CONTEXT_R0 0xB8
#define SYS_MMAP 222
// getTLS returns the value of TPIDR_EL0 register.
TEXT ·getTLS(SB),NOSPLIT,$0-8
MRS TPIDR_EL0, R1
@ -98,6 +101,37 @@ TEXT ·addrOfSighandler(SB), $0-8
MOVD R0, ret+0(FP)
RET
// The arguments are the following:
//
// R0 - The signal number.
// R1 - Pointer to siginfo_t structure.
// R2 - Pointer to ucontext structure.
//
TEXT ·sigsysHandler(SB),NOSPLIT,$0
// si_code should be SYS_SECCOMP.
MOVD SIGINFO_CODE(R1), R7
CMPW $1, R7
BNE fallback
CMPW $SYS_MMAP, R8
BNE fallback
MOVD R2, 8(RSP)
BL ·seccompMmapHandler(SB) // Call the handler.
RET
fallback:
// Jump to the previous signal handler.
MOVD ·savedHandler(SB), R7
B (R7)
// func addrOfSighandler() uintptr
TEXT ·addrOfSigsysHandler(SB), $0-8
MOVD $·sigsysHandler(SB), R0
MOVD R0, ret+0(FP)
RET
// dieTrampoline: see bluepill.go, bluepill_arm64_unsafe.go for documentation.
TEXT ·dieTrampoline(SB),NOSPLIT,$0
// R0: Fake the old PC as caller

View File

@ -193,36 +193,8 @@ func bluepillHandler(context unsafe.Pointer) {
return
}
// Increment the fault count.
atomic.AddUint32(&c.faults, 1)
// For MMIO, the physical address is the first data item.
physical = uintptr(c.runData.data[0])
virtual, ok := handleBluepillFault(c.machine, physical, physicalRegions, _KVM_MEM_FLAGS_NONE)
if !ok {
c.die(bluepillArchContext(context), "invalid physical address")
return
}
// We now need to fill in the data appropriately. KVM
// expects us to provide the result of the given MMIO
// operation in the runData struct. This is safe
// because, if a fault occurs here, the same fault
// would have occurred in guest mode. The kernel should
// not create invalid page table mappings.
data := (*[8]byte)(unsafe.Pointer(&c.runData.data[1]))
length := (uintptr)((uint32)(c.runData.data[2]))
write := (uint8)(((c.runData.data[2] >> 32) & 0xff)) != 0
for i := uintptr(0); i < length; i++ {
b := bytePtr(uintptr(virtual) + i)
if write {
// Write to the given address.
*b = data[i]
} else {
// Read from the given address.
data[i] = *b
}
}
c.die(bluepillArchContext(context), "exit_mmio")
return
case _KVM_EXIT_IRQ_WINDOW_OPEN:
bluepillStopGuest(c)
case _KVM_EXIT_SHUTDOWN:

View File

@ -17,15 +17,19 @@ package kvm
import (
"fmt"
"runtime"
gosync "sync"
"sync/atomic"
"golang.org/x/sys/unix"
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/atomicbitops"
"gvisor.dev/gvisor/pkg/hostarch"
"gvisor.dev/gvisor/pkg/log"
"gvisor.dev/gvisor/pkg/procid"
"gvisor.dev/gvisor/pkg/ring0"
"gvisor.dev/gvisor/pkg/ring0/pagetables"
"gvisor.dev/gvisor/pkg/safecopy"
"gvisor.dev/gvisor/pkg/seccomp"
ktime "gvisor.dev/gvisor/pkg/sentry/time"
"gvisor.dev/gvisor/pkg/sync"
)
@ -35,6 +39,9 @@ type machine struct {
// fd is the vm fd.
fd int
// machinePoolIndex is the index in the machinePool array.
machinePoolIndex uint32
// nextSlot is the next slot for setMemoryRegion.
//
// This must be accessed atomically. If nextSlot is ^uint32(0), then
@ -231,6 +238,10 @@ func newMachine(vm int) (*machine, error) {
m.upperSharedPageTables.MarkReadOnlyShared()
m.kernel.PageTables = pagetables.NewWithUpper(newAllocator(), m.upperSharedPageTables, ring0.KernelStartAddress)
// Install seccomp rules to trap runtime mmap system calls. They will
// be handled by seccompMmapHandler.
seccompMmapRules(m)
// Apply the physical mappings. Note that these mappings may point to
// guest physical addresses that are not actually available. These
// physical pages are mapped on demand, see kernel_unsafe.go.
@ -281,6 +292,12 @@ func newMachine(vm int) (*machine, error) {
return
}
}
// Take into account that the stack can grow down.
if vr.filename == "[stack]" {
vr.virtual -= 1 << 20
vr.length += 1 << 20
}
mapRegion(vr.region, 0)
})
@ -352,6 +369,10 @@ func (m *machine) mapPhysical(physical, length uintptr, phyRegions []physicalReg
func (m *machine) Destroy() {
runtime.SetFinalizer(m, nil)
machinePoolMu.Lock()
machinePool[m.machinePoolIndex].Store(nil)
machinePoolMu.Unlock()
// Destroy vCPUs.
for _, c := range m.vCPUsByID {
if c == nil {
@ -683,3 +704,72 @@ func (c *vCPU) setSystemTimeLegacy() error {
}
}
}
const machinePoolSize = 16
// machinePool is enumerated from the seccompMmapHandler signal handler
var (
machinePool [machinePoolSize]machineAtomicPtr
machinePoolLen uint32
machinePoolMu sync.Mutex
seccompMmapRulesOnce gosync.Once
)
func sigsysHandler()
func addrOfSigsysHandler() uintptr
// seccompMmapRules adds seccomp rules to trap mmap system calls that will be
// handled in seccompMmapHandler.
func seccompMmapRules(m *machine) {
seccompMmapRulesOnce.Do(func() {
// Install the handler.
if err := safecopy.ReplaceSignalHandler(unix.SIGSYS, addrOfSigsysHandler(), &savedSigsysHandler); err != nil {
panic(fmt.Sprintf("Unable to set handler for signal %d: %v", bluepillSignal, err))
}
rules := []seccomp.RuleSet{}
rules = append(rules, []seccomp.RuleSet{
// Trap mmap system calls and handle them in sigsysGoHandler
{
Rules: seccomp.SyscallRules{
unix.SYS_MMAP: {
{
seccomp.MatchAny{},
seccomp.MatchAny{},
seccomp.MatchAny{},
/* MAP_DENYWRITE is ignored and used only for filtering. */
seccomp.MaskedEqual(unix.MAP_DENYWRITE, 0),
},
},
},
Action: linux.SECCOMP_RET_TRAP,
},
}...)
instrs, err := seccomp.BuildProgram(rules, linux.SECCOMP_RET_ALLOW, linux.SECCOMP_RET_ALLOW)
if err != nil {
panic(fmt.Sprintf("failed to build rules: %v", err))
}
// Perform the actual installation.
if err := seccomp.SetFilter(instrs); err != nil {
panic(fmt.Sprintf("failed to set filter: %v", err))
}
})
machinePoolMu.Lock()
n := atomic.LoadUint32(&machinePoolLen)
i := uint32(0)
for ; i < n; i++ {
if machinePool[i].Load() == nil {
break
}
}
if i == n {
if i == machinePoolSize {
machinePoolMu.Unlock()
panic("machinePool is full")
}
atomic.AddUint32(&machinePoolLen, 1)
}
machinePool[i].Store(m)
m.machinePoolIndex = i
machinePoolMu.Unlock()
}

View File

@ -309,22 +309,6 @@ func loadByte(ptr *byte) byte {
return *ptr
}
// prefaultFloatingPointState touches each page of the floating point state to
// be sure that its physical pages are mapped.
//
// Otherwise the kernel can trigger KVM_EXIT_MMIO and an instruction that
// triggered a fault will be emulated by the kvm kernel code, but it can't
// emulate instructions like xsave and xrstor.
//
//go:nosplit
func prefaultFloatingPointState(data *fpu.State) {
size := len(*data)
for i := 0; i < size; i += hostarch.PageSize {
loadByte(&(*data)[i])
}
loadByte(&(*data)[size-1])
}
// SwitchToUser unpacks architectural-details.
func (c *vCPU) SwitchToUser(switchOpts ring0.SwitchOpts, info *linux.SignalInfo) (hostarch.AccessType, error) {
// Check for canonical addresses.
@ -355,11 +339,6 @@ func (c *vCPU) SwitchToUser(switchOpts ring0.SwitchOpts, info *linux.SignalInfo)
// allocations occur.
entersyscall()
bluepill(c)
// The root table physical page has to be mapped to not fault in iret
// or sysret after switching into a user address space. sysret and
// iret are in the upper half that is global and already mapped.
switchOpts.PageTables.PrefaultRootTable()
prefaultFloatingPointState(switchOpts.FloatingPointState)
vector = c.CPU.SwitchToUser(switchOpts)
exitsyscall()

View File

@ -161,3 +161,15 @@ func (c *vCPU) getSystemRegisters(sregs *systemRegs) unix.Errno {
}
return 0
}
//go:nosplit
func seccompMmapSyscall(context unsafe.Pointer) (uintptr, uintptr, unix.Errno) {
ctx := bluepillArchContext(context)
// MAP_DENYWRITE is deprecated and ignored by kernel. We use it only for seccomp filters.
addr, _, e := unix.RawSyscall6(uintptr(ctx.Rax), uintptr(ctx.Rdi), uintptr(ctx.Rsi),
uintptr(ctx.Rdx), uintptr(ctx.R10)|unix.MAP_DENYWRITE, uintptr(ctx.R8), uintptr(ctx.R9))
ctx.Rax = uint64(addr)
return addr, uintptr(ctx.Rsi), e
}

View File

@ -333,3 +333,15 @@ func (c *vCPU) SwitchToUser(switchOpts ring0.SwitchOpts, info *linux.SignalInfo)
}
}
//go:nosplit
func seccompMmapSyscall(context unsafe.Pointer) (uintptr, uintptr, unix.Errno) {
ctx := bluepillArchContext(context)
// MAP_DENYWRITE is deprecated and ignored by kernel. We use it only for seccomp filters.
addr, _, e := unix.RawSyscall6(uintptr(ctx.Regs[8]), uintptr(ctx.Regs[0]), uintptr(ctx.Regs[1]),
uintptr(ctx.Regs[2]), uintptr(ctx.Regs[3])|unix.MAP_DENYWRITE, uintptr(ctx.Regs[4]), uintptr(ctx.Regs[5]))
ctx.Regs[0] = uint64(addr)
return addr, uintptr(ctx.Regs[1]), e
}

View File

@ -171,3 +171,46 @@ func (c *vCPU) setSignalMask() error {
return nil
}
// seccompMmapHandler is a signal handler for runtime mmap system calls
// that are trapped by seccomp.
//
// It executes the mmap syscall with specified arguments and maps a new region
// to the guest.
//
//go:nosplit
func seccompMmapHandler(context unsafe.Pointer) {
addr, length, errno := seccompMmapSyscall(context)
if errno != 0 {
return
}
for i := uint32(0); i < atomic.LoadUint32(&machinePoolLen); i++ {
m := machinePool[i].Load()
if m == nil {
continue
}
// Map the new region to the guest.
vr := region{
virtual: addr,
length: length,
}
for virtual := vr.virtual; virtual < vr.virtual+vr.length; {
physical, length, ok := translateToPhysical(virtual)
if !ok {
// This must be an invalid region that was
// knocked out by creation of the physical map.
return
}
if virtual+length > vr.virtual+vr.length {
// Cap the length to the end of the area.
length = vr.virtual + vr.length - virtual
}
// Ensure the physical range is mapped.
m.mapPhysical(physical, length, physicalRegions, _KVM_MEM_FLAGS_NONE)
virtual += length
}
}
}

View File

@ -37,6 +37,8 @@ func (p *AtomicPtr) loadPtr(v *Value) {
// Load returns the value set by the most recent Store. It returns nil if there
// has been no previous call to Store.
//
//go:nosplit
func (p *AtomicPtr) Load() *Value {
return (*Value)(atomic.LoadPointer(&p.ptr))
}