Implement membarrier(2) commands other than *_SYNC_CORE.
Updates #267 PiperOrigin-RevId: 335713923
This commit is contained in:
parent
3dc3fb2375
commit
1336af78d5
|
@ -38,6 +38,7 @@ go_library(
|
||||||
"ipc.go",
|
"ipc.go",
|
||||||
"limits.go",
|
"limits.go",
|
||||||
"linux.go",
|
"linux.go",
|
||||||
|
"membarrier.go",
|
||||||
"mm.go",
|
"mm.go",
|
||||||
"netdevice.go",
|
"netdevice.go",
|
||||||
"netfilter.go",
|
"netfilter.go",
|
||||||
|
|
|
@ -0,0 +1,27 @@
|
||||||
|
// Copyright 2020 The gVisor Authors.
|
||||||
|
//
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
// you may not use this file except in compliance with the License.
|
||||||
|
// You may obtain a copy of the License at
|
||||||
|
//
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
//
|
||||||
|
// Unless required by applicable law or agreed to in writing, software
|
||||||
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
// See the License for the specific language governing permissions and
|
||||||
|
// limitations under the License.
|
||||||
|
|
||||||
|
package linux
|
||||||
|
|
||||||
|
// membarrier(2) commands, from include/uapi/linux/membarrier.h
|
||||||
|
const (
|
||||||
|
MEMBARRIER_CMD_QUERY = 0
|
||||||
|
MEMBARRIER_CMD_GLOBAL = (1 << 0)
|
||||||
|
MEMBARRIER_CMD_GLOBAL_EXPEDITED = (1 << 1)
|
||||||
|
MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED = (1 << 2)
|
||||||
|
MEMBARRIER_CMD_PRIVATE_EXPEDITED = (1 << 3)
|
||||||
|
MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED = (1 << 4)
|
||||||
|
MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE = (1 << 5)
|
||||||
|
MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE = (1 << 6)
|
||||||
|
)
|
|
@ -7,11 +7,14 @@ go_library(
|
||||||
srcs = [
|
srcs = [
|
||||||
"cgroup.go",
|
"cgroup.go",
|
||||||
"hostmm.go",
|
"hostmm.go",
|
||||||
|
"membarrier.go",
|
||||||
],
|
],
|
||||||
visibility = ["//pkg/sentry:internal"],
|
visibility = ["//pkg/sentry:internal"],
|
||||||
deps = [
|
deps = [
|
||||||
|
"//pkg/abi/linux",
|
||||||
"//pkg/fd",
|
"//pkg/fd",
|
||||||
"//pkg/log",
|
"//pkg/log",
|
||||||
"//pkg/usermem",
|
"//pkg/usermem",
|
||||||
|
"@org_golang_x_sys//unix:go_default_library",
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
|
|
|
@ -0,0 +1,90 @@
|
||||||
|
// Copyright 2020 The gVisor Authors.
|
||||||
|
//
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
// you may not use this file except in compliance with the License.
|
||||||
|
// You may obtain a copy of the License at
|
||||||
|
//
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
//
|
||||||
|
// Unless required by applicable law or agreed to in writing, software
|
||||||
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
// See the License for the specific language governing permissions and
|
||||||
|
// limitations under the License.
|
||||||
|
|
||||||
|
package hostmm
|
||||||
|
|
||||||
|
import (
|
||||||
|
"syscall"
|
||||||
|
|
||||||
|
"golang.org/x/sys/unix"
|
||||||
|
"gvisor.dev/gvisor/pkg/abi/linux"
|
||||||
|
"gvisor.dev/gvisor/pkg/log"
|
||||||
|
)
|
||||||
|
|
||||||
|
var (
|
||||||
|
haveMembarrierGlobal = false
|
||||||
|
haveMembarrierPrivateExpedited = false
|
||||||
|
)
|
||||||
|
|
||||||
|
func init() {
|
||||||
|
supported, _, e := syscall.RawSyscall(unix.SYS_MEMBARRIER, linux.MEMBARRIER_CMD_QUERY, 0 /* flags */, 0 /* unused */)
|
||||||
|
if e != 0 {
|
||||||
|
if e != syscall.ENOSYS {
|
||||||
|
log.Warningf("membarrier(MEMBARRIER_CMD_QUERY) failed: %s", e.Error())
|
||||||
|
}
|
||||||
|
return
|
||||||
|
}
|
||||||
|
// We don't use MEMBARRIER_CMD_GLOBAL_EXPEDITED because this sends IPIs to
|
||||||
|
// all CPUs running tasks that have previously invoked
|
||||||
|
// MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED, which presents a DOS risk.
|
||||||
|
// (MEMBARRIER_CMD_GLOBAL is synchronize_rcu(), i.e. it waits for an RCU
|
||||||
|
// grace period to elapse without bothering other CPUs.
|
||||||
|
// MEMBARRIER_CMD_PRIVATE_EXPEDITED sends IPIs only to CPUs running tasks
|
||||||
|
// sharing the caller's MM.)
|
||||||
|
if supported&linux.MEMBARRIER_CMD_GLOBAL != 0 {
|
||||||
|
haveMembarrierGlobal = true
|
||||||
|
}
|
||||||
|
if req := uintptr(linux.MEMBARRIER_CMD_PRIVATE_EXPEDITED | linux.MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED); supported&req == req {
|
||||||
|
if _, _, e := syscall.RawSyscall(unix.SYS_MEMBARRIER, linux.MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED, 0 /* flags */, 0 /* unused */); e != 0 {
|
||||||
|
log.Warningf("membarrier(MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED) failed: %s", e.Error())
|
||||||
|
} else {
|
||||||
|
haveMembarrierPrivateExpedited = true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// HaveGlobalMemoryBarrier returns true if GlobalMemoryBarrier is supported.
|
||||||
|
func HaveGlobalMemoryBarrier() bool {
|
||||||
|
return haveMembarrierGlobal
|
||||||
|
}
|
||||||
|
|
||||||
|
// GlobalMemoryBarrier blocks until "all running threads [in the host OS] have
|
||||||
|
// passed through a state where all memory accesses to user-space addresses
|
||||||
|
// match program order between entry to and return from [GlobalMemoryBarrier]",
|
||||||
|
// as for membarrier(2).
|
||||||
|
//
|
||||||
|
// Preconditions: HaveGlobalMemoryBarrier() == true.
|
||||||
|
func GlobalMemoryBarrier() error {
|
||||||
|
if _, _, e := syscall.Syscall(unix.SYS_MEMBARRIER, linux.MEMBARRIER_CMD_GLOBAL, 0 /* flags */, 0 /* unused */); e != 0 {
|
||||||
|
return e
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// HaveProcessMemoryBarrier returns true if ProcessMemoryBarrier is supported.
|
||||||
|
func HaveProcessMemoryBarrier() bool {
|
||||||
|
return haveMembarrierPrivateExpedited
|
||||||
|
}
|
||||||
|
|
||||||
|
// ProcessMemoryBarrier is equivalent to GlobalMemoryBarrier, but only
|
||||||
|
// synchronizes with threads sharing a virtual address space (from the host OS'
|
||||||
|
// perspective) with the calling thread.
|
||||||
|
//
|
||||||
|
// Preconditions: HaveProcessMemoryBarrier() == true.
|
||||||
|
func ProcessMemoryBarrier() error {
|
||||||
|
if _, _, e := syscall.RawSyscall(unix.SYS_MEMBARRIER, linux.MEMBARRIER_CMD_PRIVATE_EXPEDITED, 0 /* flags */, 0 /* unused */); e != 0 {
|
||||||
|
return e
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
|
@ -235,6 +235,14 @@ type MemoryManager struct {
|
||||||
|
|
||||||
// vdsoSigReturnAddr is the address of 'vdso_sigreturn'.
|
// vdsoSigReturnAddr is the address of 'vdso_sigreturn'.
|
||||||
vdsoSigReturnAddr uint64
|
vdsoSigReturnAddr uint64
|
||||||
|
|
||||||
|
// membarrierPrivateEnabled is non-zero if EnableMembarrierPrivate has
|
||||||
|
// previously been called. Since, as of this writing,
|
||||||
|
// MEMBARRIER_CMD_PRIVATE_EXPEDITED is implemented as a global memory
|
||||||
|
// barrier, membarrierPrivateEnabled has no other effect.
|
||||||
|
//
|
||||||
|
// membarrierPrivateEnabled is accessed using atomic memory operations.
|
||||||
|
membarrierPrivateEnabled uint32
|
||||||
}
|
}
|
||||||
|
|
||||||
// vma represents a virtual memory area.
|
// vma represents a virtual memory area.
|
||||||
|
|
|
@ -17,6 +17,7 @@ package mm
|
||||||
import (
|
import (
|
||||||
"fmt"
|
"fmt"
|
||||||
mrand "math/rand"
|
mrand "math/rand"
|
||||||
|
"sync/atomic"
|
||||||
|
|
||||||
"gvisor.dev/gvisor/pkg/abi/linux"
|
"gvisor.dev/gvisor/pkg/abi/linux"
|
||||||
"gvisor.dev/gvisor/pkg/context"
|
"gvisor.dev/gvisor/pkg/context"
|
||||||
|
@ -1274,3 +1275,15 @@ func (mm *MemoryManager) VirtualDataSize() uint64 {
|
||||||
defer mm.mappingMu.RUnlock()
|
defer mm.mappingMu.RUnlock()
|
||||||
return mm.dataAS
|
return mm.dataAS
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// EnableMembarrierPrivate causes future calls to IsMembarrierPrivateEnabled to
|
||||||
|
// return true.
|
||||||
|
func (mm *MemoryManager) EnableMembarrierPrivate() {
|
||||||
|
atomic.StoreUint32(&mm.membarrierPrivateEnabled, 1)
|
||||||
|
}
|
||||||
|
|
||||||
|
// IsMembarrierPrivateEnabled returns true if mm.EnableMembarrierPrivate() has
|
||||||
|
// previously been called.
|
||||||
|
func (mm *MemoryManager) IsMembarrierPrivateEnabled() bool {
|
||||||
|
return atomic.LoadUint32(&mm.membarrierPrivateEnabled) != 0
|
||||||
|
}
|
||||||
|
|
|
@ -15,6 +15,7 @@ go_library(
|
||||||
"//pkg/context",
|
"//pkg/context",
|
||||||
"//pkg/seccomp",
|
"//pkg/seccomp",
|
||||||
"//pkg/sentry/arch",
|
"//pkg/sentry/arch",
|
||||||
|
"//pkg/sentry/hostmm",
|
||||||
"//pkg/sentry/memmap",
|
"//pkg/sentry/memmap",
|
||||||
"//pkg/usermem",
|
"//pkg/usermem",
|
||||||
],
|
],
|
||||||
|
|
|
@ -56,6 +56,7 @@ go_library(
|
||||||
"//pkg/sentry/time",
|
"//pkg/sentry/time",
|
||||||
"//pkg/sync",
|
"//pkg/sync",
|
||||||
"//pkg/usermem",
|
"//pkg/usermem",
|
||||||
|
"@org_golang_x_sys//unix:go_default_library",
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
|
@ -17,14 +17,23 @@ package kvm
|
||||||
import (
|
import (
|
||||||
"syscall"
|
"syscall"
|
||||||
|
|
||||||
|
"golang.org/x/sys/unix"
|
||||||
|
|
||||||
|
"gvisor.dev/gvisor/pkg/abi/linux"
|
||||||
"gvisor.dev/gvisor/pkg/seccomp"
|
"gvisor.dev/gvisor/pkg/seccomp"
|
||||||
)
|
)
|
||||||
|
|
||||||
// SyscallFilters returns syscalls made exclusively by the KVM platform.
|
// SyscallFilters returns syscalls made exclusively by the KVM platform.
|
||||||
func (*KVM) SyscallFilters() seccomp.SyscallRules {
|
func (*KVM) SyscallFilters() seccomp.SyscallRules {
|
||||||
return seccomp.SyscallRules{
|
return seccomp.SyscallRules{
|
||||||
syscall.SYS_ARCH_PRCTL: {},
|
syscall.SYS_ARCH_PRCTL: {},
|
||||||
syscall.SYS_IOCTL: {},
|
syscall.SYS_IOCTL: {},
|
||||||
|
unix.SYS_MEMBARRIER: []seccomp.Rule{
|
||||||
|
{
|
||||||
|
seccomp.EqualTo(linux.MEMBARRIER_CMD_PRIVATE_EXPEDITED),
|
||||||
|
seccomp.EqualTo(0),
|
||||||
|
},
|
||||||
|
},
|
||||||
syscall.SYS_MMAP: {},
|
syscall.SYS_MMAP: {},
|
||||||
syscall.SYS_RT_SIGSUSPEND: {},
|
syscall.SYS_RT_SIGSUSPEND: {},
|
||||||
syscall.SYS_RT_SIGTIMEDWAIT: {},
|
syscall.SYS_RT_SIGTIMEDWAIT: {},
|
||||||
|
|
|
@ -17,13 +17,22 @@ package kvm
|
||||||
import (
|
import (
|
||||||
"syscall"
|
"syscall"
|
||||||
|
|
||||||
|
"golang.org/x/sys/unix"
|
||||||
|
|
||||||
|
"gvisor.dev/gvisor/pkg/abi/linux"
|
||||||
"gvisor.dev/gvisor/pkg/seccomp"
|
"gvisor.dev/gvisor/pkg/seccomp"
|
||||||
)
|
)
|
||||||
|
|
||||||
// SyscallFilters returns syscalls made exclusively by the KVM platform.
|
// SyscallFilters returns syscalls made exclusively by the KVM platform.
|
||||||
func (*KVM) SyscallFilters() seccomp.SyscallRules {
|
func (*KVM) SyscallFilters() seccomp.SyscallRules {
|
||||||
return seccomp.SyscallRules{
|
return seccomp.SyscallRules{
|
||||||
syscall.SYS_IOCTL: {},
|
syscall.SYS_IOCTL: {},
|
||||||
|
unix.SYS_MEMBARRIER: []seccomp.Rule{
|
||||||
|
{
|
||||||
|
seccomp.EqualTo(linux.MEMBARRIER_CMD_PRIVATE_EXPEDITED),
|
||||||
|
seccomp.EqualTo(0),
|
||||||
|
},
|
||||||
|
},
|
||||||
syscall.SYS_MMAP: {},
|
syscall.SYS_MMAP: {},
|
||||||
syscall.SYS_RT_SIGSUSPEND: {},
|
syscall.SYS_RT_SIGSUSPEND: {},
|
||||||
syscall.SYS_RT_SIGTIMEDWAIT: {},
|
syscall.SYS_RT_SIGTIMEDWAIT: {},
|
||||||
|
|
|
@ -63,6 +63,9 @@ type runData struct {
|
||||||
type KVM struct {
|
type KVM struct {
|
||||||
platform.NoCPUPreemptionDetection
|
platform.NoCPUPreemptionDetection
|
||||||
|
|
||||||
|
// KVM never changes mm_structs.
|
||||||
|
platform.UseHostProcessMemoryBarrier
|
||||||
|
|
||||||
// machine is the backing VM.
|
// machine is the backing VM.
|
||||||
machine *machine
|
machine *machine
|
||||||
}
|
}
|
||||||
|
|
|
@ -25,6 +25,7 @@ import (
|
||||||
"gvisor.dev/gvisor/pkg/context"
|
"gvisor.dev/gvisor/pkg/context"
|
||||||
"gvisor.dev/gvisor/pkg/seccomp"
|
"gvisor.dev/gvisor/pkg/seccomp"
|
||||||
"gvisor.dev/gvisor/pkg/sentry/arch"
|
"gvisor.dev/gvisor/pkg/sentry/arch"
|
||||||
|
"gvisor.dev/gvisor/pkg/sentry/hostmm"
|
||||||
"gvisor.dev/gvisor/pkg/sentry/memmap"
|
"gvisor.dev/gvisor/pkg/sentry/memmap"
|
||||||
"gvisor.dev/gvisor/pkg/usermem"
|
"gvisor.dev/gvisor/pkg/usermem"
|
||||||
)
|
)
|
||||||
|
@ -52,6 +53,10 @@ type Platform interface {
|
||||||
// can reliably return ErrContextCPUPreempted.
|
// can reliably return ErrContextCPUPreempted.
|
||||||
DetectsCPUPreemption() bool
|
DetectsCPUPreemption() bool
|
||||||
|
|
||||||
|
// HaveGlobalMemoryBarrier returns true if the GlobalMemoryBarrier method
|
||||||
|
// is supported.
|
||||||
|
HaveGlobalMemoryBarrier() bool
|
||||||
|
|
||||||
// MapUnit returns the alignment used for optional mappings into this
|
// MapUnit returns the alignment used for optional mappings into this
|
||||||
// platform's AddressSpaces. Higher values indicate lower per-page costs
|
// platform's AddressSpaces. Higher values indicate lower per-page costs
|
||||||
// for AddressSpace.MapFile. As a special case, a MapUnit of 0 indicates
|
// for AddressSpace.MapFile. As a special case, a MapUnit of 0 indicates
|
||||||
|
@ -97,6 +102,15 @@ type Platform interface {
|
||||||
// called.
|
// called.
|
||||||
PreemptAllCPUs() error
|
PreemptAllCPUs() error
|
||||||
|
|
||||||
|
// GlobalMemoryBarrier blocks until all threads running application code
|
||||||
|
// (via Context.Switch) and all task goroutines "have passed through a
|
||||||
|
// state where all memory accesses to user-space addresses match program
|
||||||
|
// order between entry to and return from [GlobalMemoryBarrier]", as for
|
||||||
|
// membarrier(2).
|
||||||
|
//
|
||||||
|
// Preconditions: HaveGlobalMemoryBarrier() == true.
|
||||||
|
GlobalMemoryBarrier() error
|
||||||
|
|
||||||
// SyscallFilters returns syscalls made exclusively by this platform.
|
// SyscallFilters returns syscalls made exclusively by this platform.
|
||||||
SyscallFilters() seccomp.SyscallRules
|
SyscallFilters() seccomp.SyscallRules
|
||||||
}
|
}
|
||||||
|
@ -115,6 +129,43 @@ func (NoCPUPreemptionDetection) PreemptAllCPUs() error {
|
||||||
panic("This platform does not support CPU preemption detection")
|
panic("This platform does not support CPU preemption detection")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// UseHostGlobalMemoryBarrier implements Platform.HaveGlobalMemoryBarrier and
|
||||||
|
// Platform.GlobalMemoryBarrier by invoking equivalent functionality on the
|
||||||
|
// host.
|
||||||
|
type UseHostGlobalMemoryBarrier struct{}
|
||||||
|
|
||||||
|
// HaveGlobalMemoryBarrier implements Platform.HaveGlobalMemoryBarrier.
|
||||||
|
func (UseHostGlobalMemoryBarrier) HaveGlobalMemoryBarrier() bool {
|
||||||
|
return hostmm.HaveGlobalMemoryBarrier()
|
||||||
|
}
|
||||||
|
|
||||||
|
// GlobalMemoryBarrier implements Platform.GlobalMemoryBarrier.
|
||||||
|
func (UseHostGlobalMemoryBarrier) GlobalMemoryBarrier() error {
|
||||||
|
return hostmm.GlobalMemoryBarrier()
|
||||||
|
}
|
||||||
|
|
||||||
|
// UseHostProcessMemoryBarrier implements Platform.HaveGlobalMemoryBarrier and
|
||||||
|
// Platform.GlobalMemoryBarrier by invoking a process-local memory barrier.
|
||||||
|
// This is faster than UseHostGlobalMemoryBarrier, but is only appropriate for
|
||||||
|
// platforms for which application code executes while using the sentry's
|
||||||
|
// mm_struct.
|
||||||
|
type UseHostProcessMemoryBarrier struct{}
|
||||||
|
|
||||||
|
// HaveGlobalMemoryBarrier implements Platform.HaveGlobalMemoryBarrier.
|
||||||
|
func (UseHostProcessMemoryBarrier) HaveGlobalMemoryBarrier() bool {
|
||||||
|
// Fall back to a global memory barrier if a process-local one isn't
|
||||||
|
// available.
|
||||||
|
return hostmm.HaveProcessMemoryBarrier() || hostmm.HaveGlobalMemoryBarrier()
|
||||||
|
}
|
||||||
|
|
||||||
|
// GlobalMemoryBarrier implements Platform.GlobalMemoryBarrier.
|
||||||
|
func (UseHostProcessMemoryBarrier) GlobalMemoryBarrier() error {
|
||||||
|
if hostmm.HaveProcessMemoryBarrier() {
|
||||||
|
return hostmm.ProcessMemoryBarrier()
|
||||||
|
}
|
||||||
|
return hostmm.GlobalMemoryBarrier()
|
||||||
|
}
|
||||||
|
|
||||||
// MemoryManager represents an abstraction above the platform address space
|
// MemoryManager represents an abstraction above the platform address space
|
||||||
// which manages memory mappings and their contents.
|
// which manages memory mappings and their contents.
|
||||||
type MemoryManager interface {
|
type MemoryManager interface {
|
||||||
|
|
|
@ -192,6 +192,7 @@ func (c *context) PullFullState(as platform.AddressSpace, ac arch.Context) {}
|
||||||
type PTrace struct {
|
type PTrace struct {
|
||||||
platform.MMapMinAddr
|
platform.MMapMinAddr
|
||||||
platform.NoCPUPreemptionDetection
|
platform.NoCPUPreemptionDetection
|
||||||
|
platform.UseHostGlobalMemoryBarrier
|
||||||
}
|
}
|
||||||
|
|
||||||
// New returns a new ptrace-based implementation of the platform interface.
|
// New returns a new ptrace-based implementation of the platform interface.
|
||||||
|
|
|
@ -21,6 +21,7 @@ go_library(
|
||||||
"sys_identity.go",
|
"sys_identity.go",
|
||||||
"sys_inotify.go",
|
"sys_inotify.go",
|
||||||
"sys_lseek.go",
|
"sys_lseek.go",
|
||||||
|
"sys_membarrier.go",
|
||||||
"sys_mempolicy.go",
|
"sys_mempolicy.go",
|
||||||
"sys_mmap.go",
|
"sys_mmap.go",
|
||||||
"sys_mount.go",
|
"sys_mount.go",
|
||||||
|
|
|
@ -376,7 +376,7 @@ var AMD64 = &kernel.SyscallTable{
|
||||||
321: syscalls.CapError("bpf", linux.CAP_SYS_ADMIN, "", nil),
|
321: syscalls.CapError("bpf", linux.CAP_SYS_ADMIN, "", nil),
|
||||||
322: syscalls.Supported("execveat", Execveat),
|
322: syscalls.Supported("execveat", Execveat),
|
||||||
323: syscalls.ErrorWithEvent("userfaultfd", syserror.ENOSYS, "", []string{"gvisor.dev/issue/266"}), // TODO(b/118906345)
|
323: syscalls.ErrorWithEvent("userfaultfd", syserror.ENOSYS, "", []string{"gvisor.dev/issue/266"}), // TODO(b/118906345)
|
||||||
324: syscalls.ErrorWithEvent("membarrier", syserror.ENOSYS, "", []string{"gvisor.dev/issue/267"}), // TODO(gvisor.dev/issue/267)
|
324: syscalls.PartiallySupported("membarrier", Membarrier, "Not supported on all platforms.", nil),
|
||||||
325: syscalls.PartiallySupported("mlock2", Mlock2, "Stub implementation. The sandbox lacks appropriate permissions.", nil),
|
325: syscalls.PartiallySupported("mlock2", Mlock2, "Stub implementation. The sandbox lacks appropriate permissions.", nil),
|
||||||
|
|
||||||
// Syscalls implemented after 325 are "backports" from versions
|
// Syscalls implemented after 325 are "backports" from versions
|
||||||
|
@ -695,7 +695,7 @@ var ARM64 = &kernel.SyscallTable{
|
||||||
280: syscalls.CapError("bpf", linux.CAP_SYS_ADMIN, "", nil),
|
280: syscalls.CapError("bpf", linux.CAP_SYS_ADMIN, "", nil),
|
||||||
281: syscalls.Supported("execveat", Execveat),
|
281: syscalls.Supported("execveat", Execveat),
|
||||||
282: syscalls.ErrorWithEvent("userfaultfd", syserror.ENOSYS, "", []string{"gvisor.dev/issue/266"}), // TODO(b/118906345)
|
282: syscalls.ErrorWithEvent("userfaultfd", syserror.ENOSYS, "", []string{"gvisor.dev/issue/266"}), // TODO(b/118906345)
|
||||||
283: syscalls.ErrorWithEvent("membarrier", syserror.ENOSYS, "", []string{"gvisor.dev/issue/267"}), // TODO(gvisor.dev/issue/267)
|
283: syscalls.PartiallySupported("membarrier", Membarrier, "Not supported on all platforms.", nil),
|
||||||
284: syscalls.PartiallySupported("mlock2", Mlock2, "Stub implementation. The sandbox lacks appropriate permissions.", nil),
|
284: syscalls.PartiallySupported("mlock2", Mlock2, "Stub implementation. The sandbox lacks appropriate permissions.", nil),
|
||||||
|
|
||||||
// Syscalls after 284 are "backports" from versions of Linux after 4.4.
|
// Syscalls after 284 are "backports" from versions of Linux after 4.4.
|
||||||
|
|
|
@ -0,0 +1,70 @@
|
||||||
|
// Copyright 2020 The gVisor Authors.
|
||||||
|
//
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
// you may not use this file except in compliance with the License.
|
||||||
|
// You may obtain a copy of the License at
|
||||||
|
//
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
//
|
||||||
|
// Unless required by applicable law or agreed to in writing, software
|
||||||
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
// See the License for the specific language governing permissions and
|
||||||
|
// limitations under the License.
|
||||||
|
|
||||||
|
package linux
|
||||||
|
|
||||||
|
import (
|
||||||
|
"gvisor.dev/gvisor/pkg/abi/linux"
|
||||||
|
"gvisor.dev/gvisor/pkg/sentry/arch"
|
||||||
|
"gvisor.dev/gvisor/pkg/sentry/kernel"
|
||||||
|
"gvisor.dev/gvisor/pkg/syserror"
|
||||||
|
)
|
||||||
|
|
||||||
|
// Membarrier implements syscall membarrier(2).
|
||||||
|
func Membarrier(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
|
||||||
|
cmd := args[0].Int()
|
||||||
|
flags := args[1].Int()
|
||||||
|
|
||||||
|
p := t.Kernel().Platform
|
||||||
|
if !p.HaveGlobalMemoryBarrier() {
|
||||||
|
// Event for applications that want membarrier on a configuration that
|
||||||
|
// doesn't support them.
|
||||||
|
t.Kernel().EmitUnimplementedEvent(t)
|
||||||
|
return 0, nil, syserror.ENOSYS
|
||||||
|
}
|
||||||
|
|
||||||
|
if flags != 0 {
|
||||||
|
return 0, nil, syserror.EINVAL
|
||||||
|
}
|
||||||
|
|
||||||
|
switch cmd {
|
||||||
|
case linux.MEMBARRIER_CMD_QUERY:
|
||||||
|
const supportedCommands = linux.MEMBARRIER_CMD_GLOBAL |
|
||||||
|
linux.MEMBARRIER_CMD_GLOBAL_EXPEDITED |
|
||||||
|
linux.MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED |
|
||||||
|
linux.MEMBARRIER_CMD_PRIVATE_EXPEDITED |
|
||||||
|
linux.MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED
|
||||||
|
return supportedCommands, nil, nil
|
||||||
|
case linux.MEMBARRIER_CMD_PRIVATE_EXPEDITED:
|
||||||
|
if !t.MemoryManager().IsMembarrierPrivateEnabled() {
|
||||||
|
return 0, nil, syserror.EPERM
|
||||||
|
}
|
||||||
|
fallthrough
|
||||||
|
case linux.MEMBARRIER_CMD_GLOBAL, linux.MEMBARRIER_CMD_GLOBAL_EXPEDITED:
|
||||||
|
return 0, nil, p.GlobalMemoryBarrier()
|
||||||
|
case linux.MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED:
|
||||||
|
// no-op
|
||||||
|
return 0, nil, nil
|
||||||
|
case linux.MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED:
|
||||||
|
t.MemoryManager().EnableMembarrierPrivate()
|
||||||
|
return 0, nil, nil
|
||||||
|
case linux.MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE, linux.MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE:
|
||||||
|
// We're aware of these, but they aren't implemented since no platform
|
||||||
|
// supports them yet.
|
||||||
|
t.Kernel().EmitUnimplementedEvent(t)
|
||||||
|
fallthrough
|
||||||
|
default:
|
||||||
|
return 0, nil, syserror.EINVAL
|
||||||
|
}
|
||||||
|
}
|
|
@ -162,6 +162,12 @@ var allowedSyscalls = seccomp.SyscallRules{
|
||||||
},
|
},
|
||||||
syscall.SYS_LSEEK: {},
|
syscall.SYS_LSEEK: {},
|
||||||
syscall.SYS_MADVISE: {},
|
syscall.SYS_MADVISE: {},
|
||||||
|
unix.SYS_MEMBARRIER: []seccomp.Rule{
|
||||||
|
{
|
||||||
|
seccomp.EqualTo(linux.MEMBARRIER_CMD_GLOBAL),
|
||||||
|
seccomp.EqualTo(0),
|
||||||
|
},
|
||||||
|
},
|
||||||
syscall.SYS_MINCORE: {},
|
syscall.SYS_MINCORE: {},
|
||||||
// Used by the Go runtime as a temporarily workaround for a Linux
|
// Used by the Go runtime as a temporarily workaround for a Linux
|
||||||
// 5.2-5.4 bug.
|
// 5.2-5.4 bug.
|
||||||
|
|
|
@ -285,6 +285,10 @@ syscall_test(
|
||||||
test = "//test/syscalls/linux:madvise_test",
|
test = "//test/syscalls/linux:madvise_test",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
syscall_test(
|
||||||
|
test = "//test/syscalls/linux:membarrier_test",
|
||||||
|
)
|
||||||
|
|
||||||
syscall_test(
|
syscall_test(
|
||||||
test = "//test/syscalls/linux:memory_accounting_test",
|
test = "//test/syscalls/linux:memory_accounting_test",
|
||||||
)
|
)
|
||||||
|
|
|
@ -1155,6 +1155,24 @@ cc_binary(
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
|
|
||||||
|
cc_binary(
|
||||||
|
name = "membarrier_test",
|
||||||
|
testonly = 1,
|
||||||
|
srcs = ["membarrier.cc"],
|
||||||
|
linkstatic = 1,
|
||||||
|
deps = [
|
||||||
|
"@com_google_absl//absl/time",
|
||||||
|
gtest,
|
||||||
|
"//test/util:cleanup",
|
||||||
|
"//test/util:logging",
|
||||||
|
"//test/util:memory_util",
|
||||||
|
"//test/util:posix_error",
|
||||||
|
"//test/util:test_main",
|
||||||
|
"//test/util:test_util",
|
||||||
|
"//test/util:thread_util",
|
||||||
|
],
|
||||||
|
)
|
||||||
|
|
||||||
cc_binary(
|
cc_binary(
|
||||||
name = "mempolicy_test",
|
name = "mempolicy_test",
|
||||||
testonly = 1,
|
testonly = 1,
|
||||||
|
|
|
@ -0,0 +1,268 @@
|
||||||
|
// Copyright 2020 The gVisor Authors.
|
||||||
|
//
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
// you may not use this file except in compliance with the License.
|
||||||
|
// You may obtain a copy of the License at
|
||||||
|
//
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
//
|
||||||
|
// Unless required by applicable law or agreed to in writing, software
|
||||||
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
// See the License for the specific language governing permissions and
|
||||||
|
// limitations under the License.
|
||||||
|
|
||||||
|
#include <errno.h>
|
||||||
|
#include <signal.h>
|
||||||
|
#include <sys/syscall.h>
|
||||||
|
#include <sys/types.h>
|
||||||
|
#include <unistd.h>
|
||||||
|
|
||||||
|
#include <atomic>
|
||||||
|
|
||||||
|
#include "absl/time/clock.h"
|
||||||
|
#include "absl/time/time.h"
|
||||||
|
#include "test/util/cleanup.h"
|
||||||
|
#include "test/util/logging.h"
|
||||||
|
#include "test/util/memory_util.h"
|
||||||
|
#include "test/util/posix_error.h"
|
||||||
|
#include "test/util/test_util.h"
|
||||||
|
#include "test/util/thread_util.h"
|
||||||
|
|
||||||
|
namespace gvisor {
|
||||||
|
namespace testing {
|
||||||
|
|
||||||
|
namespace {
|
||||||
|
|
||||||
|
// This is the classic test case for memory fences on architectures with total
|
||||||
|
// store ordering; see e.g. Intel SDM Vol. 3A Sec. 8.2.3.4 "Loads May Be
|
||||||
|
// Reordered with Earlier Stores to Different Locations". In each iteration of
|
||||||
|
// the test, given two variables X and Y initially set to 0
|
||||||
|
// (MembarrierTestSharedState::local_var and remote_var in the code), two
|
||||||
|
// threads execute as follows:
|
||||||
|
//
|
||||||
|
// T1 T2
|
||||||
|
// -- --
|
||||||
|
//
|
||||||
|
// X = 1 Y = 1
|
||||||
|
// T1fence() T2fence()
|
||||||
|
// read Y read X
|
||||||
|
//
|
||||||
|
// On architectures where memory writes may be locally buffered by each CPU
|
||||||
|
// (essentially all architectures), if T1fence() and T2fence() are omitted or
|
||||||
|
// ineffective, it is possible for both T1 and T2 to read 0 because the memory
|
||||||
|
// write from the other CPU is not yet visible outside that CPU. T1fence() and
|
||||||
|
// T2fence() are expected to perform the necessary synchronization to restore
|
||||||
|
// sequential consistency: both threads agree on a order of memory accesses that
|
||||||
|
// is consistent with program order in each thread, such that at least one
|
||||||
|
// thread reads 1.
|
||||||
|
//
|
||||||
|
// In the NoMembarrier test, T1fence() and T2fence() are both ordinary memory
|
||||||
|
// fences establishing ordering between memory accesses before and after the
|
||||||
|
// fence (std::atomic_thread_fence). In all other test cases, T1fence() is not a
|
||||||
|
// memory fence at all, but only prevents compiler reordering of memory accesses
|
||||||
|
// (std::atomic_signal_fence); T2fence() is an invocation of the membarrier()
|
||||||
|
// syscall, which establishes ordering of memory accesses before and after the
|
||||||
|
// syscall on both threads.
|
||||||
|
|
||||||
|
template <typename F>
|
||||||
|
int DoMembarrierTestSide(std::atomic<int>* our_var,
|
||||||
|
std::atomic<int> const& their_var,
|
||||||
|
F const& test_fence) {
|
||||||
|
our_var->store(1, std::memory_order_relaxed);
|
||||||
|
test_fence();
|
||||||
|
return their_var.load(std::memory_order_relaxed);
|
||||||
|
}
|
||||||
|
|
||||||
|
struct MembarrierTestSharedState {
|
||||||
|
std::atomic<int64_t> remote_iter_cur;
|
||||||
|
std::atomic<int64_t> remote_iter_done;
|
||||||
|
std::atomic<int> local_var;
|
||||||
|
std::atomic<int> remote_var;
|
||||||
|
int remote_obs_of_local_var;
|
||||||
|
|
||||||
|
void Init() {
|
||||||
|
remote_iter_cur.store(-1, std::memory_order_relaxed);
|
||||||
|
remote_iter_done.store(-1, std::memory_order_relaxed);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
// Special value for MembarrierTestSharedState::remote_iter_cur indicating that
|
||||||
|
// the remote thread should terminate.
|
||||||
|
constexpr int64_t kRemoteIterStop = -2;
|
||||||
|
|
||||||
|
// Must be async-signal-safe.
|
||||||
|
template <typename F>
|
||||||
|
void RunMembarrierTestRemoteSide(MembarrierTestSharedState* state,
|
||||||
|
F const& test_fence) {
|
||||||
|
int64_t i = 0;
|
||||||
|
int64_t cur;
|
||||||
|
while (true) {
|
||||||
|
while ((cur = state->remote_iter_cur.load(std::memory_order_acquire)) < i) {
|
||||||
|
if (cur == kRemoteIterStop) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
// spin
|
||||||
|
}
|
||||||
|
state->remote_obs_of_local_var =
|
||||||
|
DoMembarrierTestSide(&state->remote_var, state->local_var, test_fence);
|
||||||
|
state->remote_iter_done.store(i, std::memory_order_release);
|
||||||
|
i++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename F>
|
||||||
|
void RunMembarrierTestLocalSide(MembarrierTestSharedState* state,
|
||||||
|
F const& test_fence) {
|
||||||
|
// On test completion, instruct the remote thread to terminate.
|
||||||
|
Cleanup cleanup_remote([&] {
|
||||||
|
state->remote_iter_cur.store(kRemoteIterStop, std::memory_order_relaxed);
|
||||||
|
});
|
||||||
|
|
||||||
|
int64_t i = 0;
|
||||||
|
absl::Time end = absl::Now() + absl::Seconds(5); // arbitrary test duration
|
||||||
|
while (absl::Now() < end) {
|
||||||
|
// Reset both vars to 0.
|
||||||
|
state->local_var.store(0, std::memory_order_relaxed);
|
||||||
|
state->remote_var.store(0, std::memory_order_relaxed);
|
||||||
|
// Instruct the remote thread to begin this iteration.
|
||||||
|
state->remote_iter_cur.store(i, std::memory_order_release);
|
||||||
|
// Perform our side of the test.
|
||||||
|
auto local_obs_of_remote_var =
|
||||||
|
DoMembarrierTestSide(&state->local_var, state->remote_var, test_fence);
|
||||||
|
// Wait for the remote thread to finish this iteration.
|
||||||
|
while (state->remote_iter_done.load(std::memory_order_acquire) < i) {
|
||||||
|
// spin
|
||||||
|
}
|
||||||
|
ASSERT_TRUE(local_obs_of_remote_var != 0 ||
|
||||||
|
state->remote_obs_of_local_var != 0);
|
||||||
|
i++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(MembarrierTest, NoMembarrier) {
|
||||||
|
MembarrierTestSharedState state;
|
||||||
|
state.Init();
|
||||||
|
|
||||||
|
ScopedThread remote_thread([&] {
|
||||||
|
RunMembarrierTestRemoteSide(
|
||||||
|
&state, [] { std::atomic_thread_fence(std::memory_order_seq_cst); });
|
||||||
|
});
|
||||||
|
RunMembarrierTestLocalSide(
|
||||||
|
&state, [] { std::atomic_thread_fence(std::memory_order_seq_cst); });
|
||||||
|
}
|
||||||
|
|
||||||
|
enum membarrier_cmd {
|
||||||
|
MEMBARRIER_CMD_QUERY = 0,
|
||||||
|
MEMBARRIER_CMD_GLOBAL = (1 << 0),
|
||||||
|
MEMBARRIER_CMD_GLOBAL_EXPEDITED = (1 << 1),
|
||||||
|
MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED = (1 << 2),
|
||||||
|
MEMBARRIER_CMD_PRIVATE_EXPEDITED = (1 << 3),
|
||||||
|
MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED = (1 << 4),
|
||||||
|
};
|
||||||
|
|
||||||
|
int membarrier(membarrier_cmd cmd, int flags) {
|
||||||
|
return syscall(SYS_membarrier, cmd, flags);
|
||||||
|
}
|
||||||
|
|
||||||
|
PosixErrorOr<int> SupportedMembarrierCommands() {
|
||||||
|
int cmds = membarrier(MEMBARRIER_CMD_QUERY, 0);
|
||||||
|
if (cmds < 0) {
|
||||||
|
if (errno == ENOSYS) {
|
||||||
|
// No commands are supported.
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
return PosixError(errno, "membarrier(MEMBARRIER_CMD_QUERY) failed");
|
||||||
|
}
|
||||||
|
return cmds;
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(MembarrierTest, Global) {
|
||||||
|
SKIP_IF((ASSERT_NO_ERRNO_AND_VALUE(SupportedMembarrierCommands()) &
|
||||||
|
MEMBARRIER_CMD_GLOBAL) == 0);
|
||||||
|
|
||||||
|
Mapping m = ASSERT_NO_ERRNO_AND_VALUE(
|
||||||
|
MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_SHARED));
|
||||||
|
auto state = static_cast<MembarrierTestSharedState*>(m.ptr());
|
||||||
|
state->Init();
|
||||||
|
|
||||||
|
pid_t const child_pid = fork();
|
||||||
|
if (child_pid == 0) {
|
||||||
|
// In child process.
|
||||||
|
RunMembarrierTestRemoteSide(
|
||||||
|
state, [] { TEST_PCHECK(membarrier(MEMBARRIER_CMD_GLOBAL, 0) == 0); });
|
||||||
|
_exit(0);
|
||||||
|
}
|
||||||
|
// In parent process.
|
||||||
|
ASSERT_THAT(child_pid, SyscallSucceeds());
|
||||||
|
Cleanup cleanup_child([&] {
|
||||||
|
int status;
|
||||||
|
ASSERT_THAT(waitpid(child_pid, &status, 0),
|
||||||
|
SyscallSucceedsWithValue(child_pid));
|
||||||
|
EXPECT_TRUE(WIFEXITED(status) && WEXITSTATUS(status) == 0)
|
||||||
|
<< " status " << status;
|
||||||
|
});
|
||||||
|
RunMembarrierTestLocalSide(
|
||||||
|
state, [] { std::atomic_signal_fence(std::memory_order_seq_cst); });
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(MembarrierTest, GlobalExpedited) {
|
||||||
|
constexpr int kRequiredCommands = MEMBARRIER_CMD_GLOBAL_EXPEDITED |
|
||||||
|
MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED;
|
||||||
|
SKIP_IF((ASSERT_NO_ERRNO_AND_VALUE(SupportedMembarrierCommands()) &
|
||||||
|
kRequiredCommands) != kRequiredCommands);
|
||||||
|
|
||||||
|
ASSERT_THAT(membarrier(MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED, 0),
|
||||||
|
SyscallSucceeds());
|
||||||
|
|
||||||
|
Mapping m = ASSERT_NO_ERRNO_AND_VALUE(
|
||||||
|
MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_SHARED));
|
||||||
|
auto state = static_cast<MembarrierTestSharedState*>(m.ptr());
|
||||||
|
state->Init();
|
||||||
|
|
||||||
|
pid_t const child_pid = fork();
|
||||||
|
if (child_pid == 0) {
|
||||||
|
// In child process.
|
||||||
|
RunMembarrierTestRemoteSide(state, [] {
|
||||||
|
TEST_PCHECK(membarrier(MEMBARRIER_CMD_GLOBAL_EXPEDITED, 0) == 0);
|
||||||
|
});
|
||||||
|
_exit(0);
|
||||||
|
}
|
||||||
|
// In parent process.
|
||||||
|
ASSERT_THAT(child_pid, SyscallSucceeds());
|
||||||
|
Cleanup cleanup_child([&] {
|
||||||
|
int status;
|
||||||
|
ASSERT_THAT(waitpid(child_pid, &status, 0),
|
||||||
|
SyscallSucceedsWithValue(child_pid));
|
||||||
|
EXPECT_TRUE(WIFEXITED(status) && WEXITSTATUS(status) == 0)
|
||||||
|
<< " status " << status;
|
||||||
|
});
|
||||||
|
RunMembarrierTestLocalSide(
|
||||||
|
state, [] { std::atomic_signal_fence(std::memory_order_seq_cst); });
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(MembarrierTest, PrivateExpedited) {
|
||||||
|
constexpr int kRequiredCommands = MEMBARRIER_CMD_PRIVATE_EXPEDITED |
|
||||||
|
MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED;
|
||||||
|
SKIP_IF((ASSERT_NO_ERRNO_AND_VALUE(SupportedMembarrierCommands()) &
|
||||||
|
kRequiredCommands) != kRequiredCommands);
|
||||||
|
|
||||||
|
ASSERT_THAT(membarrier(MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED, 0),
|
||||||
|
SyscallSucceeds());
|
||||||
|
|
||||||
|
MembarrierTestSharedState state;
|
||||||
|
state.Init();
|
||||||
|
|
||||||
|
ScopedThread remote_thread([&] {
|
||||||
|
RunMembarrierTestRemoteSide(&state, [] {
|
||||||
|
TEST_PCHECK(membarrier(MEMBARRIER_CMD_PRIVATE_EXPEDITED, 0) == 0);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
RunMembarrierTestLocalSide(
|
||||||
|
&state, [] { std::atomic_signal_fence(std::memory_order_seq_cst); });
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace
|
||||||
|
|
||||||
|
} // namespace testing
|
||||||
|
} // namespace gvisor
|
Loading…
Reference in New Issue