gvisor/pkg/seccomp/seccomp.go

393 lines
13 KiB
Go

// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// Package seccomp provides basic seccomp filters for x86_64 (little endian).
package seccomp
import (
"fmt"
"reflect"
"sort"
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/bpf"
"gvisor.dev/gvisor/pkg/log"
)
const (
// skipOneInst is the offset to take for skipping one instruction.
skipOneInst = 1
// defaultLabel is the label for the default action.
defaultLabel = "default_action"
)
// Install generates BPF code based on the set of syscalls provided. It only
// allows syscalls that conform to the specification. Syscalls that violate the
// specification will trigger RET_KILL_PROCESS, except for the cases below.
//
// RET_TRAP is used in violations, instead of RET_KILL_PROCESS, in the
// following cases:
// 1. Kernel doesn't support RET_KILL_PROCESS: RET_KILL_THREAD only kills the
// offending thread and often keeps the sentry hanging.
// 2. Debug: RET_TRAP generates a panic followed by a stack trace which is
// much easier to debug then RET_KILL_PROCESS which can't be caught.
//
// Be aware that RET_TRAP sends SIGSYS to the process and it may be ignored,
// making it possible for the process to continue running after a violation.
// However, it will leave a SECCOMP audit event trail behind. In any case, the
// syscall is still blocked from executing.
func Install(rules SyscallRules) error {
defaultAction, err := defaultAction()
if err != nil {
return err
}
// Uncomment to get stack trace when there is a violation.
// defaultAction = linux.BPFAction(linux.SECCOMP_RET_TRAP)
log.Infof("Installing seccomp filters for %d syscalls (action=%v)", len(rules), defaultAction)
instrs, err := BuildProgram([]RuleSet{
RuleSet{
Rules: rules,
Action: linux.SECCOMP_RET_ALLOW,
},
}, defaultAction)
if log.IsLogging(log.Debug) {
programStr, errDecode := bpf.DecodeProgram(instrs)
if errDecode != nil {
programStr = fmt.Sprintf("Error: %v\n%s", errDecode, programStr)
}
log.Debugf("Seccomp program dump:\n%s", programStr)
}
if err != nil {
return err
}
// Perform the actual installation.
if errno := SetFilter(instrs); errno != 0 {
return fmt.Errorf("Failed to set filter: %v", errno)
}
log.Infof("Seccomp filters installed.")
return nil
}
func defaultAction() (linux.BPFAction, error) {
available, err := isKillProcessAvailable()
if err != nil {
return 0, err
}
if available {
return linux.SECCOMP_RET_KILL_PROCESS, nil
}
return linux.SECCOMP_RET_TRAP, nil
}
// RuleSet is a set of rules and associated action.
type RuleSet struct {
Rules SyscallRules
Action linux.BPFAction
// Vsyscall indicates that a check is made for a function being called
// from kernel mappings. This is where the vsyscall page is located
// (and typically) emulated, so this RuleSet will not match any
// functions not dispatched from the vsyscall page.
Vsyscall bool
}
// SyscallName gives names to system calls. It is used purely for debugging purposes.
//
// An alternate namer can be provided to the package at initialization time.
var SyscallName = func(sysno uintptr) string {
return fmt.Sprintf("syscall_%d", sysno)
}
// BuildProgram builds a BPF program from the given map of actions to matching
// SyscallRules. The single generated program covers all provided RuleSets.
func BuildProgram(rules []RuleSet, defaultAction linux.BPFAction) ([]linux.BPFInstruction, error) {
program := bpf.NewProgramBuilder()
// Be paranoid and check that syscall is done in the expected architecture.
//
// A = seccomp_data.arch
// if (A != AUDIT_ARCH) goto defaultAction.
program.AddStmt(bpf.Ld|bpf.Abs|bpf.W, seccompDataOffsetArch)
// defaultLabel is at the bottom of the program. The size of program
// may exceeds 255 lines, which is the limit of a condition jump.
program.AddJump(bpf.Jmp|bpf.Jeq|bpf.K, LINUX_AUDIT_ARCH, skipOneInst, 0)
program.AddDirectJumpLabel(defaultLabel)
if err := buildIndex(rules, program); err != nil {
return nil, err
}
// Exhausted: return defaultAction.
if err := program.AddLabel(defaultLabel); err != nil {
return nil, err
}
program.AddStmt(bpf.Ret|bpf.K, uint32(defaultAction))
return program.Instructions()
}
// buildIndex builds a BST to quickly search through all syscalls.
func buildIndex(rules []RuleSet, program *bpf.ProgramBuilder) error {
// Build a list of all application system calls, across all given rule
// sets. We have a simple BST, but may dispatch individual matchers
// with different actions. The matchers are evaluated linearly.
requiredSyscalls := make(map[uintptr]struct{})
for _, rs := range rules {
for sysno := range rs.Rules {
requiredSyscalls[sysno] = struct{}{}
}
}
syscalls := make([]uintptr, 0, len(requiredSyscalls))
for sysno, _ := range requiredSyscalls {
syscalls = append(syscalls, sysno)
}
sort.Slice(syscalls, func(i, j int) bool { return syscalls[i] < syscalls[j] })
for _, sysno := range syscalls {
for _, rs := range rules {
// Print only if there is a corresponding set of rules.
if _, ok := rs.Rules[sysno]; ok {
log.Debugf("syscall filter %v: %s => 0x%x", SyscallName(sysno), rs.Rules[sysno], rs.Action)
}
}
}
root := createBST(syscalls)
root.root = true
// Load syscall number into A and run through BST.
//
// A = seccomp_data.nr
program.AddStmt(bpf.Ld|bpf.Abs|bpf.W, seccompDataOffsetNR)
return root.traverse(buildBSTProgram, rules, program)
}
// createBST converts sorted syscall slice into a balanced BST.
// Panics if syscalls is empty.
func createBST(syscalls []uintptr) *node {
i := len(syscalls) / 2
parent := node{value: syscalls[i]}
if i > 0 {
parent.left = createBST(syscalls[:i])
}
if i+1 < len(syscalls) {
parent.right = createBST(syscalls[i+1:])
}
return &parent
}
func vsyscallViolationLabel(ruleSetIdx int, sysno uintptr) string {
return fmt.Sprintf("vsyscallViolation_%v_%v", ruleSetIdx, sysno)
}
func ruleViolationLabel(ruleSetIdx int, sysno uintptr, idx int) string {
return fmt.Sprintf("ruleViolation_%v_%v_%v", ruleSetIdx, sysno, idx)
}
func ruleLabel(ruleSetIdx int, sysno uintptr, idx int, name string) string {
return fmt.Sprintf("rule_%v_%v_%v_%v", ruleSetIdx, sysno, idx, name)
}
func checkArgsLabel(sysno uintptr) string {
return fmt.Sprintf("checkArgs_%v", sysno)
}
// addSyscallArgsCheck adds argument checks for a single system call. It does
// not insert a jump to the default action at the end and it is the
// responsibility of the caller to insert an appropriate jump after calling
// this function.
func addSyscallArgsCheck(p *bpf.ProgramBuilder, rules []Rule, action linux.BPFAction, ruleSetIdx int, sysno uintptr) error {
for ruleidx, rule := range rules {
labelled := false
for i, arg := range rule {
if arg != nil {
switch a := arg.(type) {
case AllowAny:
case AllowValue:
high, low := uint32(a>>32), uint32(a)
// assert arg_low == low
p.AddStmt(bpf.Ld|bpf.Abs|bpf.W, seccompDataOffsetArgLow(i))
p.AddJumpFalseLabel(bpf.Jmp|bpf.Jeq|bpf.K, low, 0, ruleViolationLabel(ruleSetIdx, sysno, ruleidx))
// assert arg_high == high
p.AddStmt(bpf.Ld|bpf.Abs|bpf.W, seccompDataOffsetArgHigh(i))
p.AddJumpFalseLabel(bpf.Jmp|bpf.Jeq|bpf.K, high, 0, ruleViolationLabel(ruleSetIdx, sysno, ruleidx))
labelled = true
case GreaterThan:
labelGood := fmt.Sprintf("gt%v", i)
high, low := uint32(a>>32), uint32(a)
// assert arg_high < high
p.AddStmt(bpf.Ld|bpf.Abs|bpf.W, seccompDataOffsetArgHigh(i))
p.AddJumpFalseLabel(bpf.Jmp|bpf.Jge|bpf.K, high, 0, ruleViolationLabel(ruleSetIdx, sysno, ruleidx))
// arg_high > high
p.AddJumpFalseLabel(bpf.Jmp|bpf.Jeq|bpf.K, high, 0, ruleLabel(ruleSetIdx, sysno, ruleidx, labelGood))
// arg_low < low
p.AddStmt(bpf.Ld|bpf.Abs|bpf.W, seccompDataOffsetArgLow(i))
p.AddJumpFalseLabel(bpf.Jmp|bpf.Jgt|bpf.K, low, 0, ruleViolationLabel(ruleSetIdx, sysno, ruleidx))
p.AddLabel(ruleLabel(ruleSetIdx, sysno, ruleidx, labelGood))
labelled = true
default:
return fmt.Errorf("unknown syscall rule type: %v", reflect.TypeOf(a))
}
}
}
// Matched, emit the given action.
p.AddStmt(bpf.Ret|bpf.K, uint32(action))
// Label the end of the rule if necessary. This is added for
// the jumps above when the argument check fails.
if labelled {
if err := p.AddLabel(ruleViolationLabel(ruleSetIdx, sysno, ruleidx)); err != nil {
return err
}
}
}
return nil
}
// buildBSTProgram converts a binary tree started in 'root' into BPF code. The outline of the code
// is as follows:
//
// // SYS_PIPE(22), root
// (A == 22) ? goto argument check : continue
// (A > 22) ? goto index_35 : goto index_9
//
// index_9: // SYS_MMAP(9), leaf
// A == 9) ? goto argument check : defaultLabel
//
// index_35: // SYS_NANOSLEEP(35), single child
// (A == 35) ? goto argument check : continue
// (A > 35) ? goto index_50 : goto defaultLabel
//
// index_50: // SYS_LISTEN(50), leaf
// (A == 50) ? goto argument check : goto defaultLabel
//
func buildBSTProgram(n *node, rules []RuleSet, program *bpf.ProgramBuilder) error {
// Root node is never referenced by label, skip it.
if !n.root {
if err := program.AddLabel(n.label()); err != nil {
return err
}
}
sysno := n.value
program.AddJumpTrueLabel(bpf.Jmp|bpf.Jeq|bpf.K, uint32(sysno), checkArgsLabel(sysno), 0)
if n.left == nil && n.right == nil {
// Leaf nodes don't require extra check.
program.AddDirectJumpLabel(defaultLabel)
} else {
// Non-leaf node. Check which turn to take otherwise. Using direct jumps
// in case that the offset may exceed the limit of a conditional jump (255)
program.AddJump(bpf.Jmp|bpf.Jgt|bpf.K, uint32(sysno), 0, skipOneInst)
program.AddDirectJumpLabel(n.right.label())
program.AddDirectJumpLabel(n.left.label())
}
if err := program.AddLabel(checkArgsLabel(sysno)); err != nil {
return err
}
emitted := false
for ruleSetIdx, rs := range rules {
if _, ok := rs.Rules[sysno]; ok {
// If there are no rules, then this will always match.
// Remember we've done this so that we can emit a
// sensible error. We can't catch all overlaps, but we
// can catch this one at least.
if emitted {
return fmt.Errorf("unreachable action for %v: 0x%x (rule set %d)", SyscallName(sysno), rs.Action, ruleSetIdx)
}
// Emit a vsyscall check if this rule requires a
// Vsyscall match. This rule ensures that the top bit
// is set in the instruction pointer, which is where
// the vsyscall page will be mapped.
if rs.Vsyscall {
program.AddStmt(bpf.Ld|bpf.Abs|bpf.W, seccompDataOffsetIPHigh)
program.AddJumpFalseLabel(bpf.Jmp|bpf.Jset|bpf.K, 0x80000000, 0, vsyscallViolationLabel(ruleSetIdx, sysno))
}
// Emit matchers.
if len(rs.Rules[sysno]) == 0 {
// This is a blanket action.
program.AddStmt(bpf.Ret|bpf.K, uint32(rs.Action))
emitted = true
} else {
// Add an argument check for these particular
// arguments. This will continue execution and
// check the next rule set. We need to ensure
// that at the very end, we insert a direct
// jump label for the unmatched case.
if err := addSyscallArgsCheck(program, rs.Rules[sysno], rs.Action, ruleSetIdx, sysno); err != nil {
return err
}
}
// If there was a Vsyscall check for this rule, then we
// need to add an appropriate label for the jump above.
if rs.Vsyscall {
if err := program.AddLabel(vsyscallViolationLabel(ruleSetIdx, sysno)); err != nil {
return err
}
}
}
}
// Not matched? We only need to insert a jump to the default label if
// not default action has been emitted for this call.
if !emitted {
program.AddDirectJumpLabel(defaultLabel)
}
return nil
}
// node represents a tree node.
type node struct {
value uintptr
left *node
right *node
root bool
}
// label returns the label corresponding to this node.
//
// If n is nil, then the defaultLabel is returned.
func (n *node) label() string {
if n == nil {
return defaultLabel
}
return fmt.Sprintf("index_%v", n.value)
}
type traverseFunc func(*node, []RuleSet, *bpf.ProgramBuilder) error
func (n *node) traverse(fn traverseFunc, rules []RuleSet, p *bpf.ProgramBuilder) error {
if n == nil {
return nil
}
if err := fn(n, rules, p); err != nil {
return err
}
if err := n.left.traverse(fn, rules, p); err != nil {
return err
}
return n.right.traverse(fn, rules, p)
}