393 lines
13 KiB
Go
393 lines
13 KiB
Go
// Copyright 2018 The gVisor Authors.
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
// Package seccomp provides basic seccomp filters for x86_64 (little endian).
|
|
package seccomp
|
|
|
|
import (
|
|
"fmt"
|
|
"reflect"
|
|
"sort"
|
|
|
|
"gvisor.dev/gvisor/pkg/abi/linux"
|
|
"gvisor.dev/gvisor/pkg/bpf"
|
|
"gvisor.dev/gvisor/pkg/log"
|
|
)
|
|
|
|
const (
|
|
// skipOneInst is the offset to take for skipping one instruction.
|
|
skipOneInst = 1
|
|
|
|
// defaultLabel is the label for the default action.
|
|
defaultLabel = "default_action"
|
|
)
|
|
|
|
// Install generates BPF code based on the set of syscalls provided. It only
|
|
// allows syscalls that conform to the specification. Syscalls that violate the
|
|
// specification will trigger RET_KILL_PROCESS, except for the cases below.
|
|
//
|
|
// RET_TRAP is used in violations, instead of RET_KILL_PROCESS, in the
|
|
// following cases:
|
|
// 1. Kernel doesn't support RET_KILL_PROCESS: RET_KILL_THREAD only kills the
|
|
// offending thread and often keeps the sentry hanging.
|
|
// 2. Debug: RET_TRAP generates a panic followed by a stack trace which is
|
|
// much easier to debug then RET_KILL_PROCESS which can't be caught.
|
|
//
|
|
// Be aware that RET_TRAP sends SIGSYS to the process and it may be ignored,
|
|
// making it possible for the process to continue running after a violation.
|
|
// However, it will leave a SECCOMP audit event trail behind. In any case, the
|
|
// syscall is still blocked from executing.
|
|
func Install(rules SyscallRules) error {
|
|
defaultAction, err := defaultAction()
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
// Uncomment to get stack trace when there is a violation.
|
|
// defaultAction = linux.BPFAction(linux.SECCOMP_RET_TRAP)
|
|
|
|
log.Infof("Installing seccomp filters for %d syscalls (action=%v)", len(rules), defaultAction)
|
|
|
|
instrs, err := BuildProgram([]RuleSet{
|
|
RuleSet{
|
|
Rules: rules,
|
|
Action: linux.SECCOMP_RET_ALLOW,
|
|
},
|
|
}, defaultAction)
|
|
if log.IsLogging(log.Debug) {
|
|
programStr, errDecode := bpf.DecodeProgram(instrs)
|
|
if errDecode != nil {
|
|
programStr = fmt.Sprintf("Error: %v\n%s", errDecode, programStr)
|
|
}
|
|
log.Debugf("Seccomp program dump:\n%s", programStr)
|
|
}
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
// Perform the actual installation.
|
|
if errno := SetFilter(instrs); errno != 0 {
|
|
return fmt.Errorf("Failed to set filter: %v", errno)
|
|
}
|
|
|
|
log.Infof("Seccomp filters installed.")
|
|
return nil
|
|
}
|
|
|
|
func defaultAction() (linux.BPFAction, error) {
|
|
available, err := isKillProcessAvailable()
|
|
if err != nil {
|
|
return 0, err
|
|
}
|
|
if available {
|
|
return linux.SECCOMP_RET_KILL_PROCESS, nil
|
|
}
|
|
return linux.SECCOMP_RET_TRAP, nil
|
|
}
|
|
|
|
// RuleSet is a set of rules and associated action.
|
|
type RuleSet struct {
|
|
Rules SyscallRules
|
|
Action linux.BPFAction
|
|
|
|
// Vsyscall indicates that a check is made for a function being called
|
|
// from kernel mappings. This is where the vsyscall page is located
|
|
// (and typically) emulated, so this RuleSet will not match any
|
|
// functions not dispatched from the vsyscall page.
|
|
Vsyscall bool
|
|
}
|
|
|
|
// SyscallName gives names to system calls. It is used purely for debugging purposes.
|
|
//
|
|
// An alternate namer can be provided to the package at initialization time.
|
|
var SyscallName = func(sysno uintptr) string {
|
|
return fmt.Sprintf("syscall_%d", sysno)
|
|
}
|
|
|
|
// BuildProgram builds a BPF program from the given map of actions to matching
|
|
// SyscallRules. The single generated program covers all provided RuleSets.
|
|
func BuildProgram(rules []RuleSet, defaultAction linux.BPFAction) ([]linux.BPFInstruction, error) {
|
|
program := bpf.NewProgramBuilder()
|
|
|
|
// Be paranoid and check that syscall is done in the expected architecture.
|
|
//
|
|
// A = seccomp_data.arch
|
|
// if (A != AUDIT_ARCH) goto defaultAction.
|
|
program.AddStmt(bpf.Ld|bpf.Abs|bpf.W, seccompDataOffsetArch)
|
|
// defaultLabel is at the bottom of the program. The size of program
|
|
// may exceeds 255 lines, which is the limit of a condition jump.
|
|
program.AddJump(bpf.Jmp|bpf.Jeq|bpf.K, LINUX_AUDIT_ARCH, skipOneInst, 0)
|
|
program.AddDirectJumpLabel(defaultLabel)
|
|
if err := buildIndex(rules, program); err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
// Exhausted: return defaultAction.
|
|
if err := program.AddLabel(defaultLabel); err != nil {
|
|
return nil, err
|
|
}
|
|
program.AddStmt(bpf.Ret|bpf.K, uint32(defaultAction))
|
|
|
|
return program.Instructions()
|
|
}
|
|
|
|
// buildIndex builds a BST to quickly search through all syscalls.
|
|
func buildIndex(rules []RuleSet, program *bpf.ProgramBuilder) error {
|
|
// Build a list of all application system calls, across all given rule
|
|
// sets. We have a simple BST, but may dispatch individual matchers
|
|
// with different actions. The matchers are evaluated linearly.
|
|
requiredSyscalls := make(map[uintptr]struct{})
|
|
for _, rs := range rules {
|
|
for sysno := range rs.Rules {
|
|
requiredSyscalls[sysno] = struct{}{}
|
|
}
|
|
}
|
|
syscalls := make([]uintptr, 0, len(requiredSyscalls))
|
|
for sysno, _ := range requiredSyscalls {
|
|
syscalls = append(syscalls, sysno)
|
|
}
|
|
sort.Slice(syscalls, func(i, j int) bool { return syscalls[i] < syscalls[j] })
|
|
for _, sysno := range syscalls {
|
|
for _, rs := range rules {
|
|
// Print only if there is a corresponding set of rules.
|
|
if _, ok := rs.Rules[sysno]; ok {
|
|
log.Debugf("syscall filter %v: %s => 0x%x", SyscallName(sysno), rs.Rules[sysno], rs.Action)
|
|
}
|
|
}
|
|
}
|
|
|
|
root := createBST(syscalls)
|
|
root.root = true
|
|
|
|
// Load syscall number into A and run through BST.
|
|
//
|
|
// A = seccomp_data.nr
|
|
program.AddStmt(bpf.Ld|bpf.Abs|bpf.W, seccompDataOffsetNR)
|
|
return root.traverse(buildBSTProgram, rules, program)
|
|
}
|
|
|
|
// createBST converts sorted syscall slice into a balanced BST.
|
|
// Panics if syscalls is empty.
|
|
func createBST(syscalls []uintptr) *node {
|
|
i := len(syscalls) / 2
|
|
parent := node{value: syscalls[i]}
|
|
if i > 0 {
|
|
parent.left = createBST(syscalls[:i])
|
|
}
|
|
if i+1 < len(syscalls) {
|
|
parent.right = createBST(syscalls[i+1:])
|
|
}
|
|
return &parent
|
|
}
|
|
|
|
func vsyscallViolationLabel(ruleSetIdx int, sysno uintptr) string {
|
|
return fmt.Sprintf("vsyscallViolation_%v_%v", ruleSetIdx, sysno)
|
|
}
|
|
|
|
func ruleViolationLabel(ruleSetIdx int, sysno uintptr, idx int) string {
|
|
return fmt.Sprintf("ruleViolation_%v_%v_%v", ruleSetIdx, sysno, idx)
|
|
}
|
|
|
|
func ruleLabel(ruleSetIdx int, sysno uintptr, idx int, name string) string {
|
|
return fmt.Sprintf("rule_%v_%v_%v_%v", ruleSetIdx, sysno, idx, name)
|
|
}
|
|
|
|
func checkArgsLabel(sysno uintptr) string {
|
|
return fmt.Sprintf("checkArgs_%v", sysno)
|
|
}
|
|
|
|
// addSyscallArgsCheck adds argument checks for a single system call. It does
|
|
// not insert a jump to the default action at the end and it is the
|
|
// responsibility of the caller to insert an appropriate jump after calling
|
|
// this function.
|
|
func addSyscallArgsCheck(p *bpf.ProgramBuilder, rules []Rule, action linux.BPFAction, ruleSetIdx int, sysno uintptr) error {
|
|
for ruleidx, rule := range rules {
|
|
labelled := false
|
|
for i, arg := range rule {
|
|
if arg != nil {
|
|
switch a := arg.(type) {
|
|
case AllowAny:
|
|
case AllowValue:
|
|
high, low := uint32(a>>32), uint32(a)
|
|
// assert arg_low == low
|
|
p.AddStmt(bpf.Ld|bpf.Abs|bpf.W, seccompDataOffsetArgLow(i))
|
|
p.AddJumpFalseLabel(bpf.Jmp|bpf.Jeq|bpf.K, low, 0, ruleViolationLabel(ruleSetIdx, sysno, ruleidx))
|
|
// assert arg_high == high
|
|
p.AddStmt(bpf.Ld|bpf.Abs|bpf.W, seccompDataOffsetArgHigh(i))
|
|
p.AddJumpFalseLabel(bpf.Jmp|bpf.Jeq|bpf.K, high, 0, ruleViolationLabel(ruleSetIdx, sysno, ruleidx))
|
|
labelled = true
|
|
case GreaterThan:
|
|
labelGood := fmt.Sprintf("gt%v", i)
|
|
high, low := uint32(a>>32), uint32(a)
|
|
// assert arg_high < high
|
|
p.AddStmt(bpf.Ld|bpf.Abs|bpf.W, seccompDataOffsetArgHigh(i))
|
|
p.AddJumpFalseLabel(bpf.Jmp|bpf.Jge|bpf.K, high, 0, ruleViolationLabel(ruleSetIdx, sysno, ruleidx))
|
|
// arg_high > high
|
|
p.AddJumpFalseLabel(bpf.Jmp|bpf.Jeq|bpf.K, high, 0, ruleLabel(ruleSetIdx, sysno, ruleidx, labelGood))
|
|
// arg_low < low
|
|
p.AddStmt(bpf.Ld|bpf.Abs|bpf.W, seccompDataOffsetArgLow(i))
|
|
p.AddJumpFalseLabel(bpf.Jmp|bpf.Jgt|bpf.K, low, 0, ruleViolationLabel(ruleSetIdx, sysno, ruleidx))
|
|
p.AddLabel(ruleLabel(ruleSetIdx, sysno, ruleidx, labelGood))
|
|
labelled = true
|
|
default:
|
|
return fmt.Errorf("unknown syscall rule type: %v", reflect.TypeOf(a))
|
|
}
|
|
}
|
|
}
|
|
|
|
// Matched, emit the given action.
|
|
p.AddStmt(bpf.Ret|bpf.K, uint32(action))
|
|
|
|
// Label the end of the rule if necessary. This is added for
|
|
// the jumps above when the argument check fails.
|
|
if labelled {
|
|
if err := p.AddLabel(ruleViolationLabel(ruleSetIdx, sysno, ruleidx)); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// buildBSTProgram converts a binary tree started in 'root' into BPF code. The outline of the code
|
|
// is as follows:
|
|
//
|
|
// // SYS_PIPE(22), root
|
|
// (A == 22) ? goto argument check : continue
|
|
// (A > 22) ? goto index_35 : goto index_9
|
|
//
|
|
// index_9: // SYS_MMAP(9), leaf
|
|
// A == 9) ? goto argument check : defaultLabel
|
|
//
|
|
// index_35: // SYS_NANOSLEEP(35), single child
|
|
// (A == 35) ? goto argument check : continue
|
|
// (A > 35) ? goto index_50 : goto defaultLabel
|
|
//
|
|
// index_50: // SYS_LISTEN(50), leaf
|
|
// (A == 50) ? goto argument check : goto defaultLabel
|
|
//
|
|
func buildBSTProgram(n *node, rules []RuleSet, program *bpf.ProgramBuilder) error {
|
|
// Root node is never referenced by label, skip it.
|
|
if !n.root {
|
|
if err := program.AddLabel(n.label()); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
|
|
sysno := n.value
|
|
program.AddJumpTrueLabel(bpf.Jmp|bpf.Jeq|bpf.K, uint32(sysno), checkArgsLabel(sysno), 0)
|
|
if n.left == nil && n.right == nil {
|
|
// Leaf nodes don't require extra check.
|
|
program.AddDirectJumpLabel(defaultLabel)
|
|
} else {
|
|
// Non-leaf node. Check which turn to take otherwise. Using direct jumps
|
|
// in case that the offset may exceed the limit of a conditional jump (255)
|
|
program.AddJump(bpf.Jmp|bpf.Jgt|bpf.K, uint32(sysno), 0, skipOneInst)
|
|
program.AddDirectJumpLabel(n.right.label())
|
|
program.AddDirectJumpLabel(n.left.label())
|
|
}
|
|
|
|
if err := program.AddLabel(checkArgsLabel(sysno)); err != nil {
|
|
return err
|
|
}
|
|
|
|
emitted := false
|
|
for ruleSetIdx, rs := range rules {
|
|
if _, ok := rs.Rules[sysno]; ok {
|
|
// If there are no rules, then this will always match.
|
|
// Remember we've done this so that we can emit a
|
|
// sensible error. We can't catch all overlaps, but we
|
|
// can catch this one at least.
|
|
if emitted {
|
|
return fmt.Errorf("unreachable action for %v: 0x%x (rule set %d)", SyscallName(sysno), rs.Action, ruleSetIdx)
|
|
}
|
|
|
|
// Emit a vsyscall check if this rule requires a
|
|
// Vsyscall match. This rule ensures that the top bit
|
|
// is set in the instruction pointer, which is where
|
|
// the vsyscall page will be mapped.
|
|
if rs.Vsyscall {
|
|
program.AddStmt(bpf.Ld|bpf.Abs|bpf.W, seccompDataOffsetIPHigh)
|
|
program.AddJumpFalseLabel(bpf.Jmp|bpf.Jset|bpf.K, 0x80000000, 0, vsyscallViolationLabel(ruleSetIdx, sysno))
|
|
}
|
|
|
|
// Emit matchers.
|
|
if len(rs.Rules[sysno]) == 0 {
|
|
// This is a blanket action.
|
|
program.AddStmt(bpf.Ret|bpf.K, uint32(rs.Action))
|
|
emitted = true
|
|
} else {
|
|
// Add an argument check for these particular
|
|
// arguments. This will continue execution and
|
|
// check the next rule set. We need to ensure
|
|
// that at the very end, we insert a direct
|
|
// jump label for the unmatched case.
|
|
if err := addSyscallArgsCheck(program, rs.Rules[sysno], rs.Action, ruleSetIdx, sysno); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
|
|
// If there was a Vsyscall check for this rule, then we
|
|
// need to add an appropriate label for the jump above.
|
|
if rs.Vsyscall {
|
|
if err := program.AddLabel(vsyscallViolationLabel(ruleSetIdx, sysno)); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Not matched? We only need to insert a jump to the default label if
|
|
// not default action has been emitted for this call.
|
|
if !emitted {
|
|
program.AddDirectJumpLabel(defaultLabel)
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// node represents a tree node.
|
|
type node struct {
|
|
value uintptr
|
|
left *node
|
|
right *node
|
|
root bool
|
|
}
|
|
|
|
// label returns the label corresponding to this node.
|
|
//
|
|
// If n is nil, then the defaultLabel is returned.
|
|
func (n *node) label() string {
|
|
if n == nil {
|
|
return defaultLabel
|
|
}
|
|
return fmt.Sprintf("index_%v", n.value)
|
|
}
|
|
|
|
type traverseFunc func(*node, []RuleSet, *bpf.ProgramBuilder) error
|
|
|
|
func (n *node) traverse(fn traverseFunc, rules []RuleSet, p *bpf.ProgramBuilder) error {
|
|
if n == nil {
|
|
return nil
|
|
}
|
|
if err := fn(n, rules, p); err != nil {
|
|
return err
|
|
}
|
|
if err := n.left.traverse(fn, rules, p); err != nil {
|
|
return err
|
|
}
|
|
return n.right.traverse(fn, rules, p)
|
|
}
|