gvisor/runsc/sandbox/network.go

422 lines
14 KiB
Go

// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package sandbox
import (
"fmt"
"net"
"os"
"path/filepath"
"runtime"
"strconv"
"syscall"
specs "github.com/opencontainers/runtime-spec/specs-go"
"github.com/vishvananda/netlink"
"golang.org/x/sys/unix"
"gvisor.dev/gvisor/pkg/log"
"gvisor.dev/gvisor/pkg/tcpip/header"
"gvisor.dev/gvisor/pkg/tcpip/stack"
"gvisor.dev/gvisor/pkg/urpc"
"gvisor.dev/gvisor/runsc/boot"
"gvisor.dev/gvisor/runsc/config"
"gvisor.dev/gvisor/runsc/specutils"
)
// setupNetwork configures the network stack to mimic the local network
// configuration. Docker uses network namespaces with vnets to configure the
// network for the container. The untrusted app expects to see the same network
// inside the sandbox. Routing and port mapping is handled directly by docker
// with most of network information not even available to the runtime.
//
// Netstack inside the sandbox speaks directly to the device using a raw socket.
// All IP addresses assigned to the NIC, are removed and passed on to netstack's
// device.
//
// If 'conf.Network' is NoNetwork, skips local configuration and creates a
// loopback interface only.
//
// Run the following container to test it:
// docker run -di --runtime=runsc -p 8080:80 -v $PWD:/usr/local/apache2/htdocs/ httpd:2.4
func setupNetwork(conn *urpc.Client, pid int, spec *specs.Spec, conf *config.Config) error {
log.Infof("Setting up network")
switch conf.Network {
case config.NetworkNone:
log.Infof("Network is disabled, create loopback interface only")
if err := createDefaultLoopbackInterface(conn); err != nil {
return fmt.Errorf("creating default loopback interface: %v", err)
}
case config.NetworkSandbox:
// Build the path to the net namespace of the sandbox process.
// This is what we will copy.
nsPath := filepath.Join("/proc", strconv.Itoa(pid), "ns/net")
if err := createInterfacesAndRoutesFromNS(conn, nsPath, conf.HardwareGSO, conf.SoftwareGSO, conf.TXChecksumOffload, conf.RXChecksumOffload, conf.NumNetworkChannels, conf.QDisc); err != nil {
return fmt.Errorf("creating interfaces from net namespace %q: %v", nsPath, err)
}
case config.NetworkHost:
// Nothing to do here.
default:
return fmt.Errorf("invalid network type: %v", conf.Network)
}
return nil
}
func createDefaultLoopbackInterface(conn *urpc.Client) error {
if err := conn.Call(boot.NetworkCreateLinksAndRoutes, &boot.CreateLinksAndRoutesArgs{
LoopbackLinks: []boot.LoopbackLink{boot.DefaultLoopbackLink},
}, nil); err != nil {
return fmt.Errorf("creating loopback link and routes: %v", err)
}
return nil
}
func joinNetNS(nsPath string) (func(), error) {
runtime.LockOSThread()
restoreNS, err := specutils.ApplyNS(specs.LinuxNamespace{
Type: specs.NetworkNamespace,
Path: nsPath,
})
if err != nil {
runtime.UnlockOSThread()
return nil, fmt.Errorf("joining net namespace %q: %v", nsPath, err)
}
return func() {
restoreNS()
runtime.UnlockOSThread()
}, nil
}
// isRootNS determines whether we are running in the root net namespace.
// /proc/sys/net/core/rmem_default only exists in root network namespace.
func isRootNS() (bool, error) {
err := syscall.Access("/proc/sys/net/core/rmem_default", syscall.F_OK)
switch err {
case nil:
return true, nil
case syscall.ENOENT:
return false, nil
default:
return false, fmt.Errorf("failed to access /proc/sys/net/core/rmem_default: %v", err)
}
}
// createInterfacesAndRoutesFromNS scrapes the interface and routes from the
// net namespace with the given path, creates them in the sandbox, and removes
// them from the host.
func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string, hardwareGSO bool, softwareGSO bool, txChecksumOffload bool, rxChecksumOffload bool, numNetworkChannels int, qDisc config.QueueingDiscipline) error {
// Join the network namespace that we will be copying.
restore, err := joinNetNS(nsPath)
if err != nil {
return err
}
defer restore()
// Get all interfaces in the namespace.
ifaces, err := net.Interfaces()
if err != nil {
return fmt.Errorf("querying interfaces: %v", err)
}
isRoot, err := isRootNS()
if err != nil {
return err
}
if isRoot {
return fmt.Errorf("cannot run with network enabled in root network namespace")
}
// Collect addresses and routes from the interfaces.
var args boot.CreateLinksAndRoutesArgs
for _, iface := range ifaces {
if iface.Flags&net.FlagUp == 0 {
log.Infof("Skipping down interface: %+v", iface)
continue
}
allAddrs, err := iface.Addrs()
if err != nil {
return fmt.Errorf("fetching interface addresses for %q: %v", iface.Name, err)
}
// We build our own loopback device.
if iface.Flags&net.FlagLoopback != 0 {
link, err := loopbackLink(iface, allAddrs)
if err != nil {
return fmt.Errorf("getting loopback link for iface %q: %v", iface.Name, err)
}
args.LoopbackLinks = append(args.LoopbackLinks, link)
continue
}
var ipAddrs []*net.IPNet
for _, ifaddr := range allAddrs {
ipNet, ok := ifaddr.(*net.IPNet)
if !ok {
return fmt.Errorf("address is not IPNet: %+v", ifaddr)
}
ipAddrs = append(ipAddrs, ipNet)
}
if len(ipAddrs) == 0 {
log.Warningf("No usable IP addresses found for interface %q, skipping", iface.Name)
continue
}
// Scrape the routes before removing the address, since that
// will remove the routes as well.
routes, defv4, defv6, err := routesForIface(iface)
if err != nil {
return fmt.Errorf("getting routes for interface %q: %v", iface.Name, err)
}
if defv4 != nil {
if !args.Defaultv4Gateway.Route.Empty() {
return fmt.Errorf("more than one default route found, interface: %v, route: %v, default route: %+v", iface.Name, defv4, args.Defaultv4Gateway)
}
args.Defaultv4Gateway.Route = *defv4
args.Defaultv4Gateway.Name = iface.Name
}
if defv6 != nil {
if !args.Defaultv6Gateway.Route.Empty() {
return fmt.Errorf("more than one default route found, interface: %v, route: %v, default route: %+v", iface.Name, defv6, args.Defaultv6Gateway)
}
args.Defaultv6Gateway.Route = *defv6
args.Defaultv6Gateway.Name = iface.Name
}
link := boot.FDBasedLink{
Name: iface.Name,
MTU: iface.MTU,
Routes: routes,
TXChecksumOffload: txChecksumOffload,
RXChecksumOffload: rxChecksumOffload,
NumChannels: numNetworkChannels,
QDisc: qDisc,
}
// Get the link for the interface.
ifaceLink, err := netlink.LinkByName(iface.Name)
if err != nil {
return fmt.Errorf("getting link for interface %q: %v", iface.Name, err)
}
link.LinkAddress = ifaceLink.Attrs().HardwareAddr
log.Debugf("Setting up network channels")
// Create the socket for the device.
for i := 0; i < link.NumChannels; i++ {
log.Debugf("Creating Channel %d", i)
socketEntry, err := createSocket(iface, ifaceLink, hardwareGSO)
if err != nil {
return fmt.Errorf("failed to createSocket for %s : %v", iface.Name, err)
}
if i == 0 {
link.GSOMaxSize = socketEntry.gsoMaxSize
} else {
if link.GSOMaxSize != socketEntry.gsoMaxSize {
return fmt.Errorf("inconsistent gsoMaxSize %d and %d when creating multiple channels for same interface: %s",
link.GSOMaxSize, socketEntry.gsoMaxSize, iface.Name)
}
}
args.FilePayload.Files = append(args.FilePayload.Files, socketEntry.deviceFile)
}
if link.GSOMaxSize == 0 && softwareGSO {
// Hardware GSO is disabled. Let's enable software GSO.
link.GSOMaxSize = stack.SoftwareGSOMaxSize
link.SoftwareGSOEnabled = true
}
// Collect the addresses for the interface, enable forwarding,
// and remove them from the host.
for _, addr := range ipAddrs {
link.Addresses = append(link.Addresses, addr.IP)
// Steal IP address from NIC.
if err := removeAddress(ifaceLink, addr.String()); err != nil {
return fmt.Errorf("removing address %v from device %q: %v", iface.Name, addr, err)
}
}
args.FDBasedLinks = append(args.FDBasedLinks, link)
}
log.Debugf("Setting up network, config: %+v", args)
if err := conn.Call(boot.NetworkCreateLinksAndRoutes, &args, nil); err != nil {
return fmt.Errorf("creating links and routes: %v", err)
}
return nil
}
type socketEntry struct {
deviceFile *os.File
gsoMaxSize uint32
}
// createSocket creates an underlying AF_PACKET socket and configures it for use by
// the sentry and returns an *os.File that wraps the underlying socket fd.
func createSocket(iface net.Interface, ifaceLink netlink.Link, enableGSO bool) (*socketEntry, error) {
// Create the socket.
const protocol = 0x0300 // htons(ETH_P_ALL)
fd, err := syscall.Socket(syscall.AF_PACKET, syscall.SOCK_RAW, protocol)
if err != nil {
return nil, fmt.Errorf("unable to create raw socket: %v", err)
}
deviceFile := os.NewFile(uintptr(fd), "raw-device-fd")
// Bind to the appropriate device.
ll := syscall.SockaddrLinklayer{
Protocol: protocol,
Ifindex: iface.Index,
Hatype: 0, // No ARP type.
Pkttype: syscall.PACKET_OTHERHOST,
}
if err := syscall.Bind(fd, &ll); err != nil {
return nil, fmt.Errorf("unable to bind to %q: %v", iface.Name, err)
}
gsoMaxSize := uint32(0)
if enableGSO {
gso, err := isGSOEnabled(fd, iface.Name)
if err != nil {
return nil, fmt.Errorf("getting GSO for interface %q: %v", iface.Name, err)
}
if gso {
if err := syscall.SetsockoptInt(fd, syscall.SOL_PACKET, unix.PACKET_VNET_HDR, 1); err != nil {
return nil, fmt.Errorf("unable to enable the PACKET_VNET_HDR option: %v", err)
}
gsoMaxSize = ifaceLink.Attrs().GSOMaxSize
} else {
log.Infof("GSO not available in host.")
}
}
// Use SO_RCVBUFFORCE/SO_SNDBUFFORCE because on linux the receive/send buffer
// for an AF_PACKET socket is capped by "net.core.rmem_max/wmem_max".
// wmem_max/rmem_max default to a unusually low value of 208KB. This is too low
// for gVisor to be able to receive packets at high throughputs without
// incurring packet drops.
const bufSize = 4 << 20 // 4MB.
if err := syscall.SetsockoptInt(fd, syscall.SOL_SOCKET, syscall.SO_RCVBUFFORCE, bufSize); err != nil {
syscall.SetsockoptInt(fd, syscall.SOL_SOCKET, syscall.SO_RCVBUF, bufSize)
sz, _ := syscall.GetsockoptInt(fd, syscall.SOL_SOCKET, syscall.SO_RCVBUF)
if sz < bufSize {
log.Warningf("Failed to increase rcv buffer to %d on SOCK_RAW on %s. Current buffer %d: %v", bufSize, iface.Name, sz, err)
}
}
if err := syscall.SetsockoptInt(fd, syscall.SOL_SOCKET, syscall.SO_SNDBUFFORCE, bufSize); err != nil {
syscall.SetsockoptInt(fd, syscall.SOL_SOCKET, syscall.SO_SNDBUF, bufSize)
sz, _ := syscall.GetsockoptInt(fd, syscall.SOL_SOCKET, syscall.SO_SNDBUF)
if sz < bufSize {
log.Warningf("Failed to increase snd buffer to %d on SOCK_RAW on %s. Curent buffer %d: %v", bufSize, iface.Name, sz, err)
}
}
return &socketEntry{deviceFile, gsoMaxSize}, nil
}
// loopbackLink returns the link with addresses and routes for a loopback
// interface.
func loopbackLink(iface net.Interface, addrs []net.Addr) (boot.LoopbackLink, error) {
link := boot.LoopbackLink{
Name: iface.Name,
}
for _, addr := range addrs {
ipNet, ok := addr.(*net.IPNet)
if !ok {
return boot.LoopbackLink{}, fmt.Errorf("address is not IPNet: %+v", addr)
}
dst := *ipNet
dst.IP = dst.IP.Mask(dst.Mask)
link.Addresses = append(link.Addresses, ipNet.IP)
link.Routes = append(link.Routes, boot.Route{
Destination: dst,
})
}
return link, nil
}
// routesForIface iterates over all routes for the given interface and converts
// them to boot.Routes. It also returns the a default v4/v6 route if found.
func routesForIface(iface net.Interface) ([]boot.Route, *boot.Route, *boot.Route, error) {
link, err := netlink.LinkByIndex(iface.Index)
if err != nil {
return nil, nil, nil, err
}
rs, err := netlink.RouteList(link, netlink.FAMILY_ALL)
if err != nil {
return nil, nil, nil, fmt.Errorf("getting routes from %q: %v", iface.Name, err)
}
var defv4, defv6 *boot.Route
var routes []boot.Route
for _, r := range rs {
// Is it a default route?
if r.Dst == nil {
if r.Gw == nil {
return nil, nil, nil, fmt.Errorf("default route with no gateway %q: %+v", iface.Name, r)
}
// Create a catch all route to the gateway.
switch len(r.Gw) {
case header.IPv4AddressSize:
if defv4 != nil {
return nil, nil, nil, fmt.Errorf("more than one default route found %q, def: %+v, route: %+v", iface.Name, defv4, r)
}
defv4 = &boot.Route{
Destination: net.IPNet{
IP: net.IPv4zero,
Mask: net.IPMask(net.IPv4zero),
},
Gateway: r.Gw,
}
case header.IPv6AddressSize:
if defv6 != nil {
return nil, nil, nil, fmt.Errorf("more than one default route found %q, def: %+v, route: %+v", iface.Name, defv6, r)
}
defv6 = &boot.Route{
Destination: net.IPNet{
IP: net.IPv6zero,
Mask: net.IPMask(net.IPv6zero),
},
Gateway: r.Gw,
}
default:
return nil, nil, nil, fmt.Errorf("unexpected address size for gateway: %+v for route: %+v", r.Gw, r)
}
continue
}
dst := *r.Dst
dst.IP = dst.IP.Mask(dst.Mask)
routes = append(routes, boot.Route{
Destination: dst,
Gateway: r.Gw,
})
}
return routes, defv4, defv6, nil
}
// removeAddress removes IP address from network device. It's equivalent to:
// ip addr del <ipAndMask> dev <name>
func removeAddress(source netlink.Link, ipAndMask string) error {
addr, err := netlink.ParseAddr(ipAndMask)
if err != nil {
return err
}
return netlink.AddrDel(source, addr)
}