gvisor/pkg/tcpip/link/fdbased/endpoint.go

552 lines
15 KiB
Go

// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// +build linux
// Package fdbased provides the implemention of data-link layer endpoints
// backed by boundary-preserving file descriptors (e.g., TUN devices,
// seqpacket/datagram sockets).
//
// FD based endpoints can be used in the networking stack by calling New() to
// create a new endpoint, and then passing it as an argument to
// Stack.CreateNIC().
package fdbased
import (
"fmt"
"syscall"
"gvisor.googlesource.com/gvisor/pkg/tcpip"
"gvisor.googlesource.com/gvisor/pkg/tcpip/buffer"
"gvisor.googlesource.com/gvisor/pkg/tcpip/header"
"gvisor.googlesource.com/gvisor/pkg/tcpip/link/rawfile"
"gvisor.googlesource.com/gvisor/pkg/tcpip/stack"
)
const (
// MaxMsgsPerRecv is the maximum number of packets we want to retrieve
// in a single RecvMMsg call.
MaxMsgsPerRecv = 8
)
// BufConfig defines the shape of the vectorised view used to read packets from the NIC.
var BufConfig = []int{128, 256, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768}
// linkDispatcher reads packets from the link FD and dispatches them to the
// NetworkDispatcher.
type linkDispatcher func() (bool, *tcpip.Error)
// PacketDispatchMode are the various supported methods of receiving and
// dispatching packets from the underlying FD.
type PacketDispatchMode int
const (
// Readv is the default dispatch mode and is the least performant of the
// dispatch options but the one that is supported by all underlying FD
// types.
Readv PacketDispatchMode = iota
// RecvMMsg enables use of recvmmsg() syscall instead of readv() to
// read inbound packets. This reduces # of syscalls needed to process
// packets.
//
// NOTE: recvmmsg() is only supported for sockets, so if the underlying
// FD is not a socket then the code will still fall back to the readv()
// path.
RecvMMsg
// PacketMMap enables use of PACKET_RX_RING to receive packets from the
// NIC. PacketMMap requires that the underlying FD be an AF_PACKET. The
// primary use-case for this is runsc which uses an AF_PACKET FD to
// receive packets from the veth device.
PacketMMap
)
type endpoint struct {
// fd is the file descriptor used to send and receive packets.
fd int
// mtu (maximum transmission unit) is the maximum size of a packet.
mtu uint32
// hdrSize specifies the link-layer header size. If set to 0, no header
// is added/removed; otherwise an ethernet header is used.
hdrSize int
// addr is the address of the endpoint.
addr tcpip.LinkAddress
// caps holds the endpoint capabilities.
caps stack.LinkEndpointCapabilities
// closed is a function to be called when the FD's peer (if any) closes
// its end of the communication pipe.
closed func(*tcpip.Error)
views [][]buffer.View
iovecs [][]syscall.Iovec
// msgHdrs is only used by the RecvMMsg dispatcher.
msgHdrs []rawfile.MMsgHdr
inboundDispatcher linkDispatcher
dispatcher stack.NetworkDispatcher
// packetDispatchMode controls the packet dispatcher used by this
// endpoint.
packetDispatchMode PacketDispatchMode
// ringBuffer is only used when PacketMMap dispatcher is used and points
// to the start of the mmapped PACKET_RX_RING buffer.
ringBuffer []byte
// ringOffset is the current offset into the ring buffer where the next
// inbound packet will be placed by the kernel.
ringOffset int
// gsoMaxSize is the maximum GSO packet size. It is zero if GSO is
// disabled.
gsoMaxSize uint32
}
// Options specify the details about the fd-based endpoint to be created.
type Options struct {
FD int
MTU uint32
EthernetHeader bool
ClosedFunc func(*tcpip.Error)
Address tcpip.LinkAddress
SaveRestore bool
DisconnectOk bool
GSOMaxSize uint32
PacketDispatchMode PacketDispatchMode
TXChecksumOffload bool
RXChecksumOffload bool
}
// New creates a new fd-based endpoint.
//
// Makes fd non-blocking, but does not take ownership of fd, which must remain
// open for the lifetime of the returned endpoint.
func New(opts *Options) (tcpip.LinkEndpointID, error) {
if err := syscall.SetNonblock(opts.FD, true); err != nil {
return 0, fmt.Errorf("syscall.SetNonblock(%v) failed: %v", opts.FD, err)
}
caps := stack.LinkEndpointCapabilities(0)
if opts.RXChecksumOffload {
caps |= stack.CapabilityRXChecksumOffload
}
if opts.TXChecksumOffload {
caps |= stack.CapabilityTXChecksumOffload
}
hdrSize := 0
if opts.EthernetHeader {
hdrSize = header.EthernetMinimumSize
caps |= stack.CapabilityResolutionRequired
}
if opts.SaveRestore {
caps |= stack.CapabilitySaveRestore
}
if opts.DisconnectOk {
caps |= stack.CapabilityDisconnectOk
}
e := &endpoint{
fd: opts.FD,
mtu: opts.MTU,
caps: caps,
closed: opts.ClosedFunc,
addr: opts.Address,
hdrSize: hdrSize,
packetDispatchMode: opts.PacketDispatchMode,
}
// For non-socket FDs we read one packet a time (e.g. TAP devices).
msgsPerRecv := 1
e.inboundDispatcher = e.dispatch
isSocket, err := isSocketFD(opts.FD)
if err != nil {
return 0, err
}
if isSocket {
if opts.GSOMaxSize != 0 {
e.caps |= stack.CapabilityGSO
e.gsoMaxSize = opts.GSOMaxSize
}
switch e.packetDispatchMode {
case PacketMMap:
if err := e.setupPacketRXRing(); err != nil {
return 0, fmt.Errorf("e.setupPacketRXRing failed: %v", err)
}
e.inboundDispatcher = e.packetMMapDispatch
return stack.RegisterLinkEndpoint(e), nil
case RecvMMsg:
// If the provided FD is a socket then we optimize
// packet reads by using recvmmsg() instead of read() to
// read packets in a batch.
e.inboundDispatcher = e.recvMMsgDispatch
msgsPerRecv = MaxMsgsPerRecv
}
}
e.views = make([][]buffer.View, msgsPerRecv)
for i := range e.views {
e.views[i] = make([]buffer.View, len(BufConfig))
}
e.iovecs = make([][]syscall.Iovec, msgsPerRecv)
iovLen := len(BufConfig)
if e.Capabilities()&stack.CapabilityGSO != 0 {
// virtioNetHdr is prepended before each packet.
iovLen++
}
for i := range e.iovecs {
e.iovecs[i] = make([]syscall.Iovec, iovLen)
}
e.msgHdrs = make([]rawfile.MMsgHdr, msgsPerRecv)
for i := range e.msgHdrs {
e.msgHdrs[i].Msg.Iov = &e.iovecs[i][0]
e.msgHdrs[i].Msg.Iovlen = uint64(iovLen)
}
return stack.RegisterLinkEndpoint(e), nil
}
func isSocketFD(fd int) (bool, error) {
var stat syscall.Stat_t
if err := syscall.Fstat(fd, &stat); err != nil {
return false, fmt.Errorf("syscall.Fstat(%v,...) failed: %v", fd, err)
}
return (stat.Mode & syscall.S_IFSOCK) == syscall.S_IFSOCK, nil
}
// Attach launches the goroutine that reads packets from the file descriptor and
// dispatches them via the provided dispatcher.
func (e *endpoint) Attach(dispatcher stack.NetworkDispatcher) {
e.dispatcher = dispatcher
// Link endpoints are not savable. When transportation endpoints are
// saved, they stop sending outgoing packets and all incoming packets
// are rejected.
go e.dispatchLoop() // S/R-SAFE: See above.
}
// IsAttached implements stack.LinkEndpoint.IsAttached.
func (e *endpoint) IsAttached() bool {
return e.dispatcher != nil
}
// MTU implements stack.LinkEndpoint.MTU. It returns the value initialized
// during construction.
func (e *endpoint) MTU() uint32 {
return e.mtu
}
// Capabilities implements stack.LinkEndpoint.Capabilities.
func (e *endpoint) Capabilities() stack.LinkEndpointCapabilities {
return e.caps
}
// MaxHeaderLength returns the maximum size of the link-layer header.
func (e *endpoint) MaxHeaderLength() uint16 {
return uint16(e.hdrSize)
}
// LinkAddress returns the link address of this endpoint.
func (e *endpoint) LinkAddress() tcpip.LinkAddress {
return e.addr
}
// virtioNetHdr is declared in linux/virtio_net.h.
type virtioNetHdr struct {
flags uint8
gsoType uint8
hdrLen uint16
gsoSize uint16
csumStart uint16
csumOffset uint16
}
// These constants are declared in linux/virtio_net.h.
const (
_VIRTIO_NET_HDR_F_NEEDS_CSUM = 1
_VIRTIO_NET_HDR_GSO_TCPV4 = 1
_VIRTIO_NET_HDR_GSO_TCPV6 = 4
)
// WritePacket writes outbound packets to the file descriptor. If it is not
// currently writable, the packet is dropped.
func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, hdr buffer.Prependable, payload buffer.VectorisedView, protocol tcpip.NetworkProtocolNumber) *tcpip.Error {
if e.hdrSize > 0 {
// Add ethernet header if needed.
eth := header.Ethernet(hdr.Prepend(header.EthernetMinimumSize))
ethHdr := &header.EthernetFields{
DstAddr: r.RemoteLinkAddress,
Type: protocol,
}
// Preserve the src address if it's set in the route.
if r.LocalLinkAddress != "" {
ethHdr.SrcAddr = r.LocalLinkAddress
} else {
ethHdr.SrcAddr = e.addr
}
eth.Encode(ethHdr)
}
if e.Capabilities()&stack.CapabilityGSO != 0 {
vnetHdr := virtioNetHdr{}
vnetHdrBuf := vnetHdrToByteSlice(&vnetHdr)
if gso != nil {
vnetHdr.hdrLen = uint16(hdr.UsedLength())
if gso.NeedsCsum {
vnetHdr.flags = _VIRTIO_NET_HDR_F_NEEDS_CSUM
vnetHdr.csumStart = header.EthernetMinimumSize + gso.L3HdrLen
vnetHdr.csumOffset = gso.CsumOffset
}
if gso.Type != stack.GSONone && uint16(payload.Size()) > gso.MSS {
switch gso.Type {
case stack.GSOTCPv4:
vnetHdr.gsoType = _VIRTIO_NET_HDR_GSO_TCPV4
case stack.GSOTCPv6:
vnetHdr.gsoType = _VIRTIO_NET_HDR_GSO_TCPV6
default:
panic(fmt.Sprintf("Unknown gso type: %v", gso.Type))
}
vnetHdr.gsoSize = gso.MSS
}
}
return rawfile.NonBlockingWrite3(e.fd, vnetHdrBuf, hdr.View(), payload.ToView())
}
if payload.Size() == 0 {
return rawfile.NonBlockingWrite(e.fd, hdr.View())
}
return rawfile.NonBlockingWrite3(e.fd, hdr.View(), payload.ToView(), nil)
}
// WriteRawPacket writes a raw packet directly to the file descriptor.
func (e *endpoint) WriteRawPacket(dest tcpip.Address, packet []byte) *tcpip.Error {
return rawfile.NonBlockingWrite(e.fd, packet)
}
func (e *endpoint) capViews(k, n int, buffers []int) int {
c := 0
for i, s := range buffers {
c += s
if c >= n {
e.views[k][i].CapLength(s - (c - n))
return i + 1
}
}
return len(buffers)
}
func (e *endpoint) allocateViews(bufConfig []int) {
for k := 0; k < len(e.views); k++ {
var vnetHdr [virtioNetHdrSize]byte
vnetHdrOff := 0
if e.Capabilities()&stack.CapabilityGSO != 0 {
// The kernel adds virtioNetHdr before each packet, but
// we don't use it, so so we allocate a buffer for it,
// add it in iovecs but don't add it in a view.
e.iovecs[k][0] = syscall.Iovec{
Base: &vnetHdr[0],
Len: uint64(virtioNetHdrSize),
}
vnetHdrOff++
}
for i := 0; i < len(bufConfig); i++ {
if e.views[k][i] != nil {
break
}
b := buffer.NewView(bufConfig[i])
e.views[k][i] = b
e.iovecs[k][i+vnetHdrOff] = syscall.Iovec{
Base: &b[0],
Len: uint64(len(b)),
}
}
}
}
// dispatch reads one packet from the file descriptor and dispatches it.
func (e *endpoint) dispatch() (bool, *tcpip.Error) {
e.allocateViews(BufConfig)
n, err := rawfile.BlockingReadv(e.fd, e.iovecs[0])
if err != nil {
return false, err
}
if e.Capabilities()&stack.CapabilityGSO != 0 {
// Skip virtioNetHdr which is added before each packet, it
// isn't used and it isn't in a view.
n -= virtioNetHdrSize
}
if n <= e.hdrSize {
return false, nil
}
var (
p tcpip.NetworkProtocolNumber
remote, local tcpip.LinkAddress
)
if e.hdrSize > 0 {
eth := header.Ethernet(e.views[0][0])
p = eth.Type()
remote = eth.SourceAddress()
local = eth.DestinationAddress()
} else {
// We don't get any indication of what the packet is, so try to guess
// if it's an IPv4 or IPv6 packet.
switch header.IPVersion(e.views[0][0]) {
case header.IPv4Version:
p = header.IPv4ProtocolNumber
case header.IPv6Version:
p = header.IPv6ProtocolNumber
default:
return true, nil
}
}
used := e.capViews(0, n, BufConfig)
vv := buffer.NewVectorisedView(n, e.views[0][:used])
vv.TrimFront(e.hdrSize)
e.dispatcher.DeliverNetworkPacket(e, remote, local, p, vv)
// Prepare e.views for another packet: release used views.
for i := 0; i < used; i++ {
e.views[0][i] = nil
}
return true, nil
}
// recvMMsgDispatch reads more than one packet at a time from the file
// descriptor and dispatches it.
func (e *endpoint) recvMMsgDispatch() (bool, *tcpip.Error) {
e.allocateViews(BufConfig)
nMsgs, err := rawfile.BlockingRecvMMsg(e.fd, e.msgHdrs)
if err != nil {
return false, err
}
// Process each of received packets.
for k := 0; k < nMsgs; k++ {
n := int(e.msgHdrs[k].Len)
if e.Capabilities()&stack.CapabilityGSO != 0 {
n -= virtioNetHdrSize
}
if n <= e.hdrSize {
return false, nil
}
var (
p tcpip.NetworkProtocolNumber
remote, local tcpip.LinkAddress
)
if e.hdrSize > 0 {
eth := header.Ethernet(e.views[k][0])
p = eth.Type()
remote = eth.SourceAddress()
local = eth.DestinationAddress()
} else {
// We don't get any indication of what the packet is, so try to guess
// if it's an IPv4 or IPv6 packet.
switch header.IPVersion(e.views[k][0]) {
case header.IPv4Version:
p = header.IPv4ProtocolNumber
case header.IPv6Version:
p = header.IPv6ProtocolNumber
default:
return true, nil
}
}
used := e.capViews(k, int(n), BufConfig)
vv := buffer.NewVectorisedView(int(n), e.views[k][:used])
vv.TrimFront(e.hdrSize)
e.dispatcher.DeliverNetworkPacket(e, remote, local, p, vv)
// Prepare e.views for another packet: release used views.
for i := 0; i < used; i++ {
e.views[k][i] = nil
}
}
for k := 0; k < nMsgs; k++ {
e.msgHdrs[k].Len = 0
}
return true, nil
}
// dispatchLoop reads packets from the file descriptor in a loop and dispatches
// them to the network stack.
func (e *endpoint) dispatchLoop() *tcpip.Error {
for {
cont, err := e.inboundDispatcher()
if err != nil || !cont {
if e.closed != nil {
e.closed(err)
}
return err
}
}
}
// GSOMaxSize returns the maximum GSO packet size.
func (e *endpoint) GSOMaxSize() uint32 {
return e.gsoMaxSize
}
// InjectableEndpoint is an injectable fd-based endpoint. The endpoint writes
// to the FD, but does not read from it. All reads come from injected packets.
type InjectableEndpoint struct {
endpoint
dispatcher stack.NetworkDispatcher
}
// Attach saves the stack network-layer dispatcher for use later when packets
// are injected.
func (e *InjectableEndpoint) Attach(dispatcher stack.NetworkDispatcher) {
e.dispatcher = dispatcher
}
// Inject injects an inbound packet.
func (e *InjectableEndpoint) Inject(protocol tcpip.NetworkProtocolNumber, vv buffer.VectorisedView) {
e.dispatcher.DeliverNetworkPacket(e, "" /* remote */, "" /* local */, protocol, vv)
}
// NewInjectable creates a new fd-based InjectableEndpoint.
func NewInjectable(fd int, mtu uint32, capabilities stack.LinkEndpointCapabilities) (tcpip.LinkEndpointID, *InjectableEndpoint) {
syscall.SetNonblock(fd, true)
e := &InjectableEndpoint{endpoint: endpoint{
fd: fd,
mtu: mtu,
caps: capabilities,
}}
return stack.RegisterLinkEndpoint(e), e
}