gvisor/pkg/tcpip/link/fdbased/packet_dispatchers.go

295 lines
7.7 KiB
Go

// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// +build linux
package fdbased
import (
"syscall"
"gvisor.dev/gvisor/pkg/tcpip"
"gvisor.dev/gvisor/pkg/tcpip/buffer"
"gvisor.dev/gvisor/pkg/tcpip/header"
"gvisor.dev/gvisor/pkg/tcpip/link/rawfile"
"gvisor.dev/gvisor/pkg/tcpip/stack"
)
// BufConfig defines the shape of the vectorised view used to read packets from the NIC.
var BufConfig = []int{128, 256, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768}
type iovecBuffer struct {
// views are the actual buffers that hold the packet contents.
views []buffer.View
// iovecs are initialized with base pointers/len of the corresponding
// entries in the views defined above, except when GSO is enabled
// (skipsVnetHdr) then the first iovec points to a buffer for the vnet header
// which is stripped before the views are passed up the stack for further
// processing.
iovecs []syscall.Iovec
// sizes is an array of buffer sizes for the underlying views. sizes is
// immutable.
sizes []int
// skipsVnetHdr is true if virtioNetHdr is to skipped.
skipsVnetHdr bool
}
func newIovecBuffer(sizes []int, skipsVnetHdr bool) *iovecBuffer {
b := &iovecBuffer{
views: make([]buffer.View, len(sizes)),
sizes: sizes,
skipsVnetHdr: skipsVnetHdr,
}
niov := len(b.views)
if b.skipsVnetHdr {
niov++
}
b.iovecs = make([]syscall.Iovec, niov)
return b
}
func (b *iovecBuffer) nextIovecs() []syscall.Iovec {
vnetHdrOff := 0
if b.skipsVnetHdr {
var vnetHdr [virtioNetHdrSize]byte
// The kernel adds virtioNetHdr before each packet, but
// we don't use it, so so we allocate a buffer for it,
// add it in iovecs but don't add it in a view.
b.iovecs[0] = syscall.Iovec{
Base: &vnetHdr[0],
Len: uint64(virtioNetHdrSize),
}
vnetHdrOff++
}
for i := range b.views {
if b.views[i] != nil {
break
}
v := buffer.NewView(b.sizes[i])
b.views[i] = v
b.iovecs[i+vnetHdrOff] = syscall.Iovec{
Base: &v[0],
Len: uint64(len(v)),
}
}
return b.iovecs
}
func (b *iovecBuffer) pullViews(n int) buffer.VectorisedView {
var views []buffer.View
c := 0
if b.skipsVnetHdr {
c += virtioNetHdrSize
if c >= n {
// Nothing in the packet.
return buffer.NewVectorisedView(0, nil)
}
}
for i, v := range b.views {
c += len(v)
if c >= n {
b.views[i].CapLength(len(v) - (c - n))
views = append([]buffer.View(nil), b.views[:i+1]...)
break
}
}
// Remove the first len(views) used views from the state.
for i := range views {
b.views[i] = nil
}
if b.skipsVnetHdr {
// Exclude the size of the vnet header.
n -= virtioNetHdrSize
}
return buffer.NewVectorisedView(n, views)
}
// readVDispatcher uses readv() system call to read inbound packets and
// dispatches them.
type readVDispatcher struct {
// fd is the file descriptor used to send and receive packets.
fd int
// e is the endpoint this dispatcher is attached to.
e *endpoint
// buf is the iovec buffer that contains the packet contents.
buf *iovecBuffer
}
func newReadVDispatcher(fd int, e *endpoint) (linkDispatcher, error) {
d := &readVDispatcher{fd: fd, e: e}
skipsVnetHdr := d.e.Capabilities()&stack.CapabilityHardwareGSO != 0
d.buf = newIovecBuffer(BufConfig, skipsVnetHdr)
return d, nil
}
// dispatch reads one packet from the file descriptor and dispatches it.
func (d *readVDispatcher) dispatch() (bool, *tcpip.Error) {
n, err := rawfile.BlockingReadv(d.fd, d.buf.nextIovecs())
if n == 0 || err != nil {
return false, err
}
pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{
Data: d.buf.pullViews(n),
})
var (
p tcpip.NetworkProtocolNumber
remote, local tcpip.LinkAddress
)
if d.e.hdrSize > 0 {
hdr, ok := pkt.LinkHeader().Consume(d.e.hdrSize)
if !ok {
return false, nil
}
eth := header.Ethernet(hdr)
p = eth.Type()
remote = eth.SourceAddress()
local = eth.DestinationAddress()
} else {
// We don't get any indication of what the packet is, so try to guess
// if it's an IPv4 or IPv6 packet.
// IP version information is at the first octet, so pulling up 1 byte.
h, ok := pkt.Data.PullUp(1)
if !ok {
return true, nil
}
switch header.IPVersion(h) {
case header.IPv4Version:
p = header.IPv4ProtocolNumber
case header.IPv6Version:
p = header.IPv6ProtocolNumber
default:
return true, nil
}
}
d.e.dispatcher.DeliverNetworkPacket(remote, local, p, pkt)
return true, nil
}
// recvMMsgDispatcher uses the recvmmsg system call to read inbound packets and
// dispatches them.
type recvMMsgDispatcher struct {
// fd is the file descriptor used to send and receive packets.
fd int
// e is the endpoint this dispatcher is attached to.
e *endpoint
// bufs is an array of iovec buffers that contain packet contents.
bufs []*iovecBuffer
// msgHdrs is an array of MMsgHdr objects where each MMsghdr is used to
// reference an array of iovecs in the iovecs field defined above. This
// array is passed as the parameter to recvmmsg call to retrieve
// potentially more than 1 packet per syscall.
msgHdrs []rawfile.MMsgHdr
}
const (
// MaxMsgsPerRecv is the maximum number of packets we want to retrieve
// in a single RecvMMsg call.
MaxMsgsPerRecv = 8
)
func newRecvMMsgDispatcher(fd int, e *endpoint) (linkDispatcher, error) {
d := &recvMMsgDispatcher{
fd: fd,
e: e,
bufs: make([]*iovecBuffer, MaxMsgsPerRecv),
msgHdrs: make([]rawfile.MMsgHdr, MaxMsgsPerRecv),
}
skipsVnetHdr := d.e.Capabilities()&stack.CapabilityHardwareGSO != 0
for i := range d.bufs {
d.bufs[i] = newIovecBuffer(BufConfig, skipsVnetHdr)
}
return d, nil
}
// recvMMsgDispatch reads more than one packet at a time from the file
// descriptor and dispatches it.
func (d *recvMMsgDispatcher) dispatch() (bool, *tcpip.Error) {
// Fill message headers.
for k := range d.msgHdrs {
if d.msgHdrs[k].Msg.Iovlen > 0 {
break
}
iovecs := d.bufs[k].nextIovecs()
iovLen := len(iovecs)
d.msgHdrs[k].Len = 0
d.msgHdrs[k].Msg.Iov = &iovecs[0]
d.msgHdrs[k].Msg.Iovlen = uint64(iovLen)
}
nMsgs, err := rawfile.BlockingRecvMMsg(d.fd, d.msgHdrs)
if err != nil {
return false, err
}
// Process each of received packets.
for k := 0; k < nMsgs; k++ {
n := int(d.msgHdrs[k].Len)
pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{
Data: d.bufs[k].pullViews(n),
})
// Mark that this iovec has been processed.
d.msgHdrs[k].Msg.Iovlen = 0
var (
p tcpip.NetworkProtocolNumber
remote, local tcpip.LinkAddress
)
if d.e.hdrSize > 0 {
hdr, ok := pkt.LinkHeader().Consume(d.e.hdrSize)
if !ok {
return false, nil
}
eth := header.Ethernet(hdr)
p = eth.Type()
remote = eth.SourceAddress()
local = eth.DestinationAddress()
} else {
// We don't get any indication of what the packet is, so try to guess
// if it's an IPv4 or IPv6 packet.
// IP version information is at the first octet, so pulling up 1 byte.
h, ok := pkt.Data.PullUp(1)
if !ok {
// Skip this packet.
continue
}
switch header.IPVersion(h) {
case header.IPv4Version:
p = header.IPv4ProtocolNumber
case header.IPv6Version:
p = header.IPv6ProtocolNumber
default:
// Skip this packet.
continue
}
}
d.e.dispatcher.DeliverNetworkPacket(remote, local, p, pkt)
}
return true, nil
}