Add support for using PACKET_RX_RING to receive packets.

PACKET_RX_RING allows the use of an mmapped buffer to receive packets from the
kernel. This should cut down the number of host syscalls that need to be made
to receive packets when the underlying fd is a socket of the AF_PACKET type.

PiperOrigin-RevId: 233834998
Change-Id: I8060025c6ced206986e94cc46b8f382b81bfa47f
This commit is contained in:
Bhasker Hariharan 2019-02-13 14:52:06 -08:00 committed by Shentubot
parent 0e84ae72e0
commit e0b3d3323f
9 changed files with 349 additions and 58 deletions

View File

@ -4,7 +4,11 @@ package(licenses = ["notice"])
go_library(
name = "fdbased",
srcs = ["endpoint.go"],
srcs = [
"endpoint.go",
"mmap.go",
"mmap_amd64_unsafe.go",
],
importpath = "gvisor.googlesource.com/gvisor/pkg/tcpip/link/fdbased",
visibility = [
"//visibility:public",
@ -15,6 +19,7 @@ go_library(
"//pkg/tcpip/header",
"//pkg/tcpip/link/rawfile",
"//pkg/tcpip/stack",
"@org_golang_x_sys//unix:go_default_library",
],
)

View File

@ -47,6 +47,30 @@ var BufConfig = []int{128, 256, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768}
// NetworkDispatcher.
type linkDispatcher func() (bool, *tcpip.Error)
// PacketDispatchMode are the various supported methods of receiving and
// dispatching packets from the underlying FD.
type PacketDispatchMode int
const (
// Readv is the default dispatch mode and is the least performant of the
// dispatch options but the one that is supported by all underlying FD
// types.
Readv PacketDispatchMode = iota
// RecvMMsg enables use of recvmmsg() syscall instead of readv() to
// read inbound packets. This reduces # of syscalls needed to process
// packets.
//
// NOTE: recvmmsg() is only supported for sockets, so if the underlying
// FD is not a socket then the code will still fall back to the readv()
// path.
RecvMMsg
// PacketMMap enables use of PACKET_RX_RING to receive packets from the
// NIC. PacketMMap requires that the underlying FD be an AF_PACKET. The
// primary use-case for this is runsc which uses an AF_PACKET FD to
// receive packets from the veth device.
PacketMMap
)
type endpoint struct {
// fd is the file descriptor used to send and receive packets.
fd int
@ -68,9 +92,11 @@ type endpoint struct {
// its end of the communication pipe.
closed func(*tcpip.Error)
views [][]buffer.View
iovecs [][]syscall.Iovec
msgHdrs []rawfile.MMsgHdr
views [][]buffer.View
iovecs [][]syscall.Iovec
// msgHdrs is only used by the RecvMMsg dispatcher.
msgHdrs []rawfile.MMsgHdr
inboundDispatcher linkDispatcher
dispatcher stack.NetworkDispatcher
@ -79,28 +105,31 @@ type endpoint struct {
// endpoint (false).
handleLocal bool
// useRecvMMsg enables use of recvmmsg() syscall instead of readv() to
// read inbound packets. This reduces # of syscalls needed to process
// packets.
//
// NOTE: recvmmsg() is only supported for sockets, so if the underlying
// FD is not a socket then the code will still fall back to the readv()
// path.
useRecvMMsg bool
// packetDispatchMode controls the packet dispatcher used by this
// endpoint.
packetDispatchMode PacketDispatchMode
// ringBuffer is only used when PacketMMap dispatcher is used and points
// to the start of the mmapped PACKET_RX_RING buffer.
ringBuffer []byte
// ringOffset is the current offset into the ring buffer where the next
// inbound packet will be placed by the kernel.
ringOffset int
}
// Options specify the details about the fd-based endpoint to be created.
type Options struct {
FD int
MTU uint32
EthernetHeader bool
ChecksumOffload bool
ClosedFunc func(*tcpip.Error)
Address tcpip.LinkAddress
SaveRestore bool
DisconnectOk bool
HandleLocal bool
UseRecvMMsg bool
FD int
MTU uint32
EthernetHeader bool
ChecksumOffload bool
ClosedFunc func(*tcpip.Error)
Address tcpip.LinkAddress
SaveRestore bool
DisconnectOk bool
HandleLocal bool
PacketDispatchMode PacketDispatchMode
}
// New creates a new fd-based endpoint.
@ -133,21 +162,31 @@ func New(opts *Options) tcpip.LinkEndpointID {
}
e := &endpoint{
fd: opts.FD,
mtu: opts.MTU,
caps: caps,
closed: opts.ClosedFunc,
addr: opts.Address,
hdrSize: hdrSize,
handleLocal: opts.HandleLocal,
useRecvMMsg: opts.UseRecvMMsg,
fd: opts.FD,
mtu: opts.MTU,
caps: caps,
closed: opts.ClosedFunc,
addr: opts.Address,
hdrSize: hdrSize,
handleLocal: opts.HandleLocal,
packetDispatchMode: opts.PacketDispatchMode,
}
if isSocketFD(opts.FD) && e.packetDispatchMode == PacketMMap {
if err := e.setupPacketRXRing(); err != nil {
// TODO: replace panic with an error return.
panic(fmt.Sprintf("e.setupPacketRXRing failed: %v", err))
}
e.inboundDispatcher = e.packetMMapDispatch
return stack.RegisterLinkEndpoint(e)
}
// For non-socket FDs we read one packet a time (e.g. TAP devices)
msgsPerRecv := 1
e.inboundDispatcher = e.dispatch
// If the provided FD is a socket then we optimize packet reads by
// using recvmmsg() instead of read() to read packets in a batch.
if isSocketFD(opts.FD) && e.useRecvMMsg {
if isSocketFD(opts.FD) && e.packetDispatchMode == RecvMMsg {
e.inboundDispatcher = e.recvMMsgDispatch
msgsPerRecv = MaxMsgsPerRecv
}
@ -165,6 +204,7 @@ func New(opts *Options) tcpip.LinkEndpointID {
e.msgHdrs[i].Msg.Iov = &e.iovecs[i][0]
e.msgHdrs[i].Msg.Iovlen = uint64(len(BufConfig))
}
return stack.RegisterLinkEndpoint(e)
}

View File

@ -0,0 +1,33 @@
// Copyright 2019 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// +build !linux !amd64
package fdbased
import "gvisor.googlesource.com/gvisor/pkg/tcpip"
// Stubbed out versions for non-linux/non-amd64 platforms.
func (e *endpoint) setupPacketRXRing() error {
return nil
}
func (e *endpoint) readMMappedPacket() ([]byte, *tcpip.Error) {
return nil, nil
}
func (e *endpoint) packetMMapDispatch() (bool, *tcpip.Error) {
return false, nil
}

View File

@ -0,0 +1,210 @@
// Copyright 2019 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// +build linux,amd64
package fdbased
import (
"encoding/binary"
"fmt"
"syscall"
"unsafe"
"golang.org/x/sys/unix"
"gvisor.googlesource.com/gvisor/pkg/tcpip"
"gvisor.googlesource.com/gvisor/pkg/tcpip/buffer"
"gvisor.googlesource.com/gvisor/pkg/tcpip/header"
"gvisor.googlesource.com/gvisor/pkg/tcpip/link/rawfile"
)
const (
tPacketAlignment = uintptr(16)
tpStatusKernel = 0
tpStatusUser = 1
tpStatusCopy = 2
tpStatusLosing = 4
)
// We overallocate the frame size to accommodate space for the
// TPacketHdr+RawSockAddrLinkLayer+MAC header and any padding.
//
// NOTE: Frames need to be aligned at 16 byte boundaries.
const (
tpFrameSize = 65536 + 128
tpBlockSize = tpFrameSize * 128
tpBlockNR = 10
tpFrameNR = (tpBlockSize * tpBlockNR) / tpFrameSize
)
// tPacketAlign aligns the pointer v at a tPacketAlignment boundary. Direct
// translation of the TPACKET_ALIGN macro in <linux/if_packet.h>.
func tPacketAlign(v uintptr) uintptr {
return (v + tPacketAlignment - 1) & uintptr(^(tPacketAlignment - 1))
}
// tPacketHdrlen is the TPACKET_HDRLEN variable defined in <linux/if_packet.h>.
var tPacketHdrlen = tPacketAlign(unsafe.Sizeof(tPacketHdr{}) + unsafe.Sizeof(syscall.RawSockaddrLinklayer{}))
// tPacketReq is the tpacket_req structure as described in
// https://www.kernel.org/doc/Documentation/networking/packet_mmap.txt
type tPacketReq struct {
tpBlockSize uint32
tpBlockNR uint32
tpFrameSize uint32
tpFrameNR uint32
}
// tPacketHdr is tpacket_hdr structure as described in <linux/if_packet.h>
type tPacketHdr []byte
const (
tpStatusOffset = 0
tpLenOffset = 8
tpSnapLenOffset = 12
tpMacOffset = 16
tpNetOffset = 18
tpSecOffset = 20
tpUSecOffset = 24
)
func (t tPacketHdr) tpStatus() uint32 {
return binary.LittleEndian.Uint32(t[tpStatusOffset:])
}
func (t tPacketHdr) setTPStatus(status uint32) {
binary.LittleEndian.PutUint32(t[tpStatusOffset:], status)
}
func (t tPacketHdr) tpLen() uint32 {
return binary.LittleEndian.Uint32(t[tpLenOffset:])
}
func (t tPacketHdr) tpSnapLen() uint32 {
return binary.LittleEndian.Uint32(t[tpSnapLenOffset:])
}
func (t tPacketHdr) tpMac() uint16 {
return binary.LittleEndian.Uint16(t[tpMacOffset:])
}
func (t tPacketHdr) tpNet() uint16 {
return binary.LittleEndian.Uint16(t[tpNetOffset:])
}
func (t tPacketHdr) tpSec() uint32 {
return binary.LittleEndian.Uint32(t[tpSecOffset:])
}
func (t tPacketHdr) tpUSec() uint32 {
return binary.LittleEndian.Uint32(t[tpUSecOffset:])
}
func (t tPacketHdr) Payload() []byte {
return t[uint32(t.tpMac()) : uint32(t.tpMac())+t.tpSnapLen()]
}
func (e *endpoint) setupPacketRXRing() error {
tReq := tPacketReq{
tpBlockSize: uint32(tpBlockSize),
tpBlockNR: uint32(tpBlockNR),
tpFrameSize: uint32(tpFrameSize),
tpFrameNR: uint32(tpFrameNR),
}
// Setup PACKET_RX_RING.
if err := setsockopt(e.fd, syscall.SOL_PACKET, syscall.PACKET_RX_RING, unsafe.Pointer(&tReq), unsafe.Sizeof(tReq)); err != nil {
return fmt.Errorf("failed to enable PACKET_RX_RING: %v", err)
}
// Let's mmap the blocks.
sz := tpBlockSize * tpBlockNR
buf, err := syscall.Mmap(e.fd, 0, sz, syscall.PROT_READ|syscall.PROT_WRITE, syscall.MAP_SHARED)
if err != nil {
return fmt.Errorf("syscall.Mmap(...,0, %v, ...) failed = %v", sz, err)
}
e.ringBuffer = buf
return nil
}
func (e *endpoint) readMMappedPacket() ([]byte, *tcpip.Error) {
hdr := (tPacketHdr)(e.ringBuffer[0+e.ringOffset*tpFrameSize:])
for (hdr.tpStatus() & tpStatusUser) == 0 {
event := rawfile.PollEvent{
FD: int32(e.fd),
Events: unix.POLLIN | unix.POLLERR,
}
_, errno := rawfile.BlockingPoll(&event, 1, -1)
if errno != 0 {
if errno == syscall.EINTR {
continue
}
return nil, rawfile.TranslateErrno(errno)
}
if hdr.tpStatus()&tpStatusCopy != 0 {
continue
}
if hdr.tpStatus()&tpStatusLosing != 0 {
continue
}
}
// Copy out the packet from the mmapped frame to a locally owned buffer.
pkt := make([]byte, hdr.tpSnapLen())
copy(pkt, hdr.Payload())
// Release packet to kernel.
hdr.setTPStatus(tpStatusKernel)
e.ringOffset = (e.ringOffset + 1) % tpFrameNR
return pkt, nil
}
// packetMMapDispatch reads packets from an mmaped ring buffer and dispatches
// them to the network stack.
func (e *endpoint) packetMMapDispatch() (bool, *tcpip.Error) {
pkt, err := e.readMMappedPacket()
if err != nil {
return false, err
}
var (
p tcpip.NetworkProtocolNumber
remote, local tcpip.LinkAddress
)
if e.hdrSize > 0 {
eth := header.Ethernet(pkt)
p = eth.Type()
remote = eth.SourceAddress()
local = eth.DestinationAddress()
} else {
// We don't get any indication of what the packet is, so try to guess
// if it's an IPv4 or IPv6 packet.
switch header.IPVersion(pkt) {
case header.IPv4Version:
p = header.IPv4ProtocolNumber
case header.IPv6Version:
p = header.IPv6ProtocolNumber
default:
return true, nil
}
}
pkt = pkt[e.hdrSize:]
e.dispatcher.DeliverNetworkPacket(e, remote, local, p, buffer.NewVectorisedView(len(pkt), []buffer.View{buffer.View(pkt)}))
return true, nil
}
func setsockopt(fd, level, name int, val unsafe.Pointer, vallen uintptr) error {
if _, _, errno := syscall.Syscall6(syscall.SYS_SETSOCKOPT, uintptr(fd), uintptr(level), uintptr(name), uintptr(val), vallen, 0); errno != 0 {
return error(errno)
}
return nil
}

View File

@ -14,12 +14,12 @@
#include "textflag.h"
// blockingPoll makes the poll() syscall while calling the version of
// BlockingPoll makes the poll() syscall while calling the version of
// entersyscall that relinquishes the P so that other Gs can run. This is meant
// to be called in cases when the syscall is expected to block.
//
// func blockingPoll(fds *pollEvent, nfds int, timeout int64) (n int, err syscall.Errno)
TEXT ·blockingPoll(SB),NOSPLIT,$0-40
// func BlockingPoll(fds *PollEvent, nfds int, timeout int64) (n int, err syscall.Errno)
TEXT ·BlockingPoll(SB),NOSPLIT,$0-40
CALL ·callEntersyscallblock(SB)
MOVQ fds+0(FP), DI
MOVQ nfds+8(FP), SI

View File

@ -25,7 +25,7 @@ import (
)
//go:noescape
func blockingPoll(fds *pollEvent, nfds int, timeout int64) (int, syscall.Errno)
func BlockingPoll(fds *PollEvent, nfds int, timeout int64) (int, syscall.Errno)
// Use go:linkname to call into the runtime. As of Go 1.12 this has to
// be done from Go code so that we make an ABIInternal call to an

View File

@ -21,7 +21,9 @@ import (
"unsafe"
)
func blockingPoll(fds *pollEvent, nfds int, timeout int64) (int, syscall.Errno) {
// BlockingPoll is just a stub function that forwards to the poll() system call
// on non-amd64 platforms.
func BlockingPoll(fds *PollEvent, nfds int, timeout int64) (int, syscall.Errno) {
n, _, e := syscall.Syscall(syscall.SYS_POLL, uintptr(unsafe.Pointer(fds)), uintptr(nfds), uintptr(timeout))
return int(n), e
}

View File

@ -94,10 +94,11 @@ func NonBlockingWrite2(fd int, b1, b2 []byte) *tcpip.Error {
return nil
}
type pollEvent struct {
fd int32
events int16
revents int16
// PollEvent represents the pollfd structure passed to a poll() system call.
type PollEvent struct {
FD int32
Events int16
Revents int16
}
// BlockingRead reads from a file descriptor that is set up as non-blocking. If
@ -110,12 +111,12 @@ func BlockingRead(fd int, b []byte) (int, *tcpip.Error) {
return int(n), nil
}
event := pollEvent{
fd: int32(fd),
events: 1, // POLLIN
event := PollEvent{
FD: int32(fd),
Events: 1, // POLLIN
}
_, e = blockingPoll(&event, 1, -1)
_, e = BlockingPoll(&event, 1, -1)
if e != 0 && e != syscall.EINTR {
return 0, TranslateErrno(e)
}
@ -132,12 +133,12 @@ func BlockingReadv(fd int, iovecs []syscall.Iovec) (int, *tcpip.Error) {
return int(n), nil
}
event := pollEvent{
fd: int32(fd),
events: 1, // POLLIN
event := PollEvent{
FD: int32(fd),
Events: 1, // POLLIN
}
_, e = blockingPoll(&event, 1, -1)
_, e = BlockingPoll(&event, 1, -1)
if e != 0 && e != syscall.EINTR {
return 0, TranslateErrno(e)
}
@ -162,12 +163,12 @@ func BlockingRecvMMsg(fd int, msgHdrs []MMsgHdr) (int, *tcpip.Error) {
return int(n), nil
}
event := pollEvent{
fd: int32(fd),
events: 1, // POLLIN
event := PollEvent{
FD: int32(fd),
Events: 1, // POLLIN
}
if _, e := blockingPoll(&event, 1, -1); e != 0 && e != syscall.EINTR {
if _, e := BlockingPoll(&event, 1, -1); e != 0 && e != syscall.EINTR {
return 0, TranslateErrno(e)
}
}

View File

@ -135,12 +135,12 @@ func (n *Network) CreateLinksAndRoutes(args *CreateLinksAndRoutesArgs, _ *struct
mac := tcpip.LinkAddress(generateRndMac())
linkEP := fdbased.New(&fdbased.Options{
FD: newFD,
MTU: uint32(link.MTU),
EthernetHeader: true,
HandleLocal: true,
Address: mac,
UseRecvMMsg: true,
FD: newFD,
MTU: uint32(link.MTU),
EthernetHeader: true,
HandleLocal: true,
Address: mac,
PacketDispatchMode: fdbased.PacketMMap,
})
log.Infof("Enabling interface %q with id %d on addresses %+v (%v)", link.Name, nicID, link.Addresses, mac)