From e0b3d3323fbb4b27280f0087427bb04c3e71238b Mon Sep 17 00:00:00 2001 From: Bhasker Hariharan Date: Wed, 13 Feb 2019 14:52:06 -0800 Subject: [PATCH] Add support for using PACKET_RX_RING to receive packets. PACKET_RX_RING allows the use of an mmapped buffer to receive packets from the kernel. This should cut down the number of host syscalls that need to be made to receive packets when the underlying fd is a socket of the AF_PACKET type. PiperOrigin-RevId: 233834998 Change-Id: I8060025c6ced206986e94cc46b8f382b81bfa47f --- pkg/tcpip/link/fdbased/BUILD | 7 +- pkg/tcpip/link/fdbased/endpoint.go | 100 ++++++--- pkg/tcpip/link/fdbased/mmap.go | 33 +++ pkg/tcpip/link/fdbased/mmap_amd64_unsafe.go | 210 ++++++++++++++++++ pkg/tcpip/link/rawfile/blockingpoll_amd64.s | 6 +- .../link/rawfile/blockingpoll_amd64_unsafe.go | 2 +- pkg/tcpip/link/rawfile/blockingpoll_unsafe.go | 4 +- pkg/tcpip/link/rawfile/rawfile_unsafe.go | 33 +-- runsc/boot/network.go | 12 +- 9 files changed, 349 insertions(+), 58 deletions(-) create mode 100644 pkg/tcpip/link/fdbased/mmap.go create mode 100644 pkg/tcpip/link/fdbased/mmap_amd64_unsafe.go diff --git a/pkg/tcpip/link/fdbased/BUILD b/pkg/tcpip/link/fdbased/BUILD index 0d78c9b15..bcf9c023e 100644 --- a/pkg/tcpip/link/fdbased/BUILD +++ b/pkg/tcpip/link/fdbased/BUILD @@ -4,7 +4,11 @@ package(licenses = ["notice"]) go_library( name = "fdbased", - srcs = ["endpoint.go"], + srcs = [ + "endpoint.go", + "mmap.go", + "mmap_amd64_unsafe.go", + ], importpath = "gvisor.googlesource.com/gvisor/pkg/tcpip/link/fdbased", visibility = [ "//visibility:public", @@ -15,6 +19,7 @@ go_library( "//pkg/tcpip/header", "//pkg/tcpip/link/rawfile", "//pkg/tcpip/stack", + "@org_golang_x_sys//unix:go_default_library", ], ) diff --git a/pkg/tcpip/link/fdbased/endpoint.go b/pkg/tcpip/link/fdbased/endpoint.go index 87c8ab1fc..20f379ab0 100644 --- a/pkg/tcpip/link/fdbased/endpoint.go +++ b/pkg/tcpip/link/fdbased/endpoint.go @@ -47,6 +47,30 @@ var BufConfig = []int{128, 256, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768} // NetworkDispatcher. type linkDispatcher func() (bool, *tcpip.Error) +// PacketDispatchMode are the various supported methods of receiving and +// dispatching packets from the underlying FD. +type PacketDispatchMode int + +const ( + // Readv is the default dispatch mode and is the least performant of the + // dispatch options but the one that is supported by all underlying FD + // types. + Readv PacketDispatchMode = iota + // RecvMMsg enables use of recvmmsg() syscall instead of readv() to + // read inbound packets. This reduces # of syscalls needed to process + // packets. + // + // NOTE: recvmmsg() is only supported for sockets, so if the underlying + // FD is not a socket then the code will still fall back to the readv() + // path. + RecvMMsg + // PacketMMap enables use of PACKET_RX_RING to receive packets from the + // NIC. PacketMMap requires that the underlying FD be an AF_PACKET. The + // primary use-case for this is runsc which uses an AF_PACKET FD to + // receive packets from the veth device. + PacketMMap +) + type endpoint struct { // fd is the file descriptor used to send and receive packets. fd int @@ -68,9 +92,11 @@ type endpoint struct { // its end of the communication pipe. closed func(*tcpip.Error) - views [][]buffer.View - iovecs [][]syscall.Iovec - msgHdrs []rawfile.MMsgHdr + views [][]buffer.View + iovecs [][]syscall.Iovec + // msgHdrs is only used by the RecvMMsg dispatcher. + msgHdrs []rawfile.MMsgHdr + inboundDispatcher linkDispatcher dispatcher stack.NetworkDispatcher @@ -79,28 +105,31 @@ type endpoint struct { // endpoint (false). handleLocal bool - // useRecvMMsg enables use of recvmmsg() syscall instead of readv() to - // read inbound packets. This reduces # of syscalls needed to process - // packets. - // - // NOTE: recvmmsg() is only supported for sockets, so if the underlying - // FD is not a socket then the code will still fall back to the readv() - // path. - useRecvMMsg bool + // packetDispatchMode controls the packet dispatcher used by this + // endpoint. + packetDispatchMode PacketDispatchMode + + // ringBuffer is only used when PacketMMap dispatcher is used and points + // to the start of the mmapped PACKET_RX_RING buffer. + ringBuffer []byte + + // ringOffset is the current offset into the ring buffer where the next + // inbound packet will be placed by the kernel. + ringOffset int } // Options specify the details about the fd-based endpoint to be created. type Options struct { - FD int - MTU uint32 - EthernetHeader bool - ChecksumOffload bool - ClosedFunc func(*tcpip.Error) - Address tcpip.LinkAddress - SaveRestore bool - DisconnectOk bool - HandleLocal bool - UseRecvMMsg bool + FD int + MTU uint32 + EthernetHeader bool + ChecksumOffload bool + ClosedFunc func(*tcpip.Error) + Address tcpip.LinkAddress + SaveRestore bool + DisconnectOk bool + HandleLocal bool + PacketDispatchMode PacketDispatchMode } // New creates a new fd-based endpoint. @@ -133,21 +162,31 @@ func New(opts *Options) tcpip.LinkEndpointID { } e := &endpoint{ - fd: opts.FD, - mtu: opts.MTU, - caps: caps, - closed: opts.ClosedFunc, - addr: opts.Address, - hdrSize: hdrSize, - handleLocal: opts.HandleLocal, - useRecvMMsg: opts.UseRecvMMsg, + fd: opts.FD, + mtu: opts.MTU, + caps: caps, + closed: opts.ClosedFunc, + addr: opts.Address, + hdrSize: hdrSize, + handleLocal: opts.HandleLocal, + packetDispatchMode: opts.PacketDispatchMode, } + + if isSocketFD(opts.FD) && e.packetDispatchMode == PacketMMap { + if err := e.setupPacketRXRing(); err != nil { + // TODO: replace panic with an error return. + panic(fmt.Sprintf("e.setupPacketRXRing failed: %v", err)) + } + e.inboundDispatcher = e.packetMMapDispatch + return stack.RegisterLinkEndpoint(e) + } + // For non-socket FDs we read one packet a time (e.g. TAP devices) msgsPerRecv := 1 e.inboundDispatcher = e.dispatch // If the provided FD is a socket then we optimize packet reads by // using recvmmsg() instead of read() to read packets in a batch. - if isSocketFD(opts.FD) && e.useRecvMMsg { + if isSocketFD(opts.FD) && e.packetDispatchMode == RecvMMsg { e.inboundDispatcher = e.recvMMsgDispatch msgsPerRecv = MaxMsgsPerRecv } @@ -165,6 +204,7 @@ func New(opts *Options) tcpip.LinkEndpointID { e.msgHdrs[i].Msg.Iov = &e.iovecs[i][0] e.msgHdrs[i].Msg.Iovlen = uint64(len(BufConfig)) } + return stack.RegisterLinkEndpoint(e) } diff --git a/pkg/tcpip/link/fdbased/mmap.go b/pkg/tcpip/link/fdbased/mmap.go new file mode 100644 index 000000000..f1e71c233 --- /dev/null +++ b/pkg/tcpip/link/fdbased/mmap.go @@ -0,0 +1,33 @@ +// Copyright 2019 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build !linux !amd64 + +package fdbased + +import "gvisor.googlesource.com/gvisor/pkg/tcpip" + +// Stubbed out versions for non-linux/non-amd64 platforms. + +func (e *endpoint) setupPacketRXRing() error { + return nil +} + +func (e *endpoint) readMMappedPacket() ([]byte, *tcpip.Error) { + return nil, nil +} + +func (e *endpoint) packetMMapDispatch() (bool, *tcpip.Error) { + return false, nil +} diff --git a/pkg/tcpip/link/fdbased/mmap_amd64_unsafe.go b/pkg/tcpip/link/fdbased/mmap_amd64_unsafe.go new file mode 100644 index 000000000..d88c3f8a5 --- /dev/null +++ b/pkg/tcpip/link/fdbased/mmap_amd64_unsafe.go @@ -0,0 +1,210 @@ +// Copyright 2019 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build linux,amd64 + +package fdbased + +import ( + "encoding/binary" + "fmt" + "syscall" + "unsafe" + + "golang.org/x/sys/unix" + "gvisor.googlesource.com/gvisor/pkg/tcpip" + "gvisor.googlesource.com/gvisor/pkg/tcpip/buffer" + "gvisor.googlesource.com/gvisor/pkg/tcpip/header" + "gvisor.googlesource.com/gvisor/pkg/tcpip/link/rawfile" +) + +const ( + tPacketAlignment = uintptr(16) + tpStatusKernel = 0 + tpStatusUser = 1 + tpStatusCopy = 2 + tpStatusLosing = 4 +) + +// We overallocate the frame size to accommodate space for the +// TPacketHdr+RawSockAddrLinkLayer+MAC header and any padding. +// +// NOTE: Frames need to be aligned at 16 byte boundaries. +const ( + tpFrameSize = 65536 + 128 + tpBlockSize = tpFrameSize * 128 + tpBlockNR = 10 + tpFrameNR = (tpBlockSize * tpBlockNR) / tpFrameSize +) + +// tPacketAlign aligns the pointer v at a tPacketAlignment boundary. Direct +// translation of the TPACKET_ALIGN macro in . +func tPacketAlign(v uintptr) uintptr { + return (v + tPacketAlignment - 1) & uintptr(^(tPacketAlignment - 1)) +} + +// tPacketHdrlen is the TPACKET_HDRLEN variable defined in . +var tPacketHdrlen = tPacketAlign(unsafe.Sizeof(tPacketHdr{}) + unsafe.Sizeof(syscall.RawSockaddrLinklayer{})) + +// tPacketReq is the tpacket_req structure as described in +// https://www.kernel.org/doc/Documentation/networking/packet_mmap.txt +type tPacketReq struct { + tpBlockSize uint32 + tpBlockNR uint32 + tpFrameSize uint32 + tpFrameNR uint32 +} + +// tPacketHdr is tpacket_hdr structure as described in +type tPacketHdr []byte + +const ( + tpStatusOffset = 0 + tpLenOffset = 8 + tpSnapLenOffset = 12 + tpMacOffset = 16 + tpNetOffset = 18 + tpSecOffset = 20 + tpUSecOffset = 24 +) + +func (t tPacketHdr) tpStatus() uint32 { + return binary.LittleEndian.Uint32(t[tpStatusOffset:]) +} + +func (t tPacketHdr) setTPStatus(status uint32) { + binary.LittleEndian.PutUint32(t[tpStatusOffset:], status) +} + +func (t tPacketHdr) tpLen() uint32 { + return binary.LittleEndian.Uint32(t[tpLenOffset:]) +} + +func (t tPacketHdr) tpSnapLen() uint32 { + return binary.LittleEndian.Uint32(t[tpSnapLenOffset:]) +} + +func (t tPacketHdr) tpMac() uint16 { + return binary.LittleEndian.Uint16(t[tpMacOffset:]) +} + +func (t tPacketHdr) tpNet() uint16 { + return binary.LittleEndian.Uint16(t[tpNetOffset:]) +} + +func (t tPacketHdr) tpSec() uint32 { + return binary.LittleEndian.Uint32(t[tpSecOffset:]) +} + +func (t tPacketHdr) tpUSec() uint32 { + return binary.LittleEndian.Uint32(t[tpUSecOffset:]) +} + +func (t tPacketHdr) Payload() []byte { + return t[uint32(t.tpMac()) : uint32(t.tpMac())+t.tpSnapLen()] +} + +func (e *endpoint) setupPacketRXRing() error { + tReq := tPacketReq{ + tpBlockSize: uint32(tpBlockSize), + tpBlockNR: uint32(tpBlockNR), + tpFrameSize: uint32(tpFrameSize), + tpFrameNR: uint32(tpFrameNR), + } + // Setup PACKET_RX_RING. + if err := setsockopt(e.fd, syscall.SOL_PACKET, syscall.PACKET_RX_RING, unsafe.Pointer(&tReq), unsafe.Sizeof(tReq)); err != nil { + return fmt.Errorf("failed to enable PACKET_RX_RING: %v", err) + } + // Let's mmap the blocks. + sz := tpBlockSize * tpBlockNR + buf, err := syscall.Mmap(e.fd, 0, sz, syscall.PROT_READ|syscall.PROT_WRITE, syscall.MAP_SHARED) + if err != nil { + return fmt.Errorf("syscall.Mmap(...,0, %v, ...) failed = %v", sz, err) + } + e.ringBuffer = buf + return nil +} + +func (e *endpoint) readMMappedPacket() ([]byte, *tcpip.Error) { + hdr := (tPacketHdr)(e.ringBuffer[0+e.ringOffset*tpFrameSize:]) + for (hdr.tpStatus() & tpStatusUser) == 0 { + event := rawfile.PollEvent{ + FD: int32(e.fd), + Events: unix.POLLIN | unix.POLLERR, + } + _, errno := rawfile.BlockingPoll(&event, 1, -1) + if errno != 0 { + if errno == syscall.EINTR { + continue + } + return nil, rawfile.TranslateErrno(errno) + } + if hdr.tpStatus()&tpStatusCopy != 0 { + continue + } + if hdr.tpStatus()&tpStatusLosing != 0 { + continue + } + } + + // Copy out the packet from the mmapped frame to a locally owned buffer. + pkt := make([]byte, hdr.tpSnapLen()) + copy(pkt, hdr.Payload()) + // Release packet to kernel. + hdr.setTPStatus(tpStatusKernel) + e.ringOffset = (e.ringOffset + 1) % tpFrameNR + return pkt, nil +} + +// packetMMapDispatch reads packets from an mmaped ring buffer and dispatches +// them to the network stack. +func (e *endpoint) packetMMapDispatch() (bool, *tcpip.Error) { + pkt, err := e.readMMappedPacket() + if err != nil { + return false, err + } + var ( + p tcpip.NetworkProtocolNumber + remote, local tcpip.LinkAddress + ) + if e.hdrSize > 0 { + eth := header.Ethernet(pkt) + p = eth.Type() + remote = eth.SourceAddress() + local = eth.DestinationAddress() + } else { + // We don't get any indication of what the packet is, so try to guess + // if it's an IPv4 or IPv6 packet. + switch header.IPVersion(pkt) { + case header.IPv4Version: + p = header.IPv4ProtocolNumber + case header.IPv6Version: + p = header.IPv6ProtocolNumber + default: + return true, nil + } + } + + pkt = pkt[e.hdrSize:] + e.dispatcher.DeliverNetworkPacket(e, remote, local, p, buffer.NewVectorisedView(len(pkt), []buffer.View{buffer.View(pkt)})) + return true, nil +} + +func setsockopt(fd, level, name int, val unsafe.Pointer, vallen uintptr) error { + if _, _, errno := syscall.Syscall6(syscall.SYS_SETSOCKOPT, uintptr(fd), uintptr(level), uintptr(name), uintptr(val), vallen, 0); errno != 0 { + return error(errno) + } + + return nil +} diff --git a/pkg/tcpip/link/rawfile/blockingpoll_amd64.s b/pkg/tcpip/link/rawfile/blockingpoll_amd64.s index 8e22ba661..9dade5421 100644 --- a/pkg/tcpip/link/rawfile/blockingpoll_amd64.s +++ b/pkg/tcpip/link/rawfile/blockingpoll_amd64.s @@ -14,12 +14,12 @@ #include "textflag.h" -// blockingPoll makes the poll() syscall while calling the version of +// BlockingPoll makes the poll() syscall while calling the version of // entersyscall that relinquishes the P so that other Gs can run. This is meant // to be called in cases when the syscall is expected to block. // -// func blockingPoll(fds *pollEvent, nfds int, timeout int64) (n int, err syscall.Errno) -TEXT ·blockingPoll(SB),NOSPLIT,$0-40 +// func BlockingPoll(fds *PollEvent, nfds int, timeout int64) (n int, err syscall.Errno) +TEXT ·BlockingPoll(SB),NOSPLIT,$0-40 CALL ·callEntersyscallblock(SB) MOVQ fds+0(FP), DI MOVQ nfds+8(FP), SI diff --git a/pkg/tcpip/link/rawfile/blockingpoll_amd64_unsafe.go b/pkg/tcpip/link/rawfile/blockingpoll_amd64_unsafe.go index 93479cd0d..3ba96a123 100644 --- a/pkg/tcpip/link/rawfile/blockingpoll_amd64_unsafe.go +++ b/pkg/tcpip/link/rawfile/blockingpoll_amd64_unsafe.go @@ -25,7 +25,7 @@ import ( ) //go:noescape -func blockingPoll(fds *pollEvent, nfds int, timeout int64) (int, syscall.Errno) +func BlockingPoll(fds *PollEvent, nfds int, timeout int64) (int, syscall.Errno) // Use go:linkname to call into the runtime. As of Go 1.12 this has to // be done from Go code so that we make an ABIInternal call to an diff --git a/pkg/tcpip/link/rawfile/blockingpoll_unsafe.go b/pkg/tcpip/link/rawfile/blockingpoll_unsafe.go index 6a3e956ad..94ddad8ea 100644 --- a/pkg/tcpip/link/rawfile/blockingpoll_unsafe.go +++ b/pkg/tcpip/link/rawfile/blockingpoll_unsafe.go @@ -21,7 +21,9 @@ import ( "unsafe" ) -func blockingPoll(fds *pollEvent, nfds int, timeout int64) (int, syscall.Errno) { +// BlockingPoll is just a stub function that forwards to the poll() system call +// on non-amd64 platforms. +func BlockingPoll(fds *PollEvent, nfds int, timeout int64) (int, syscall.Errno) { n, _, e := syscall.Syscall(syscall.SYS_POLL, uintptr(unsafe.Pointer(fds)), uintptr(nfds), uintptr(timeout)) return int(n), e } diff --git a/pkg/tcpip/link/rawfile/rawfile_unsafe.go b/pkg/tcpip/link/rawfile/rawfile_unsafe.go index 5deea093a..5d36ebe57 100644 --- a/pkg/tcpip/link/rawfile/rawfile_unsafe.go +++ b/pkg/tcpip/link/rawfile/rawfile_unsafe.go @@ -94,10 +94,11 @@ func NonBlockingWrite2(fd int, b1, b2 []byte) *tcpip.Error { return nil } -type pollEvent struct { - fd int32 - events int16 - revents int16 +// PollEvent represents the pollfd structure passed to a poll() system call. +type PollEvent struct { + FD int32 + Events int16 + Revents int16 } // BlockingRead reads from a file descriptor that is set up as non-blocking. If @@ -110,12 +111,12 @@ func BlockingRead(fd int, b []byte) (int, *tcpip.Error) { return int(n), nil } - event := pollEvent{ - fd: int32(fd), - events: 1, // POLLIN + event := PollEvent{ + FD: int32(fd), + Events: 1, // POLLIN } - _, e = blockingPoll(&event, 1, -1) + _, e = BlockingPoll(&event, 1, -1) if e != 0 && e != syscall.EINTR { return 0, TranslateErrno(e) } @@ -132,12 +133,12 @@ func BlockingReadv(fd int, iovecs []syscall.Iovec) (int, *tcpip.Error) { return int(n), nil } - event := pollEvent{ - fd: int32(fd), - events: 1, // POLLIN + event := PollEvent{ + FD: int32(fd), + Events: 1, // POLLIN } - _, e = blockingPoll(&event, 1, -1) + _, e = BlockingPoll(&event, 1, -1) if e != 0 && e != syscall.EINTR { return 0, TranslateErrno(e) } @@ -162,12 +163,12 @@ func BlockingRecvMMsg(fd int, msgHdrs []MMsgHdr) (int, *tcpip.Error) { return int(n), nil } - event := pollEvent{ - fd: int32(fd), - events: 1, // POLLIN + event := PollEvent{ + FD: int32(fd), + Events: 1, // POLLIN } - if _, e := blockingPoll(&event, 1, -1); e != 0 && e != syscall.EINTR { + if _, e := BlockingPoll(&event, 1, -1); e != 0 && e != syscall.EINTR { return 0, TranslateErrno(e) } } diff --git a/runsc/boot/network.go b/runsc/boot/network.go index 83d56f93a..0cadf48d6 100644 --- a/runsc/boot/network.go +++ b/runsc/boot/network.go @@ -135,12 +135,12 @@ func (n *Network) CreateLinksAndRoutes(args *CreateLinksAndRoutesArgs, _ *struct mac := tcpip.LinkAddress(generateRndMac()) linkEP := fdbased.New(&fdbased.Options{ - FD: newFD, - MTU: uint32(link.MTU), - EthernetHeader: true, - HandleLocal: true, - Address: mac, - UseRecvMMsg: true, + FD: newFD, + MTU: uint32(link.MTU), + EthernetHeader: true, + HandleLocal: true, + Address: mac, + PacketDispatchMode: fdbased.PacketMMap, }) log.Infof("Enabling interface %q with id %d on addresses %+v (%v)", link.Name, nicID, link.Addresses, mac)