diff --git a/pkg/tcpip/link/fdbased/BUILD b/pkg/tcpip/link/fdbased/BUILD index 0d78c9b15..bcf9c023e 100644 --- a/pkg/tcpip/link/fdbased/BUILD +++ b/pkg/tcpip/link/fdbased/BUILD @@ -4,7 +4,11 @@ package(licenses = ["notice"]) go_library( name = "fdbased", - srcs = ["endpoint.go"], + srcs = [ + "endpoint.go", + "mmap.go", + "mmap_amd64_unsafe.go", + ], importpath = "gvisor.googlesource.com/gvisor/pkg/tcpip/link/fdbased", visibility = [ "//visibility:public", @@ -15,6 +19,7 @@ go_library( "//pkg/tcpip/header", "//pkg/tcpip/link/rawfile", "//pkg/tcpip/stack", + "@org_golang_x_sys//unix:go_default_library", ], ) diff --git a/pkg/tcpip/link/fdbased/endpoint.go b/pkg/tcpip/link/fdbased/endpoint.go index 87c8ab1fc..20f379ab0 100644 --- a/pkg/tcpip/link/fdbased/endpoint.go +++ b/pkg/tcpip/link/fdbased/endpoint.go @@ -47,6 +47,30 @@ var BufConfig = []int{128, 256, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768} // NetworkDispatcher. type linkDispatcher func() (bool, *tcpip.Error) +// PacketDispatchMode are the various supported methods of receiving and +// dispatching packets from the underlying FD. +type PacketDispatchMode int + +const ( + // Readv is the default dispatch mode and is the least performant of the + // dispatch options but the one that is supported by all underlying FD + // types. + Readv PacketDispatchMode = iota + // RecvMMsg enables use of recvmmsg() syscall instead of readv() to + // read inbound packets. This reduces # of syscalls needed to process + // packets. + // + // NOTE: recvmmsg() is only supported for sockets, so if the underlying + // FD is not a socket then the code will still fall back to the readv() + // path. + RecvMMsg + // PacketMMap enables use of PACKET_RX_RING to receive packets from the + // NIC. PacketMMap requires that the underlying FD be an AF_PACKET. The + // primary use-case for this is runsc which uses an AF_PACKET FD to + // receive packets from the veth device. + PacketMMap +) + type endpoint struct { // fd is the file descriptor used to send and receive packets. fd int @@ -68,9 +92,11 @@ type endpoint struct { // its end of the communication pipe. closed func(*tcpip.Error) - views [][]buffer.View - iovecs [][]syscall.Iovec - msgHdrs []rawfile.MMsgHdr + views [][]buffer.View + iovecs [][]syscall.Iovec + // msgHdrs is only used by the RecvMMsg dispatcher. + msgHdrs []rawfile.MMsgHdr + inboundDispatcher linkDispatcher dispatcher stack.NetworkDispatcher @@ -79,28 +105,31 @@ type endpoint struct { // endpoint (false). handleLocal bool - // useRecvMMsg enables use of recvmmsg() syscall instead of readv() to - // read inbound packets. This reduces # of syscalls needed to process - // packets. - // - // NOTE: recvmmsg() is only supported for sockets, so if the underlying - // FD is not a socket then the code will still fall back to the readv() - // path. - useRecvMMsg bool + // packetDispatchMode controls the packet dispatcher used by this + // endpoint. + packetDispatchMode PacketDispatchMode + + // ringBuffer is only used when PacketMMap dispatcher is used and points + // to the start of the mmapped PACKET_RX_RING buffer. + ringBuffer []byte + + // ringOffset is the current offset into the ring buffer where the next + // inbound packet will be placed by the kernel. + ringOffset int } // Options specify the details about the fd-based endpoint to be created. type Options struct { - FD int - MTU uint32 - EthernetHeader bool - ChecksumOffload bool - ClosedFunc func(*tcpip.Error) - Address tcpip.LinkAddress - SaveRestore bool - DisconnectOk bool - HandleLocal bool - UseRecvMMsg bool + FD int + MTU uint32 + EthernetHeader bool + ChecksumOffload bool + ClosedFunc func(*tcpip.Error) + Address tcpip.LinkAddress + SaveRestore bool + DisconnectOk bool + HandleLocal bool + PacketDispatchMode PacketDispatchMode } // New creates a new fd-based endpoint. @@ -133,21 +162,31 @@ func New(opts *Options) tcpip.LinkEndpointID { } e := &endpoint{ - fd: opts.FD, - mtu: opts.MTU, - caps: caps, - closed: opts.ClosedFunc, - addr: opts.Address, - hdrSize: hdrSize, - handleLocal: opts.HandleLocal, - useRecvMMsg: opts.UseRecvMMsg, + fd: opts.FD, + mtu: opts.MTU, + caps: caps, + closed: opts.ClosedFunc, + addr: opts.Address, + hdrSize: hdrSize, + handleLocal: opts.HandleLocal, + packetDispatchMode: opts.PacketDispatchMode, } + + if isSocketFD(opts.FD) && e.packetDispatchMode == PacketMMap { + if err := e.setupPacketRXRing(); err != nil { + // TODO: replace panic with an error return. + panic(fmt.Sprintf("e.setupPacketRXRing failed: %v", err)) + } + e.inboundDispatcher = e.packetMMapDispatch + return stack.RegisterLinkEndpoint(e) + } + // For non-socket FDs we read one packet a time (e.g. TAP devices) msgsPerRecv := 1 e.inboundDispatcher = e.dispatch // If the provided FD is a socket then we optimize packet reads by // using recvmmsg() instead of read() to read packets in a batch. - if isSocketFD(opts.FD) && e.useRecvMMsg { + if isSocketFD(opts.FD) && e.packetDispatchMode == RecvMMsg { e.inboundDispatcher = e.recvMMsgDispatch msgsPerRecv = MaxMsgsPerRecv } @@ -165,6 +204,7 @@ func New(opts *Options) tcpip.LinkEndpointID { e.msgHdrs[i].Msg.Iov = &e.iovecs[i][0] e.msgHdrs[i].Msg.Iovlen = uint64(len(BufConfig)) } + return stack.RegisterLinkEndpoint(e) } diff --git a/pkg/tcpip/link/fdbased/mmap.go b/pkg/tcpip/link/fdbased/mmap.go new file mode 100644 index 000000000..f1e71c233 --- /dev/null +++ b/pkg/tcpip/link/fdbased/mmap.go @@ -0,0 +1,33 @@ +// Copyright 2019 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build !linux !amd64 + +package fdbased + +import "gvisor.googlesource.com/gvisor/pkg/tcpip" + +// Stubbed out versions for non-linux/non-amd64 platforms. + +func (e *endpoint) setupPacketRXRing() error { + return nil +} + +func (e *endpoint) readMMappedPacket() ([]byte, *tcpip.Error) { + return nil, nil +} + +func (e *endpoint) packetMMapDispatch() (bool, *tcpip.Error) { + return false, nil +} diff --git a/pkg/tcpip/link/fdbased/mmap_amd64_unsafe.go b/pkg/tcpip/link/fdbased/mmap_amd64_unsafe.go new file mode 100644 index 000000000..d88c3f8a5 --- /dev/null +++ b/pkg/tcpip/link/fdbased/mmap_amd64_unsafe.go @@ -0,0 +1,210 @@ +// Copyright 2019 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build linux,amd64 + +package fdbased + +import ( + "encoding/binary" + "fmt" + "syscall" + "unsafe" + + "golang.org/x/sys/unix" + "gvisor.googlesource.com/gvisor/pkg/tcpip" + "gvisor.googlesource.com/gvisor/pkg/tcpip/buffer" + "gvisor.googlesource.com/gvisor/pkg/tcpip/header" + "gvisor.googlesource.com/gvisor/pkg/tcpip/link/rawfile" +) + +const ( + tPacketAlignment = uintptr(16) + tpStatusKernel = 0 + tpStatusUser = 1 + tpStatusCopy = 2 + tpStatusLosing = 4 +) + +// We overallocate the frame size to accommodate space for the +// TPacketHdr+RawSockAddrLinkLayer+MAC header and any padding. +// +// NOTE: Frames need to be aligned at 16 byte boundaries. +const ( + tpFrameSize = 65536 + 128 + tpBlockSize = tpFrameSize * 128 + tpBlockNR = 10 + tpFrameNR = (tpBlockSize * tpBlockNR) / tpFrameSize +) + +// tPacketAlign aligns the pointer v at a tPacketAlignment boundary. Direct +// translation of the TPACKET_ALIGN macro in . +func tPacketAlign(v uintptr) uintptr { + return (v + tPacketAlignment - 1) & uintptr(^(tPacketAlignment - 1)) +} + +// tPacketHdrlen is the TPACKET_HDRLEN variable defined in . +var tPacketHdrlen = tPacketAlign(unsafe.Sizeof(tPacketHdr{}) + unsafe.Sizeof(syscall.RawSockaddrLinklayer{})) + +// tPacketReq is the tpacket_req structure as described in +// https://www.kernel.org/doc/Documentation/networking/packet_mmap.txt +type tPacketReq struct { + tpBlockSize uint32 + tpBlockNR uint32 + tpFrameSize uint32 + tpFrameNR uint32 +} + +// tPacketHdr is tpacket_hdr structure as described in +type tPacketHdr []byte + +const ( + tpStatusOffset = 0 + tpLenOffset = 8 + tpSnapLenOffset = 12 + tpMacOffset = 16 + tpNetOffset = 18 + tpSecOffset = 20 + tpUSecOffset = 24 +) + +func (t tPacketHdr) tpStatus() uint32 { + return binary.LittleEndian.Uint32(t[tpStatusOffset:]) +} + +func (t tPacketHdr) setTPStatus(status uint32) { + binary.LittleEndian.PutUint32(t[tpStatusOffset:], status) +} + +func (t tPacketHdr) tpLen() uint32 { + return binary.LittleEndian.Uint32(t[tpLenOffset:]) +} + +func (t tPacketHdr) tpSnapLen() uint32 { + return binary.LittleEndian.Uint32(t[tpSnapLenOffset:]) +} + +func (t tPacketHdr) tpMac() uint16 { + return binary.LittleEndian.Uint16(t[tpMacOffset:]) +} + +func (t tPacketHdr) tpNet() uint16 { + return binary.LittleEndian.Uint16(t[tpNetOffset:]) +} + +func (t tPacketHdr) tpSec() uint32 { + return binary.LittleEndian.Uint32(t[tpSecOffset:]) +} + +func (t tPacketHdr) tpUSec() uint32 { + return binary.LittleEndian.Uint32(t[tpUSecOffset:]) +} + +func (t tPacketHdr) Payload() []byte { + return t[uint32(t.tpMac()) : uint32(t.tpMac())+t.tpSnapLen()] +} + +func (e *endpoint) setupPacketRXRing() error { + tReq := tPacketReq{ + tpBlockSize: uint32(tpBlockSize), + tpBlockNR: uint32(tpBlockNR), + tpFrameSize: uint32(tpFrameSize), + tpFrameNR: uint32(tpFrameNR), + } + // Setup PACKET_RX_RING. + if err := setsockopt(e.fd, syscall.SOL_PACKET, syscall.PACKET_RX_RING, unsafe.Pointer(&tReq), unsafe.Sizeof(tReq)); err != nil { + return fmt.Errorf("failed to enable PACKET_RX_RING: %v", err) + } + // Let's mmap the blocks. + sz := tpBlockSize * tpBlockNR + buf, err := syscall.Mmap(e.fd, 0, sz, syscall.PROT_READ|syscall.PROT_WRITE, syscall.MAP_SHARED) + if err != nil { + return fmt.Errorf("syscall.Mmap(...,0, %v, ...) failed = %v", sz, err) + } + e.ringBuffer = buf + return nil +} + +func (e *endpoint) readMMappedPacket() ([]byte, *tcpip.Error) { + hdr := (tPacketHdr)(e.ringBuffer[0+e.ringOffset*tpFrameSize:]) + for (hdr.tpStatus() & tpStatusUser) == 0 { + event := rawfile.PollEvent{ + FD: int32(e.fd), + Events: unix.POLLIN | unix.POLLERR, + } + _, errno := rawfile.BlockingPoll(&event, 1, -1) + if errno != 0 { + if errno == syscall.EINTR { + continue + } + return nil, rawfile.TranslateErrno(errno) + } + if hdr.tpStatus()&tpStatusCopy != 0 { + continue + } + if hdr.tpStatus()&tpStatusLosing != 0 { + continue + } + } + + // Copy out the packet from the mmapped frame to a locally owned buffer. + pkt := make([]byte, hdr.tpSnapLen()) + copy(pkt, hdr.Payload()) + // Release packet to kernel. + hdr.setTPStatus(tpStatusKernel) + e.ringOffset = (e.ringOffset + 1) % tpFrameNR + return pkt, nil +} + +// packetMMapDispatch reads packets from an mmaped ring buffer and dispatches +// them to the network stack. +func (e *endpoint) packetMMapDispatch() (bool, *tcpip.Error) { + pkt, err := e.readMMappedPacket() + if err != nil { + return false, err + } + var ( + p tcpip.NetworkProtocolNumber + remote, local tcpip.LinkAddress + ) + if e.hdrSize > 0 { + eth := header.Ethernet(pkt) + p = eth.Type() + remote = eth.SourceAddress() + local = eth.DestinationAddress() + } else { + // We don't get any indication of what the packet is, so try to guess + // if it's an IPv4 or IPv6 packet. + switch header.IPVersion(pkt) { + case header.IPv4Version: + p = header.IPv4ProtocolNumber + case header.IPv6Version: + p = header.IPv6ProtocolNumber + default: + return true, nil + } + } + + pkt = pkt[e.hdrSize:] + e.dispatcher.DeliverNetworkPacket(e, remote, local, p, buffer.NewVectorisedView(len(pkt), []buffer.View{buffer.View(pkt)})) + return true, nil +} + +func setsockopt(fd, level, name int, val unsafe.Pointer, vallen uintptr) error { + if _, _, errno := syscall.Syscall6(syscall.SYS_SETSOCKOPT, uintptr(fd), uintptr(level), uintptr(name), uintptr(val), vallen, 0); errno != 0 { + return error(errno) + } + + return nil +} diff --git a/pkg/tcpip/link/rawfile/blockingpoll_amd64.s b/pkg/tcpip/link/rawfile/blockingpoll_amd64.s index 8e22ba661..9dade5421 100644 --- a/pkg/tcpip/link/rawfile/blockingpoll_amd64.s +++ b/pkg/tcpip/link/rawfile/blockingpoll_amd64.s @@ -14,12 +14,12 @@ #include "textflag.h" -// blockingPoll makes the poll() syscall while calling the version of +// BlockingPoll makes the poll() syscall while calling the version of // entersyscall that relinquishes the P so that other Gs can run. This is meant // to be called in cases when the syscall is expected to block. // -// func blockingPoll(fds *pollEvent, nfds int, timeout int64) (n int, err syscall.Errno) -TEXT ·blockingPoll(SB),NOSPLIT,$0-40 +// func BlockingPoll(fds *PollEvent, nfds int, timeout int64) (n int, err syscall.Errno) +TEXT ·BlockingPoll(SB),NOSPLIT,$0-40 CALL ·callEntersyscallblock(SB) MOVQ fds+0(FP), DI MOVQ nfds+8(FP), SI diff --git a/pkg/tcpip/link/rawfile/blockingpoll_amd64_unsafe.go b/pkg/tcpip/link/rawfile/blockingpoll_amd64_unsafe.go index 93479cd0d..3ba96a123 100644 --- a/pkg/tcpip/link/rawfile/blockingpoll_amd64_unsafe.go +++ b/pkg/tcpip/link/rawfile/blockingpoll_amd64_unsafe.go @@ -25,7 +25,7 @@ import ( ) //go:noescape -func blockingPoll(fds *pollEvent, nfds int, timeout int64) (int, syscall.Errno) +func BlockingPoll(fds *PollEvent, nfds int, timeout int64) (int, syscall.Errno) // Use go:linkname to call into the runtime. As of Go 1.12 this has to // be done from Go code so that we make an ABIInternal call to an diff --git a/pkg/tcpip/link/rawfile/blockingpoll_unsafe.go b/pkg/tcpip/link/rawfile/blockingpoll_unsafe.go index 6a3e956ad..94ddad8ea 100644 --- a/pkg/tcpip/link/rawfile/blockingpoll_unsafe.go +++ b/pkg/tcpip/link/rawfile/blockingpoll_unsafe.go @@ -21,7 +21,9 @@ import ( "unsafe" ) -func blockingPoll(fds *pollEvent, nfds int, timeout int64) (int, syscall.Errno) { +// BlockingPoll is just a stub function that forwards to the poll() system call +// on non-amd64 platforms. +func BlockingPoll(fds *PollEvent, nfds int, timeout int64) (int, syscall.Errno) { n, _, e := syscall.Syscall(syscall.SYS_POLL, uintptr(unsafe.Pointer(fds)), uintptr(nfds), uintptr(timeout)) return int(n), e } diff --git a/pkg/tcpip/link/rawfile/rawfile_unsafe.go b/pkg/tcpip/link/rawfile/rawfile_unsafe.go index 5deea093a..5d36ebe57 100644 --- a/pkg/tcpip/link/rawfile/rawfile_unsafe.go +++ b/pkg/tcpip/link/rawfile/rawfile_unsafe.go @@ -94,10 +94,11 @@ func NonBlockingWrite2(fd int, b1, b2 []byte) *tcpip.Error { return nil } -type pollEvent struct { - fd int32 - events int16 - revents int16 +// PollEvent represents the pollfd structure passed to a poll() system call. +type PollEvent struct { + FD int32 + Events int16 + Revents int16 } // BlockingRead reads from a file descriptor that is set up as non-blocking. If @@ -110,12 +111,12 @@ func BlockingRead(fd int, b []byte) (int, *tcpip.Error) { return int(n), nil } - event := pollEvent{ - fd: int32(fd), - events: 1, // POLLIN + event := PollEvent{ + FD: int32(fd), + Events: 1, // POLLIN } - _, e = blockingPoll(&event, 1, -1) + _, e = BlockingPoll(&event, 1, -1) if e != 0 && e != syscall.EINTR { return 0, TranslateErrno(e) } @@ -132,12 +133,12 @@ func BlockingReadv(fd int, iovecs []syscall.Iovec) (int, *tcpip.Error) { return int(n), nil } - event := pollEvent{ - fd: int32(fd), - events: 1, // POLLIN + event := PollEvent{ + FD: int32(fd), + Events: 1, // POLLIN } - _, e = blockingPoll(&event, 1, -1) + _, e = BlockingPoll(&event, 1, -1) if e != 0 && e != syscall.EINTR { return 0, TranslateErrno(e) } @@ -162,12 +163,12 @@ func BlockingRecvMMsg(fd int, msgHdrs []MMsgHdr) (int, *tcpip.Error) { return int(n), nil } - event := pollEvent{ - fd: int32(fd), - events: 1, // POLLIN + event := PollEvent{ + FD: int32(fd), + Events: 1, // POLLIN } - if _, e := blockingPoll(&event, 1, -1); e != 0 && e != syscall.EINTR { + if _, e := BlockingPoll(&event, 1, -1); e != 0 && e != syscall.EINTR { return 0, TranslateErrno(e) } } diff --git a/runsc/boot/network.go b/runsc/boot/network.go index 83d56f93a..0cadf48d6 100644 --- a/runsc/boot/network.go +++ b/runsc/boot/network.go @@ -135,12 +135,12 @@ func (n *Network) CreateLinksAndRoutes(args *CreateLinksAndRoutesArgs, _ *struct mac := tcpip.LinkAddress(generateRndMac()) linkEP := fdbased.New(&fdbased.Options{ - FD: newFD, - MTU: uint32(link.MTU), - EthernetHeader: true, - HandleLocal: true, - Address: mac, - UseRecvMMsg: true, + FD: newFD, + MTU: uint32(link.MTU), + EthernetHeader: true, + HandleLocal: true, + Address: mac, + PacketDispatchMode: fdbased.PacketMMap, }) log.Infof("Enabling interface %q with id %d on addresses %+v (%v)", link.Name, nicID, link.Addresses, mac)