gvisor/pkg/tcpip/transport/raw/endpoint.go

658 lines
18 KiB
Go
Raw Normal View History

// Copyright 2019 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// Package raw provides the implementation of raw sockets (see raw(7)). Raw
// sockets allow applications to:
//
// * manually write and inspect transport layer headers and payloads
// * receive all traffic of a given transport protocol (e.g. ICMP or UDP)
// * optionally write and inspect network layer headers of packets
//
// Raw sockets don't have any notion of ports, and incoming packets are
// demultiplexed solely by protocol number. Thus, a raw UDP endpoint will
// receive every UDP packet received by netstack. bind(2) and connect(2) can be
// used to filter incoming packets by source and destination.
package raw
import (
"gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/tcpip"
"gvisor.dev/gvisor/pkg/tcpip/buffer"
"gvisor.dev/gvisor/pkg/tcpip/header"
"gvisor.dev/gvisor/pkg/tcpip/iptables"
"gvisor.dev/gvisor/pkg/tcpip/stack"
"gvisor.dev/gvisor/pkg/waiter"
)
// +stateify savable
type rawPacket struct {
rawPacketEntry
// data holds the actual packet data, including any headers and
// payload.
data buffer.VectorisedView `state:".(buffer.VectorisedView)"`
// timestampNS is the unix time at which the packet was received.
timestampNS int64
// senderAddr is the network address of the sender.
senderAddr tcpip.FullAddress
}
// endpoint is the raw socket implementation of tcpip.Endpoint. It is legal to
// have goroutines make concurrent calls into the endpoint.
//
// Lock order:
// endpoint.mu
// endpoint.rcvMu
//
// +stateify savable
type endpoint struct {
stack.TransportEndpointInfo
// The following fields are initialized at creation time and are
// immutable.
stack *stack.Stack `state:"manual"`
waiterQueue *waiter.Queue
associated bool
// The following fields are used to manage the receive queue and are
// protected by rcvMu.
rcvMu sync.Mutex `state:"nosave"`
rcvList rawPacketList
rcvBufSizeMax int `state:".(int)"`
rcvBufSize int
rcvClosed bool
// The following fields are protected by mu.
mu sync.RWMutex `state:"nosave"`
sndBufSize int
closed bool
connected bool
bound bool
// route is the route to a remote network endpoint. It is set via
// Connect(), and is valid only when conneted is true.
route stack.Route `state:"manual"`
stats tcpip.TransportEndpointStats `state:"nosave"`
}
// NewEndpoint returns a raw endpoint for the given protocols.
func NewEndpoint(stack *stack.Stack, netProto tcpip.NetworkProtocolNumber, transProto tcpip.TransportProtocolNumber, waiterQueue *waiter.Queue) (tcpip.Endpoint, *tcpip.Error) {
return newEndpoint(stack, netProto, transProto, waiterQueue, true /* associated */)
}
func newEndpoint(s *stack.Stack, netProto tcpip.NetworkProtocolNumber, transProto tcpip.TransportProtocolNumber, waiterQueue *waiter.Queue, associated bool) (tcpip.Endpoint, *tcpip.Error) {
if netProto != header.IPv4ProtocolNumber {
return nil, tcpip.ErrUnknownProtocol
}
e := &endpoint{
stack: s,
TransportEndpointInfo: stack.TransportEndpointInfo{
NetProto: netProto,
TransProto: transProto,
},
waiterQueue: waiterQueue,
rcvBufSizeMax: 32 * 1024,
sndBufSize: 32 * 1024,
associated: associated,
}
// Unassociated endpoints are write-only and users call Write() with IP
// headers included. Because they're write-only, We don't need to
// register with the stack.
if !associated {
e.rcvBufSizeMax = 0
e.waiterQueue = nil
return e, nil
}
if err := e.stack.RegisterRawTransportEndpoint(e.RegisterNICID, e.NetProto, e.TransProto, e); err != nil {
return nil, err
}
return e, nil
}
// Close implements tcpip.Endpoint.Close.
func (e *endpoint) Close() {
e.mu.Lock()
defer e.mu.Unlock()
if e.closed || !e.associated {
return
}
e.stack.UnregisterRawTransportEndpoint(e.RegisterNICID, e.NetProto, e.TransProto, e)
e.rcvMu.Lock()
defer e.rcvMu.Unlock()
// Clear the receive list.
e.rcvClosed = true
e.rcvBufSize = 0
for !e.rcvList.Empty() {
e.rcvList.Remove(e.rcvList.Front())
}
if e.connected {
e.route.Release()
e.connected = false
}
e.closed = true
e.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.EventIn | waiter.EventOut)
}
// ModerateRecvBuf implements tcpip.Endpoint.ModerateRecvBuf.
func (e *endpoint) ModerateRecvBuf(copied int) {}
// IPTables implements tcpip.Endpoint.IPTables.
func (e *endpoint) IPTables() (iptables.IPTables, error) {
return e.stack.IPTables(), nil
}
// Read implements tcpip.Endpoint.Read.
func (e *endpoint) Read(addr *tcpip.FullAddress) (buffer.View, tcpip.ControlMessages, *tcpip.Error) {
if !e.associated {
return buffer.View{}, tcpip.ControlMessages{}, tcpip.ErrInvalidOptionValue
}
e.rcvMu.Lock()
// If there's no data to read, return that read would block or that the
// endpoint is closed.
if e.rcvList.Empty() {
err := tcpip.ErrWouldBlock
if e.rcvClosed {
e.stats.ReadErrors.ReadClosed.Increment()
err = tcpip.ErrClosedForReceive
}
e.rcvMu.Unlock()
return buffer.View{}, tcpip.ControlMessages{}, err
}
pkt := e.rcvList.Front()
e.rcvList.Remove(pkt)
e.rcvBufSize -= pkt.data.Size()
e.rcvMu.Unlock()
if addr != nil {
*addr = pkt.senderAddr
}
return pkt.data.ToView(), tcpip.ControlMessages{HasTimestamp: true, Timestamp: pkt.timestampNS}, nil
}
// Write implements tcpip.Endpoint.Write.
func (e *endpoint) Write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, <-chan struct{}, *tcpip.Error) {
n, ch, err := e.write(p, opts)
switch err {
case nil:
e.stats.PacketsSent.Increment()
case tcpip.ErrMessageTooLong, tcpip.ErrInvalidOptionValue:
e.stats.WriteErrors.InvalidArgs.Increment()
case tcpip.ErrClosedForSend:
e.stats.WriteErrors.WriteClosed.Increment()
case tcpip.ErrInvalidEndpointState:
e.stats.WriteErrors.InvalidEndpointState.Increment()
case tcpip.ErrNoLinkAddress:
e.stats.SendErrors.NoLinkAddr.Increment()
case tcpip.ErrNoRoute, tcpip.ErrBroadcastDisabled, tcpip.ErrNetworkUnreachable:
// Errors indicating any problem with IP routing of the packet.
e.stats.SendErrors.NoRoute.Increment()
default:
// For all other errors when writing to the network layer.
e.stats.SendErrors.SendToNetworkFailed.Increment()
}
return n, ch, err
}
func (e *endpoint) write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, <-chan struct{}, *tcpip.Error) {
// MSG_MORE is unimplemented. This also means that MSG_EOR is a no-op.
if opts.More {
return 0, nil, tcpip.ErrInvalidOptionValue
}
e.mu.RLock()
if e.closed {
e.mu.RUnlock()
return 0, nil, tcpip.ErrInvalidEndpointState
}
payloadBytes, err := p.FullPayload()
if err != nil {
e.mu.RUnlock()
return 0, nil, err
}
// If this is an unassociated socket and callee provided a nonzero
// destination address, route using that address.
if !e.associated {
ip := header.IPv4(payloadBytes)
if !ip.IsValid(len(payloadBytes)) {
e.mu.RUnlock()
return 0, nil, tcpip.ErrInvalidOptionValue
}
dstAddr := ip.DestinationAddress()
// Update dstAddr with the address in the IP header, unless
// opts.To is set (e.g. if sendto specifies a specific
// address).
if dstAddr != tcpip.Address([]byte{0, 0, 0, 0}) && opts.To == nil {
opts.To = &tcpip.FullAddress{
NIC: 0, // NIC is unset.
Addr: dstAddr, // The address from the payload.
Port: 0, // There are no ports here.
}
}
}
// Did the user caller provide a destination? If not, use the connected
// destination.
if opts.To == nil {
// If the user doesn't specify a destination, they should have
// connected to another address.
if !e.connected {
e.mu.RUnlock()
return 0, nil, tcpip.ErrDestinationRequired
}
if e.route.IsResolutionRequired() {
savedRoute := &e.route
// Promote lock to exclusive if using a shared route,
// given that it may need to change in finishWrite.
e.mu.RUnlock()
e.mu.Lock()
// Make sure that the route didn't change during the
// time we didn't hold the lock.
if !e.connected || savedRoute != &e.route {
e.mu.Unlock()
return 0, nil, tcpip.ErrInvalidEndpointState
}
n, ch, err := e.finishWrite(payloadBytes, savedRoute)
e.mu.Unlock()
return n, ch, err
}
n, ch, err := e.finishWrite(payloadBytes, &e.route)
e.mu.RUnlock()
return n, ch, err
}
// The caller provided a destination. Reject destination address if it
// goes through a different NIC than the endpoint was bound to.
nic := opts.To.NIC
if e.bound && nic != 0 && nic != e.BindNICID {
e.mu.RUnlock()
return 0, nil, tcpip.ErrNoRoute
}
// We don't support IPv6 yet, so this has to be an IPv4 address.
if len(opts.To.Addr) != header.IPv4AddressSize {
e.mu.RUnlock()
return 0, nil, tcpip.ErrInvalidEndpointState
}
// Find the route to the destination. If BindAddress is 0,
// FindRoute will choose an appropriate source address.
route, err := e.stack.FindRoute(nic, e.BindAddr, opts.To.Addr, e.NetProto, false)
if err != nil {
e.mu.RUnlock()
return 0, nil, err
}
n, ch, err := e.finishWrite(payloadBytes, &route)
route.Release()
e.mu.RUnlock()
return n, ch, err
}
// finishWrite writes the payload to a route. It resolves the route if
// necessary. It's really just a helper to make defer unnecessary in Write.
func (e *endpoint) finishWrite(payloadBytes []byte, route *stack.Route) (int64, <-chan struct{}, *tcpip.Error) {
// We may need to resolve the route (match a link layer address to the
// network address). If that requires blocking (e.g. to use ARP),
// return a channel on which the caller can wait.
if route.IsResolutionRequired() {
if ch, err := route.Resolve(nil); err != nil {
if err == tcpip.ErrWouldBlock {
return 0, ch, tcpip.ErrNoLinkAddress
}
return 0, nil, err
}
}
switch e.NetProto {
case header.IPv4ProtocolNumber:
if !e.associated {
if err := route.WriteHeaderIncludedPacket(tcpip.PacketBuffer{
Data: buffer.View(payloadBytes).ToVectorisedView(),
}); err != nil {
return 0, nil, err
}
break
}
hdr := buffer.NewPrependable(len(payloadBytes) + int(route.MaxHeaderLength()))
if err := route.WritePacket(nil /* gso */, stack.NetworkHeaderParams{Protocol: e.TransProto, TTL: route.DefaultTTL(), TOS: stack.DefaultTOS}, tcpip.PacketBuffer{
Header: hdr,
Data: buffer.View(payloadBytes).ToVectorisedView(),
}); err != nil {
return 0, nil, err
}
default:
return 0, nil, tcpip.ErrUnknownProtocol
}
return int64(len(payloadBytes)), nil, nil
}
// Peek implements tcpip.Endpoint.Peek.
func (e *endpoint) Peek([][]byte) (int64, tcpip.ControlMessages, *tcpip.Error) {
return 0, tcpip.ControlMessages{}, nil
}
// Disconnect implements tcpip.Endpoint.Disconnect.
func (*endpoint) Disconnect() *tcpip.Error {
return tcpip.ErrNotSupported
}
// Connect implements tcpip.Endpoint.Connect.
func (e *endpoint) Connect(addr tcpip.FullAddress) *tcpip.Error {
e.mu.Lock()
defer e.mu.Unlock()
if e.closed {
return tcpip.ErrInvalidEndpointState
}
// We don't support IPv6 yet.
if len(addr.Addr) != header.IPv4AddressSize {
return tcpip.ErrInvalidEndpointState
}
nic := addr.NIC
if e.bound {
if e.BindNICID == 0 {
// If we're bound, but not to a specific NIC, the NIC
// in addr will be used. Nothing to do here.
} else if addr.NIC == 0 {
// If we're bound to a specific NIC, but addr doesn't
// specify a NIC, use the bound NIC.
nic = e.BindNICID
} else if addr.NIC != e.BindNICID {
// We're bound and addr specifies a NIC. They must be
// the same.
return tcpip.ErrInvalidEndpointState
}
}
// Find a route to the destination.
route, err := e.stack.FindRoute(nic, tcpip.Address(""), addr.Addr, e.NetProto, false)
if err != nil {
return err
}
defer route.Release()
if e.associated {
// Re-register the endpoint with the appropriate NIC.
if err := e.stack.RegisterRawTransportEndpoint(addr.NIC, e.NetProto, e.TransProto, e); err != nil {
return err
}
e.stack.UnregisterRawTransportEndpoint(e.RegisterNICID, e.NetProto, e.TransProto, e)
e.RegisterNICID = nic
}
// Save the route we've connected via.
e.route = route.Clone()
e.connected = true
return nil
}
// Shutdown implements tcpip.Endpoint.Shutdown. It's a noop for raw sockets.
func (e *endpoint) Shutdown(flags tcpip.ShutdownFlags) *tcpip.Error {
e.mu.Lock()
defer e.mu.Unlock()
if !e.connected {
return tcpip.ErrNotConnected
}
return nil
}
// Listen implements tcpip.Endpoint.Listen.
func (e *endpoint) Listen(backlog int) *tcpip.Error {
return tcpip.ErrNotSupported
}
// Accept implements tcpip.Endpoint.Accept.
func (e *endpoint) Accept() (tcpip.Endpoint, *waiter.Queue, *tcpip.Error) {
return nil, nil, tcpip.ErrNotSupported
}
// Bind implements tcpip.Endpoint.Bind.
func (e *endpoint) Bind(addr tcpip.FullAddress) *tcpip.Error {
e.mu.Lock()
defer e.mu.Unlock()
// Callers must provide an IPv4 address or no network address (for
// binding to a NIC, but not an address).
if len(addr.Addr) != 0 && len(addr.Addr) != 4 {
return tcpip.ErrInvalidEndpointState
}
// If a local address was specified, verify that it's valid.
if len(addr.Addr) == header.IPv4AddressSize && e.stack.CheckLocalAddress(addr.NIC, e.NetProto, addr.Addr) == 0 {
return tcpip.ErrBadLocalAddress
}
if e.associated {
// Re-register the endpoint with the appropriate NIC.
if err := e.stack.RegisterRawTransportEndpoint(addr.NIC, e.NetProto, e.TransProto, e); err != nil {
return err
}
e.stack.UnregisterRawTransportEndpoint(e.RegisterNICID, e.NetProto, e.TransProto, e)
e.RegisterNICID = addr.NIC
e.BindNICID = addr.NIC
}
e.BindAddr = addr.Addr
e.bound = true
return nil
}
// GetLocalAddress implements tcpip.Endpoint.GetLocalAddress.
func (e *endpoint) GetLocalAddress() (tcpip.FullAddress, *tcpip.Error) {
return tcpip.FullAddress{}, tcpip.ErrNotSupported
}
// GetRemoteAddress implements tcpip.Endpoint.GetRemoteAddress.
func (e *endpoint) GetRemoteAddress() (tcpip.FullAddress, *tcpip.Error) {
// Even a connected socket doesn't return a remote address.
return tcpip.FullAddress{}, tcpip.ErrNotConnected
}
// Readiness implements tcpip.Endpoint.Readiness.
func (e *endpoint) Readiness(mask waiter.EventMask) waiter.EventMask {
// The endpoint is always writable.
result := waiter.EventOut & mask
// Determine whether the endpoint is readable.
if (mask & waiter.EventIn) != 0 {
e.rcvMu.Lock()
if !e.rcvList.Empty() || e.rcvClosed {
result |= waiter.EventIn
}
e.rcvMu.Unlock()
}
return result
}
// SetSockOpt implements tcpip.Endpoint.SetSockOpt.
func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
return tcpip.ErrUnknownProtocolOption
}
// SetSockOptBool implements tcpip.Endpoint.SetSockOptBool.
func (e *endpoint) SetSockOptBool(opt tcpip.SockOptBool, v bool) *tcpip.Error {
return tcpip.ErrUnknownProtocolOption
}
// SetSockOptInt implements tcpip.Endpoint.SetSockOptInt.
func (e *endpoint) SetSockOptInt(opt tcpip.SockOptInt, v int) *tcpip.Error {
return tcpip.ErrUnknownProtocolOption
}
// GetSockOpt implements tcpip.Endpoint.GetSockOpt.
func (e *endpoint) GetSockOpt(opt interface{}) *tcpip.Error {
switch o := opt.(type) {
case tcpip.ErrorOption:
return nil
case *tcpip.KeepaliveEnabledOption:
*o = 0
return nil
default:
return tcpip.ErrUnknownProtocolOption
}
}
// GetSockOptBool implements tcpip.Endpoint.GetSockOptBool.
func (e *endpoint) GetSockOptBool(opt tcpip.SockOptBool) (bool, *tcpip.Error) {
return false, tcpip.ErrUnknownProtocolOption
}
// GetSockOptInt implements tcpip.Endpoint.GetSockOptInt.
func (e *endpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, *tcpip.Error) {
switch opt {
case tcpip.ReceiveQueueSizeOption:
v := 0
e.rcvMu.Lock()
if !e.rcvList.Empty() {
p := e.rcvList.Front()
v = p.data.Size()
}
e.rcvMu.Unlock()
return v, nil
case tcpip.SendBufferSizeOption:
e.mu.Lock()
v := e.sndBufSize
e.mu.Unlock()
return v, nil
case tcpip.ReceiveBufferSizeOption:
e.rcvMu.Lock()
v := e.rcvBufSizeMax
e.rcvMu.Unlock()
return v, nil
}
return -1, tcpip.ErrUnknownProtocolOption
}
// HandlePacket implements stack.RawTransportEndpoint.HandlePacket.
Use PacketBuffers, rather than VectorisedViews, in netstack. PacketBuffers are analogous to Linux's sk_buff. They hold all information about a packet, headers, and payload. This is important for: * iptables to access various headers of packets * Preventing the clutter of passing different net and link headers along with VectorisedViews to packet handling functions. This change only affects the incoming packet path, and a future change will change the outgoing path. Benchmark Regular PacketBufferPtr PacketBufferConcrete -------------------------------------------------------------------------------- BM_Recvmsg 400.715MB/s 373.676MB/s 396.276MB/s BM_Sendmsg 361.832MB/s 333.003MB/s 335.571MB/s BM_Recvfrom 453.336MB/s 393.321MB/s 381.650MB/s BM_Sendto 378.052MB/s 372.134MB/s 341.342MB/s BM_SendmsgTCP/0/1k 353.711MB/s 316.216MB/s 322.747MB/s BM_SendmsgTCP/0/2k 600.681MB/s 588.776MB/s 565.050MB/s BM_SendmsgTCP/0/4k 995.301MB/s 888.808MB/s 941.888MB/s BM_SendmsgTCP/0/8k 1.517GB/s 1.274GB/s 1.345GB/s BM_SendmsgTCP/0/16k 1.872GB/s 1.586GB/s 1.698GB/s BM_SendmsgTCP/0/32k 1.017GB/s 1.020GB/s 1.133GB/s BM_SendmsgTCP/0/64k 475.626MB/s 584.587MB/s 627.027MB/s BM_SendmsgTCP/0/128k 416.371MB/s 503.434MB/s 409.850MB/s BM_SendmsgTCP/0/256k 323.449MB/s 449.599MB/s 388.852MB/s BM_SendmsgTCP/0/512k 243.992MB/s 267.676MB/s 314.474MB/s BM_SendmsgTCP/0/1M 95.138MB/s 95.874MB/s 95.417MB/s BM_SendmsgTCP/0/2M 96.261MB/s 94.977MB/s 96.005MB/s BM_SendmsgTCP/0/4M 96.512MB/s 95.978MB/s 95.370MB/s BM_SendmsgTCP/0/8M 95.603MB/s 95.541MB/s 94.935MB/s BM_SendmsgTCP/0/16M 94.598MB/s 94.696MB/s 94.521MB/s BM_SendmsgTCP/0/32M 94.006MB/s 94.671MB/s 94.768MB/s BM_SendmsgTCP/0/64M 94.133MB/s 94.333MB/s 94.746MB/s BM_SendmsgTCP/0/128M 93.615MB/s 93.497MB/s 93.573MB/s BM_SendmsgTCP/0/256M 93.241MB/s 95.100MB/s 93.272MB/s BM_SendmsgTCP/1/1k 303.644MB/s 316.074MB/s 308.430MB/s BM_SendmsgTCP/1/2k 537.093MB/s 584.962MB/s 529.020MB/s BM_SendmsgTCP/1/4k 882.362MB/s 939.087MB/s 892.285MB/s BM_SendmsgTCP/1/8k 1.272GB/s 1.394GB/s 1.296GB/s BM_SendmsgTCP/1/16k 1.802GB/s 2.019GB/s 1.830GB/s BM_SendmsgTCP/1/32k 2.084GB/s 2.173GB/s 2.156GB/s BM_SendmsgTCP/1/64k 2.515GB/s 2.463GB/s 2.473GB/s BM_SendmsgTCP/1/128k 2.811GB/s 3.004GB/s 2.946GB/s BM_SendmsgTCP/1/256k 3.008GB/s 3.159GB/s 3.171GB/s BM_SendmsgTCP/1/512k 2.980GB/s 3.150GB/s 3.126GB/s BM_SendmsgTCP/1/1M 2.165GB/s 2.233GB/s 2.163GB/s BM_SendmsgTCP/1/2M 2.370GB/s 2.219GB/s 2.453GB/s BM_SendmsgTCP/1/4M 2.005GB/s 2.091GB/s 2.214GB/s BM_SendmsgTCP/1/8M 2.111GB/s 2.013GB/s 2.109GB/s BM_SendmsgTCP/1/16M 1.902GB/s 1.868GB/s 1.897GB/s BM_SendmsgTCP/1/32M 1.655GB/s 1.665GB/s 1.635GB/s BM_SendmsgTCP/1/64M 1.575GB/s 1.547GB/s 1.575GB/s BM_SendmsgTCP/1/128M 1.524GB/s 1.584GB/s 1.580GB/s BM_SendmsgTCP/1/256M 1.579GB/s 1.607GB/s 1.593GB/s PiperOrigin-RevId: 278940079
2019-11-06 22:24:38 +00:00
func (e *endpoint) HandlePacket(route *stack.Route, pkt tcpip.PacketBuffer) {
e.rcvMu.Lock()
// Drop the packet if our buffer is currently full.
if e.rcvClosed {
e.rcvMu.Unlock()
e.stack.Stats().DroppedPackets.Increment()
e.stats.ReceiveErrors.ClosedReceiver.Increment()
return
}
if e.rcvBufSize >= e.rcvBufSizeMax {
e.rcvMu.Unlock()
e.stack.Stats().DroppedPackets.Increment()
e.stats.ReceiveErrors.ReceiveBufferOverflow.Increment()
return
}
if e.bound {
// If bound to a NIC, only accept data for that NIC.
if e.BindNICID != 0 && e.BindNICID != route.NICID() {
e.rcvMu.Unlock()
return
}
// If bound to an address, only accept data for that address.
if e.BindAddr != "" && e.BindAddr != route.RemoteAddress {
e.rcvMu.Unlock()
return
}
}
// If connected, only accept packets from the remote address we
// connected to.
if e.connected && e.route.RemoteAddress != route.RemoteAddress {
e.rcvMu.Unlock()
return
}
wasEmpty := e.rcvBufSize == 0
// Push new packet into receive list and increment the buffer size.
Use PacketBuffers, rather than VectorisedViews, in netstack. PacketBuffers are analogous to Linux's sk_buff. They hold all information about a packet, headers, and payload. This is important for: * iptables to access various headers of packets * Preventing the clutter of passing different net and link headers along with VectorisedViews to packet handling functions. This change only affects the incoming packet path, and a future change will change the outgoing path. Benchmark Regular PacketBufferPtr PacketBufferConcrete -------------------------------------------------------------------------------- BM_Recvmsg 400.715MB/s 373.676MB/s 396.276MB/s BM_Sendmsg 361.832MB/s 333.003MB/s 335.571MB/s BM_Recvfrom 453.336MB/s 393.321MB/s 381.650MB/s BM_Sendto 378.052MB/s 372.134MB/s 341.342MB/s BM_SendmsgTCP/0/1k 353.711MB/s 316.216MB/s 322.747MB/s BM_SendmsgTCP/0/2k 600.681MB/s 588.776MB/s 565.050MB/s BM_SendmsgTCP/0/4k 995.301MB/s 888.808MB/s 941.888MB/s BM_SendmsgTCP/0/8k 1.517GB/s 1.274GB/s 1.345GB/s BM_SendmsgTCP/0/16k 1.872GB/s 1.586GB/s 1.698GB/s BM_SendmsgTCP/0/32k 1.017GB/s 1.020GB/s 1.133GB/s BM_SendmsgTCP/0/64k 475.626MB/s 584.587MB/s 627.027MB/s BM_SendmsgTCP/0/128k 416.371MB/s 503.434MB/s 409.850MB/s BM_SendmsgTCP/0/256k 323.449MB/s 449.599MB/s 388.852MB/s BM_SendmsgTCP/0/512k 243.992MB/s 267.676MB/s 314.474MB/s BM_SendmsgTCP/0/1M 95.138MB/s 95.874MB/s 95.417MB/s BM_SendmsgTCP/0/2M 96.261MB/s 94.977MB/s 96.005MB/s BM_SendmsgTCP/0/4M 96.512MB/s 95.978MB/s 95.370MB/s BM_SendmsgTCP/0/8M 95.603MB/s 95.541MB/s 94.935MB/s BM_SendmsgTCP/0/16M 94.598MB/s 94.696MB/s 94.521MB/s BM_SendmsgTCP/0/32M 94.006MB/s 94.671MB/s 94.768MB/s BM_SendmsgTCP/0/64M 94.133MB/s 94.333MB/s 94.746MB/s BM_SendmsgTCP/0/128M 93.615MB/s 93.497MB/s 93.573MB/s BM_SendmsgTCP/0/256M 93.241MB/s 95.100MB/s 93.272MB/s BM_SendmsgTCP/1/1k 303.644MB/s 316.074MB/s 308.430MB/s BM_SendmsgTCP/1/2k 537.093MB/s 584.962MB/s 529.020MB/s BM_SendmsgTCP/1/4k 882.362MB/s 939.087MB/s 892.285MB/s BM_SendmsgTCP/1/8k 1.272GB/s 1.394GB/s 1.296GB/s BM_SendmsgTCP/1/16k 1.802GB/s 2.019GB/s 1.830GB/s BM_SendmsgTCP/1/32k 2.084GB/s 2.173GB/s 2.156GB/s BM_SendmsgTCP/1/64k 2.515GB/s 2.463GB/s 2.473GB/s BM_SendmsgTCP/1/128k 2.811GB/s 3.004GB/s 2.946GB/s BM_SendmsgTCP/1/256k 3.008GB/s 3.159GB/s 3.171GB/s BM_SendmsgTCP/1/512k 2.980GB/s 3.150GB/s 3.126GB/s BM_SendmsgTCP/1/1M 2.165GB/s 2.233GB/s 2.163GB/s BM_SendmsgTCP/1/2M 2.370GB/s 2.219GB/s 2.453GB/s BM_SendmsgTCP/1/4M 2.005GB/s 2.091GB/s 2.214GB/s BM_SendmsgTCP/1/8M 2.111GB/s 2.013GB/s 2.109GB/s BM_SendmsgTCP/1/16M 1.902GB/s 1.868GB/s 1.897GB/s BM_SendmsgTCP/1/32M 1.655GB/s 1.665GB/s 1.635GB/s BM_SendmsgTCP/1/64M 1.575GB/s 1.547GB/s 1.575GB/s BM_SendmsgTCP/1/128M 1.524GB/s 1.584GB/s 1.580GB/s BM_SendmsgTCP/1/256M 1.579GB/s 1.607GB/s 1.593GB/s PiperOrigin-RevId: 278940079
2019-11-06 22:24:38 +00:00
packet := &rawPacket{
senderAddr: tcpip.FullAddress{
NIC: route.NICID(),
Addr: route.RemoteAddress,
},
}
Use PacketBuffers, rather than VectorisedViews, in netstack. PacketBuffers are analogous to Linux's sk_buff. They hold all information about a packet, headers, and payload. This is important for: * iptables to access various headers of packets * Preventing the clutter of passing different net and link headers along with VectorisedViews to packet handling functions. This change only affects the incoming packet path, and a future change will change the outgoing path. Benchmark Regular PacketBufferPtr PacketBufferConcrete -------------------------------------------------------------------------------- BM_Recvmsg 400.715MB/s 373.676MB/s 396.276MB/s BM_Sendmsg 361.832MB/s 333.003MB/s 335.571MB/s BM_Recvfrom 453.336MB/s 393.321MB/s 381.650MB/s BM_Sendto 378.052MB/s 372.134MB/s 341.342MB/s BM_SendmsgTCP/0/1k 353.711MB/s 316.216MB/s 322.747MB/s BM_SendmsgTCP/0/2k 600.681MB/s 588.776MB/s 565.050MB/s BM_SendmsgTCP/0/4k 995.301MB/s 888.808MB/s 941.888MB/s BM_SendmsgTCP/0/8k 1.517GB/s 1.274GB/s 1.345GB/s BM_SendmsgTCP/0/16k 1.872GB/s 1.586GB/s 1.698GB/s BM_SendmsgTCP/0/32k 1.017GB/s 1.020GB/s 1.133GB/s BM_SendmsgTCP/0/64k 475.626MB/s 584.587MB/s 627.027MB/s BM_SendmsgTCP/0/128k 416.371MB/s 503.434MB/s 409.850MB/s BM_SendmsgTCP/0/256k 323.449MB/s 449.599MB/s 388.852MB/s BM_SendmsgTCP/0/512k 243.992MB/s 267.676MB/s 314.474MB/s BM_SendmsgTCP/0/1M 95.138MB/s 95.874MB/s 95.417MB/s BM_SendmsgTCP/0/2M 96.261MB/s 94.977MB/s 96.005MB/s BM_SendmsgTCP/0/4M 96.512MB/s 95.978MB/s 95.370MB/s BM_SendmsgTCP/0/8M 95.603MB/s 95.541MB/s 94.935MB/s BM_SendmsgTCP/0/16M 94.598MB/s 94.696MB/s 94.521MB/s BM_SendmsgTCP/0/32M 94.006MB/s 94.671MB/s 94.768MB/s BM_SendmsgTCP/0/64M 94.133MB/s 94.333MB/s 94.746MB/s BM_SendmsgTCP/0/128M 93.615MB/s 93.497MB/s 93.573MB/s BM_SendmsgTCP/0/256M 93.241MB/s 95.100MB/s 93.272MB/s BM_SendmsgTCP/1/1k 303.644MB/s 316.074MB/s 308.430MB/s BM_SendmsgTCP/1/2k 537.093MB/s 584.962MB/s 529.020MB/s BM_SendmsgTCP/1/4k 882.362MB/s 939.087MB/s 892.285MB/s BM_SendmsgTCP/1/8k 1.272GB/s 1.394GB/s 1.296GB/s BM_SendmsgTCP/1/16k 1.802GB/s 2.019GB/s 1.830GB/s BM_SendmsgTCP/1/32k 2.084GB/s 2.173GB/s 2.156GB/s BM_SendmsgTCP/1/64k 2.515GB/s 2.463GB/s 2.473GB/s BM_SendmsgTCP/1/128k 2.811GB/s 3.004GB/s 2.946GB/s BM_SendmsgTCP/1/256k 3.008GB/s 3.159GB/s 3.171GB/s BM_SendmsgTCP/1/512k 2.980GB/s 3.150GB/s 3.126GB/s BM_SendmsgTCP/1/1M 2.165GB/s 2.233GB/s 2.163GB/s BM_SendmsgTCP/1/2M 2.370GB/s 2.219GB/s 2.453GB/s BM_SendmsgTCP/1/4M 2.005GB/s 2.091GB/s 2.214GB/s BM_SendmsgTCP/1/8M 2.111GB/s 2.013GB/s 2.109GB/s BM_SendmsgTCP/1/16M 1.902GB/s 1.868GB/s 1.897GB/s BM_SendmsgTCP/1/32M 1.655GB/s 1.665GB/s 1.635GB/s BM_SendmsgTCP/1/64M 1.575GB/s 1.547GB/s 1.575GB/s BM_SendmsgTCP/1/128M 1.524GB/s 1.584GB/s 1.580GB/s BM_SendmsgTCP/1/256M 1.579GB/s 1.607GB/s 1.593GB/s PiperOrigin-RevId: 278940079
2019-11-06 22:24:38 +00:00
networkHeader := append(buffer.View(nil), pkt.NetworkHeader...)
combinedVV := networkHeader.ToVectorisedView()
combinedVV.Append(pkt.Data)
packet.data = combinedVV
packet.timestampNS = e.stack.NowNanoseconds()
Use PacketBuffers, rather than VectorisedViews, in netstack. PacketBuffers are analogous to Linux's sk_buff. They hold all information about a packet, headers, and payload. This is important for: * iptables to access various headers of packets * Preventing the clutter of passing different net and link headers along with VectorisedViews to packet handling functions. This change only affects the incoming packet path, and a future change will change the outgoing path. Benchmark Regular PacketBufferPtr PacketBufferConcrete -------------------------------------------------------------------------------- BM_Recvmsg 400.715MB/s 373.676MB/s 396.276MB/s BM_Sendmsg 361.832MB/s 333.003MB/s 335.571MB/s BM_Recvfrom 453.336MB/s 393.321MB/s 381.650MB/s BM_Sendto 378.052MB/s 372.134MB/s 341.342MB/s BM_SendmsgTCP/0/1k 353.711MB/s 316.216MB/s 322.747MB/s BM_SendmsgTCP/0/2k 600.681MB/s 588.776MB/s 565.050MB/s BM_SendmsgTCP/0/4k 995.301MB/s 888.808MB/s 941.888MB/s BM_SendmsgTCP/0/8k 1.517GB/s 1.274GB/s 1.345GB/s BM_SendmsgTCP/0/16k 1.872GB/s 1.586GB/s 1.698GB/s BM_SendmsgTCP/0/32k 1.017GB/s 1.020GB/s 1.133GB/s BM_SendmsgTCP/0/64k 475.626MB/s 584.587MB/s 627.027MB/s BM_SendmsgTCP/0/128k 416.371MB/s 503.434MB/s 409.850MB/s BM_SendmsgTCP/0/256k 323.449MB/s 449.599MB/s 388.852MB/s BM_SendmsgTCP/0/512k 243.992MB/s 267.676MB/s 314.474MB/s BM_SendmsgTCP/0/1M 95.138MB/s 95.874MB/s 95.417MB/s BM_SendmsgTCP/0/2M 96.261MB/s 94.977MB/s 96.005MB/s BM_SendmsgTCP/0/4M 96.512MB/s 95.978MB/s 95.370MB/s BM_SendmsgTCP/0/8M 95.603MB/s 95.541MB/s 94.935MB/s BM_SendmsgTCP/0/16M 94.598MB/s 94.696MB/s 94.521MB/s BM_SendmsgTCP/0/32M 94.006MB/s 94.671MB/s 94.768MB/s BM_SendmsgTCP/0/64M 94.133MB/s 94.333MB/s 94.746MB/s BM_SendmsgTCP/0/128M 93.615MB/s 93.497MB/s 93.573MB/s BM_SendmsgTCP/0/256M 93.241MB/s 95.100MB/s 93.272MB/s BM_SendmsgTCP/1/1k 303.644MB/s 316.074MB/s 308.430MB/s BM_SendmsgTCP/1/2k 537.093MB/s 584.962MB/s 529.020MB/s BM_SendmsgTCP/1/4k 882.362MB/s 939.087MB/s 892.285MB/s BM_SendmsgTCP/1/8k 1.272GB/s 1.394GB/s 1.296GB/s BM_SendmsgTCP/1/16k 1.802GB/s 2.019GB/s 1.830GB/s BM_SendmsgTCP/1/32k 2.084GB/s 2.173GB/s 2.156GB/s BM_SendmsgTCP/1/64k 2.515GB/s 2.463GB/s 2.473GB/s BM_SendmsgTCP/1/128k 2.811GB/s 3.004GB/s 2.946GB/s BM_SendmsgTCP/1/256k 3.008GB/s 3.159GB/s 3.171GB/s BM_SendmsgTCP/1/512k 2.980GB/s 3.150GB/s 3.126GB/s BM_SendmsgTCP/1/1M 2.165GB/s 2.233GB/s 2.163GB/s BM_SendmsgTCP/1/2M 2.370GB/s 2.219GB/s 2.453GB/s BM_SendmsgTCP/1/4M 2.005GB/s 2.091GB/s 2.214GB/s BM_SendmsgTCP/1/8M 2.111GB/s 2.013GB/s 2.109GB/s BM_SendmsgTCP/1/16M 1.902GB/s 1.868GB/s 1.897GB/s BM_SendmsgTCP/1/32M 1.655GB/s 1.665GB/s 1.635GB/s BM_SendmsgTCP/1/64M 1.575GB/s 1.547GB/s 1.575GB/s BM_SendmsgTCP/1/128M 1.524GB/s 1.584GB/s 1.580GB/s BM_SendmsgTCP/1/256M 1.579GB/s 1.607GB/s 1.593GB/s PiperOrigin-RevId: 278940079
2019-11-06 22:24:38 +00:00
e.rcvList.PushBack(packet)
e.rcvBufSize += packet.data.Size()
e.rcvMu.Unlock()
e.stats.PacketsReceived.Increment()
// Notify waiters that there's data to be read.
if wasEmpty {
e.waiterQueue.Notify(waiter.EventIn)
}
}
// State implements socket.Socket.State.
func (e *endpoint) State() uint32 {
return 0
}
// Info returns a copy of the endpoint info.
func (e *endpoint) Info() tcpip.EndpointInfo {
e.mu.RLock()
// Make a copy of the endpoint info.
ret := e.TransportEndpointInfo
e.mu.RUnlock()
return &ret
}
// Stats returns a pointer to the endpoint stats.
func (e *endpoint) Stats() tcpip.EndpointStats {
return &e.stats
}
// Wait implements stack.TransportEndpoint.Wait.
func (*endpoint) Wait() {}