From 12235d533ae5c8b4b03c6d7f24cae785a2d6ec22 Mon Sep 17 00:00:00 2001 From: Kevin Krakauer Date: Mon, 21 Oct 2019 13:21:58 -0700 Subject: [PATCH] AF_PACKET support for netstack (aka epsocket). Like (AF_INET, SOCK_RAW) sockets, AF_PACKET sockets require CAP_NET_RAW. With runsc, you'll need to pass `--net-raw=true` to enable them. Binding isn't supported yet. PiperOrigin-RevId: 275909366 --- pkg/abi/linux/socket.go | 12 + pkg/sentry/socket/netstack/netstack.go | 40 +- pkg/sentry/socket/netstack/provider.go | 53 ++- pkg/tcpip/header/eth.go | 18 + pkg/tcpip/link/channel/channel.go | 19 +- pkg/tcpip/link/fdbased/endpoint.go | 15 +- pkg/tcpip/link/fdbased/endpoint_test.go | 21 +- pkg/tcpip/link/fdbased/mmap.go | 5 +- pkg/tcpip/link/fdbased/packet_dispatchers.go | 10 +- pkg/tcpip/link/loopback/BUILD | 1 + pkg/tcpip/link/loopback/loopback.go | 22 +- pkg/tcpip/link/muxed/injectable.go | 19 +- pkg/tcpip/link/muxed/injectable_test.go | 2 +- pkg/tcpip/link/sharedmem/sharedmem.go | 17 +- pkg/tcpip/link/sharedmem/sharedmem_test.go | 16 +- pkg/tcpip/link/sniffer/sniffer.go | 62 +++- pkg/tcpip/link/waitable/waitable.go | 15 +- pkg/tcpip/link/waitable/waitable_test.go | 13 +- pkg/tcpip/network/ip_test.go | 4 + pkg/tcpip/stack/nic.go | 73 +++- pkg/tcpip/stack/registration.go | 55 ++- pkg/tcpip/stack/stack.go | 136 ++++++- pkg/tcpip/stack/transport_demuxer.go | 2 +- pkg/tcpip/tcpip.go | 2 +- pkg/tcpip/transport/packet/BUILD | 46 +++ pkg/tcpip/transport/packet/endpoint.go | 363 +++++++++++++++++++ pkg/tcpip/transport/packet/endpoint_state.go | 72 ++++ pkg/tcpip/transport/raw/BUILD | 15 +- pkg/tcpip/transport/raw/endpoint.go | 30 +- pkg/tcpip/transport/raw/endpoint_state.go | 14 +- pkg/tcpip/transport/raw/protocol.go | 12 +- pkg/tcpip/transport/udp/protocol.go | 11 +- runsc/boot/loader.go | 2 +- test/syscalls/linux/packet_socket.cc | 28 +- test/syscalls/linux/packet_socket_raw.cc | 27 +- 35 files changed, 1090 insertions(+), 162 deletions(-) create mode 100644 pkg/tcpip/transport/packet/BUILD create mode 100644 pkg/tcpip/transport/packet/endpoint.go create mode 100644 pkg/tcpip/transport/packet/endpoint_state.go diff --git a/pkg/abi/linux/socket.go b/pkg/abi/linux/socket.go index d5b731390..2e2cc6be7 100644 --- a/pkg/abi/linux/socket.go +++ b/pkg/abi/linux/socket.go @@ -256,6 +256,17 @@ type SockAddrInet6 struct { Scope_id uint32 } +// SockAddrLink is a struct sockaddr_ll, from uapi/linux/if_packet.h. +type SockAddrLink struct { + Family uint16 + Protocol uint16 + InterfaceIndex int32 + ARPHardwareType uint16 + PacketType byte + HardwareAddrLen byte + HardwareAddr [8]byte +} + // UnixPathMax is the maximum length of the path in an AF_UNIX socket. // // From uapi/linux/un.h. @@ -278,6 +289,7 @@ type SockAddr interface { func (s *SockAddrInet) implementsSockAddr() {} func (s *SockAddrInet6) implementsSockAddr() {} +func (s *SockAddrLink) implementsSockAddr() {} func (s *SockAddrUnix) implementsSockAddr() {} func (s *SockAddrNetlink) implementsSockAddr() {} diff --git a/pkg/sentry/socket/netstack/netstack.go b/pkg/sentry/socket/netstack/netstack.go index 0ae573b45..debf9bce0 100644 --- a/pkg/sentry/socket/netstack/netstack.go +++ b/pkg/sentry/socket/netstack/netstack.go @@ -53,6 +53,7 @@ import ( "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/buffer" + "gvisor.dev/gvisor/pkg/tcpip/header" "gvisor.dev/gvisor/pkg/tcpip/stack" "gvisor.dev/gvisor/pkg/tcpip/transport/tcp" "gvisor.dev/gvisor/pkg/tcpip/transport/udp" @@ -296,6 +297,7 @@ func New(t *kernel.Task, family int, skType linux.SockType, protocol int, queue var sockAddrInetSize = int(binary.Size(linux.SockAddrInet{})) var sockAddrInet6Size = int(binary.Size(linux.SockAddrInet6{})) +var sockAddrLinkSize = int(binary.Size(linux.SockAddrLink{})) // bytesToIPAddress converts an IPv4 or IPv6 address from the user to the // netstack representation taking any addresses into account. @@ -307,12 +309,12 @@ func bytesToIPAddress(addr []byte) tcpip.Address { } // AddressAndFamily reads an sockaddr struct from the given address and -// converts it to the FullAddress format. It supports AF_UNIX, AF_INET and -// AF_INET6 addresses. +// converts it to the FullAddress format. It supports AF_UNIX, AF_INET, +// AF_INET6, and AF_PACKET addresses. // // strict indicates whether addresses with the AF_UNSPEC family are accepted of not. // -// AddressAndFamily returns an address, its family. +// AddressAndFamily returns an address and its family. func AddressAndFamily(sfamily int, addr []byte, strict bool) (tcpip.FullAddress, uint16, *syserr.Error) { // Make sure we have at least 2 bytes for the address family. if len(addr) < 2 { @@ -371,6 +373,22 @@ func AddressAndFamily(sfamily int, addr []byte, strict bool) (tcpip.FullAddress, } return out, family, nil + case linux.AF_PACKET: + var a linux.SockAddrLink + if len(addr) < sockAddrLinkSize { + return tcpip.FullAddress{}, family, syserr.ErrInvalidArgument + } + binary.Unmarshal(addr[:sockAddrLinkSize], usermem.ByteOrder, &a) + if a.Family != linux.AF_PACKET || a.HardwareAddrLen != header.EthernetAddressSize { + return tcpip.FullAddress{}, family, syserr.ErrInvalidArgument + } + + // TODO(b/129292371): Return protocol too. + return tcpip.FullAddress{ + NIC: tcpip.NICID(a.InterfaceIndex), + Addr: tcpip.Address(a.HardwareAddr[:header.EthernetAddressSize]), + }, family, nil + case linux.AF_UNSPEC: return tcpip.FullAddress{}, family, nil @@ -1951,12 +1969,14 @@ func ConvertAddress(family int, addr tcpip.FullAddress) (linux.SockAddr, uint32) return &out, uint32(2 + l) } return &out, uint32(3 + l) + case linux.AF_INET: var out linux.SockAddrInet copy(out.Addr[:], addr.Addr) out.Family = linux.AF_INET out.Port = htons(addr.Port) - return &out, uint32(binary.Size(out)) + return &out, uint32(sockAddrInetSize) + case linux.AF_INET6: var out linux.SockAddrInet6 if len(addr.Addr) == 4 { @@ -1972,7 +1992,17 @@ func ConvertAddress(family int, addr tcpip.FullAddress) (linux.SockAddr, uint32) if isLinkLocal(addr.Addr) { out.Scope_id = uint32(addr.NIC) } - return &out, uint32(binary.Size(out)) + return &out, uint32(sockAddrInet6Size) + + case linux.AF_PACKET: + // TODO(b/129292371): Return protocol too. + var out linux.SockAddrLink + out.Family = linux.AF_PACKET + out.InterfaceIndex = int32(addr.NIC) + out.HardwareAddrLen = header.EthernetAddressSize + copy(out.HardwareAddr[:], addr.Addr) + return &out, uint32(sockAddrLinkSize) + default: return nil, 0 } diff --git a/pkg/sentry/socket/netstack/provider.go b/pkg/sentry/socket/netstack/provider.go index 357a664cc..2d2c1ba2a 100644 --- a/pkg/sentry/socket/netstack/provider.go +++ b/pkg/sentry/socket/netstack/provider.go @@ -62,6 +62,10 @@ func getTransportProtocol(ctx context.Context, stype linux.SockType, protocol in } case linux.SOCK_RAW: + // TODO(b/142504697): "In order to create a raw socket, a + // process must have the CAP_NET_RAW capability in the user + // namespace that governs its network namespace." - raw(7) + // Raw sockets require CAP_NET_RAW. creds := auth.CredentialsFromContext(ctx) if !creds.HasCapability(linux.CAP_NET_RAW) { @@ -85,7 +89,8 @@ func getTransportProtocol(ctx context.Context, stype linux.SockType, protocol in return 0, true, syserr.ErrProtocolNotSupported } -// Socket creates a new socket object for the AF_INET or AF_INET6 family. +// Socket creates a new socket object for the AF_INET, AF_INET6, or AF_PACKET +// family. func (p *provider) Socket(t *kernel.Task, stype linux.SockType, protocol int) (*fs.File, *syserr.Error) { // Fail right away if we don't have a stack. stack := t.NetworkContext() @@ -99,6 +104,12 @@ func (p *provider) Socket(t *kernel.Task, stype linux.SockType, protocol int) (* return nil, nil } + // Packet sockets are handled separately, since they are neither INET + // nor INET6 specific. + if p.family == linux.AF_PACKET { + return packetSocket(t, eps, stype, protocol) + } + // Figure out the transport protocol. transProto, associated, err := getTransportProtocol(t, stype, protocol) if err != nil { @@ -121,12 +132,47 @@ func (p *provider) Socket(t *kernel.Task, stype linux.SockType, protocol int) (* return New(t, p.family, stype, int(transProto), wq, ep) } +func packetSocket(t *kernel.Task, epStack *Stack, stype linux.SockType, protocol int) (*fs.File, *syserr.Error) { + // TODO(b/142504697): "In order to create a packet socket, a process + // must have the CAP_NET_RAW capability in the user namespace that + // governs its network namespace." - packet(7) + + // Packet sockets require CAP_NET_RAW. + creds := auth.CredentialsFromContext(t) + if !creds.HasCapability(linux.CAP_NET_RAW) { + return nil, syserr.ErrNotPermitted + } + + // "cooked" packets don't contain link layer information. + var cooked bool + switch stype { + case linux.SOCK_DGRAM: + cooked = true + case linux.SOCK_RAW: + cooked = false + default: + return nil, syserr.ErrProtocolNotSupported + } + + // protocol is passed in network byte order, but netstack wants it in + // host order. + netProto := tcpip.NetworkProtocolNumber(ntohs(uint16(protocol))) + + wq := &waiter.Queue{} + ep, err := epStack.Stack.NewPacketEndpoint(cooked, netProto, wq) + if err != nil { + return nil, syserr.TranslateNetstackError(err) + } + + return New(t, linux.AF_PACKET, stype, protocol, wq, ep) +} + // Pair just returns nil sockets (not supported). func (*provider) Pair(*kernel.Task, linux.SockType, int) (*fs.File, *fs.File, *syserr.Error) { return nil, nil, nil } -// init registers socket providers for AF_INET and AF_INET6. +// init registers socket providers for AF_INET, AF_INET6, and AF_PACKET. func init() { // Providers backed by netstack. p := []provider{ @@ -138,6 +184,9 @@ func init() { family: linux.AF_INET6, netProto: ipv6.ProtocolNumber, }, + { + family: linux.AF_PACKET, + }, } for i := range p { diff --git a/pkg/tcpip/header/eth.go b/pkg/tcpip/header/eth.go index 4c3d3311f..1bb3bb5e6 100644 --- a/pkg/tcpip/header/eth.go +++ b/pkg/tcpip/header/eth.go @@ -50,6 +50,24 @@ const ( EthernetAddressSize = 6 ) +const ( + // EthernetProtocolAll is a catch-all for all protocols carried inside + // an ethernet frame. It is mainly used to create packet sockets that + // capture all traffic. + EthernetProtocolAll tcpip.NetworkProtocolNumber = 0x0003 + + // EthernetProtocolPUP is the PARC Universial Packet protocol ethertype. + EthernetProtocolPUP tcpip.NetworkProtocolNumber = 0x0200 +) + +// Ethertypes holds the protocol numbers describing the payload of an ethernet +// frame. These types aren't necessarily supported by netstack, but can be used +// to catch all traffic of a type via packet endpoints. +var Ethertypes = []tcpip.NetworkProtocolNumber{ + EthernetProtocolAll, + EthernetProtocolPUP, +} + // SourceAddress returns the "MAC source" field of the ethernet frame header. func (b Ethernet) SourceAddress() tcpip.LinkAddress { return tcpip.LinkAddress(b[srcMAC:][:EthernetAddressSize]) diff --git a/pkg/tcpip/link/channel/channel.go b/pkg/tcpip/link/channel/channel.go index 18adb2085..95c6f0828 100644 --- a/pkg/tcpip/link/channel/channel.go +++ b/pkg/tcpip/link/channel/channel.go @@ -72,7 +72,7 @@ func (e *Endpoint) Inject(protocol tcpip.NetworkProtocolNumber, vv buffer.Vector // InjectLinkAddr injects an inbound packet with a remote link address. func (e *Endpoint) InjectLinkAddr(protocol tcpip.NetworkProtocolNumber, remote tcpip.LinkAddress, vv buffer.VectorisedView) { - e.dispatcher.DeliverNetworkPacket(e, remote, "" /* local */, protocol, vv.Clone(nil)) + e.dispatcher.DeliverNetworkPacket(e, remote, "" /* local */, protocol, vv.Clone(nil), nil /* linkHeader */) } // Attach saves the stack network-layer dispatcher for use later when packets @@ -134,5 +134,22 @@ func (e *Endpoint) WritePacket(_ *stack.Route, gso *stack.GSO, hdr buffer.Prepen return nil } +// WriteRawPacket implements stack.LinkEndpoint.WriteRawPacket. +func (e *Endpoint) WriteRawPacket(packet buffer.VectorisedView) *tcpip.Error { + p := PacketInfo{ + Header: packet.ToView(), + Proto: 0, + Payload: buffer.View{}, + GSO: nil, + } + + select { + case e.C <- p: + default: + } + + return nil +} + // Wait implements stack.LinkEndpoint.Wait. func (*Endpoint) Wait() {} diff --git a/pkg/tcpip/link/fdbased/endpoint.go b/pkg/tcpip/link/fdbased/endpoint.go index f80ac3435..255220d1a 100644 --- a/pkg/tcpip/link/fdbased/endpoint.go +++ b/pkg/tcpip/link/fdbased/endpoint.go @@ -430,8 +430,13 @@ func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, hdr buffer.Prepen return rawfile.NonBlockingWrite3(e.fds[0], hdr.View(), payload.ToView(), nil) } -// WriteRawPacket writes a raw packet directly to the file descriptor. -func (e *endpoint) WriteRawPacket(dest tcpip.Address, packet []byte) *tcpip.Error { +// WriteRawPacket implements stack.LinkEndpoint.WriteRawPacket. +func (e *endpoint) WriteRawPacket(packet buffer.VectorisedView) *tcpip.Error { + return rawfile.NonBlockingWrite(e.fds[0], packet.ToView()) +} + +// InjectOutobund implements stack.InjectableEndpoint.InjectOutbound. +func (e *endpoint) InjectOutbound(dest tcpip.Address, packet []byte) *tcpip.Error { return rawfile.NonBlockingWrite(e.fds[0], packet) } @@ -468,9 +473,9 @@ func (e *InjectableEndpoint) Attach(dispatcher stack.NetworkDispatcher) { e.dispatcher = dispatcher } -// Inject injects an inbound packet. -func (e *InjectableEndpoint) Inject(protocol tcpip.NetworkProtocolNumber, vv buffer.VectorisedView) { - e.dispatcher.DeliverNetworkPacket(e, "" /* remote */, "" /* local */, protocol, vv) +// InjectInbound injects an inbound packet. +func (e *InjectableEndpoint) InjectInbound(protocol tcpip.NetworkProtocolNumber, vv buffer.VectorisedView) { + e.dispatcher.DeliverNetworkPacket(e, "" /* remote */, "" /* local */, protocol, vv, nil /* linkHeader */) } // NewInjectable creates a new fd-based InjectableEndpoint. diff --git a/pkg/tcpip/link/fdbased/endpoint_test.go b/pkg/tcpip/link/fdbased/endpoint_test.go index 04406bc9a..59378b96c 100644 --- a/pkg/tcpip/link/fdbased/endpoint_test.go +++ b/pkg/tcpip/link/fdbased/endpoint_test.go @@ -43,9 +43,10 @@ const ( ) type packetInfo struct { - raddr tcpip.LinkAddress - proto tcpip.NetworkProtocolNumber - contents buffer.View + raddr tcpip.LinkAddress + proto tcpip.NetworkProtocolNumber + contents buffer.View + linkHeader buffer.View } type context struct { @@ -92,8 +93,8 @@ func (c *context) cleanup() { syscall.Close(c.fds[1]) } -func (c *context) DeliverNetworkPacket(linkEP stack.LinkEndpoint, remote tcpip.LinkAddress, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, vv buffer.VectorisedView) { - c.ch <- packetInfo{remote, protocol, vv.ToView()} +func (c *context) DeliverNetworkPacket(linkEP stack.LinkEndpoint, remote tcpip.LinkAddress, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, vv buffer.VectorisedView, linkHeader buffer.View) { + c.ch <- packetInfo{remote, protocol, vv.ToView(), linkHeader} } func TestNoEthernetProperties(t *testing.T) { @@ -293,11 +294,12 @@ func TestDeliverPacket(t *testing.T) { b[i] = uint8(rand.Intn(256)) } + var hdr header.Ethernet if !eth { // So that it looks like an IPv4 packet. b[0] = 0x40 } else { - hdr := make(header.Ethernet, header.EthernetMinimumSize) + hdr = make(header.Ethernet, header.EthernetMinimumSize) hdr.Encode(&header.EthernetFields{ SrcAddr: raddr, DstAddr: laddr, @@ -315,9 +317,10 @@ func TestDeliverPacket(t *testing.T) { select { case pi := <-c.ch: want := packetInfo{ - raddr: raddr, - proto: proto, - contents: b, + raddr: raddr, + proto: proto, + contents: b, + linkHeader: buffer.View(hdr), } if !eth { want.proto = header.IPv4ProtocolNumber diff --git a/pkg/tcpip/link/fdbased/mmap.go b/pkg/tcpip/link/fdbased/mmap.go index 8bfeb97e4..554d45715 100644 --- a/pkg/tcpip/link/fdbased/mmap.go +++ b/pkg/tcpip/link/fdbased/mmap.go @@ -169,9 +169,10 @@ func (d *packetMMapDispatcher) dispatch() (bool, *tcpip.Error) { var ( p tcpip.NetworkProtocolNumber remote, local tcpip.LinkAddress + eth header.Ethernet ) if d.e.hdrSize > 0 { - eth := header.Ethernet(pkt) + eth = header.Ethernet(pkt) p = eth.Type() remote = eth.SourceAddress() local = eth.DestinationAddress() @@ -189,6 +190,6 @@ func (d *packetMMapDispatcher) dispatch() (bool, *tcpip.Error) { } pkt = pkt[d.e.hdrSize:] - d.e.dispatcher.DeliverNetworkPacket(d.e, remote, local, p, buffer.NewVectorisedView(len(pkt), []buffer.View{buffer.View(pkt)})) + d.e.dispatcher.DeliverNetworkPacket(d.e, remote, local, p, buffer.NewVectorisedView(len(pkt), []buffer.View{buffer.View(pkt)}), buffer.View(eth)) return true, nil } diff --git a/pkg/tcpip/link/fdbased/packet_dispatchers.go b/pkg/tcpip/link/fdbased/packet_dispatchers.go index 7ca217e5b..9fb760891 100644 --- a/pkg/tcpip/link/fdbased/packet_dispatchers.go +++ b/pkg/tcpip/link/fdbased/packet_dispatchers.go @@ -118,9 +118,10 @@ func (d *readVDispatcher) dispatch() (bool, *tcpip.Error) { var ( p tcpip.NetworkProtocolNumber remote, local tcpip.LinkAddress + eth header.Ethernet ) if d.e.hdrSize > 0 { - eth := header.Ethernet(d.views[0]) + eth = header.Ethernet(d.views[0][:header.EthernetMinimumSize]) p = eth.Type() remote = eth.SourceAddress() local = eth.DestinationAddress() @@ -141,7 +142,7 @@ func (d *readVDispatcher) dispatch() (bool, *tcpip.Error) { vv := buffer.NewVectorisedView(n, d.views[:used]) vv.TrimFront(d.e.hdrSize) - d.e.dispatcher.DeliverNetworkPacket(d.e, remote, local, p, vv) + d.e.dispatcher.DeliverNetworkPacket(d.e, remote, local, p, vv, buffer.View(eth)) // Prepare e.views for another packet: release used views. for i := 0; i < used; i++ { @@ -271,9 +272,10 @@ func (d *recvMMsgDispatcher) dispatch() (bool, *tcpip.Error) { var ( p tcpip.NetworkProtocolNumber remote, local tcpip.LinkAddress + eth header.Ethernet ) if d.e.hdrSize > 0 { - eth := header.Ethernet(d.views[k][0]) + eth = header.Ethernet(d.views[k][0]) p = eth.Type() remote = eth.SourceAddress() local = eth.DestinationAddress() @@ -293,7 +295,7 @@ func (d *recvMMsgDispatcher) dispatch() (bool, *tcpip.Error) { used := d.capViews(k, int(n), BufConfig) vv := buffer.NewVectorisedView(int(n), d.views[k][:used]) vv.TrimFront(d.e.hdrSize) - d.e.dispatcher.DeliverNetworkPacket(d.e, remote, local, p, vv) + d.e.dispatcher.DeliverNetworkPacket(d.e, remote, local, p, vv, buffer.View(eth)) // Prepare e.views for another packet: release used views. for i := 0; i < used; i++ { diff --git a/pkg/tcpip/link/loopback/BUILD b/pkg/tcpip/link/loopback/BUILD index 47a54845c..23e4d1418 100644 --- a/pkg/tcpip/link/loopback/BUILD +++ b/pkg/tcpip/link/loopback/BUILD @@ -10,6 +10,7 @@ go_library( deps = [ "//pkg/tcpip", "//pkg/tcpip/buffer", + "//pkg/tcpip/header", "//pkg/tcpip/stack", ], ) diff --git a/pkg/tcpip/link/loopback/loopback.go b/pkg/tcpip/link/loopback/loopback.go index b36629d2c..5f5bacf27 100644 --- a/pkg/tcpip/link/loopback/loopback.go +++ b/pkg/tcpip/link/loopback/loopback.go @@ -23,6 +23,7 @@ package loopback import ( "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/buffer" + "gvisor.dev/gvisor/pkg/tcpip/header" "gvisor.dev/gvisor/pkg/tcpip/stack" ) @@ -70,6 +71,9 @@ func (*endpoint) LinkAddress() tcpip.LinkAddress { return "" } +// Wait implements stack.LinkEndpoint.Wait. +func (*endpoint) Wait() {} + // WritePacket implements stack.LinkEndpoint.WritePacket. It delivers outbound // packets to the network-layer dispatcher. func (e *endpoint) WritePacket(_ *stack.Route, _ *stack.GSO, hdr buffer.Prependable, payload buffer.VectorisedView, protocol tcpip.NetworkProtocolNumber) *tcpip.Error { @@ -81,10 +85,22 @@ func (e *endpoint) WritePacket(_ *stack.Route, _ *stack.GSO, hdr buffer.Prependa // Because we're immediately turning around and writing the packet back to the // rx path, we intentionally don't preserve the remote and local link // addresses from the stack.Route we're passed. - e.dispatcher.DeliverNetworkPacket(e, "" /* remote */, "" /* local */, protocol, vv) + e.dispatcher.DeliverNetworkPacket(e, "" /* remote */, "" /* local */, protocol, vv, nil /* linkHeader */) return nil } -// Wait implements stack.LinkEndpoint.Wait. -func (*endpoint) Wait() {} +// WriteRawPacket implements stack.LinkEndpoint.WriteRawPacket. +func (e *endpoint) WriteRawPacket(packet buffer.VectorisedView) *tcpip.Error { + // Reject the packet if it's shorter than an ethernet header. + if packet.Size() < header.EthernetMinimumSize { + return tcpip.ErrBadAddress + } + + // There should be an ethernet header at the beginning of packet. + linkHeader := header.Ethernet(packet.First()[:header.EthernetMinimumSize]) + packet.TrimFront(len(linkHeader)) + e.dispatcher.DeliverNetworkPacket(e, "" /* remote */, "" /* local */, linkHeader.Type(), packet, buffer.View(linkHeader)) + + return nil +} diff --git a/pkg/tcpip/link/muxed/injectable.go b/pkg/tcpip/link/muxed/injectable.go index 7c946101d..41524e271 100644 --- a/pkg/tcpip/link/muxed/injectable.go +++ b/pkg/tcpip/link/muxed/injectable.go @@ -79,9 +79,9 @@ func (m *InjectableEndpoint) IsAttached() bool { return m.dispatcher != nil } -// Inject implements stack.InjectableLinkEndpoint. -func (m *InjectableEndpoint) Inject(protocol tcpip.NetworkProtocolNumber, vv buffer.VectorisedView) { - m.dispatcher.DeliverNetworkPacket(m, "" /* remote */, "" /* local */, protocol, vv) +// InjectInbound implements stack.InjectableLinkEndpoint. +func (m *InjectableEndpoint) InjectInbound(protocol tcpip.NetworkProtocolNumber, vv buffer.VectorisedView) { + m.dispatcher.DeliverNetworkPacket(m, "" /* remote */, "" /* local */, protocol, vv, nil /* linkHeader */) } // WritePacket writes outbound packets to the appropriate LinkInjectableEndpoint @@ -94,14 +94,21 @@ func (m *InjectableEndpoint) WritePacket(r *stack.Route, _ *stack.GSO, hdr buffe return tcpip.ErrNoRoute } -// WriteRawPacket writes outbound packets to the appropriate +// WriteRawPacket implements stack.LinkEndpoint.WriteRawPacket. +func (m *InjectableEndpoint) WriteRawPacket(packet buffer.VectorisedView) *tcpip.Error { + // WriteRawPacket doesn't get a route or network address, so there's + // nowhere to write this. + return tcpip.ErrNoRoute +} + +// InjectOutbound writes outbound packets to the appropriate // LinkInjectableEndpoint based on the dest address. -func (m *InjectableEndpoint) WriteRawPacket(dest tcpip.Address, packet []byte) *tcpip.Error { +func (m *InjectableEndpoint) InjectOutbound(dest tcpip.Address, packet []byte) *tcpip.Error { endpoint, ok := m.routes[dest] if !ok { return tcpip.ErrNoRoute } - return endpoint.WriteRawPacket(dest, packet) + return endpoint.InjectOutbound(dest, packet) } // Wait implements stack.LinkEndpoint.Wait. diff --git a/pkg/tcpip/link/muxed/injectable_test.go b/pkg/tcpip/link/muxed/injectable_test.go index 3086fec00..9cd300af8 100644 --- a/pkg/tcpip/link/muxed/injectable_test.go +++ b/pkg/tcpip/link/muxed/injectable_test.go @@ -31,7 +31,7 @@ import ( func TestInjectableEndpointRawDispatch(t *testing.T) { endpoint, sock, dstIP := makeTestInjectableEndpoint(t) - endpoint.WriteRawPacket(dstIP, []byte{0xFA}) + endpoint.InjectOutbound(dstIP, []byte{0xFA}) buf := make([]byte, ipv4.MaxTotalSize) bytesRead, err := sock.Read(buf) diff --git a/pkg/tcpip/link/sharedmem/sharedmem.go b/pkg/tcpip/link/sharedmem/sharedmem.go index 9e71d4edf..4393403c5 100644 --- a/pkg/tcpip/link/sharedmem/sharedmem.go +++ b/pkg/tcpip/link/sharedmem/sharedmem.go @@ -212,6 +212,21 @@ func (e *endpoint) WritePacket(r *stack.Route, _ *stack.GSO, hdr buffer.Prependa return nil } +// WriteRawPacket implements stack.LinkEndpoint.WriteRawPacket. +func (e *endpoint) WriteRawPacket(packet buffer.VectorisedView) *tcpip.Error { + v := packet.ToView() + // Transmit the packet. + e.mu.Lock() + ok := e.tx.transmit(v, buffer.View{}) + e.mu.Unlock() + + if !ok { + return tcpip.ErrWouldBlock + } + + return nil +} + // dispatchLoop reads packets from the rx queue in a loop and dispatches them // to the network stack. func (e *endpoint) dispatchLoop(d stack.NetworkDispatcher) { @@ -254,7 +269,7 @@ func (e *endpoint) dispatchLoop(d stack.NetworkDispatcher) { // Send packet up the stack. eth := header.Ethernet(b) - d.DeliverNetworkPacket(e, eth.SourceAddress(), eth.DestinationAddress(), eth.Type(), buffer.View(b[header.EthernetMinimumSize:]).ToVectorisedView()) + d.DeliverNetworkPacket(e, eth.SourceAddress(), eth.DestinationAddress(), eth.Type(), buffer.View(b[header.EthernetMinimumSize:]).ToVectorisedView(), buffer.View(eth)) } // Clean state. diff --git a/pkg/tcpip/link/sharedmem/sharedmem_test.go b/pkg/tcpip/link/sharedmem/sharedmem_test.go index 0e9ba0846..f3e9705c9 100644 --- a/pkg/tcpip/link/sharedmem/sharedmem_test.go +++ b/pkg/tcpip/link/sharedmem/sharedmem_test.go @@ -78,9 +78,10 @@ func (q *queueBuffers) cleanup() { } type packetInfo struct { - addr tcpip.LinkAddress - proto tcpip.NetworkProtocolNumber - vv buffer.VectorisedView + addr tcpip.LinkAddress + proto tcpip.NetworkProtocolNumber + vv buffer.VectorisedView + linkHeader buffer.View } type testContext struct { @@ -130,12 +131,13 @@ func newTestContext(t *testing.T, mtu, bufferSize uint32, addr tcpip.LinkAddress return c } -func (c *testContext) DeliverNetworkPacket(_ stack.LinkEndpoint, remoteLinkAddr, localLinkAddr tcpip.LinkAddress, proto tcpip.NetworkProtocolNumber, vv buffer.VectorisedView) { +func (c *testContext) DeliverNetworkPacket(_ stack.LinkEndpoint, remoteLinkAddr, localLinkAddr tcpip.LinkAddress, proto tcpip.NetworkProtocolNumber, vv buffer.VectorisedView, linkHeader buffer.View) { c.mu.Lock() c.packets = append(c.packets, packetInfo{ - addr: remoteLinkAddr, - proto: proto, - vv: vv.Clone(nil), + addr: remoteLinkAddr, + proto: proto, + vv: vv.Clone(nil), + linkHeader: linkHeader, }) c.mu.Unlock() diff --git a/pkg/tcpip/link/sniffer/sniffer.go b/pkg/tcpip/link/sniffer/sniffer.go index e401dce44..de9ff98cb 100644 --- a/pkg/tcpip/link/sniffer/sniffer.go +++ b/pkg/tcpip/link/sniffer/sniffer.go @@ -116,7 +116,7 @@ func NewWithFile(lower stack.LinkEndpoint, file *os.File, snapLen uint32) (stack // DeliverNetworkPacket implements the stack.NetworkDispatcher interface. It is // called by the link-layer endpoint being wrapped when a packet arrives, and // logs the packet before forwarding to the actual dispatcher. -func (e *endpoint) DeliverNetworkPacket(linkEP stack.LinkEndpoint, remote, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, vv buffer.VectorisedView) { +func (e *endpoint) DeliverNetworkPacket(linkEP stack.LinkEndpoint, remote, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, vv buffer.VectorisedView, linkHeader buffer.View) { if atomic.LoadUint32(&LogPackets) == 1 && e.file == nil { logPacket("recv", protocol, vv.First(), nil) } @@ -147,7 +147,7 @@ func (e *endpoint) DeliverNetworkPacket(linkEP stack.LinkEndpoint, remote, local panic(err) } } - e.dispatcher.DeliverNetworkPacket(e, remote, local, protocol, vv) + e.dispatcher.DeliverNetworkPacket(e, remote, local, protocol, vv, linkHeader) } // Attach implements the stack.LinkEndpoint interface. It saves the dispatcher @@ -218,21 +218,7 @@ func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, hdr buffer.Prepen panic(err) } length -= len(hdrBuf) - if length > 0 { - for _, v := range payload.Views() { - if len(v) > length { - v = v[:length] - } - n, err := buf.Write(v) - if err != nil { - panic(err) - } - length -= n - if length == 0 { - break - } - } - } + logVectorisedView(payload, length, buf) if _, err := e.file.Write(buf.Bytes()); err != nil { panic(err) } @@ -240,6 +226,48 @@ func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, hdr buffer.Prepen return e.lower.WritePacket(r, gso, hdr, payload, protocol) } +// WriteRawPacket implements stack.LinkEndpoint.WriteRawPacket. +func (e *endpoint) WriteRawPacket(packet buffer.VectorisedView) *tcpip.Error { + if atomic.LoadUint32(&LogPackets) == 1 && e.file == nil { + logPacket("send", 0, buffer.View("[raw packet, no header available]"), nil /* gso */) + } + if e.file != nil && atomic.LoadUint32(&LogPacketsToFile) == 1 { + length := packet.Size() + if length > int(e.maxPCAPLen) { + length = int(e.maxPCAPLen) + } + + buf := bytes.NewBuffer(make([]byte, 0, pcapPacketHeaderLen+length)) + if err := binary.Write(buf, binary.BigEndian, newPCAPPacketHeader(uint32(length), uint32(packet.Size()))); err != nil { + panic(err) + } + logVectorisedView(packet, length, buf) + if _, err := e.file.Write(buf.Bytes()); err != nil { + panic(err) + } + } + return e.lower.WriteRawPacket(packet) +} + +func logVectorisedView(vv buffer.VectorisedView, length int, buf *bytes.Buffer) { + if length <= 0 { + return + } + for _, v := range vv.Views() { + if len(v) > length { + v = v[:length] + } + n, err := buf.Write(v) + if err != nil { + panic(err) + } + length -= n + if length == 0 { + return + } + } +} + // Wait implements stack.LinkEndpoint.Wait. func (*endpoint) Wait() {} diff --git a/pkg/tcpip/link/waitable/waitable.go b/pkg/tcpip/link/waitable/waitable.go index 5a1791cb5..e9b50b802 100644 --- a/pkg/tcpip/link/waitable/waitable.go +++ b/pkg/tcpip/link/waitable/waitable.go @@ -50,12 +50,12 @@ func New(lower stack.LinkEndpoint) *Endpoint { // It is called by the link-layer endpoint being wrapped when a packet arrives, // and only forwards to the actual dispatcher if Wait or WaitDispatch haven't // been called. -func (e *Endpoint) DeliverNetworkPacket(linkEP stack.LinkEndpoint, remote, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, vv buffer.VectorisedView) { +func (e *Endpoint) DeliverNetworkPacket(linkEP stack.LinkEndpoint, remote, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, vv buffer.VectorisedView, linkHeader buffer.View) { if !e.dispatchGate.Enter() { return } - e.dispatcher.DeliverNetworkPacket(e, remote, local, protocol, vv) + e.dispatcher.DeliverNetworkPacket(e, remote, local, protocol, vv, linkHeader) e.dispatchGate.Leave() } @@ -109,6 +109,17 @@ func (e *Endpoint) WritePacket(r *stack.Route, gso *stack.GSO, hdr buffer.Prepen return err } +// WriteRawPacket implements stack.LinkEndpoint.WriteRawPacket. +func (e *Endpoint) WriteRawPacket(packet buffer.VectorisedView) *tcpip.Error { + if !e.writeGate.Enter() { + return nil + } + + err := e.lower.WriteRawPacket(packet) + e.writeGate.Leave() + return err +} + // WaitWrite prevents new calls to WritePacket from reaching the lower endpoint, // and waits for inflight ones to finish before returning. func (e *Endpoint) WaitWrite() { diff --git a/pkg/tcpip/link/waitable/waitable_test.go b/pkg/tcpip/link/waitable/waitable_test.go index ae23c96b7..f03bbceec 100644 --- a/pkg/tcpip/link/waitable/waitable_test.go +++ b/pkg/tcpip/link/waitable/waitable_test.go @@ -35,7 +35,7 @@ type countedEndpoint struct { dispatcher stack.NetworkDispatcher } -func (e *countedEndpoint) DeliverNetworkPacket(linkEP stack.LinkEndpoint, remote, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, vv buffer.VectorisedView) { +func (e *countedEndpoint) DeliverNetworkPacket(linkEP stack.LinkEndpoint, remote, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, vv buffer.VectorisedView, linkHeader buffer.View) { e.dispatchCount++ } @@ -70,6 +70,11 @@ func (e *countedEndpoint) WritePacket(r *stack.Route, _ *stack.GSO, hdr buffer.P return nil } +func (e *countedEndpoint) WriteRawPacket(packet buffer.VectorisedView) *tcpip.Error { + e.writeCount++ + return nil +} + // Wait implements stack.LinkEndpoint.Wait. func (*countedEndpoint) Wait() {} @@ -109,21 +114,21 @@ func TestWaitDispatch(t *testing.T) { } // Dispatch and check that it goes through. - ep.dispatcher.DeliverNetworkPacket(ep, "", "", 0, buffer.VectorisedView{}) + ep.dispatcher.DeliverNetworkPacket(ep, "", "", 0, buffer.VectorisedView{}, buffer.View{}) if want := 1; ep.dispatchCount != want { t.Fatalf("Unexpected dispatchCount: got=%v, want=%v", ep.dispatchCount, want) } // Wait on writes, then try to dispatch. It must go through. wep.WaitWrite() - ep.dispatcher.DeliverNetworkPacket(ep, "", "", 0, buffer.VectorisedView{}) + ep.dispatcher.DeliverNetworkPacket(ep, "", "", 0, buffer.VectorisedView{}, buffer.View{}) if want := 2; ep.dispatchCount != want { t.Fatalf("Unexpected dispatchCount: got=%v, want=%v", ep.dispatchCount, want) } // Wait on dispatches, then try to dispatch. It must not go through. wep.WaitDispatch() - ep.dispatcher.DeliverNetworkPacket(ep, "", "", 0, buffer.VectorisedView{}) + ep.dispatcher.DeliverNetworkPacket(ep, "", "", 0, buffer.VectorisedView{}, buffer.View{}) if want := 2; ep.dispatchCount != want { t.Fatalf("Unexpected dispatchCount: got=%v, want=%v", ep.dispatchCount, want) } diff --git a/pkg/tcpip/network/ip_test.go b/pkg/tcpip/network/ip_test.go index f644a8b08..bd09b9a1c 100644 --- a/pkg/tcpip/network/ip_test.go +++ b/pkg/tcpip/network/ip_test.go @@ -171,6 +171,10 @@ func (t *testObject) WritePacket(_ *stack.Route, _ *stack.GSO, hdr buffer.Prepen return nil } +func (t *testObject) WriteRawPacket(_ buffer.VectorisedView) *tcpip.Error { + return tcpip.ErrNotSupported +} + func buildIPv4Route(local, remote tcpip.Address) (stack.Route, *tcpip.Error) { s := stack.New(stack.Options{ NetworkProtocols: []stack.NetworkProtocol{ipv4.NewProtocol()}, diff --git a/pkg/tcpip/stack/nic.go b/pkg/tcpip/stack/nic.go index 38d066cd1..dbd304b7e 100644 --- a/pkg/tcpip/stack/nic.go +++ b/pkg/tcpip/stack/nic.go @@ -40,6 +40,9 @@ type NIC struct { endpoints map[NetworkEndpointID]*referencedNetworkEndpoint addressRanges []tcpip.Subnet mcastJoins map[NetworkEndpointID]int32 + // packetEPs is protected by mu, but the contained PacketEndpoint + // values are not. + packetEPs map[tcpip.NetworkProtocolNumber][]PacketEndpoint stats NICStats @@ -78,7 +81,7 @@ const ( ) func newNIC(stack *Stack, id tcpip.NICID, name string, ep LinkEndpoint, loopback bool) *NIC { - return &NIC{ + nic := &NIC{ stack: stack, id: id, name: name, @@ -87,6 +90,7 @@ func newNIC(stack *Stack, id tcpip.NICID, name string, ep LinkEndpoint, loopback primary: make(map[tcpip.NetworkProtocolNumber][]*referencedNetworkEndpoint), endpoints: make(map[NetworkEndpointID]*referencedNetworkEndpoint), mcastJoins: make(map[NetworkEndpointID]int32), + packetEPs: make(map[tcpip.NetworkProtocolNumber][]PacketEndpoint), stats: NICStats{ Tx: DirectionStats{ Packets: &tcpip.StatCounter{}, @@ -101,6 +105,16 @@ func newNIC(stack *Stack, id tcpip.NICID, name string, ep LinkEndpoint, loopback dad: make(map[tcpip.Address]dadState), }, } + + // Register supported packet endpoint protocols. + for _, netProto := range header.Ethertypes { + nic.packetEPs[netProto] = []PacketEndpoint{} + } + for _, netProto := range stack.networkProtocols { + nic.packetEPs[netProto.Number()] = []PacketEndpoint{} + } + + return nic } // enable enables the NIC. enable will attach the link to its LinkEndpoint and @@ -631,7 +645,7 @@ func handlePacket(protocol tcpip.NetworkProtocolNumber, dst, src tcpip.Address, // Note that the ownership of the slice backing vv is retained by the caller. // This rule applies only to the slice itself, not to the items of the slice; // the ownership of the items is not retained by the caller. -func (n *NIC) DeliverNetworkPacket(linkEP LinkEndpoint, remote, _ tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, vv buffer.VectorisedView) { +func (n *NIC) DeliverNetworkPacket(linkEP LinkEndpoint, remote, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, vv buffer.VectorisedView, linkHeader buffer.View) { n.stats.Rx.Packets.Increment() n.stats.Rx.Bytes.IncrementBy(uint64(vv.Size())) @@ -641,6 +655,26 @@ func (n *NIC) DeliverNetworkPacket(linkEP LinkEndpoint, remote, _ tcpip.LinkAddr return } + // If no local link layer address is provided, assume it was sent + // directly to this NIC. + if local == "" { + local = n.linkEP.LinkAddress() + } + + // Are any packet sockets listening for this network protocol? + n.mu.RLock() + packetEPs := n.packetEPs[protocol] + // Check whether there are packet sockets listening for every protocol. + // If we received a packet with protocol EthernetProtocolAll, then the + // previous for loop will have handled it. + if protocol != header.EthernetProtocolAll { + packetEPs = append(packetEPs, n.packetEPs[header.EthernetProtocolAll]...) + } + n.mu.RUnlock() + for _, ep := range packetEPs { + ep.HandlePacket(n.id, local, protocol, vv, linkHeader) + } + if netProto.Number() == header.IPv4ProtocolNumber || netProto.Number() == header.IPv6ProtocolNumber { n.stack.stats.IP.PacketsReceived.Increment() } @@ -700,7 +734,10 @@ func (n *NIC) DeliverNetworkPacket(linkEP LinkEndpoint, remote, _ tcpip.LinkAddr return } - n.stack.stats.IP.InvalidAddressesReceived.Increment() + // If a packet socket handled the packet, don't treat it as invalid. + if len(packetEPs) == 0 { + n.stack.stats.IP.InvalidAddressesReceived.Increment() + } } // DeliverTransportPacket delivers the packets to the appropriate transport @@ -856,6 +893,36 @@ const ( temporary ) +func (n *NIC) registerPacketEndpoint(netProto tcpip.NetworkProtocolNumber, ep PacketEndpoint) *tcpip.Error { + n.mu.Lock() + defer n.mu.Unlock() + + eps, ok := n.packetEPs[netProto] + if !ok { + return tcpip.ErrNotSupported + } + n.packetEPs[netProto] = append(eps, ep) + + return nil +} + +func (n *NIC) unregisterPacketEndpoint(netProto tcpip.NetworkProtocolNumber, ep PacketEndpoint) { + n.mu.Lock() + defer n.mu.Unlock() + + eps, ok := n.packetEPs[netProto] + if !ok { + return + } + + for i, epOther := range eps { + if epOther == ep { + n.packetEPs[netProto] = append(eps[:i], eps[i+1:]...) + return + } + } +} + type referencedNetworkEndpoint struct { ep NetworkEndpoint nic *NIC diff --git a/pkg/tcpip/stack/registration.go b/pkg/tcpip/stack/registration.go index 9d6157f22..f655d011f 100644 --- a/pkg/tcpip/stack/registration.go +++ b/pkg/tcpip/stack/registration.go @@ -71,8 +71,8 @@ type TransportEndpoint interface { // RawTransportEndpoint is the interface that needs to be implemented by raw // transport protocol endpoints. RawTransportEndpoints receive the entire -// packet - including the link, network, and transport headers - as delivered -// to netstack. +// packet - including the network and transport headers - as delivered to +// netstack. type RawTransportEndpoint interface { // HandlePacket is called by the stack when new packets arrive to // this transport endpoint. The packet contains all data from the link @@ -80,6 +80,22 @@ type RawTransportEndpoint interface { HandlePacket(r *Route, netHeader buffer.View, packet buffer.VectorisedView) } +// PacketEndpoint is the interface that needs to be implemented by packet +// transport protocol endpoints. These endpoints receive link layer headers in +// addition to whatever they contain (usually network and transport layer +// headers and a payload). +type PacketEndpoint interface { + // HandlePacket is called by the stack when new packets arrive that + // match the endpoint. + // + // Implementers should treat packet as immutable and should copy it + // before before modification. + // + // linkHeader may have a length of 0, in which case the PacketEndpoint + // should construct its own ethernet header for applications. + HandlePacket(nicid tcpip.NICID, addr tcpip.LinkAddress, netProto tcpip.NetworkProtocolNumber, packet buffer.VectorisedView, linkHeader buffer.View) +} + // TransportProtocol is the interface that needs to be implemented by transport // protocols (e.g., tcp, udp) that want to be part of the networking stack. type TransportProtocol interface { @@ -242,9 +258,10 @@ type NetworkProtocol interface { // packets to the appropriate network endpoint after it has been handled by // the data link layer. type NetworkDispatcher interface { - // DeliverNetworkPacket finds the appropriate network protocol - // endpoint and hands the packet over for further processing. - DeliverNetworkPacket(linkEP LinkEndpoint, remote, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, vv buffer.VectorisedView) + // DeliverNetworkPacket finds the appropriate network protocol endpoint + // and hands the packet over for further processing. linkHeader may have + // length 0 when the caller does not have ethernet data. + DeliverNetworkPacket(linkEP LinkEndpoint, remote, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, vv buffer.VectorisedView, linkHeader buffer.View) } // LinkEndpointCapabilities is the type associated with the capabilities @@ -301,6 +318,10 @@ type LinkEndpoint interface { // r.LocalLinkAddress if it is provided. WritePacket(r *Route, gso *GSO, hdr buffer.Prependable, payload buffer.VectorisedView, protocol tcpip.NetworkProtocolNumber) *tcpip.Error + // WriteRawPacket writes a packet directly to the link. The packet + // should already have an ethernet header. + WriteRawPacket(packet buffer.VectorisedView) *tcpip.Error + // Attach attaches the data link layer endpoint to the network-layer // dispatcher of the stack. Attach(dispatcher NetworkDispatcher) @@ -324,13 +345,14 @@ type LinkEndpoint interface { type InjectableLinkEndpoint interface { LinkEndpoint - // Inject injects an inbound packet. - Inject(protocol tcpip.NetworkProtocolNumber, vv buffer.VectorisedView) + // InjectInbound injects an inbound packet. + InjectInbound(protocol tcpip.NetworkProtocolNumber, vv buffer.VectorisedView) - // WriteRawPacket writes a fully formed outbound packet directly to the link. + // InjectOutbound writes a fully formed outbound packet directly to the + // link. // // dest is used by endpoints with multiple raw destinations. - WriteRawPacket(dest tcpip.Address, packet []byte) *tcpip.Error + InjectOutbound(dest tcpip.Address, packet []byte) *tcpip.Error } // A LinkAddressResolver is an extension to a NetworkProtocol that @@ -379,11 +401,16 @@ type LinkAddressCache interface { RemoveWaker(nicid tcpip.NICID, addr tcpip.Address, waker *sleep.Waker) } -// UnassociatedEndpointFactory produces endpoints for writing packets not -// associated with a particular transport protocol. Such endpoints can be used -// to write arbitrary packets that include the IP header. -type UnassociatedEndpointFactory interface { - NewUnassociatedRawEndpoint(stack *Stack, netProto tcpip.NetworkProtocolNumber, transProto tcpip.TransportProtocolNumber, waiterQueue *waiter.Queue) (tcpip.Endpoint, *tcpip.Error) +// RawFactory produces endpoints for writing various types of raw packets. +type RawFactory interface { + // NewUnassociatedEndpoint produces endpoints for writing packets not + // associated with a particular transport protocol. Such endpoints can + // be used to write arbitrary packets that include the network header. + NewUnassociatedEndpoint(stack *Stack, netProto tcpip.NetworkProtocolNumber, transProto tcpip.TransportProtocolNumber, waiterQueue *waiter.Queue) (tcpip.Endpoint, *tcpip.Error) + + // NewPacketEndpoint produces endpoints for reading and writing packets + // that include network and (when cooked is false) link layer headers. + NewPacketEndpoint(stack *Stack, cooked bool, netProto tcpip.NetworkProtocolNumber, waiterQueue *waiter.Queue) (tcpip.Endpoint, *tcpip.Error) } // GSOType is the type of GSO segments. diff --git a/pkg/tcpip/stack/stack.go b/pkg/tcpip/stack/stack.go index a199bc1cc..9fa884b8d 100644 --- a/pkg/tcpip/stack/stack.go +++ b/pkg/tcpip/stack/stack.go @@ -351,10 +351,9 @@ type Stack struct { networkProtocols map[tcpip.NetworkProtocolNumber]NetworkProtocol linkAddrResolvers map[tcpip.NetworkProtocolNumber]LinkAddressResolver - // unassociatedFactory creates unassociated endpoints. If nil, raw - // endpoints are disabled. It is set during Stack creation and is - // immutable. - unassociatedFactory UnassociatedEndpointFactory + // rawFactory creates raw endpoints. If nil, raw endpoints are + // disabled. It is set during Stack creation and is immutable. + rawFactory RawFactory demux *transportDemuxer @@ -425,16 +424,16 @@ type Options struct { // stack (false). HandleLocal bool - // UnassociatedFactory produces unassociated endpoints raw endpoints. - // Raw endpoints are enabled only if this is non-nil. - UnassociatedFactory UnassociatedEndpointFactory - // NDPConfigs is the NDP configurations used by interfaces. // // By default, NDPConfigs will have a zero value for its // DupAddrDetectTransmits field, implying that DAD will not be performed // before assigning an address to a NIC. NDPConfigs NDPConfigurations + + // RawFactory produces raw endpoints. Raw endpoints are enabled only if + // this is non-nil. + RawFactory RawFactory } // TransportEndpointInfo holds useful information about a transport endpoint @@ -514,8 +513,8 @@ func New(opts Options) *Stack { } } - // Add the factory for unassociated endpoints, if present. - s.unassociatedFactory = opts.UnassociatedFactory + // Add the factory for raw endpoints, if present. + s.rawFactory = opts.RawFactory // Create the global transport demuxer. s.demux = newTransportDemuxer(s) @@ -650,12 +649,12 @@ func (s *Stack) NewEndpoint(transport tcpip.TransportProtocolNumber, network tcp // protocol. Raw endpoints receive all traffic for a given protocol regardless // of address. func (s *Stack) NewRawEndpoint(transport tcpip.TransportProtocolNumber, network tcpip.NetworkProtocolNumber, waiterQueue *waiter.Queue, associated bool) (tcpip.Endpoint, *tcpip.Error) { - if s.unassociatedFactory == nil { + if s.rawFactory == nil { return nil, tcpip.ErrNotPermitted } if !associated { - return s.unassociatedFactory.NewUnassociatedRawEndpoint(s, network, transport, waiterQueue) + return s.rawFactory.NewUnassociatedEndpoint(s, network, transport, waiterQueue) } t, ok := s.transportProtocols[transport] @@ -666,6 +665,16 @@ func (s *Stack) NewRawEndpoint(transport tcpip.TransportProtocolNumber, network return t.proto.NewRawEndpoint(s, network, waiterQueue) } +// NewPacketEndpoint creates a new packet endpoint listening for the given +// netProto. +func (s *Stack) NewPacketEndpoint(cooked bool, netProto tcpip.NetworkProtocolNumber, waiterQueue *waiter.Queue) (tcpip.Endpoint, *tcpip.Error) { + if s.rawFactory == nil { + return nil, tcpip.ErrNotPermitted + } + + return s.rawFactory.NewPacketEndpoint(s, cooked, netProto, waiterQueue) +} + // createNIC creates a NIC with the provided id and link-layer endpoint, and // optionally enable it. func (s *Stack) createNIC(id tcpip.NICID, name string, ep LinkEndpoint, enabled, loopback bool) *tcpip.Error { @@ -1135,6 +1144,109 @@ func (s *Stack) Resume() { } } +// RegisterPacketEndpoint registers ep with the stack, causing it to receive +// all traffic of the specified netProto on the given NIC. If nicID is 0, it +// receives traffic from every NIC. +func (s *Stack) RegisterPacketEndpoint(nicID tcpip.NICID, netProto tcpip.NetworkProtocolNumber, ep PacketEndpoint) *tcpip.Error { + s.mu.Lock() + defer s.mu.Unlock() + + // If no NIC is specified, capture on all devices. + if nicID == 0 { + // Register with each NIC. + for _, nic := range s.nics { + if err := nic.registerPacketEndpoint(netProto, ep); err != nil { + s.unregisterPacketEndpointLocked(0, netProto, ep) + return err + } + } + return nil + } + + // Capture on a specific device. + nic, ok := s.nics[nicID] + if !ok { + return tcpip.ErrUnknownNICID + } + if err := nic.registerPacketEndpoint(netProto, ep); err != nil { + return err + } + + return nil +} + +// UnregisterPacketEndpoint unregisters ep for packets of the specified +// netProto from the specified NIC. If nicID is 0, ep is unregistered from all +// NICs. +func (s *Stack) UnregisterPacketEndpoint(nicID tcpip.NICID, netProto tcpip.NetworkProtocolNumber, ep PacketEndpoint) { + s.mu.Lock() + defer s.mu.Unlock() + s.unregisterPacketEndpointLocked(nicID, netProto, ep) +} + +func (s *Stack) unregisterPacketEndpointLocked(nicID tcpip.NICID, netProto tcpip.NetworkProtocolNumber, ep PacketEndpoint) { + // If no NIC is specified, unregister on all devices. + if nicID == 0 { + // Unregister with each NIC. + for _, nic := range s.nics { + nic.unregisterPacketEndpoint(netProto, ep) + } + return + } + + // Unregister in a single device. + nic, ok := s.nics[nicID] + if !ok { + return + } + nic.unregisterPacketEndpoint(netProto, ep) +} + +// WritePacket writes data directly to the specified NIC. It adds an ethernet +// header based on the arguments. +func (s *Stack) WritePacket(nicid tcpip.NICID, dst tcpip.LinkAddress, netProto tcpip.NetworkProtocolNumber, payload buffer.VectorisedView) *tcpip.Error { + s.mu.Lock() + nic, ok := s.nics[nicid] + s.mu.Unlock() + if !ok { + return tcpip.ErrUnknownDevice + } + + // Add our own fake ethernet header. + ethFields := header.EthernetFields{ + SrcAddr: nic.linkEP.LinkAddress(), + DstAddr: dst, + Type: netProto, + } + fakeHeader := make(header.Ethernet, header.EthernetMinimumSize) + fakeHeader.Encode(ðFields) + ethHeader := buffer.View(fakeHeader).ToVectorisedView() + ethHeader.Append(payload) + + if err := nic.linkEP.WriteRawPacket(ethHeader); err != nil { + return err + } + + return nil +} + +// WriteRawPacket writes data directly to the specified NIC without adding any +// headers. +func (s *Stack) WriteRawPacket(nicid tcpip.NICID, payload buffer.VectorisedView) *tcpip.Error { + s.mu.Lock() + nic, ok := s.nics[nicid] + s.mu.Unlock() + if !ok { + return tcpip.ErrUnknownDevice + } + + if err := nic.linkEP.WriteRawPacket(payload); err != nil { + return err + } + + return nil +} + // NetworkProtocolInstance returns the protocol instance in the stack for the // specified network protocol. This method is public for protocol implementers // and tests to use. diff --git a/pkg/tcpip/stack/transport_demuxer.go b/pkg/tcpip/stack/transport_demuxer.go index 92267ce4d..97a1aec4b 100644 --- a/pkg/tcpip/stack/transport_demuxer.go +++ b/pkg/tcpip/stack/transport_demuxer.go @@ -465,7 +465,7 @@ func (d *transportDemuxer) findEndpointLocked(eps *transportEndpoints, vv buffer func (d *transportDemuxer) registerRawEndpoint(netProto tcpip.NetworkProtocolNumber, transProto tcpip.TransportProtocolNumber, ep RawTransportEndpoint) *tcpip.Error { eps, ok := d.protocol[protocolIDs{netProto, transProto}] if !ok { - return nil + return tcpip.ErrNotSupported } eps.mu.Lock() diff --git a/pkg/tcpip/tcpip.go b/pkg/tcpip/tcpip.go index 678a94616..3d10001d8 100644 --- a/pkg/tcpip/tcpip.go +++ b/pkg/tcpip/tcpip.go @@ -255,7 +255,7 @@ type FullAddress struct { // This may not be used by all endpoint types. NIC NICID - // Addr is the network address. + // Addr is the network or link layer address. Addr Address // Port is the transport port. diff --git a/pkg/tcpip/transport/packet/BUILD b/pkg/tcpip/transport/packet/BUILD new file mode 100644 index 000000000..8ea2e6ee5 --- /dev/null +++ b/pkg/tcpip/transport/packet/BUILD @@ -0,0 +1,46 @@ +load("//tools/go_generics:defs.bzl", "go_template_instance") +load("//tools/go_stateify:defs.bzl", "go_library") + +package(licenses = ["notice"]) + +go_template_instance( + name = "packet_list", + out = "packet_list.go", + package = "packet", + prefix = "packet", + template = "//pkg/ilist:generic_list", + types = { + "Element": "*packet", + "Linker": "*packet", + }, +) + +go_library( + name = "packet", + srcs = [ + "endpoint.go", + "endpoint_state.go", + "packet_list.go", + ], + importpath = "gvisor.dev/gvisor/pkg/tcpip/transport/packet", + imports = ["gvisor.dev/gvisor/pkg/tcpip/buffer"], + visibility = ["//visibility:public"], + deps = [ + "//pkg/log", + "//pkg/sleep", + "//pkg/tcpip", + "//pkg/tcpip/buffer", + "//pkg/tcpip/header", + "//pkg/tcpip/iptables", + "//pkg/tcpip/stack", + "//pkg/waiter", + ], +) + +filegroup( + name = "autogen", + srcs = [ + "packet_list.go", + ], + visibility = ["//:sandbox"], +) diff --git a/pkg/tcpip/transport/packet/endpoint.go b/pkg/tcpip/transport/packet/endpoint.go new file mode 100644 index 000000000..73cdaa265 --- /dev/null +++ b/pkg/tcpip/transport/packet/endpoint.go @@ -0,0 +1,363 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package packet provides the implementation of packet sockets (see +// packet(7)). Packet sockets allow applications to: +// +// * manually write and inspect link, network, and transport headers +// * receive all traffic of a given network protocol, or all protocols +// +// Packet sockets are similar to raw sockets, but provide even more power to +// users, letting them effectively talk directly to the network device. +// +// Packet sockets skip the input and output iptables chains. +package packet + +import ( + "sync" + + "gvisor.dev/gvisor/pkg/tcpip" + "gvisor.dev/gvisor/pkg/tcpip/buffer" + "gvisor.dev/gvisor/pkg/tcpip/header" + "gvisor.dev/gvisor/pkg/tcpip/iptables" + "gvisor.dev/gvisor/pkg/tcpip/stack" + "gvisor.dev/gvisor/pkg/waiter" +) + +// +stateify savable +type packet struct { + packetEntry + // data holds the actual packet data, including any headers and + // payload. + data buffer.VectorisedView `state:".(buffer.VectorisedView)"` + // views is pre-allocated space to back data. As long as the packet is + // made up of fewer than 8 buffer.Views, no extra allocation is + // necessary to store packet data. + views [8]buffer.View `state:"nosave"` + // timestampNS is the unix time at which the packet was received. + timestampNS int64 + // senderAddr is the network address of the sender. + senderAddr tcpip.FullAddress +} + +// endpoint is the packet socket implementation of tcpip.Endpoint. It is legal +// to have goroutines make concurrent calls into the endpoint. +// +// Lock order: +// endpoint.mu +// endpoint.rcvMu +// +// +stateify savable +type endpoint struct { + stack.TransportEndpointInfo + // The following fields are initialized at creation time and are + // immutable. + stack *stack.Stack `state:"manual"` + netProto tcpip.NetworkProtocolNumber + waiterQueue *waiter.Queue + cooked bool + + // The following fields are used to manage the receive queue and are + // protected by rcvMu. + rcvMu sync.Mutex `state:"nosave"` + rcvList packetList + rcvBufSizeMax int `state:".(int)"` + rcvBufSize int + rcvClosed bool + + // The following fields are protected by mu. + mu sync.RWMutex `state:"nosave"` + sndBufSize int + closed bool + stats tcpip.TransportEndpointStats `state:"nosave"` +} + +// NewEndpoint returns a new packet endpoint. +func NewEndpoint(s *stack.Stack, cooked bool, netProto tcpip.NetworkProtocolNumber, waiterQueue *waiter.Queue) (tcpip.Endpoint, *tcpip.Error) { + ep := &endpoint{ + stack: s, + TransportEndpointInfo: stack.TransportEndpointInfo{ + NetProto: netProto, + }, + cooked: cooked, + netProto: netProto, + waiterQueue: waiterQueue, + rcvBufSizeMax: 32 * 1024, + sndBufSize: 32 * 1024, + } + + if err := s.RegisterPacketEndpoint(0, netProto, ep); err != nil { + return nil, err + } + return ep, nil +} + +// Close implements tcpip.Endpoint.Close. +func (ep *endpoint) Close() { + ep.mu.Lock() + defer ep.mu.Unlock() + + if ep.closed { + return + } + + ep.stack.UnregisterPacketEndpoint(0, ep.netProto, ep) + + ep.rcvMu.Lock() + defer ep.rcvMu.Unlock() + + // Clear the receive list. + ep.rcvClosed = true + ep.rcvBufSize = 0 + for !ep.rcvList.Empty() { + ep.rcvList.Remove(ep.rcvList.Front()) + } + + ep.closed = true + ep.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.EventIn | waiter.EventOut) +} + +// ModerateRecvBuf implements tcpip.Endpoint.ModerateRecvBuf. +func (ep *endpoint) ModerateRecvBuf(copied int) {} + +// IPTables implements tcpip.Endpoint.IPTables. +func (ep *endpoint) IPTables() (iptables.IPTables, error) { + return ep.stack.IPTables(), nil +} + +// Read implements tcpip.Endpoint.Read. +func (ep *endpoint) Read(addr *tcpip.FullAddress) (buffer.View, tcpip.ControlMessages, *tcpip.Error) { + ep.rcvMu.Lock() + + // If there's no data to read, return that read would block or that the + // endpoint is closed. + if ep.rcvList.Empty() { + err := tcpip.ErrWouldBlock + if ep.rcvClosed { + ep.stats.ReadErrors.ReadClosed.Increment() + err = tcpip.ErrClosedForReceive + } + ep.rcvMu.Unlock() + return buffer.View{}, tcpip.ControlMessages{}, err + } + + packet := ep.rcvList.Front() + ep.rcvList.Remove(packet) + ep.rcvBufSize -= packet.data.Size() + + ep.rcvMu.Unlock() + + if addr != nil { + *addr = packet.senderAddr + } + + return packet.data.ToView(), tcpip.ControlMessages{HasTimestamp: true, Timestamp: packet.timestampNS}, nil +} + +func (ep *endpoint) Write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, <-chan struct{}, *tcpip.Error) { + // TODO(b/129292371): Implement. + return 0, nil, tcpip.ErrInvalidOptionValue +} + +// Peek implements tcpip.Endpoint.Peek. +func (ep *endpoint) Peek([][]byte) (int64, tcpip.ControlMessages, *tcpip.Error) { + return 0, tcpip.ControlMessages{}, nil +} + +// Disconnect implements tcpip.Endpoint.Disconnect. Packet sockets cannot be +// disconnected, and this function always returns tpcip.ErrNotSupported. +func (*endpoint) Disconnect() *tcpip.Error { + return tcpip.ErrNotSupported +} + +// Connect implements tcpip.Endpoint.Connect. Packet sockets cannot be +// connected, and this function always returnes tcpip.ErrNotSupported. +func (ep *endpoint) Connect(addr tcpip.FullAddress) *tcpip.Error { + return tcpip.ErrNotSupported +} + +// Shutdown implements tcpip.Endpoint.Shutdown. Packet sockets cannot be used +// with Shutdown, and this function always returns tcpip.ErrNotSupported. +func (ep *endpoint) Shutdown(flags tcpip.ShutdownFlags) *tcpip.Error { + return tcpip.ErrNotSupported +} + +// Listen implements tcpip.Endpoint.Listen. Packet sockets cannot be used with +// Listen, and this function always returns tcpip.ErrNotSupported. +func (ep *endpoint) Listen(backlog int) *tcpip.Error { + return tcpip.ErrNotSupported +} + +// Accept implements tcpip.Endpoint.Accept. Packet sockets cannot be used with +// Accept, and this function always returns tcpip.ErrNotSupported. +func (ep *endpoint) Accept() (tcpip.Endpoint, *waiter.Queue, *tcpip.Error) { + return nil, nil, tcpip.ErrNotSupported +} + +// Bind implements tcpip.Endpoint.Bind. +func (ep *endpoint) Bind(addr tcpip.FullAddress) *tcpip.Error { + // TODO(gvisor.dev/issue/173): Add Bind support. + + // "By default, all packets of the specified protocol type are passed + // to a packet socket. To get packets only from a specific interface + // use bind(2) specifying an address in a struct sockaddr_ll to bind + // the packet socket to an interface. Fields used for binding are + // sll_family (should be AF_PACKET), sll_protocol, and sll_ifindex." + // - packet(7). + + return tcpip.ErrNotSupported +} + +// GetLocalAddress implements tcpip.Endpoint.GetLocalAddress. +func (ep *endpoint) GetLocalAddress() (tcpip.FullAddress, *tcpip.Error) { + return tcpip.FullAddress{}, tcpip.ErrNotSupported +} + +// GetRemoteAddress implements tcpip.Endpoint.GetRemoteAddress. +func (ep *endpoint) GetRemoteAddress() (tcpip.FullAddress, *tcpip.Error) { + // Even a connected socket doesn't return a remote address. + return tcpip.FullAddress{}, tcpip.ErrNotConnected +} + +// Readiness implements tcpip.Endpoint.Readiness. +func (ep *endpoint) Readiness(mask waiter.EventMask) waiter.EventMask { + // The endpoint is always writable. + result := waiter.EventOut & mask + + // Determine whether the endpoint is readable. + if (mask & waiter.EventIn) != 0 { + ep.rcvMu.Lock() + if !ep.rcvList.Empty() || ep.rcvClosed { + result |= waiter.EventIn + } + ep.rcvMu.Unlock() + } + + return result +} + +// SetSockOpt implements tcpip.Endpoint.SetSockOpt. Packet sockets cannot be +// used with SetSockOpt, and this function always returns +// tcpip.ErrNotSupported. +func (ep *endpoint) SetSockOpt(opt interface{}) *tcpip.Error { + return tcpip.ErrNotSupported +} + +// SetSockOptInt implements tcpip.Endpoint.SetSockOptInt. +func (ep *endpoint) SetSockOptInt(opt tcpip.SockOpt, v int) *tcpip.Error { + return tcpip.ErrUnknownProtocolOption +} + +// GetSockOptInt implements tcpip.Endpoint.GetSockOptInt. +func (ep *endpoint) GetSockOptInt(opt tcpip.SockOpt) (int, *tcpip.Error) { + return 0, tcpip.ErrNotSupported +} + +// GetSockOpt implements tcpip.Endpoint.GetSockOpt. +func (ep *endpoint) GetSockOpt(opt interface{}) *tcpip.Error { + return tcpip.ErrNotSupported +} + +// HandlePacket implements stack.PacketEndpoint.HandlePacket. +func (ep *endpoint) HandlePacket(nicid tcpip.NICID, localAddr tcpip.LinkAddress, netProto tcpip.NetworkProtocolNumber, vv buffer.VectorisedView, ethHeader buffer.View) { + ep.rcvMu.Lock() + + // Drop the packet if our buffer is currently full. + if ep.rcvClosed { + ep.rcvMu.Unlock() + ep.stack.Stats().DroppedPackets.Increment() + ep.stats.ReceiveErrors.ClosedReceiver.Increment() + return + } + + if ep.rcvBufSize >= ep.rcvBufSizeMax { + ep.rcvMu.Unlock() + ep.stack.Stats().DroppedPackets.Increment() + ep.stats.ReceiveErrors.ReceiveBufferOverflow.Increment() + return + } + + wasEmpty := ep.rcvBufSize == 0 + + // Push new packet into receive list and increment the buffer size. + var packet packet + // TODO(b/129292371): Return network protocol. + if len(ethHeader) > 0 { + // Get info directly from the ethernet header. + hdr := header.Ethernet(ethHeader) + packet.senderAddr = tcpip.FullAddress{ + NIC: nicid, + Addr: tcpip.Address(hdr.SourceAddress()), + } + } else { + // Guess the would-be ethernet header. + packet.senderAddr = tcpip.FullAddress{ + NIC: nicid, + Addr: tcpip.Address(localAddr), + } + } + + if ep.cooked { + // Cooked packets can simply be queued. + packet.data = vv.Clone(packet.views[:]) + } else { + // Raw packets need their ethernet headers prepended before + // queueing. + if len(ethHeader) == 0 { + // We weren't provided with an actual ethernet header, + // so fake one. + ethFields := header.EthernetFields{ + SrcAddr: tcpip.LinkAddress([]byte{0x00, 0x00, 0x00, 0x00, 0x00, 0x00}), + DstAddr: localAddr, + Type: netProto, + } + fakeHeader := make(header.Ethernet, header.EthernetMinimumSize) + fakeHeader.Encode(ðFields) + ethHeader = buffer.View(fakeHeader) + } + combinedVV := buffer.View(ethHeader).ToVectorisedView() + combinedVV.Append(vv) + packet.data = combinedVV.Clone(packet.views[:]) + } + packet.timestampNS = ep.stack.NowNanoseconds() + + ep.rcvList.PushBack(&packet) + ep.rcvBufSize += packet.data.Size() + + ep.rcvMu.Unlock() + ep.stats.PacketsReceived.Increment() + // Notify waiters that there's data to be read. + if wasEmpty { + ep.waiterQueue.Notify(waiter.EventIn) + } +} + +// State implements socket.Socket.State. +func (ep *endpoint) State() uint32 { + return 0 +} + +// Info returns a copy of the endpoint info. +func (ep *endpoint) Info() tcpip.EndpointInfo { + ep.mu.RLock() + // Make a copy of the endpoint info. + ret := ep.TransportEndpointInfo + ep.mu.RUnlock() + return &ret +} + +// Stats returns a pointer to the endpoint stats. +func (ep *endpoint) Stats() tcpip.EndpointStats { + return &ep.stats +} diff --git a/pkg/tcpip/transport/packet/endpoint_state.go b/pkg/tcpip/transport/packet/endpoint_state.go new file mode 100644 index 000000000..9b88f17e4 --- /dev/null +++ b/pkg/tcpip/transport/packet/endpoint_state.go @@ -0,0 +1,72 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package packet + +import ( + "gvisor.dev/gvisor/pkg/tcpip/buffer" + "gvisor.dev/gvisor/pkg/tcpip/stack" +) + +// saveData saves packet.data field. +func (p *packet) saveData() buffer.VectorisedView { + // We cannot save p.data directly as p.data.views may alias to p.views, + // which is not allowed by state framework (in-struct pointer). + return p.data.Clone(nil) +} + +// loadData loads packet.data field. +func (p *packet) loadData(data buffer.VectorisedView) { + // NOTE: We cannot do the p.data = data.Clone(p.views[:]) optimization + // here because data.views is not guaranteed to be loaded by now. Plus, + // data.views will be allocated anyway so there really is little point + // of utilizing p.views for data.views. + p.data = data +} + +// beforeSave is invoked by stateify. +func (ep *endpoint) beforeSave() { + // Stop incoming packets from being handled (and mutate endpoint state). + // The lock will be released after saveRcvBufSizeMax(), which would have + // saved ep.rcvBufSizeMax and set it to 0 to continue blocking incoming + // packets. + ep.rcvMu.Lock() +} + +// saveRcvBufSizeMax is invoked by stateify. +func (ep *endpoint) saveRcvBufSizeMax() int { + max := ep.rcvBufSizeMax + // Make sure no new packets will be handled regardless of the lock. + ep.rcvBufSizeMax = 0 + // Release the lock acquired in beforeSave() so regular endpoint closing + // logic can proceed after save. + ep.rcvMu.Unlock() + return max +} + +// loadRcvBufSizeMax is invoked by stateify. +func (ep *endpoint) loadRcvBufSizeMax(max int) { + ep.rcvBufSizeMax = max +} + +// afterLoad is invoked by stateify. +func (ep *endpoint) afterLoad() { + // StackFromEnv is a stack used specifically for save/restore. + ep.stack = stack.StackFromEnv + + // TODO(gvisor.dev/173): Once bind is supported, choose the right NIC. + if err := ep.stack.RegisterPacketEndpoint(0, ep.netProto, ep); err != nil { + panic(*err) + } +} diff --git a/pkg/tcpip/transport/raw/BUILD b/pkg/tcpip/transport/raw/BUILD index fba598d51..4af49218c 100644 --- a/pkg/tcpip/transport/raw/BUILD +++ b/pkg/tcpip/transport/raw/BUILD @@ -4,14 +4,14 @@ load("//tools/go_stateify:defs.bzl", "go_library") package(licenses = ["notice"]) go_template_instance( - name = "packet_list", - out = "packet_list.go", + name = "raw_packet_list", + out = "raw_packet_list.go", package = "raw", - prefix = "packet", + prefix = "rawPacket", template = "//pkg/ilist:generic_list", types = { - "Element": "*packet", - "Linker": "*packet", + "Element": "*rawPacket", + "Linker": "*rawPacket", }, ) @@ -20,8 +20,8 @@ go_library( srcs = [ "endpoint.go", "endpoint_state.go", - "packet_list.go", "protocol.go", + "raw_packet_list.go", ], importpath = "gvisor.dev/gvisor/pkg/tcpip/transport/raw", imports = ["gvisor.dev/gvisor/pkg/tcpip/buffer"], @@ -34,6 +34,7 @@ go_library( "//pkg/tcpip/header", "//pkg/tcpip/iptables", "//pkg/tcpip/stack", + "//pkg/tcpip/transport/packet", "//pkg/waiter", ], ) @@ -41,7 +42,7 @@ go_library( filegroup( name = "autogen", srcs = [ - "packet_list.go", + "raw_packet_list.go", ], visibility = ["//:sandbox"], ) diff --git a/pkg/tcpip/transport/raw/endpoint.go b/pkg/tcpip/transport/raw/endpoint.go index b4c660859..308f10d24 100644 --- a/pkg/tcpip/transport/raw/endpoint.go +++ b/pkg/tcpip/transport/raw/endpoint.go @@ -17,8 +17,7 @@ // // * manually write and inspect transport layer headers and payloads // * receive all traffic of a given transport protocol (e.g. ICMP or UDP) -// * optionally write and inspect network layer and link layer headers for -// packets +// * optionally write and inspect network layer headers of packets // // Raw sockets don't have any notion of ports, and incoming packets are // demultiplexed solely by protocol number. Thus, a raw UDP endpoint will @@ -38,8 +37,8 @@ import ( ) // +stateify savable -type packet struct { - packetEntry +type rawPacket struct { + rawPacketEntry // data holds the actual packet data, including any headers and // payload. data buffer.VectorisedView `state:".(buffer.VectorisedView)"` @@ -72,7 +71,7 @@ type endpoint struct { // The following fields are used to manage the receive queue and are // protected by rcvMu. rcvMu sync.Mutex `state:"nosave"` - rcvList packetList + rcvList rawPacketList rcvBufSizeMax int `state:".(int)"` rcvBufSize int rcvClosed bool @@ -90,7 +89,6 @@ type endpoint struct { } // NewEndpoint returns a raw endpoint for the given protocols. -// TODO(b/129292371): IP_HDRINCL and AF_PACKET. func NewEndpoint(stack *stack.Stack, netProto tcpip.NetworkProtocolNumber, transProto tcpip.TransportProtocolNumber, waiterQueue *waiter.Queue) (tcpip.Endpoint, *tcpip.Error) { return newEndpoint(stack, netProto, transProto, waiterQueue, true /* associated */) } @@ -187,17 +185,17 @@ func (e *endpoint) Read(addr *tcpip.FullAddress) (buffer.View, tcpip.ControlMess return buffer.View{}, tcpip.ControlMessages{}, err } - packet := e.rcvList.Front() - e.rcvList.Remove(packet) - e.rcvBufSize -= packet.data.Size() + pkt := e.rcvList.Front() + e.rcvList.Remove(pkt) + e.rcvBufSize -= pkt.data.Size() e.rcvMu.Unlock() if addr != nil { - *addr = packet.senderAddr + *addr = pkt.senderAddr } - return packet.data.ToView(), tcpip.ControlMessages{HasTimestamp: true, Timestamp: packet.timestampNS}, nil + return pkt.data.ToView(), tcpip.ControlMessages{HasTimestamp: true, Timestamp: pkt.timestampNS}, nil } // Write implements tcpip.Endpoint.Write. @@ -602,7 +600,7 @@ func (e *endpoint) HandlePacket(route *stack.Route, netHeader buffer.View, vv bu wasEmpty := e.rcvBufSize == 0 // Push new packet into receive list and increment the buffer size. - packet := &packet{ + pkt := &rawPacket{ senderAddr: tcpip.FullAddress{ NIC: route.NICID(), Addr: route.RemoteAddress, @@ -611,11 +609,11 @@ func (e *endpoint) HandlePacket(route *stack.Route, netHeader buffer.View, vv bu combinedVV := netHeader.ToVectorisedView() combinedVV.Append(vv) - packet.data = combinedVV.Clone(packet.views[:]) - packet.timestampNS = e.stack.NowNanoseconds() + pkt.data = combinedVV.Clone(pkt.views[:]) + pkt.timestampNS = e.stack.NowNanoseconds() - e.rcvList.PushBack(packet) - e.rcvBufSize += packet.data.Size() + e.rcvList.PushBack(pkt) + e.rcvBufSize += pkt.data.Size() e.rcvMu.Unlock() e.stats.PacketsReceived.Increment() diff --git a/pkg/tcpip/transport/raw/endpoint_state.go b/pkg/tcpip/transport/raw/endpoint_state.go index a6c7cc43a..33bfb56cd 100644 --- a/pkg/tcpip/transport/raw/endpoint_state.go +++ b/pkg/tcpip/transport/raw/endpoint_state.go @@ -20,15 +20,15 @@ import ( "gvisor.dev/gvisor/pkg/tcpip/stack" ) -// saveData saves packet.data field. -func (p *packet) saveData() buffer.VectorisedView { +// saveData saves rawPacket.data field. +func (p *rawPacket) saveData() buffer.VectorisedView { // We cannot save p.data directly as p.data.views may alias to p.views, // which is not allowed by state framework (in-struct pointer). return p.data.Clone(nil) } -// loadData loads packet.data field. -func (p *packet) loadData(data buffer.VectorisedView) { +// loadData loads rawPacket.data field. +func (p *rawPacket) loadData(data buffer.VectorisedView) { // NOTE: We cannot do the p.data = data.Clone(p.views[:]) optimization // here because data.views is not guaranteed to be loaded by now. Plus, // data.views will be allocated anyway so there really is little point @@ -86,7 +86,9 @@ func (ep *endpoint) Resume(s *stack.Stack) { } } - if err := ep.stack.RegisterRawTransportEndpoint(ep.RegisterNICID, ep.NetProto, ep.TransProto, ep); err != nil { - panic(err) + if ep.associated { + if err := ep.stack.RegisterRawTransportEndpoint(ep.RegisterNICID, ep.NetProto, ep.TransProto, ep); err != nil { + panic(err) + } } } diff --git a/pkg/tcpip/transport/raw/protocol.go b/pkg/tcpip/transport/raw/protocol.go index a2512d666..f30aa2a4a 100644 --- a/pkg/tcpip/transport/raw/protocol.go +++ b/pkg/tcpip/transport/raw/protocol.go @@ -17,13 +17,19 @@ package raw import ( "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/stack" + "gvisor.dev/gvisor/pkg/tcpip/transport/packet" "gvisor.dev/gvisor/pkg/waiter" ) -// EndpointFactory implements stack.UnassociatedEndpointFactory. +// EndpointFactory implements stack.RawFactory. type EndpointFactory struct{} -// NewUnassociatedRawEndpoint implements stack.UnassociatedEndpointFactory. -func (EndpointFactory) NewUnassociatedRawEndpoint(stack *stack.Stack, netProto tcpip.NetworkProtocolNumber, transProto tcpip.TransportProtocolNumber, waiterQueue *waiter.Queue) (tcpip.Endpoint, *tcpip.Error) { +// NewUnassociatedEndpoint implements stack.RawFactory.NewUnassociatedEndpoint. +func (EndpointFactory) NewUnassociatedEndpoint(stack *stack.Stack, netProto tcpip.NetworkProtocolNumber, transProto tcpip.TransportProtocolNumber, waiterQueue *waiter.Queue) (tcpip.Endpoint, *tcpip.Error) { return newEndpoint(stack, netProto, transProto, waiterQueue, false /* associated */) } + +// NewPacketEndpoint implements stack.RawFactory.NewPacketEndpoint. +func (EndpointFactory) NewPacketEndpoint(stack *stack.Stack, cooked bool, netProto tcpip.NetworkProtocolNumber, waiterQueue *waiter.Queue) (tcpip.Endpoint, *tcpip.Error) { + return packet.NewEndpoint(stack, cooked, netProto, waiterQueue) +} diff --git a/pkg/tcpip/transport/udp/protocol.go b/pkg/tcpip/transport/udp/protocol.go index de026880f..5c3358a5e 100644 --- a/pkg/tcpip/transport/udp/protocol.go +++ b/pkg/tcpip/transport/udp/protocol.go @@ -121,8 +121,15 @@ func (p *protocol) HandleUnknownDestinationPacket(r *stack.Route, id stack.Trans payloadLen = available } - payload := buffer.NewVectorisedView(len(netHeader), []buffer.View{netHeader}) - payload.Append(vv) + // The buffers used by vv and netHeader may be used elsewhere + // in the system. For example, a raw or packet socket may use + // what UDP considers an unreachable destination. Thus we deep + // copy vv and netHeader to prevent multiple ownership and SR + // errors. + newNetHeader := make(buffer.View, len(netHeader)) + copy(newNetHeader, netHeader) + payload := buffer.NewVectorisedView(len(newNetHeader), []buffer.View{newNetHeader}) + payload.Append(vv.ToView().ToVectorisedView()) payload.CapLength(payloadLen) hdr := buffer.NewPrependable(headerLen) diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go index c8e5e86ee..0c0eba99e 100644 --- a/runsc/boot/loader.go +++ b/runsc/boot/loader.go @@ -922,7 +922,7 @@ func newEmptyNetworkStack(conf *Config, clock tcpip.Clock) (inet.Stack, error) { HandleLocal: true, // Enable raw sockets for users with sufficient // privileges. - UnassociatedFactory: raw.EndpointFactory{}, + RawFactory: raw.EndpointFactory{}, })} // Enable SACK Recovery. diff --git a/test/syscalls/linux/packet_socket.cc b/test/syscalls/linux/packet_socket.cc index 37b4e6575..fcf64ee59 100644 --- a/test/syscalls/linux/packet_socket.cc +++ b/test/syscalls/linux/packet_socket.cc @@ -61,6 +61,9 @@ namespace testing { namespace { +using ::testing::AnyOf; +using ::testing::Eq; + constexpr char kMessage[] = "soweoneul malhaebwa"; constexpr in_port_t kPort = 0x409c; // htons(40000) @@ -83,17 +86,14 @@ void SendUDPMessage(int sock) { // Send an IP packet and make sure ETH_P_ doesn't pick it up. TEST(BasicCookedPacketTest, WrongType) { - // (b/129292371): Remove once we support packet sockets. - SKIP_IF(IsRunningOnGvisor()); - if (!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW))) { ASSERT_THAT(socket(AF_PACKET, SOCK_DGRAM, ETH_P_PUP), SyscallFailsWithErrno(EPERM)); GTEST_SKIP(); } - FileDescriptor sock = - ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_PACKET, SOCK_DGRAM, ETH_P_PUP)); + FileDescriptor sock = ASSERT_NO_ERRNO_AND_VALUE( + Socket(AF_PACKET, SOCK_DGRAM, htons(ETH_P_PUP))); // Let's use a simple IP payload: a UDP datagram. FileDescriptor udp_sock = @@ -124,9 +124,6 @@ class CookedPacketTest : public ::testing::TestWithParam { }; void CookedPacketTest::SetUp() { - // (b/129292371): Remove once we support packet sockets. - SKIP_IF(IsRunningOnGvisor()); - if (!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW))) { ASSERT_THAT(socket(AF_PACKET, SOCK_DGRAM, htons(GetParam())), SyscallFailsWithErrno(EPERM)); @@ -138,9 +135,6 @@ void CookedPacketTest::SetUp() { } void CookedPacketTest::TearDown() { - // (b/129292371): Remove once we support packet sockets. - SKIP_IF(IsRunningOnGvisor()); - // TearDown will be run even if we skip the test. if (ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW))) { EXPECT_THAT(close(socket_), SyscallSucceeds()); @@ -177,13 +171,16 @@ TEST_P(CookedPacketTest, Receive) { ASSERT_THAT(recvfrom(socket_, buf, sizeof(buf), 0, reinterpret_cast(&src), &src_len), SyscallSucceedsWithValue(packet_size)); - ASSERT_EQ(src_len, sizeof(src)); + // sockaddr_ll ends with an 8 byte physical address field, but ethernet + // addresses only use 6 bytes. Linux used to return sizeof(sockaddr_ll)-2 + // here, but since commit b2cf86e1563e33a14a1c69b3e508d15dc12f804c returns + // sizeof(sockaddr_ll). + ASSERT_THAT(src_len, AnyOf(Eq(sizeof(src)), Eq(sizeof(src) - 2))); + // TODO(b/129292371): Verify protocol once we return it. // Verify the source address. EXPECT_EQ(src.sll_family, AF_PACKET); - EXPECT_EQ(src.sll_protocol, htons(ETH_P_IP)); EXPECT_EQ(src.sll_ifindex, GetLoopbackIndex()); - EXPECT_EQ(src.sll_hatype, ARPHRD_LOOPBACK); EXPECT_EQ(src.sll_halen, ETH_ALEN); // This came from the loopback device, so the address is all 0s. for (int i = 0; i < src.sll_halen; i++) { @@ -213,6 +210,9 @@ TEST_P(CookedPacketTest, Receive) { // Send via a packet socket. TEST_P(CookedPacketTest, Send) { + // TODO(b/129292371): Remove once we support packet socket writing. + SKIP_IF(IsRunningOnGvisor()); + // Let's send a UDP packet and receive it using a regular UDP socket. FileDescriptor udp_sock = ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_INET, SOCK_DGRAM, 0)); diff --git a/test/syscalls/linux/packet_socket_raw.cc b/test/syscalls/linux/packet_socket_raw.cc index 6491453b6..d258d353c 100644 --- a/test/syscalls/linux/packet_socket_raw.cc +++ b/test/syscalls/linux/packet_socket_raw.cc @@ -26,6 +26,7 @@ #include #include +#include "gmock/gmock.h" #include "gtest/gtest.h" #include "absl/base/internal/endian.h" #include "test/syscalls/linux/socket_test_util.h" @@ -61,6 +62,9 @@ namespace testing { namespace { +using ::testing::AnyOf; +using ::testing::Eq; + constexpr char kMessage[] = "soweoneul malhaebwa"; constexpr in_port_t kPort = 0x409c; // htons(40000) @@ -97,9 +101,6 @@ class RawPacketTest : public ::testing::TestWithParam { }; void RawPacketTest::SetUp() { - // (b/129292371): Remove once we support packet sockets. - SKIP_IF(IsRunningOnGvisor()); - if (!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW))) { ASSERT_THAT(socket(AF_PACKET, SOCK_RAW, htons(GetParam())), SyscallFailsWithErrno(EPERM)); @@ -125,9 +126,6 @@ void RawPacketTest::SetUp() { } void RawPacketTest::TearDown() { - // (b/129292371): Remove once we support packet sockets. - SKIP_IF(IsRunningOnGvisor()); - // TearDown will be run even if we skip the test. if (ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW))) { EXPECT_THAT(close(socket_), SyscallSucceeds()); @@ -164,16 +162,16 @@ TEST_P(RawPacketTest, Receive) { ASSERT_THAT(recvfrom(socket_, buf, sizeof(buf), 0, reinterpret_cast(&src), &src_len), SyscallSucceedsWithValue(packet_size)); - // sizeof(src) is the size of a struct sockaddr_ll. sockaddr_ll ends with an 8 - // byte physical address field, but ethernet (MAC) addresses only use 6 bytes. - // Thus src_len should get modified to be 2 less than the size of sockaddr_ll. - ASSERT_EQ(src_len, sizeof(src) - 2); + // sockaddr_ll ends with an 8 byte physical address field, but ethernet + // addresses only use 6 bytes. Linux used to return sizeof(sockaddr_ll)-2 + // here, but since commit b2cf86e1563e33a14a1c69b3e508d15dc12f804c returns + // sizeof(sockaddr_ll). + ASSERT_THAT(src_len, AnyOf(Eq(sizeof(src)), Eq(sizeof(src) - 2))); + // TODO(b/129292371): Verify protocol once we return it. // Verify the source address. EXPECT_EQ(src.sll_family, AF_PACKET); - EXPECT_EQ(src.sll_protocol, htons(ETH_P_IP)); EXPECT_EQ(src.sll_ifindex, GetLoopbackIndex()); - EXPECT_EQ(src.sll_hatype, ARPHRD_LOOPBACK); EXPECT_EQ(src.sll_halen, ETH_ALEN); // This came from the loopback device, so the address is all 0s. for (int i = 0; i < src.sll_halen; i++) { @@ -214,6 +212,9 @@ TEST_P(RawPacketTest, Receive) { // Send via a packet socket. TEST_P(RawPacketTest, Send) { + // TODO(b/129292371): Remove once we support packet socket writing. + SKIP_IF(IsRunningOnGvisor()); + // Let's send a UDP packet and receive it using a regular UDP socket. FileDescriptor udp_sock = ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_INET, SOCK_DGRAM, 0)); @@ -309,7 +310,7 @@ TEST_P(RawPacketTest, Send) { } INSTANTIATE_TEST_SUITE_P(AllInetTests, RawPacketTest, - ::testing::Values(ETH_P_IP /*, ETH_P_ALL*/)); + ::testing::Values(ETH_P_IP, ETH_P_ALL)); } // namespace