gvisor/pkg/tcpip/transport/tcp/forwarder.go

171 lines
4.9 KiB
Go
Raw Normal View History

// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package tcp
import (
"sync"
"gvisor.dev/gvisor/pkg/tcpip"
"gvisor.dev/gvisor/pkg/tcpip/header"
"gvisor.dev/gvisor/pkg/tcpip/seqnum"
"gvisor.dev/gvisor/pkg/tcpip/stack"
"gvisor.dev/gvisor/pkg/waiter"
)
// Forwarder is a connection request forwarder, which allows clients to decide
// what to do with a connection request, for example: ignore it, send a RST, or
// attempt to complete the 3-way handshake.
//
// The canonical way of using it is to pass the Forwarder.HandlePacket function
// to stack.SetTransportProtocolHandler.
type Forwarder struct {
maxInFlight int
handler func(*ForwarderRequest)
mu sync.Mutex
inFlight map[stack.TransportEndpointID]struct{}
listen *listenContext
}
// NewForwarder allocates and initializes a new forwarder with the given
// maximum number of in-flight connection attempts. Once the maximum is reached
// new incoming connection requests will be ignored.
//
// If rcvWnd is set to zero, the default buffer size is used instead.
func NewForwarder(s *stack.Stack, rcvWnd, maxInFlight int, handler func(*ForwarderRequest)) *Forwarder {
if rcvWnd == 0 {
rcvWnd = DefaultReceiveBufferSize
}
return &Forwarder{
maxInFlight: maxInFlight,
handler: handler,
inFlight: make(map[stack.TransportEndpointID]struct{}),
Fixes to TCP listen behavior. Netstack listen loop can get stuck if cookies are in-use and the app is slow to accept incoming connections. Further we continue to complete handshake for a connection even if the backlog is full. This creates a problem when a lots of connections come in rapidly and we end up with lots of completed connections just hanging around to be delivered. These fixes change netstack behaviour to mirror what linux does as described here in the following article http://veithen.io/2014/01/01/how-tcp-backlog-works-in-linux.html Now when cookies are not in-use Netstack will silently drop the ACK to a SYN-ACK and not complete the handshake if the backlog is full. This will result in the connection staying in a half-complete state. Eventually the sender will retransmit the ACK and if backlog has space we will transition to a connected state and deliver the endpoint. Similarly when cookies are in use we do not try and create an endpoint unless there is space in the accept queue to accept the newly created endpoint. If there is no space then we again silently drop the ACK as we can just recreate it when the ACK is retransmitted by the peer. We also now use the backlog to cap the size of the SYN-RCVD queue for a given endpoint. So at any time there can be N connections in the backlog and N in a SYN-RCVD state if the application is not accepting connections. Any new SYNs will be dropped. This CL also fixes another small bug where we mark a new endpoint which has not completed handshake as connected. We should wait till handshake successfully completes before marking it connected. Updates #236 PiperOrigin-RevId: 250717817
2019-05-30 17:47:11 +00:00
listen: newListenContext(s, nil /* listenEP */, seqnum.Size(rcvWnd), true, 0),
}
}
// HandlePacket handles a packet if it is of interest to the forwarder (i.e., if
// it's a SYN packet), returning true if it's the case. Otherwise the packet
// is not handled and false is returned.
//
// This function is expected to be passed as an argument to the
// stack.SetTransportProtocolHandler function.
Use PacketBuffers, rather than VectorisedViews, in netstack. PacketBuffers are analogous to Linux's sk_buff. They hold all information about a packet, headers, and payload. This is important for: * iptables to access various headers of packets * Preventing the clutter of passing different net and link headers along with VectorisedViews to packet handling functions. This change only affects the incoming packet path, and a future change will change the outgoing path. Benchmark Regular PacketBufferPtr PacketBufferConcrete -------------------------------------------------------------------------------- BM_Recvmsg 400.715MB/s 373.676MB/s 396.276MB/s BM_Sendmsg 361.832MB/s 333.003MB/s 335.571MB/s BM_Recvfrom 453.336MB/s 393.321MB/s 381.650MB/s BM_Sendto 378.052MB/s 372.134MB/s 341.342MB/s BM_SendmsgTCP/0/1k 353.711MB/s 316.216MB/s 322.747MB/s BM_SendmsgTCP/0/2k 600.681MB/s 588.776MB/s 565.050MB/s BM_SendmsgTCP/0/4k 995.301MB/s 888.808MB/s 941.888MB/s BM_SendmsgTCP/0/8k 1.517GB/s 1.274GB/s 1.345GB/s BM_SendmsgTCP/0/16k 1.872GB/s 1.586GB/s 1.698GB/s BM_SendmsgTCP/0/32k 1.017GB/s 1.020GB/s 1.133GB/s BM_SendmsgTCP/0/64k 475.626MB/s 584.587MB/s 627.027MB/s BM_SendmsgTCP/0/128k 416.371MB/s 503.434MB/s 409.850MB/s BM_SendmsgTCP/0/256k 323.449MB/s 449.599MB/s 388.852MB/s BM_SendmsgTCP/0/512k 243.992MB/s 267.676MB/s 314.474MB/s BM_SendmsgTCP/0/1M 95.138MB/s 95.874MB/s 95.417MB/s BM_SendmsgTCP/0/2M 96.261MB/s 94.977MB/s 96.005MB/s BM_SendmsgTCP/0/4M 96.512MB/s 95.978MB/s 95.370MB/s BM_SendmsgTCP/0/8M 95.603MB/s 95.541MB/s 94.935MB/s BM_SendmsgTCP/0/16M 94.598MB/s 94.696MB/s 94.521MB/s BM_SendmsgTCP/0/32M 94.006MB/s 94.671MB/s 94.768MB/s BM_SendmsgTCP/0/64M 94.133MB/s 94.333MB/s 94.746MB/s BM_SendmsgTCP/0/128M 93.615MB/s 93.497MB/s 93.573MB/s BM_SendmsgTCP/0/256M 93.241MB/s 95.100MB/s 93.272MB/s BM_SendmsgTCP/1/1k 303.644MB/s 316.074MB/s 308.430MB/s BM_SendmsgTCP/1/2k 537.093MB/s 584.962MB/s 529.020MB/s BM_SendmsgTCP/1/4k 882.362MB/s 939.087MB/s 892.285MB/s BM_SendmsgTCP/1/8k 1.272GB/s 1.394GB/s 1.296GB/s BM_SendmsgTCP/1/16k 1.802GB/s 2.019GB/s 1.830GB/s BM_SendmsgTCP/1/32k 2.084GB/s 2.173GB/s 2.156GB/s BM_SendmsgTCP/1/64k 2.515GB/s 2.463GB/s 2.473GB/s BM_SendmsgTCP/1/128k 2.811GB/s 3.004GB/s 2.946GB/s BM_SendmsgTCP/1/256k 3.008GB/s 3.159GB/s 3.171GB/s BM_SendmsgTCP/1/512k 2.980GB/s 3.150GB/s 3.126GB/s BM_SendmsgTCP/1/1M 2.165GB/s 2.233GB/s 2.163GB/s BM_SendmsgTCP/1/2M 2.370GB/s 2.219GB/s 2.453GB/s BM_SendmsgTCP/1/4M 2.005GB/s 2.091GB/s 2.214GB/s BM_SendmsgTCP/1/8M 2.111GB/s 2.013GB/s 2.109GB/s BM_SendmsgTCP/1/16M 1.902GB/s 1.868GB/s 1.897GB/s BM_SendmsgTCP/1/32M 1.655GB/s 1.665GB/s 1.635GB/s BM_SendmsgTCP/1/64M 1.575GB/s 1.547GB/s 1.575GB/s BM_SendmsgTCP/1/128M 1.524GB/s 1.584GB/s 1.580GB/s BM_SendmsgTCP/1/256M 1.579GB/s 1.607GB/s 1.593GB/s PiperOrigin-RevId: 278940079
2019-11-06 22:24:38 +00:00
func (f *Forwarder) HandlePacket(r *stack.Route, id stack.TransportEndpointID, pkt tcpip.PacketBuffer) bool {
s := newSegment(r, id, pkt)
defer s.decRef()
// We only care about well-formed SYN packets.
if !s.parse() || !s.csumValid || s.flags != header.TCPFlagSyn {
return false
}
opts := parseSynSegmentOptions(s)
f.mu.Lock()
defer f.mu.Unlock()
// We have an inflight request for this id, ignore this one for now.
if _, ok := f.inFlight[id]; ok {
return true
}
// Ignore the segment if we're beyond the limit.
if len(f.inFlight) >= f.maxInFlight {
return true
}
// Launch a new goroutine to handle the request.
f.inFlight[id] = struct{}{}
s.incRef()
go f.handler(&ForwarderRequest{ // S/R-SAFE: not used by Sentry.
forwarder: f,
segment: s,
synOptions: opts,
})
return true
}
// ForwarderRequest represents a connection request received by the forwarder
// and passed to the client. Clients must eventually call Complete() on it, and
// may optionally create an endpoint to represent it via CreateEndpoint.
type ForwarderRequest struct {
mu sync.Mutex
forwarder *Forwarder
segment *segment
synOptions header.TCPSynOptions
}
// ID returns the 4-tuple (src address, src port, dst address, dst port) that
// represents the connection request.
func (r *ForwarderRequest) ID() stack.TransportEndpointID {
return r.segment.id
}
// Complete completes the request, and optionally sends a RST segment back to the
// sender.
func (r *ForwarderRequest) Complete(sendReset bool) {
r.mu.Lock()
defer r.mu.Unlock()
if r.segment == nil {
panic("Completing already completed forwarder request")
}
// Remove request from the forwarder.
r.forwarder.mu.Lock()
delete(r.forwarder.inFlight, r.segment.id)
r.forwarder.mu.Unlock()
// If the caller requested, send a reset.
if sendReset {
replyWithReset(r.segment)
}
// Release all resources.
r.segment.decRef()
r.segment = nil
r.forwarder = nil
}
// CreateEndpoint creates a TCP endpoint for the connection request, performing
// the 3-way handshake in the process.
func (r *ForwarderRequest) CreateEndpoint(queue *waiter.Queue) (tcpip.Endpoint, *tcpip.Error) {
r.mu.Lock()
defer r.mu.Unlock()
if r.segment == nil {
return nil, tcpip.ErrInvalidEndpointState
}
f := r.forwarder
ep, err := f.listen.createEndpointAndPerformHandshake(r.segment, &header.TCPSynOptions{
MSS: r.synOptions.MSS,
WS: r.synOptions.WS,
TS: r.synOptions.TS,
TSVal: r.synOptions.TSVal,
TSEcr: r.synOptions.TSEcr,
SACKPermitted: r.synOptions.SACKPermitted,
})
if err != nil {
return nil, err
}
// Start the protocol goroutine.
ep.startAcceptedLoop(queue)
return ep, nil
}