511 lines
14 KiB
Go
511 lines
14 KiB
Go
// Copyright 2018 The gVisor Authors.
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
// Package ports provides PortManager that manages allocating, reserving and
|
|
// releasing ports.
|
|
package ports
|
|
|
|
import (
|
|
"math/rand"
|
|
"sync/atomic"
|
|
|
|
"gvisor.dev/gvisor/pkg/sync"
|
|
"gvisor.dev/gvisor/pkg/tcpip"
|
|
)
|
|
|
|
const anyIPAddress tcpip.Address = ""
|
|
|
|
// Reservation describes a port reservation.
|
|
type Reservation struct {
|
|
// Networks is a list of network protocols to which the reservation
|
|
// applies. Can be IPv4, IPv6, or both.
|
|
Networks []tcpip.NetworkProtocolNumber
|
|
|
|
// Transport is the transport protocol to which the reservation applies.
|
|
Transport tcpip.TransportProtocolNumber
|
|
|
|
// Addr is the address of the local endpoint.
|
|
Addr tcpip.Address
|
|
|
|
// Port is the local port number.
|
|
Port uint16
|
|
|
|
// Flags describe features of the reservation.
|
|
Flags Flags
|
|
|
|
// BindToDevice is the NIC to which the reservation applies.
|
|
BindToDevice tcpip.NICID
|
|
|
|
// Dest is the destination address.
|
|
Dest tcpip.FullAddress
|
|
}
|
|
|
|
func (rs Reservation) dst() destination {
|
|
return destination{
|
|
rs.Dest.Addr,
|
|
rs.Dest.Port,
|
|
}
|
|
}
|
|
|
|
type portDescriptor struct {
|
|
network tcpip.NetworkProtocolNumber
|
|
transport tcpip.TransportProtocolNumber
|
|
port uint16
|
|
}
|
|
|
|
type destination struct {
|
|
addr tcpip.Address
|
|
port uint16
|
|
}
|
|
|
|
// destToCounter maps each destination to the FlagCounter that represents
|
|
// endpoints to that destination.
|
|
//
|
|
// destToCounter is never empty. When it has no elements, it is removed from
|
|
// the map that references it.
|
|
type destToCounter map[destination]FlagCounter
|
|
|
|
// intersectionFlags calculates the intersection of flag bit values which affect
|
|
// the specified destination.
|
|
//
|
|
// If no destinations are present, all flag values are returned as there are no
|
|
// entries to limit possible flag values of a new entry.
|
|
//
|
|
// In addition to the intersection, the number of intersecting refs is
|
|
// returned.
|
|
func (dc destToCounter) intersectionFlags(res Reservation) (BitFlags, int) {
|
|
intersection := FlagMask
|
|
var count int
|
|
|
|
for dest, counter := range dc {
|
|
if dest == res.dst() {
|
|
intersection &= counter.SharedFlags()
|
|
count++
|
|
continue
|
|
}
|
|
// Wildcard destinations affect all destinations for TupleOnly.
|
|
if dest.addr == anyIPAddress || res.Dest.Addr == anyIPAddress {
|
|
// Only bitwise and the TupleOnlyFlag.
|
|
intersection &= ((^TupleOnlyFlag) | counter.SharedFlags())
|
|
count++
|
|
}
|
|
}
|
|
|
|
return intersection, count
|
|
}
|
|
|
|
// deviceToDest maps NICs to destinations for which there are port reservations.
|
|
//
|
|
// deviceToDest is never empty. When it has no elements, it is removed from the
|
|
// map that references it.
|
|
type deviceToDest map[tcpip.NICID]destToCounter
|
|
|
|
// isAvailable checks whether binding is possible by device. If not binding to
|
|
// a device, check against all FlagCounters. If binding to a specific device,
|
|
// check against the unspecified device and the provided device.
|
|
//
|
|
// If either of the port reuse flags is enabled on any of the nodes, all nodes
|
|
// sharing a port must share at least one reuse flag. This matches Linux's
|
|
// behavior.
|
|
func (dd deviceToDest) isAvailable(res Reservation) bool {
|
|
flagBits := res.Flags.Bits()
|
|
if res.BindToDevice == 0 {
|
|
intersection := FlagMask
|
|
for _, dest := range dd {
|
|
flags, count := dest.intersectionFlags(res)
|
|
if count == 0 {
|
|
continue
|
|
}
|
|
intersection &= flags
|
|
if intersection&flagBits == 0 {
|
|
// Can't bind because the (addr,port) was
|
|
// previously bound without reuse.
|
|
return false
|
|
}
|
|
}
|
|
return true
|
|
}
|
|
|
|
intersection := FlagMask
|
|
|
|
if dests, ok := dd[0]; ok {
|
|
var count int
|
|
intersection, count = dests.intersectionFlags(res)
|
|
if count > 0 && intersection&flagBits == 0 {
|
|
return false
|
|
}
|
|
}
|
|
|
|
if dests, ok := dd[res.BindToDevice]; ok {
|
|
flags, count := dests.intersectionFlags(res)
|
|
intersection &= flags
|
|
if count > 0 && intersection&flagBits == 0 {
|
|
return false
|
|
}
|
|
}
|
|
|
|
return true
|
|
}
|
|
|
|
// addrToDevice maps IP addresses to NICs that have port reservations.
|
|
type addrToDevice map[tcpip.Address]deviceToDest
|
|
|
|
// isAvailable checks whether an IP address is available to bind to. If the
|
|
// address is the "any" address, check all other addresses. Otherwise, just
|
|
// check against the "any" address and the provided address.
|
|
func (ad addrToDevice) isAvailable(res Reservation) bool {
|
|
if res.Addr == anyIPAddress {
|
|
// If binding to the "any" address then check that there are no
|
|
// conflicts with all addresses.
|
|
for _, devices := range ad {
|
|
if !devices.isAvailable(res) {
|
|
return false
|
|
}
|
|
}
|
|
return true
|
|
}
|
|
|
|
// Check that there is no conflict with the "any" address.
|
|
if devices, ok := ad[anyIPAddress]; ok {
|
|
if !devices.isAvailable(res) {
|
|
return false
|
|
}
|
|
}
|
|
|
|
// Check that this is no conflict with the provided address.
|
|
if devices, ok := ad[res.Addr]; ok {
|
|
if !devices.isAvailable(res) {
|
|
return false
|
|
}
|
|
}
|
|
|
|
return true
|
|
}
|
|
|
|
// PortManager manages allocating, reserving and releasing ports.
|
|
type PortManager struct {
|
|
// mu protects allocatedPorts.
|
|
// LOCK ORDERING: mu > ephemeralMu.
|
|
mu sync.RWMutex
|
|
// allocatedPorts is a nesting of maps that ultimately map Reservations
|
|
// to FlagCounters describing whether the Reservation is valid and can
|
|
// be reused.
|
|
allocatedPorts map[portDescriptor]addrToDevice
|
|
|
|
// ephemeralMu protects firstEphemeral and numEphemeral.
|
|
ephemeralMu sync.RWMutex
|
|
firstEphemeral uint16
|
|
numEphemeral uint16
|
|
|
|
// hint is used to pick ports ephemeral ports in a stable order for
|
|
// a given port offset.
|
|
//
|
|
// hint must be accessed using the portHint/incPortHint helpers.
|
|
// TODO(gvisor.dev/issue/940): S/R this field.
|
|
hint uint32
|
|
}
|
|
|
|
// NewPortManager creates new PortManager.
|
|
func NewPortManager() *PortManager {
|
|
return &PortManager{
|
|
allocatedPorts: make(map[portDescriptor]addrToDevice),
|
|
// Match Linux's default ephemeral range. See:
|
|
// https://github.com/torvalds/linux/blob/e54937963fa249595824439dc839c948188dea83/net/ipv4/af_inet.c#L1842
|
|
firstEphemeral: 32768,
|
|
numEphemeral: 28232,
|
|
}
|
|
}
|
|
|
|
// PortTester indicates whether the passed in port is suitable. Returning an
|
|
// error causes the function to which the PortTester is passed to return that
|
|
// error.
|
|
type PortTester func(port uint16) (good bool, err tcpip.Error)
|
|
|
|
// PickEphemeralPort randomly chooses a starting point and iterates over all
|
|
// possible ephemeral ports, allowing the caller to decide whether a given port
|
|
// is suitable for its needs, and stopping when a port is found or an error
|
|
// occurs.
|
|
func (pm *PortManager) PickEphemeralPort(testPort PortTester) (port uint16, err tcpip.Error) {
|
|
pm.ephemeralMu.RLock()
|
|
firstEphemeral := pm.firstEphemeral
|
|
numEphemeral := pm.numEphemeral
|
|
pm.ephemeralMu.RUnlock()
|
|
|
|
offset := uint16(rand.Int31n(int32(numEphemeral)))
|
|
return pickEphemeralPort(offset, firstEphemeral, numEphemeral, testPort)
|
|
}
|
|
|
|
// portHint atomically reads and returns the pm.hint value.
|
|
func (pm *PortManager) portHint() uint16 {
|
|
return uint16(atomic.LoadUint32(&pm.hint))
|
|
}
|
|
|
|
// incPortHint atomically increments pm.hint by 1.
|
|
func (pm *PortManager) incPortHint() {
|
|
atomic.AddUint32(&pm.hint, 1)
|
|
}
|
|
|
|
// PickEphemeralPortStable starts at the specified offset + pm.portHint and
|
|
// iterates over all ephemeral ports, allowing the caller to decide whether a
|
|
// given port is suitable for its needs and stopping when a port is found or an
|
|
// error occurs.
|
|
func (pm *PortManager) PickEphemeralPortStable(offset uint16, testPort PortTester) (port uint16, err tcpip.Error) {
|
|
pm.ephemeralMu.RLock()
|
|
firstEphemeral := pm.firstEphemeral
|
|
numEphemeral := pm.numEphemeral
|
|
pm.ephemeralMu.RUnlock()
|
|
|
|
p, err := pickEphemeralPort(pm.portHint()+offset, firstEphemeral, numEphemeral, testPort)
|
|
if err == nil {
|
|
pm.incPortHint()
|
|
}
|
|
return p, err
|
|
}
|
|
|
|
// pickEphemeralPort starts at the offset specified from the FirstEphemeral port
|
|
// and iterates over the number of ports specified by count and allows the
|
|
// caller to decide whether a given port is suitable for its needs, and stopping
|
|
// when a port is found or an error occurs.
|
|
func pickEphemeralPort(offset, first, count uint16, testPort PortTester) (port uint16, err tcpip.Error) {
|
|
for i := uint16(0); i < count; i++ {
|
|
port = first + (offset+i)%count
|
|
ok, err := testPort(port)
|
|
if err != nil {
|
|
return 0, err
|
|
}
|
|
|
|
if ok {
|
|
return port, nil
|
|
}
|
|
}
|
|
|
|
return 0, &tcpip.ErrNoPortAvailable{}
|
|
}
|
|
|
|
// ReservePort marks a port/IP combination as reserved so that it cannot be
|
|
// reserved by another endpoint. If port is zero, ReservePort will search for
|
|
// an unreserved ephemeral port and reserve it, returning its value in the
|
|
// "port" return value.
|
|
//
|
|
// An optional PortTester can be passed in which if provided will be used to
|
|
// test if the picked port can be used. The function should return true if the
|
|
// port is safe to use, false otherwise.
|
|
func (pm *PortManager) ReservePort(res Reservation, testPort PortTester) (reservedPort uint16, err tcpip.Error) {
|
|
pm.mu.Lock()
|
|
defer pm.mu.Unlock()
|
|
|
|
// If a port is specified, just try to reserve it for all network
|
|
// protocols.
|
|
if res.Port != 0 {
|
|
if !pm.reserveSpecificPortLocked(res) {
|
|
return 0, &tcpip.ErrPortInUse{}
|
|
}
|
|
if testPort != nil {
|
|
ok, err := testPort(res.Port)
|
|
if err != nil {
|
|
pm.releasePortLocked(res)
|
|
return 0, err
|
|
}
|
|
if !ok {
|
|
pm.releasePortLocked(res)
|
|
return 0, &tcpip.ErrPortInUse{}
|
|
}
|
|
}
|
|
return res.Port, nil
|
|
}
|
|
|
|
// A port wasn't specified, so try to find one.
|
|
return pm.PickEphemeralPort(func(p uint16) (bool, tcpip.Error) {
|
|
res.Port = p
|
|
if !pm.reserveSpecificPortLocked(res) {
|
|
return false, nil
|
|
}
|
|
if testPort != nil {
|
|
ok, err := testPort(p)
|
|
if err != nil {
|
|
pm.releasePortLocked(res)
|
|
return false, err
|
|
}
|
|
if !ok {
|
|
pm.releasePortLocked(res)
|
|
return false, nil
|
|
}
|
|
}
|
|
return true, nil
|
|
})
|
|
}
|
|
|
|
// reserveSpecificPortLocked tries to reserve the given port on all given
|
|
// protocols.
|
|
func (pm *PortManager) reserveSpecificPortLocked(res Reservation) bool {
|
|
// Make sure the port is available.
|
|
for _, network := range res.Networks {
|
|
desc := portDescriptor{network, res.Transport, res.Port}
|
|
if addrs, ok := pm.allocatedPorts[desc]; ok {
|
|
if !addrs.isAvailable(res) {
|
|
return false
|
|
}
|
|
}
|
|
}
|
|
|
|
// Reserve port on all network protocols.
|
|
flagBits := res.Flags.Bits()
|
|
dst := res.dst()
|
|
for _, network := range res.Networks {
|
|
desc := portDescriptor{network, res.Transport, res.Port}
|
|
addrToDev, ok := pm.allocatedPorts[desc]
|
|
if !ok {
|
|
addrToDev = make(addrToDevice)
|
|
pm.allocatedPorts[desc] = addrToDev
|
|
}
|
|
devToDest, ok := addrToDev[res.Addr]
|
|
if !ok {
|
|
devToDest = make(deviceToDest)
|
|
addrToDev[res.Addr] = devToDest
|
|
}
|
|
destToCntr := devToDest[res.BindToDevice]
|
|
if destToCntr == nil {
|
|
destToCntr = make(destToCounter)
|
|
}
|
|
counter := destToCntr[dst]
|
|
counter.AddRef(flagBits)
|
|
destToCntr[dst] = counter
|
|
devToDest[res.BindToDevice] = destToCntr
|
|
}
|
|
|
|
return true
|
|
}
|
|
|
|
// ReserveTuple adds a port reservation for the tuple on all given protocol.
|
|
func (pm *PortManager) ReserveTuple(res Reservation) bool {
|
|
flagBits := res.Flags.Bits()
|
|
dst := res.dst()
|
|
|
|
pm.mu.Lock()
|
|
defer pm.mu.Unlock()
|
|
|
|
// It is easier to undo the entire reservation, so if we find that the
|
|
// tuple can't be fully added, finish and undo the whole thing.
|
|
undo := false
|
|
|
|
// Reserve port on all network protocols.
|
|
for _, network := range res.Networks {
|
|
desc := portDescriptor{network, res.Transport, res.Port}
|
|
addrToDev, ok := pm.allocatedPorts[desc]
|
|
if !ok {
|
|
addrToDev = make(addrToDevice)
|
|
pm.allocatedPorts[desc] = addrToDev
|
|
}
|
|
devToDest, ok := addrToDev[res.Addr]
|
|
if !ok {
|
|
devToDest = make(deviceToDest)
|
|
addrToDev[res.Addr] = devToDest
|
|
}
|
|
destToCntr := devToDest[res.BindToDevice]
|
|
if destToCntr == nil {
|
|
destToCntr = make(destToCounter)
|
|
}
|
|
|
|
counter := destToCntr[dst]
|
|
if counter.TotalRefs() != 0 && counter.SharedFlags()&flagBits == 0 {
|
|
// Tuple already exists.
|
|
undo = true
|
|
}
|
|
counter.AddRef(flagBits)
|
|
destToCntr[dst] = counter
|
|
devToDest[res.BindToDevice] = destToCntr
|
|
}
|
|
|
|
if undo {
|
|
// releasePortLocked decrements the counts (rather than setting
|
|
// them to zero), so it will undo the incorrect incrementing
|
|
// above.
|
|
pm.releasePortLocked(res)
|
|
return false
|
|
}
|
|
|
|
return true
|
|
}
|
|
|
|
// ReleasePort releases the reservation on a port/IP combination so that it can
|
|
// be reserved by other endpoints.
|
|
func (pm *PortManager) ReleasePort(res Reservation) {
|
|
pm.mu.Lock()
|
|
defer pm.mu.Unlock()
|
|
|
|
pm.releasePortLocked(res)
|
|
}
|
|
|
|
func (pm *PortManager) releasePortLocked(res Reservation) {
|
|
dst := res.dst()
|
|
for _, network := range res.Networks {
|
|
desc := portDescriptor{network, res.Transport, res.Port}
|
|
addrToDev, ok := pm.allocatedPorts[desc]
|
|
if !ok {
|
|
continue
|
|
}
|
|
devToDest, ok := addrToDev[res.Addr]
|
|
if !ok {
|
|
continue
|
|
}
|
|
destToCounter, ok := devToDest[res.BindToDevice]
|
|
if !ok {
|
|
continue
|
|
}
|
|
counter, ok := destToCounter[dst]
|
|
if !ok {
|
|
continue
|
|
}
|
|
counter.DropRef(res.Flags.Bits())
|
|
if counter.TotalRefs() > 0 {
|
|
destToCounter[dst] = counter
|
|
continue
|
|
}
|
|
delete(destToCounter, dst)
|
|
if len(destToCounter) > 0 {
|
|
continue
|
|
}
|
|
delete(devToDest, res.BindToDevice)
|
|
if len(devToDest) > 0 {
|
|
continue
|
|
}
|
|
delete(addrToDev, res.Addr)
|
|
if len(addrToDev) > 0 {
|
|
continue
|
|
}
|
|
delete(pm.allocatedPorts, desc)
|
|
}
|
|
}
|
|
|
|
// PortRange returns the UDP and TCP inclusive range of ephemeral ports used in
|
|
// both IPv4 and IPv6.
|
|
func (pm *PortManager) PortRange() (uint16, uint16) {
|
|
pm.ephemeralMu.RLock()
|
|
defer pm.ephemeralMu.RUnlock()
|
|
return pm.firstEphemeral, pm.firstEphemeral + pm.numEphemeral - 1
|
|
}
|
|
|
|
// SetPortRange sets the UDP and TCP IPv4 and IPv6 ephemeral port range
|
|
// (inclusive).
|
|
func (pm *PortManager) SetPortRange(start uint16, end uint16) tcpip.Error {
|
|
if start > end {
|
|
return &tcpip.ErrInvalidPortRange{}
|
|
}
|
|
pm.ephemeralMu.Lock()
|
|
defer pm.ephemeralMu.Unlock()
|
|
pm.firstEphemeral = start
|
|
pm.numEphemeral = end - start + 1
|
|
return nil
|
|
}
|