garbage collect connections
As in Linux, we must periodically clean up unused connections. PiperOrigin-RevId: 321003353
This commit is contained in:
parent
76b392bc26
commit
43c209f48e
|
@ -27,6 +27,18 @@ go_template_instance(
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
|
|
||||||
|
go_template_instance(
|
||||||
|
name = "tuple_list",
|
||||||
|
out = "tuple_list.go",
|
||||||
|
package = "stack",
|
||||||
|
prefix = "tuple",
|
||||||
|
template = "//pkg/ilist:generic_list",
|
||||||
|
types = {
|
||||||
|
"Element": "*tuple",
|
||||||
|
"Linker": "*tuple",
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
go_library(
|
go_library(
|
||||||
name = "stack",
|
name = "stack",
|
||||||
srcs = [
|
srcs = [
|
||||||
|
@ -35,6 +47,7 @@ go_library(
|
||||||
"forwarder.go",
|
"forwarder.go",
|
||||||
"icmp_rate_limit.go",
|
"icmp_rate_limit.go",
|
||||||
"iptables.go",
|
"iptables.go",
|
||||||
|
"iptables_state.go",
|
||||||
"iptables_targets.go",
|
"iptables_targets.go",
|
||||||
"iptables_types.go",
|
"iptables_types.go",
|
||||||
"linkaddrcache.go",
|
"linkaddrcache.go",
|
||||||
|
@ -50,6 +63,7 @@ go_library(
|
||||||
"stack_global_state.go",
|
"stack_global_state.go",
|
||||||
"stack_options.go",
|
"stack_options.go",
|
||||||
"transport_demuxer.go",
|
"transport_demuxer.go",
|
||||||
|
"tuple_list.go",
|
||||||
],
|
],
|
||||||
visibility = ["//visibility:public"],
|
visibility = ["//visibility:public"],
|
||||||
deps = [
|
deps = [
|
||||||
|
|
|
@ -15,9 +15,12 @@
|
||||||
package stack
|
package stack
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"encoding/binary"
|
||||||
"sync"
|
"sync"
|
||||||
|
"time"
|
||||||
|
|
||||||
"gvisor.dev/gvisor/pkg/tcpip"
|
"gvisor.dev/gvisor/pkg/tcpip"
|
||||||
|
"gvisor.dev/gvisor/pkg/tcpip/hash/jenkins"
|
||||||
"gvisor.dev/gvisor/pkg/tcpip/header"
|
"gvisor.dev/gvisor/pkg/tcpip/header"
|
||||||
"gvisor.dev/gvisor/pkg/tcpip/transport/tcpconntrack"
|
"gvisor.dev/gvisor/pkg/tcpip/transport/tcpconntrack"
|
||||||
)
|
)
|
||||||
|
@ -30,6 +33,10 @@ import (
|
||||||
//
|
//
|
||||||
// Currently, only TCP tracking is supported.
|
// Currently, only TCP tracking is supported.
|
||||||
|
|
||||||
|
// Our hash table has 16K buckets.
|
||||||
|
// TODO(gvisor.dev/issue/170): These should be tunable.
|
||||||
|
const numBuckets = 1 << 14
|
||||||
|
|
||||||
// Direction of the tuple.
|
// Direction of the tuple.
|
||||||
type direction int
|
type direction int
|
||||||
|
|
||||||
|
@ -48,7 +55,12 @@ const (
|
||||||
|
|
||||||
// tuple holds a connection's identifying and manipulating data in one
|
// tuple holds a connection's identifying and manipulating data in one
|
||||||
// direction. It is immutable.
|
// direction. It is immutable.
|
||||||
|
//
|
||||||
|
// +stateify savable
|
||||||
type tuple struct {
|
type tuple struct {
|
||||||
|
// tupleEntry is used to build an intrusive list of tuples.
|
||||||
|
tupleEntry
|
||||||
|
|
||||||
tupleID
|
tupleID
|
||||||
|
|
||||||
// conn is the connection tracking entry this tuple belongs to.
|
// conn is the connection tracking entry this tuple belongs to.
|
||||||
|
@ -61,6 +73,8 @@ type tuple struct {
|
||||||
// tupleID uniquely identifies a connection in one direction. It currently
|
// tupleID uniquely identifies a connection in one direction. It currently
|
||||||
// contains enough information to distinguish between any TCP or UDP
|
// contains enough information to distinguish between any TCP or UDP
|
||||||
// connection, and will need to be extended to support other protocols.
|
// connection, and will need to be extended to support other protocols.
|
||||||
|
//
|
||||||
|
// +stateify savable
|
||||||
type tupleID struct {
|
type tupleID struct {
|
||||||
srcAddr tcpip.Address
|
srcAddr tcpip.Address
|
||||||
srcPort uint16
|
srcPort uint16
|
||||||
|
@ -83,6 +97,8 @@ func (ti tupleID) reply() tupleID {
|
||||||
}
|
}
|
||||||
|
|
||||||
// conn is a tracked connection.
|
// conn is a tracked connection.
|
||||||
|
//
|
||||||
|
// +stateify savable
|
||||||
type conn struct {
|
type conn struct {
|
||||||
// original is the tuple in original direction. It is immutable.
|
// original is the tuple in original direction. It is immutable.
|
||||||
original tuple
|
original tuple
|
||||||
|
@ -98,22 +114,67 @@ type conn struct {
|
||||||
tcbHook Hook
|
tcbHook Hook
|
||||||
|
|
||||||
// mu protects tcb.
|
// mu protects tcb.
|
||||||
mu sync.Mutex
|
mu sync.Mutex `state:"nosave"`
|
||||||
|
|
||||||
// tcb is TCB control block. It is used to keep track of states
|
// tcb is TCB control block. It is used to keep track of states
|
||||||
// of tcp connection and is protected by mu.
|
// of tcp connection and is protected by mu.
|
||||||
tcb tcpconntrack.TCB
|
tcb tcpconntrack.TCB
|
||||||
|
|
||||||
|
// lastUsed is the last time the connection saw a relevant packet, and
|
||||||
|
// is updated by each packet on the connection. It is protected by mu.
|
||||||
|
lastUsed time.Time `state:".(unixTime)"`
|
||||||
|
}
|
||||||
|
|
||||||
|
// timedOut returns whether the connection timed out based on its state.
|
||||||
|
func (cn *conn) timedOut(now time.Time) bool {
|
||||||
|
const establishedTimeout = 5 * 24 * time.Hour
|
||||||
|
const defaultTimeout = 120 * time.Second
|
||||||
|
cn.mu.Lock()
|
||||||
|
defer cn.mu.Unlock()
|
||||||
|
if cn.tcb.State() == tcpconntrack.ResultAlive {
|
||||||
|
// Use the same default as Linux, which doesn't delete
|
||||||
|
// established connections for 5(!) days.
|
||||||
|
return now.Sub(cn.lastUsed) > establishedTimeout
|
||||||
|
}
|
||||||
|
// Use the same default as Linux, which lets connections in most states
|
||||||
|
// other than established remain for <= 120 seconds.
|
||||||
|
return now.Sub(cn.lastUsed) > defaultTimeout
|
||||||
}
|
}
|
||||||
|
|
||||||
// ConnTrack tracks all connections created for NAT rules. Most users are
|
// ConnTrack tracks all connections created for NAT rules. Most users are
|
||||||
// expected to only call handlePacket and createConnFor.
|
// expected to only call handlePacket and createConnFor.
|
||||||
|
//
|
||||||
|
// ConnTrack keeps all connections in a slice of buckets, each of which holds a
|
||||||
|
// linked list of tuples. This gives us some desirable properties:
|
||||||
|
// - Each bucket has its own lock, lessening lock contention.
|
||||||
|
// - The slice is large enough that lists stay short (<10 elements on average).
|
||||||
|
// Thus traversal is fast.
|
||||||
|
// - During linked list traversal we reap expired connections. This amortizes
|
||||||
|
// the cost of reaping them and makes reapUnused faster.
|
||||||
|
//
|
||||||
|
// Locks are ordered by their location in the buckets slice. That is, a
|
||||||
|
// goroutine that locks buckets[i] can only lock buckets[j] s.t. i < j.
|
||||||
|
//
|
||||||
|
// +stateify savable
|
||||||
type ConnTrack struct {
|
type ConnTrack struct {
|
||||||
// mu protects conns.
|
// seed is a one-time random value initialized at stack startup
|
||||||
mu sync.RWMutex
|
// and is used in the calculation of hash keys for the list of buckets.
|
||||||
|
// It is immutable.
|
||||||
|
seed uint32
|
||||||
|
|
||||||
// conns maintains a map of tuples needed for connection tracking for
|
// mu protects the buckets slice, but not buckets' contents. Only take
|
||||||
// iptables NAT rules. It is protected by mu.
|
// the write lock if you are modifying the slice or saving for S/R.
|
||||||
conns map[tupleID]tuple
|
mu sync.RWMutex `state:"nosave"`
|
||||||
|
|
||||||
|
// buckets is protected by mu.
|
||||||
|
buckets []bucket
|
||||||
|
}
|
||||||
|
|
||||||
|
// +stateify savable
|
||||||
|
type bucket struct {
|
||||||
|
// mu protects tuples.
|
||||||
|
mu sync.Mutex `state:"nosave"`
|
||||||
|
tuples tupleList
|
||||||
}
|
}
|
||||||
|
|
||||||
// packetToTupleID converts packet to a tuple ID. It fails when pkt lacks a valid
|
// packetToTupleID converts packet to a tuple ID. It fails when pkt lacks a valid
|
||||||
|
@ -143,8 +204,9 @@ func packetToTupleID(pkt *PacketBuffer) (tupleID, *tcpip.Error) {
|
||||||
// newConn creates new connection.
|
// newConn creates new connection.
|
||||||
func newConn(orig, reply tupleID, manip manipType, hook Hook) *conn {
|
func newConn(orig, reply tupleID, manip manipType, hook Hook) *conn {
|
||||||
conn := conn{
|
conn := conn{
|
||||||
manip: manip,
|
manip: manip,
|
||||||
tcbHook: hook,
|
tcbHook: hook,
|
||||||
|
lastUsed: time.Now(),
|
||||||
}
|
}
|
||||||
conn.original = tuple{conn: &conn, tupleID: orig}
|
conn.original = tuple{conn: &conn, tupleID: orig}
|
||||||
conn.reply = tuple{conn: &conn, tupleID: reply, direction: dirReply}
|
conn.reply = tuple{conn: &conn, tupleID: reply, direction: dirReply}
|
||||||
|
@ -162,14 +224,28 @@ func (ct *ConnTrack) connFor(pkt *PacketBuffer) (*conn, direction) {
|
||||||
return nil, dirOriginal
|
return nil, dirOriginal
|
||||||
}
|
}
|
||||||
|
|
||||||
ct.mu.Lock()
|
bucket := ct.bucket(tid)
|
||||||
defer ct.mu.Unlock()
|
now := time.Now()
|
||||||
|
|
||||||
tuple, ok := ct.conns[tid]
|
ct.mu.RLock()
|
||||||
if !ok {
|
defer ct.mu.RUnlock()
|
||||||
return nil, dirOriginal
|
ct.buckets[bucket].mu.Lock()
|
||||||
|
defer ct.buckets[bucket].mu.Unlock()
|
||||||
|
|
||||||
|
// Iterate over the tuples in a bucket, cleaning up any unused
|
||||||
|
// connections we find.
|
||||||
|
for other := ct.buckets[bucket].tuples.Front(); other != nil; other = other.Next() {
|
||||||
|
// Clean up any timed-out connections we happen to find.
|
||||||
|
if ct.reapTupleLocked(other, bucket, now) {
|
||||||
|
// The tuple expired.
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if tid == other.tupleID {
|
||||||
|
return other.conn, other.direction
|
||||||
|
}
|
||||||
}
|
}
|
||||||
return tuple.conn, tuple.direction
|
|
||||||
|
return nil, dirOriginal
|
||||||
}
|
}
|
||||||
|
|
||||||
// createConnFor creates a new conn for pkt.
|
// createConnFor creates a new conn for pkt.
|
||||||
|
@ -197,13 +273,31 @@ func (ct *ConnTrack) createConnFor(pkt *PacketBuffer, hook Hook, rt RedirectTarg
|
||||||
}
|
}
|
||||||
conn := newConn(tid, replyTID, manip, hook)
|
conn := newConn(tid, replyTID, manip, hook)
|
||||||
|
|
||||||
// Add the changed tuple to the map.
|
// Lock the buckets in the correct order.
|
||||||
// TODO(gvisor.dev/issue/170): Need to support collisions using linked
|
tupleBucket := ct.bucket(tid)
|
||||||
// list.
|
replyBucket := ct.bucket(replyTID)
|
||||||
ct.mu.Lock()
|
ct.mu.RLock()
|
||||||
defer ct.mu.Unlock()
|
defer ct.mu.RUnlock()
|
||||||
ct.conns[tid] = conn.original
|
if tupleBucket < replyBucket {
|
||||||
ct.conns[replyTID] = conn.reply
|
ct.buckets[tupleBucket].mu.Lock()
|
||||||
|
ct.buckets[replyBucket].mu.Lock()
|
||||||
|
} else if tupleBucket > replyBucket {
|
||||||
|
ct.buckets[replyBucket].mu.Lock()
|
||||||
|
ct.buckets[tupleBucket].mu.Lock()
|
||||||
|
} else {
|
||||||
|
// Both tuples are in the same bucket.
|
||||||
|
ct.buckets[tupleBucket].mu.Lock()
|
||||||
|
}
|
||||||
|
|
||||||
|
// Add the tuple to the map.
|
||||||
|
ct.buckets[tupleBucket].tuples.PushFront(&conn.original)
|
||||||
|
ct.buckets[replyBucket].tuples.PushFront(&conn.reply)
|
||||||
|
|
||||||
|
// Unlocking can happen in any order.
|
||||||
|
ct.buckets[tupleBucket].mu.Unlock()
|
||||||
|
if tupleBucket != replyBucket {
|
||||||
|
ct.buckets[replyBucket].mu.Unlock()
|
||||||
|
}
|
||||||
|
|
||||||
return conn
|
return conn
|
||||||
}
|
}
|
||||||
|
@ -297,35 +391,134 @@ func (ct *ConnTrack) handlePacket(pkt *PacketBuffer, hook Hook, gso *GSO, r *Rou
|
||||||
// other tcp states.
|
// other tcp states.
|
||||||
conn.mu.Lock()
|
conn.mu.Lock()
|
||||||
defer conn.mu.Unlock()
|
defer conn.mu.Unlock()
|
||||||
var st tcpconntrack.Result
|
|
||||||
tcpHeader := header.TCP(pkt.TransportHeader)
|
// Mark the connection as having been used recently so it isn't reaped.
|
||||||
if conn.tcb.IsEmpty() {
|
conn.lastUsed = time.Now()
|
||||||
|
// Update connection state.
|
||||||
|
if tcpHeader := header.TCP(pkt.TransportHeader); conn.tcb.IsEmpty() {
|
||||||
conn.tcb.Init(tcpHeader)
|
conn.tcb.Init(tcpHeader)
|
||||||
conn.tcbHook = hook
|
conn.tcbHook = hook
|
||||||
|
} else if hook == conn.tcbHook {
|
||||||
|
conn.tcb.UpdateStateOutbound(tcpHeader)
|
||||||
} else {
|
} else {
|
||||||
switch hook {
|
conn.tcb.UpdateStateInbound(tcpHeader)
|
||||||
case conn.tcbHook:
|
}
|
||||||
st = conn.tcb.UpdateStateOutbound(tcpHeader)
|
}
|
||||||
default:
|
|
||||||
st = conn.tcb.UpdateStateInbound(tcpHeader)
|
// bucket gets the conntrack bucket for a tupleID.
|
||||||
|
func (ct *ConnTrack) bucket(id tupleID) int {
|
||||||
|
h := jenkins.Sum32(ct.seed)
|
||||||
|
h.Write([]byte(id.srcAddr))
|
||||||
|
h.Write([]byte(id.dstAddr))
|
||||||
|
shortBuf := make([]byte, 2)
|
||||||
|
binary.LittleEndian.PutUint16(shortBuf, id.srcPort)
|
||||||
|
h.Write([]byte(shortBuf))
|
||||||
|
binary.LittleEndian.PutUint16(shortBuf, id.dstPort)
|
||||||
|
h.Write([]byte(shortBuf))
|
||||||
|
binary.LittleEndian.PutUint16(shortBuf, uint16(id.transProto))
|
||||||
|
h.Write([]byte(shortBuf))
|
||||||
|
binary.LittleEndian.PutUint16(shortBuf, uint16(id.netProto))
|
||||||
|
h.Write([]byte(shortBuf))
|
||||||
|
ct.mu.RLock()
|
||||||
|
defer ct.mu.RUnlock()
|
||||||
|
return int(h.Sum32()) % len(ct.buckets)
|
||||||
|
}
|
||||||
|
|
||||||
|
// reapUnused deletes timed out entries from the conntrack map. The rules for
|
||||||
|
// reaping are:
|
||||||
|
// - Most reaping occurs in connFor, which is called on each packet. connFor
|
||||||
|
// cleans up the bucket the packet's connection maps to. Thus calls to
|
||||||
|
// reapUnused should be fast.
|
||||||
|
// - Each call to reapUnused traverses a fraction of the conntrack table.
|
||||||
|
// Specifically, it traverses len(ct.buckets)/fractionPerReaping.
|
||||||
|
// - After reaping, reapUnused decides when it should next run based on the
|
||||||
|
// ratio of expired connections to examined connections. If the ratio is
|
||||||
|
// greater than maxExpiredPct, it schedules the next run quickly. Otherwise it
|
||||||
|
// slightly increases the interval between runs.
|
||||||
|
// - maxFullTraversal caps the time it takes to traverse the entire table.
|
||||||
|
//
|
||||||
|
// reapUnused returns the next bucket that should be checked and the time after
|
||||||
|
// which it should be called again.
|
||||||
|
func (ct *ConnTrack) reapUnused(start int, prevInterval time.Duration) (int, time.Duration) {
|
||||||
|
// TODO(gvisor.dev/issue/170): This can be more finely controlled, as
|
||||||
|
// it is in Linux via sysctl.
|
||||||
|
const fractionPerReaping = 128
|
||||||
|
const maxExpiredPct = 50
|
||||||
|
const maxFullTraversal = 60 * time.Second
|
||||||
|
const minInterval = 10 * time.Millisecond
|
||||||
|
const maxInterval = maxFullTraversal / fractionPerReaping
|
||||||
|
|
||||||
|
now := time.Now()
|
||||||
|
checked := 0
|
||||||
|
expired := 0
|
||||||
|
var idx int
|
||||||
|
ct.mu.RLock()
|
||||||
|
defer ct.mu.RUnlock()
|
||||||
|
for i := 0; i < len(ct.buckets)/fractionPerReaping; i++ {
|
||||||
|
idx = (i + start) % len(ct.buckets)
|
||||||
|
ct.buckets[idx].mu.Lock()
|
||||||
|
for tuple := ct.buckets[idx].tuples.Front(); tuple != nil; tuple = tuple.Next() {
|
||||||
|
checked++
|
||||||
|
if ct.reapTupleLocked(tuple, idx, now) {
|
||||||
|
expired++
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
ct.buckets[idx].mu.Unlock()
|
||||||
}
|
}
|
||||||
|
// We already checked buckets[idx].
|
||||||
|
idx++
|
||||||
|
|
||||||
// Delete conn if tcp connection is closed.
|
// If half or more of the connections are expired, the table has gotten
|
||||||
if st == tcpconntrack.ResultClosedByPeer || st == tcpconntrack.ResultClosedBySelf || st == tcpconntrack.ResultReset {
|
// stale. Reschedule quickly.
|
||||||
ct.deleteConn(conn)
|
expiredPct := 0
|
||||||
|
if checked != 0 {
|
||||||
|
expiredPct = expired * 100 / checked
|
||||||
}
|
}
|
||||||
|
if expiredPct > maxExpiredPct {
|
||||||
|
return idx, minInterval
|
||||||
|
}
|
||||||
|
if interval := prevInterval + minInterval; interval <= maxInterval {
|
||||||
|
// Increment the interval between runs.
|
||||||
|
return idx, interval
|
||||||
|
}
|
||||||
|
// We've hit the maximum interval.
|
||||||
|
return idx, maxInterval
|
||||||
}
|
}
|
||||||
|
|
||||||
// deleteConn deletes the connection.
|
// reapTupleLocked tries to remove tuple and its reply from the table. It
|
||||||
func (ct *ConnTrack) deleteConn(conn *conn) {
|
// returns whether the tuple's connection has timed out.
|
||||||
if conn == nil {
|
//
|
||||||
return
|
// Preconditions: ct.mu is locked for reading and bucket is locked.
|
||||||
|
func (ct *ConnTrack) reapTupleLocked(tuple *tuple, bucket int, now time.Time) bool {
|
||||||
|
if !tuple.conn.timedOut(now) {
|
||||||
|
return false
|
||||||
}
|
}
|
||||||
|
|
||||||
ct.mu.Lock()
|
// To maintain lock order, we can only reap these tuples if the reply
|
||||||
defer ct.mu.Unlock()
|
// appears later in the table.
|
||||||
|
replyBucket := ct.bucket(tuple.reply())
|
||||||
|
if bucket > replyBucket {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
|
||||||
delete(ct.conns, conn.original.tupleID)
|
// Don't re-lock if both tuples are in the same bucket.
|
||||||
delete(ct.conns, conn.reply.tupleID)
|
differentBuckets := bucket != replyBucket
|
||||||
|
if differentBuckets {
|
||||||
|
ct.buckets[replyBucket].mu.Lock()
|
||||||
|
}
|
||||||
|
|
||||||
|
// We have the buckets locked and can remove both tuples.
|
||||||
|
if tuple.direction == dirOriginal {
|
||||||
|
ct.buckets[replyBucket].tuples.Remove(&tuple.conn.reply)
|
||||||
|
} else {
|
||||||
|
ct.buckets[replyBucket].tuples.Remove(&tuple.conn.original)
|
||||||
|
}
|
||||||
|
ct.buckets[bucket].tuples.Remove(tuple)
|
||||||
|
|
||||||
|
// Don't re-unlock if both tuples are in the same bucket.
|
||||||
|
if differentBuckets {
|
||||||
|
ct.buckets[replyBucket].mu.Unlock()
|
||||||
|
}
|
||||||
|
|
||||||
|
return true
|
||||||
}
|
}
|
||||||
|
|
|
@ -16,6 +16,7 @@ package stack
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"fmt"
|
"fmt"
|
||||||
|
"time"
|
||||||
|
|
||||||
"gvisor.dev/gvisor/pkg/tcpip"
|
"gvisor.dev/gvisor/pkg/tcpip"
|
||||||
"gvisor.dev/gvisor/pkg/tcpip/header"
|
"gvisor.dev/gvisor/pkg/tcpip/header"
|
||||||
|
@ -41,6 +42,9 @@ const (
|
||||||
// underflow.
|
// underflow.
|
||||||
const HookUnset = -1
|
const HookUnset = -1
|
||||||
|
|
||||||
|
// reaperDelay is how long to wait before starting to reap connections.
|
||||||
|
const reaperDelay = 5 * time.Second
|
||||||
|
|
||||||
// DefaultTables returns a default set of tables. Each chain is set to accept
|
// DefaultTables returns a default set of tables. Each chain is set to accept
|
||||||
// all packets.
|
// all packets.
|
||||||
func DefaultTables() *IPTables {
|
func DefaultTables() *IPTables {
|
||||||
|
@ -112,8 +116,9 @@ func DefaultTables() *IPTables {
|
||||||
Output: []string{TablenameMangle, TablenameNat, TablenameFilter},
|
Output: []string{TablenameMangle, TablenameNat, TablenameFilter},
|
||||||
},
|
},
|
||||||
connections: ConnTrack{
|
connections: ConnTrack{
|
||||||
conns: make(map[tupleID]tuple),
|
seed: generateRandUint32(),
|
||||||
},
|
},
|
||||||
|
reaperDone: make(chan struct{}, 1),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -169,6 +174,12 @@ func (it *IPTables) GetTable(name string) (Table, bool) {
|
||||||
func (it *IPTables) ReplaceTable(name string, table Table) {
|
func (it *IPTables) ReplaceTable(name string, table Table) {
|
||||||
it.mu.Lock()
|
it.mu.Lock()
|
||||||
defer it.mu.Unlock()
|
defer it.mu.Unlock()
|
||||||
|
// If iptables is being enabled, initialize the conntrack table and
|
||||||
|
// reaper.
|
||||||
|
if !it.modified {
|
||||||
|
it.connections.buckets = make([]bucket, numBuckets)
|
||||||
|
it.startReaper(reaperDelay)
|
||||||
|
}
|
||||||
it.modified = true
|
it.modified = true
|
||||||
it.tables[name] = table
|
it.tables[name] = table
|
||||||
}
|
}
|
||||||
|
@ -249,6 +260,35 @@ func (it *IPTables) Check(hook Hook, pkt *PacketBuffer, gso *GSO, r *Route, addr
|
||||||
return true
|
return true
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// beforeSave is invoked by stateify.
|
||||||
|
func (it *IPTables) beforeSave() {
|
||||||
|
// Ensure the reaper exits cleanly.
|
||||||
|
it.reaperDone <- struct{}{}
|
||||||
|
// Prevent others from modifying the connection table.
|
||||||
|
it.connections.mu.Lock()
|
||||||
|
}
|
||||||
|
|
||||||
|
// afterLoad is invoked by stateify.
|
||||||
|
func (it *IPTables) afterLoad() {
|
||||||
|
it.startReaper(reaperDelay)
|
||||||
|
}
|
||||||
|
|
||||||
|
// startReaper starts a goroutine that wakes up periodically to reap timed out
|
||||||
|
// connections.
|
||||||
|
func (it *IPTables) startReaper(interval time.Duration) {
|
||||||
|
go func() { // S/R-SAFE: reaperDone is signalled when iptables is saved.
|
||||||
|
bucket := 0
|
||||||
|
for {
|
||||||
|
select {
|
||||||
|
case <-it.reaperDone:
|
||||||
|
return
|
||||||
|
case <-time.After(interval):
|
||||||
|
bucket, interval = it.connections.reapUnused(bucket, interval)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
}
|
||||||
|
|
||||||
// CheckPackets runs pkts through the rules for hook and returns a map of packets that
|
// CheckPackets runs pkts through the rules for hook and returns a map of packets that
|
||||||
// should not go forward.
|
// should not go forward.
|
||||||
//
|
//
|
||||||
|
|
|
@ -0,0 +1,40 @@
|
||||||
|
// Copyright 2020 The gVisor Authors.
|
||||||
|
//
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
// you may not use this file except in compliance with the License.
|
||||||
|
// You may obtain a copy of the License at
|
||||||
|
//
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
//
|
||||||
|
// Unless required by applicable law or agreed to in writing, software
|
||||||
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
// See the License for the specific language governing permissions and
|
||||||
|
// limitations under the License.
|
||||||
|
|
||||||
|
package stack
|
||||||
|
|
||||||
|
import (
|
||||||
|
"time"
|
||||||
|
)
|
||||||
|
|
||||||
|
// +stateify savable
|
||||||
|
type unixTime struct {
|
||||||
|
second int64
|
||||||
|
nano int64
|
||||||
|
}
|
||||||
|
|
||||||
|
// saveLastUsed is invoked by stateify.
|
||||||
|
func (cn *conn) saveLastUsed() unixTime {
|
||||||
|
return unixTime{cn.lastUsed.Unix(), cn.lastUsed.UnixNano()}
|
||||||
|
}
|
||||||
|
|
||||||
|
// loadLastUsed is invoked by stateify.
|
||||||
|
func (cn *conn) loadLastUsed(unix unixTime) {
|
||||||
|
cn.lastUsed = time.Unix(unix.second, unix.nano)
|
||||||
|
}
|
||||||
|
|
||||||
|
// beforeSave is invoked by stateify.
|
||||||
|
func (ct *ConnTrack) beforeSave() {
|
||||||
|
ct.mu.Lock()
|
||||||
|
}
|
|
@ -78,6 +78,8 @@ const (
|
||||||
)
|
)
|
||||||
|
|
||||||
// IPTables holds all the tables for a netstack.
|
// IPTables holds all the tables for a netstack.
|
||||||
|
//
|
||||||
|
// +stateify savable
|
||||||
type IPTables struct {
|
type IPTables struct {
|
||||||
// mu protects tables, priorities, and modified.
|
// mu protects tables, priorities, and modified.
|
||||||
mu sync.RWMutex
|
mu sync.RWMutex
|
||||||
|
@ -97,10 +99,15 @@ type IPTables struct {
|
||||||
modified bool
|
modified bool
|
||||||
|
|
||||||
connections ConnTrack
|
connections ConnTrack
|
||||||
|
|
||||||
|
// reaperDone can be signalled to stop the reaper goroutine.
|
||||||
|
reaperDone chan struct{}
|
||||||
}
|
}
|
||||||
|
|
||||||
// A Table defines a set of chains and hooks into the network stack. It is
|
// A Table defines a set of chains and hooks into the network stack. It is
|
||||||
// really just a list of rules.
|
// really just a list of rules.
|
||||||
|
//
|
||||||
|
// +stateify savable
|
||||||
type Table struct {
|
type Table struct {
|
||||||
// Rules holds the rules that make up the table.
|
// Rules holds the rules that make up the table.
|
||||||
Rules []Rule
|
Rules []Rule
|
||||||
|
@ -130,6 +137,8 @@ func (table *Table) ValidHooks() uint32 {
|
||||||
// contains zero or more matchers, each of which is a specification of which
|
// contains zero or more matchers, each of which is a specification of which
|
||||||
// packets this rule applies to. If there are no matchers in the rule, it
|
// packets this rule applies to. If there are no matchers in the rule, it
|
||||||
// applies to any packet.
|
// applies to any packet.
|
||||||
|
//
|
||||||
|
// +stateify savable
|
||||||
type Rule struct {
|
type Rule struct {
|
||||||
// Filter holds basic IP filtering fields common to every rule.
|
// Filter holds basic IP filtering fields common to every rule.
|
||||||
Filter IPHeaderFilter
|
Filter IPHeaderFilter
|
||||||
|
@ -142,6 +151,8 @@ type Rule struct {
|
||||||
}
|
}
|
||||||
|
|
||||||
// IPHeaderFilter holds basic IP filtering data common to every rule.
|
// IPHeaderFilter holds basic IP filtering data common to every rule.
|
||||||
|
//
|
||||||
|
// +stateify savable
|
||||||
type IPHeaderFilter struct {
|
type IPHeaderFilter struct {
|
||||||
// Protocol matches the transport protocol.
|
// Protocol matches the transport protocol.
|
||||||
Protocol tcpip.TransportProtocolNumber
|
Protocol tcpip.TransportProtocolNumber
|
||||||
|
|
|
@ -425,6 +425,7 @@ type Stack struct {
|
||||||
handleLocal bool
|
handleLocal bool
|
||||||
|
|
||||||
// tables are the iptables packet filtering and manipulation rules.
|
// tables are the iptables packet filtering and manipulation rules.
|
||||||
|
// TODO(gvisor.dev/issue/170): S/R this field.
|
||||||
tables *IPTables
|
tables *IPTables
|
||||||
|
|
||||||
// resumableEndpoints is a list of endpoints that need to be resumed if the
|
// resumableEndpoints is a list of endpoints that need to be resumed if the
|
||||||
|
|
|
@ -106,6 +106,11 @@ func (t *TCB) UpdateStateOutbound(tcp header.TCP) Result {
|
||||||
return st
|
return st
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// State returns the current state of the TCB.
|
||||||
|
func (t *TCB) State() Result {
|
||||||
|
return t.state
|
||||||
|
}
|
||||||
|
|
||||||
// IsAlive returns true as long as the connection is established(Alive)
|
// IsAlive returns true as long as the connection is established(Alive)
|
||||||
// or connecting state.
|
// or connecting state.
|
||||||
func (t *TCB) IsAlive() bool {
|
func (t *TCB) IsAlive() bool {
|
||||||
|
|
Loading…
Reference in New Issue