Track number of packets queued to Failed neighbors

Add a NIC-specific neighbor table statistic so we can determine how many
packets have been queued to Failed neighbors, indicating an unhealthy local
network. This change assists us to debug in-field issues where subsequent
traffic to a neighbor fails.

Fixes #4819

PiperOrigin-RevId: 344131119
This commit is contained in:
Sam Balana 2020-11-24 14:20:15 -08:00 committed by gVisor bot
parent d492b21319
commit f90ab60a8a
5 changed files with 159 additions and 6 deletions

View File

@ -24,9 +24,16 @@ import (
const neighborCacheSize = 512 // max entries per interface
// NeighborStats holds metrics for the neighbor table.
type NeighborStats struct {
// FailedEntryLookups counts the number of lookups performed on an entry in
// Failed state.
FailedEntryLookups *tcpip.StatCounter
}
// neighborCache maps IP addresses to link addresses. It uses the Least
// Recently Used (LRU) eviction strategy to implement a bounded cache for
// dynmically acquired entries. It contains the state machine and configuration
// dynamically acquired entries. It contains the state machine and configuration
// for running Neighbor Unreachability Detection (NUD).
//
// There are two types of entries in the neighbor cache:

View File

@ -86,7 +86,8 @@ func newTestNeighborCache(nudDisp NUDDispatcher, config NUDConfigurations, clock
clock: clock,
nudDisp: nudDisp,
},
id: 1,
id: 1,
stats: makeNICStats(),
},
state: NewNUDState(config, rng),
cache: make(map[tcpip.Address]*neighborEntry, neighborCacheSize),

View File

@ -347,9 +347,10 @@ func (e *neighborEntry) handlePacketQueuedLocked(localAddr tcpip.Address) {
e.setStateLocked(Delay)
e.dispatchChangeEventLocked()
case Incomplete, Reachable, Delay, Probe, Static, Failed:
case Incomplete, Reachable, Delay, Probe, Static:
// Do nothing
case Failed:
e.nic.stats.Neighbor.FailedEntryLookups.Increment()
default:
panic(fmt.Sprintf("Invalid cache entry state: %s", e.neigh.State))
}

View File

@ -89,7 +89,7 @@ func eventDiffOptsWithSort() []cmp.Option {
// | Stale | Reachable | Solicited confirmation w/o address | Notify wakers | Changed |
// | Stale | Stale | Override confirmation | Update LinkAddr | Changed |
// | Stale | Stale | Probe w/ different address | Update LinkAddr | Changed |
// | Stale | Delay | Packet sent | | Changed |
// | Stale | Delay | Packet queued | | Changed |
// | Delay | Reachable | Upper-layer confirmation | | Changed |
// | Delay | Reachable | Solicited override confirmation | Update LinkAddr | Changed |
// | Delay | Reachable | Solicited confirmation w/o address | Notify wakers | Changed |
@ -101,6 +101,7 @@ func eventDiffOptsWithSort() []cmp.Option {
// | Probe | Stale | Probe or confirmation w/ different address | | Changed |
// | Probe | Probe | Retransmit timer expired | Send probe | Changed |
// | Probe | Failed | Max probes sent without reply | Notify wakers | Removed |
// | Failed | Failed | Packet queued | | |
// | Failed | | Unreachability timer expired | Delete entry | |
type testEntryEventType uint8
@ -228,6 +229,7 @@ func entryTestSetup(c NUDConfigurations) (*neighborEntry, *testNUDDispatcher, *e
clock: clock,
nudDisp: &disp,
},
stats: makeNICStats(),
}
nic.networkEndpoints = map[tcpip.NetworkProtocolNumber]NetworkEndpoint{
header.IPv6ProtocolNumber: (&testIPv6Protocol{}).NewEndpoint(&nic, nil, nil, nil),
@ -3433,6 +3435,146 @@ func TestEntryProbeToFailed(t *testing.T) {
nudDisp.mu.Unlock()
}
func TestEntryFailedToFailed(t *testing.T) {
c := DefaultNUDConfigurations()
c.MaxMulticastProbes = 3
c.MaxUnicastProbes = 3
e, nudDisp, linkRes, clock := entryTestSetup(c)
// Verify the cache contains the entry.
if _, ok := e.nic.neigh.cache[entryTestAddr1]; !ok {
t.Errorf("expected entry %q to exist in the neighbor cache", entryTestAddr1)
}
// TODO(gvisor.dev/issue/4872): Use helper functions to start entry tests in
// their expected state.
e.mu.Lock()
e.handlePacketQueuedLocked(entryTestAddr2)
e.mu.Unlock()
runImmediatelyScheduledJobs(clock)
{
wantProbes := []entryTestProbeInfo{
{
RemoteAddress: entryTestAddr1,
LocalAddress: entryTestAddr2,
},
}
linkRes.mu.Lock()
diff := cmp.Diff(linkRes.probes, wantProbes)
linkRes.probes = nil
linkRes.mu.Unlock()
if diff != "" {
t.Fatalf("link address resolver probes mismatch (-got, +want):\n%s", diff)
}
}
e.mu.Lock()
e.handleConfirmationLocked(entryTestLinkAddr1, ReachabilityConfirmationFlags{
Solicited: false,
Override: false,
IsRouter: false,
})
e.handlePacketQueuedLocked(entryTestAddr2)
e.mu.Unlock()
waitFor := c.DelayFirstProbeTime + c.RetransmitTimer*time.Duration(c.MaxUnicastProbes)
clock.Advance(waitFor)
{
wantProbes := []entryTestProbeInfo{
{
RemoteAddress: entryTestAddr1,
RemoteLinkAddress: entryTestLinkAddr1,
},
{
RemoteAddress: entryTestAddr1,
RemoteLinkAddress: entryTestLinkAddr1,
},
{
RemoteAddress: entryTestAddr1,
RemoteLinkAddress: entryTestLinkAddr1,
},
}
linkRes.mu.Lock()
diff := cmp.Diff(linkRes.probes, wantProbes)
linkRes.mu.Unlock()
if diff != "" {
t.Fatalf("link address resolver probes mismatch (-got, +want):\n%s", diff)
}
}
wantEvents := []testEntryEventInfo{
{
EventType: entryTestAdded,
NICID: entryTestNICID,
Entry: NeighborEntry{
Addr: entryTestAddr1,
LinkAddr: tcpip.LinkAddress(""),
State: Incomplete,
},
},
{
EventType: entryTestChanged,
NICID: entryTestNICID,
Entry: NeighborEntry{
Addr: entryTestAddr1,
LinkAddr: entryTestLinkAddr1,
State: Stale,
},
},
{
EventType: entryTestChanged,
NICID: entryTestNICID,
Entry: NeighborEntry{
Addr: entryTestAddr1,
LinkAddr: entryTestLinkAddr1,
State: Delay,
},
},
{
EventType: entryTestChanged,
NICID: entryTestNICID,
Entry: NeighborEntry{
Addr: entryTestAddr1,
LinkAddr: entryTestLinkAddr1,
State: Probe,
},
},
{
EventType: entryTestRemoved,
NICID: entryTestNICID,
Entry: NeighborEntry{
Addr: entryTestAddr1,
LinkAddr: entryTestLinkAddr1,
State: Probe,
},
},
}
nudDisp.mu.Lock()
if diff := cmp.Diff(nudDisp.events, wantEvents, eventDiffOpts()...); diff != "" {
t.Errorf("nud dispatcher events mismatch (-got, +want):\n%s", diff)
}
nudDisp.mu.Unlock()
failedLookups := e.nic.stats.Neighbor.FailedEntryLookups
if got := failedLookups.Value(); got != 0 {
t.Errorf("got Neighbor.FailedEntryLookups = %d, want = 0", got)
}
e.mu.Lock()
// Verify queuing a packet to the entry immediately fails.
e.handlePacketQueuedLocked(entryTestAddr2)
state := e.neigh.State
e.mu.Unlock()
if state != Failed {
t.Errorf("got e.neigh.State = %q, want = %q", state, Failed)
}
if got := failedLookups.Value(); got != 1 {
t.Errorf("got Neighbor.FailedEntryLookups = %d, want = 1", got)
}
}
func TestEntryFailedGetsDeleted(t *testing.T) {
c := DefaultNUDConfigurations()
c.MaxMulticastProbes = 3

View File

@ -60,12 +60,14 @@ type NIC struct {
}
}
// NICStats includes transmitted and received stats.
// NICStats hold statistics for a NIC.
type NICStats struct {
Tx DirectionStats
Rx DirectionStats
DisabledRx DirectionStats
Neighbor NeighborStats
}
func makeNICStats() NICStats {