531 lines
16 KiB
Go
531 lines
16 KiB
Go
// Copyright 2018 The gVisor Authors.
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
// Package metric provides primitives for collecting metrics.
|
|
package metric
|
|
|
|
import (
|
|
"errors"
|
|
"fmt"
|
|
"sort"
|
|
"sync/atomic"
|
|
"time"
|
|
|
|
"google.golang.org/protobuf/types/known/timestamppb"
|
|
"gvisor.dev/gvisor/pkg/eventchannel"
|
|
"gvisor.dev/gvisor/pkg/log"
|
|
pb "gvisor.dev/gvisor/pkg/metric/metric_go_proto"
|
|
"gvisor.dev/gvisor/pkg/sync"
|
|
)
|
|
|
|
var (
|
|
// ErrNameInUse indicates that another metric is already defined for
|
|
// the given name.
|
|
ErrNameInUse = errors.New("metric name already in use")
|
|
|
|
// ErrInitializationDone indicates that the caller tried to create a
|
|
// new metric after initialization.
|
|
ErrInitializationDone = errors.New("metric cannot be created after initialization is complete")
|
|
|
|
// WeirdnessMetric is a metric with fields created to track the number
|
|
// of weird occurrences such as time fallback, partial_result, vsyscall
|
|
// count, watchdog startup timeouts and stuck tasks.
|
|
WeirdnessMetric = MustCreateNewUint64Metric("/weirdness", true /* sync */, "Increment for weird occurrences of problems such as time fallback, partial result, vsyscalls invoked in the sandbox, watchdog startup timeouts and stuck tasks.",
|
|
Field{
|
|
name: "weirdness_type",
|
|
allowedValues: []string{"time_fallback", "partial_result", "vsyscall_count", "watchdog_stuck_startup", "watchdog_stuck_tasks"},
|
|
})
|
|
|
|
// SuspiciousOperationsMetric is a metric with fields created to detect
|
|
// operations such as opening an executable file to write from a gofer.
|
|
SuspiciousOperationsMetric = MustCreateNewUint64Metric("/suspicious_operations", true /* sync */, "Increment for suspicious operations such as opening an executable file to write from a gofer.",
|
|
Field{
|
|
name: "operation_type",
|
|
allowedValues: []string{"opened_write_execute_file"},
|
|
})
|
|
)
|
|
|
|
// InitStage is the name of a Sentry initialization stage.
|
|
type InitStage string
|
|
|
|
// List of all Sentry initialization stages.
|
|
var (
|
|
InitRestoreConfig InitStage = "restore_config"
|
|
InitExecConfig InitStage = "exec_config"
|
|
InitRestore InitStage = "restore"
|
|
InitCreateProcess InitStage = "create_process"
|
|
InitTaskStart InitStage = "task_start"
|
|
|
|
// allStages is the list of allowed stages.
|
|
allStages = []InitStage{
|
|
InitRestoreConfig,
|
|
InitExecConfig,
|
|
InitRestore,
|
|
InitCreateProcess,
|
|
InitTaskStart,
|
|
}
|
|
)
|
|
|
|
// Uint64Metric encapsulates a uint64 that represents some kind of metric to be
|
|
// monitored. We currently support metrics with at most one field.
|
|
//
|
|
// Metrics are not saved across save/restore and thus reset to zero on restore.
|
|
//
|
|
// TODO(b/67298427): Support metric fields.
|
|
type Uint64Metric struct {
|
|
// value is the actual value of the metric. It must be accessed atomically.
|
|
value uint64
|
|
|
|
// numFields is the number of metric fields. It is immutable once
|
|
// initialized.
|
|
numFields int
|
|
|
|
// mu protects the below fields.
|
|
mu sync.RWMutex `state:"nosave"`
|
|
|
|
// fields is the map of fields in the metric.
|
|
fields map[string]uint64
|
|
}
|
|
|
|
var (
|
|
// initialized indicates that all metrics are registered. allMetrics is
|
|
// immutable once initialized is true.
|
|
initialized bool
|
|
|
|
// allMetrics are the registered metrics.
|
|
allMetrics = makeMetricSet()
|
|
)
|
|
|
|
// Initialize sends a metric registration event over the event channel.
|
|
//
|
|
// Precondition:
|
|
// * All metrics are registered.
|
|
// * Initialize/Disable has not been called.
|
|
func Initialize() error {
|
|
if initialized {
|
|
return errors.New("metric.Initialize called after metric.Initialize or metric.Disable")
|
|
}
|
|
|
|
m := pb.MetricRegistration{}
|
|
for _, v := range allMetrics.m {
|
|
m.Metrics = append(m.Metrics, v.metadata)
|
|
}
|
|
m.Stages = make([]string, 0, len(allStages))
|
|
for _, s := range allStages {
|
|
m.Stages = append(m.Stages, string(s))
|
|
}
|
|
if err := eventchannel.Emit(&m); err != nil {
|
|
return fmt.Errorf("unable to emit metric initialize event: %w", err)
|
|
}
|
|
|
|
initialized = true
|
|
return nil
|
|
}
|
|
|
|
// Disable sends an empty metric registration event over the event channel,
|
|
// disabling metric collection.
|
|
//
|
|
// Precondition:
|
|
// * All metrics are registered.
|
|
// * Initialize/Disable has not been called.
|
|
func Disable() error {
|
|
if initialized {
|
|
return errors.New("metric.Disable called after metric.Initialize or metric.Disable")
|
|
}
|
|
|
|
m := pb.MetricRegistration{}
|
|
if err := eventchannel.Emit(&m); err != nil {
|
|
return fmt.Errorf("unable to emit metric disable event: %w", err)
|
|
}
|
|
|
|
initialized = true
|
|
return nil
|
|
}
|
|
|
|
type customUint64Metric struct {
|
|
// metadata describes the metric. It is immutable.
|
|
metadata *pb.MetricMetadata
|
|
|
|
// value returns the current value of the metric for the given set of
|
|
// fields. It takes a variadic number of field values as argument.
|
|
value func(fieldValues ...string) uint64
|
|
}
|
|
|
|
// Field contains the field name and allowed values for the metric which is
|
|
// used in registration of the metric.
|
|
type Field struct {
|
|
// name is the metric field name.
|
|
name string
|
|
|
|
// allowedValues is the list of allowed values for the field.
|
|
allowedValues []string
|
|
}
|
|
|
|
// RegisterCustomUint64Metric registers a metric with the given name.
|
|
//
|
|
// Register must only be called at init and will return and error if called
|
|
// after Initialized.
|
|
//
|
|
// Preconditions:
|
|
// * name must be globally unique.
|
|
// * Initialize/Disable have not been called.
|
|
// * value is expected to accept exactly len(fields) arguments.
|
|
func RegisterCustomUint64Metric(name string, cumulative, sync bool, units pb.MetricMetadata_Units, description string, value func(...string) uint64, fields ...Field) error {
|
|
if initialized {
|
|
return ErrInitializationDone
|
|
}
|
|
|
|
if _, ok := allMetrics.m[name]; ok {
|
|
return ErrNameInUse
|
|
}
|
|
|
|
allMetrics.m[name] = customUint64Metric{
|
|
metadata: &pb.MetricMetadata{
|
|
Name: name,
|
|
Description: description,
|
|
Cumulative: cumulative,
|
|
Sync: sync,
|
|
Type: pb.MetricMetadata_TYPE_UINT64,
|
|
Units: units,
|
|
},
|
|
value: value,
|
|
}
|
|
|
|
// Metrics can exist without fields.
|
|
if l := len(fields); l > 1 {
|
|
return fmt.Errorf("%d fields provided, must be <= 1", l)
|
|
}
|
|
|
|
for _, field := range fields {
|
|
allMetrics.m[name].metadata.Fields = append(allMetrics.m[name].metadata.Fields, &pb.MetricMetadata_Field{
|
|
FieldName: field.name,
|
|
AllowedValues: field.allowedValues,
|
|
})
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// MustRegisterCustomUint64Metric calls RegisterCustomUint64Metric for metrics
|
|
// without fields and panics if it returns an error.
|
|
func MustRegisterCustomUint64Metric(name string, cumulative, sync bool, description string, value func(...string) uint64, fields ...Field) {
|
|
if err := RegisterCustomUint64Metric(name, cumulative, sync, pb.MetricMetadata_UNITS_NONE, description, value, fields...); err != nil {
|
|
panic(fmt.Sprintf("Unable to register metric %q: %s", name, err))
|
|
}
|
|
}
|
|
|
|
// NewUint64Metric creates and registers a new cumulative metric with the given
|
|
// name.
|
|
//
|
|
// Metrics must be statically defined (i.e., at init).
|
|
func NewUint64Metric(name string, sync bool, units pb.MetricMetadata_Units, description string, fields ...Field) (*Uint64Metric, error) {
|
|
m := Uint64Metric{
|
|
numFields: len(fields),
|
|
}
|
|
|
|
if m.numFields == 1 {
|
|
m.fields = make(map[string]uint64)
|
|
for _, fieldValue := range fields[0].allowedValues {
|
|
m.fields[fieldValue] = 0
|
|
}
|
|
}
|
|
return &m, RegisterCustomUint64Metric(name, true /* cumulative */, sync, units, description, m.Value, fields...)
|
|
}
|
|
|
|
// MustCreateNewUint64Metric calls NewUint64Metric and panics if it returns an
|
|
// error.
|
|
func MustCreateNewUint64Metric(name string, sync bool, description string, fields ...Field) *Uint64Metric {
|
|
m, err := NewUint64Metric(name, sync, pb.MetricMetadata_UNITS_NONE, description, fields...)
|
|
if err != nil {
|
|
panic(fmt.Sprintf("Unable to create metric %q: %s", name, err))
|
|
}
|
|
return m
|
|
}
|
|
|
|
// MustCreateNewUint64NanosecondsMetric calls NewUint64Metric and panics if it
|
|
// returns an error.
|
|
func MustCreateNewUint64NanosecondsMetric(name string, sync bool, description string) *Uint64Metric {
|
|
m, err := NewUint64Metric(name, sync, pb.MetricMetadata_UNITS_NANOSECONDS, description)
|
|
if err != nil {
|
|
panic(fmt.Sprintf("Unable to create metric %q: %s", name, err))
|
|
}
|
|
return m
|
|
}
|
|
|
|
// Value returns the current value of the metric for the given set of fields.
|
|
func (m *Uint64Metric) Value(fieldValues ...string) uint64 {
|
|
if m.numFields != len(fieldValues) {
|
|
panic(fmt.Sprintf("Number of fieldValues %d is not equal to the number of metric fields %d", len(fieldValues), m.numFields))
|
|
}
|
|
|
|
switch m.numFields {
|
|
case 0:
|
|
return atomic.LoadUint64(&m.value)
|
|
case 1:
|
|
m.mu.RLock()
|
|
defer m.mu.RUnlock()
|
|
|
|
fieldValue := fieldValues[0]
|
|
if _, ok := m.fields[fieldValue]; !ok {
|
|
panic(fmt.Sprintf("Metric does not allow to have field value %s", fieldValue))
|
|
}
|
|
return m.fields[fieldValue]
|
|
default:
|
|
panic("Sentry metrics do not support more than one field")
|
|
}
|
|
}
|
|
|
|
// Increment increments the metric field by 1.
|
|
func (m *Uint64Metric) Increment(fieldValues ...string) {
|
|
m.IncrementBy(1, fieldValues...)
|
|
}
|
|
|
|
// IncrementBy increments the metric by v.
|
|
func (m *Uint64Metric) IncrementBy(v uint64, fieldValues ...string) {
|
|
if m.numFields != len(fieldValues) {
|
|
panic(fmt.Sprintf("Number of fieldValues %d is not equal to the number of metric fields %d", len(fieldValues), m.numFields))
|
|
}
|
|
|
|
switch m.numFields {
|
|
case 0:
|
|
atomic.AddUint64(&m.value, v)
|
|
return
|
|
case 1:
|
|
fieldValue := fieldValues[0]
|
|
m.mu.Lock()
|
|
defer m.mu.Unlock()
|
|
|
|
if _, ok := m.fields[fieldValue]; !ok {
|
|
panic(fmt.Sprintf("Metric does not allow to have field value %s", fieldValue))
|
|
}
|
|
m.fields[fieldValue] += v
|
|
default:
|
|
panic("Sentry metrics do not support more than one field")
|
|
}
|
|
}
|
|
|
|
// stageTiming contains timing data for an initialization stage.
|
|
type stageTiming struct {
|
|
stage InitStage
|
|
started time.Time
|
|
// ended is the zero time when the stage has not ended yet.
|
|
ended time.Time
|
|
}
|
|
|
|
// inProgress returns whether this stage hasn't ended yet.
|
|
func (s stageTiming) inProgress() bool {
|
|
return !s.started.IsZero() && s.ended.IsZero()
|
|
}
|
|
|
|
// metricSet holds metric data.
|
|
type metricSet struct {
|
|
// Map of metrics.
|
|
m map[string]customUint64Metric
|
|
|
|
// mu protects the fields below.
|
|
mu sync.RWMutex
|
|
|
|
// Information about the stages reached by the Sentry. Only appended to, so
|
|
// reading a shallow copy of the slice header concurrently is safe.
|
|
finished []stageTiming
|
|
|
|
// The current stage in progress.
|
|
currentStage stageTiming
|
|
}
|
|
|
|
// makeMetricSet returns a new metricSet.
|
|
func makeMetricSet() metricSet {
|
|
return metricSet{
|
|
m: make(map[string]customUint64Metric),
|
|
finished: make([]stageTiming, 0, len(allStages)),
|
|
}
|
|
}
|
|
|
|
// Values returns a snapshot of all values in m.
|
|
func (m *metricSet) Values() metricValues {
|
|
m.mu.Lock()
|
|
stages := m.finished[:]
|
|
m.mu.Unlock()
|
|
|
|
vals := metricValues{
|
|
m: make(map[string]interface{}, len(m.m)),
|
|
stages: stages,
|
|
}
|
|
|
|
for k, v := range m.m {
|
|
fields := v.metadata.GetFields()
|
|
switch len(fields) {
|
|
case 0:
|
|
vals.m[k] = v.value()
|
|
case 1:
|
|
values := fields[0].GetAllowedValues()
|
|
fieldsMap := make(map[string]uint64)
|
|
for _, fieldValue := range values {
|
|
fieldsMap[fieldValue] = v.value(fieldValue)
|
|
}
|
|
vals.m[k] = fieldsMap
|
|
default:
|
|
panic(fmt.Sprintf("Unsupported number of metric fields: %d", len(fields)))
|
|
}
|
|
}
|
|
return vals
|
|
}
|
|
|
|
// metricValues contains a copy of the values of all metrics.
|
|
type metricValues struct {
|
|
// m is a map with key as metric name and value can be either uint64 or
|
|
// map[string]uint64 to support metrics with one field.
|
|
m map[string]interface{}
|
|
|
|
// Information on when initialization stages were reached. Does not include
|
|
// the currently-ongoing stage, if any.
|
|
stages []stageTiming
|
|
}
|
|
|
|
var (
|
|
// emitMu protects metricsAtLastEmit and ensures that all emitted
|
|
// metrics are strongly ordered (older metrics are never emitted after
|
|
// newer metrics).
|
|
emitMu sync.Mutex
|
|
|
|
// metricsAtLastEmit contains the state of the metrics at the last emit event.
|
|
metricsAtLastEmit metricValues
|
|
)
|
|
|
|
// EmitMetricUpdate emits a MetricUpdate over the event channel.
|
|
//
|
|
// Only metrics that have changed since the last call are emitted.
|
|
//
|
|
// EmitMetricUpdate is thread-safe.
|
|
//
|
|
// Preconditions:
|
|
// * Initialize has been called.
|
|
func EmitMetricUpdate() {
|
|
emitMu.Lock()
|
|
defer emitMu.Unlock()
|
|
|
|
snapshot := allMetrics.Values()
|
|
|
|
m := pb.MetricUpdate{}
|
|
// On the first call metricsAtLastEmit will be empty. Include all
|
|
// metrics then.
|
|
for k, v := range snapshot.m {
|
|
prev, ok := metricsAtLastEmit.m[k]
|
|
switch t := v.(type) {
|
|
case uint64:
|
|
// Metric exists and value did not change.
|
|
if ok && prev.(uint64) == t {
|
|
continue
|
|
}
|
|
|
|
m.Metrics = append(m.Metrics, &pb.MetricValue{
|
|
Name: k,
|
|
Value: &pb.MetricValue_Uint64Value{Uint64Value: t},
|
|
})
|
|
case map[string]uint64:
|
|
for fieldValue, metricValue := range t {
|
|
// Emit data on the first call only if the field
|
|
// value has been incremented. For all other
|
|
// calls, emit data if the field value has been
|
|
// changed from the previous emit.
|
|
if (!ok && metricValue == 0) || (ok && prev.(map[string]uint64)[fieldValue] == metricValue) {
|
|
continue
|
|
}
|
|
|
|
m.Metrics = append(m.Metrics, &pb.MetricValue{
|
|
Name: k,
|
|
FieldValues: []string{fieldValue},
|
|
Value: &pb.MetricValue_Uint64Value{Uint64Value: metricValue},
|
|
})
|
|
}
|
|
}
|
|
}
|
|
|
|
for s := len(metricsAtLastEmit.stages); s < len(snapshot.stages); s++ {
|
|
newStage := snapshot.stages[s]
|
|
m.StageTiming = append(m.StageTiming, &pb.StageTiming{
|
|
Stage: string(newStage.stage),
|
|
Started: ×tamppb.Timestamp{
|
|
Seconds: newStage.started.Unix(),
|
|
Nanos: int32(newStage.started.Nanosecond()),
|
|
},
|
|
Ended: ×tamppb.Timestamp{
|
|
Seconds: newStage.ended.Unix(),
|
|
Nanos: int32(newStage.ended.Nanosecond()),
|
|
},
|
|
})
|
|
}
|
|
|
|
metricsAtLastEmit = snapshot
|
|
if len(m.Metrics) == 0 && len(m.StageTiming) == 0 {
|
|
return
|
|
}
|
|
|
|
if log.IsLogging(log.Debug) {
|
|
sort.Slice(m.Metrics, func(i, j int) bool {
|
|
return m.Metrics[i].Name < m.Metrics[j].Name
|
|
})
|
|
log.Debugf("Emitting metrics:")
|
|
for _, metric := range m.Metrics {
|
|
log.Debugf("%s: %+v", metric.Name, metric.Value)
|
|
}
|
|
for _, stage := range m.StageTiming {
|
|
duration := time.Duration(stage.Ended.Seconds-stage.Started.Seconds)*time.Second + time.Duration(stage.Ended.Nanos-stage.Started.Nanos)*time.Nanosecond
|
|
log.Debugf("Stage %s took %v", stage.GetStage(), duration)
|
|
}
|
|
}
|
|
|
|
if err := eventchannel.Emit(&m); err != nil {
|
|
log.Warningf("Unable to emit metrics: %s", err)
|
|
}
|
|
}
|
|
|
|
// StartStage should be called when an initialization stage is started.
|
|
// It returns a function that must be called to indicate that the stage ended.
|
|
// Alternatively, future calls to StartStage will implicitly indicate that the
|
|
// previous stage ended.
|
|
// Stage information will be emitted in the next call to EmitMetricUpdate after
|
|
// a stage has ended.
|
|
//
|
|
// This function may (and is expected to) be called prior to final
|
|
// initialization of this metric library, as it has to capture early stages
|
|
// of Sentry initialization.
|
|
func StartStage(stage InitStage) func() {
|
|
now := time.Now()
|
|
allMetrics.mu.Lock()
|
|
defer allMetrics.mu.Unlock()
|
|
if allMetrics.currentStage.inProgress() {
|
|
endStage(now)
|
|
}
|
|
allMetrics.currentStage.stage = stage
|
|
allMetrics.currentStage.started = now
|
|
return func() {
|
|
now := time.Now()
|
|
allMetrics.mu.Lock()
|
|
defer allMetrics.mu.Unlock()
|
|
// The current stage may have been ended by another call to StartStage, so
|
|
// double-check prior to clearing the current stage.
|
|
if allMetrics.currentStage.inProgress() && allMetrics.currentStage.stage == stage {
|
|
endStage(now)
|
|
}
|
|
}
|
|
}
|
|
|
|
// endStage marks allMetrics.currentStage as ended, adding it to the list of
|
|
// finished stages. It assumes allMetrics.mu is locked.
|
|
func endStage(when time.Time) {
|
|
allMetrics.currentStage.ended = when
|
|
allMetrics.finished = append(allMetrics.finished, allMetrics.currentStage)
|
|
allMetrics.currentStage = stageTiming{}
|
|
}
|