From 538a6d5c0124452c86c80175453bb47aa68e28e7 Mon Sep 17 00:00:00 2001 From: Katarzyna Kujawa Date: Mon, 6 Jul 2020 11:28:27 +0200 Subject: [PATCH] Aggregate perf metrics Add documentation about core perf events aggregation Signed-off-by: Katarzyna Kujawa --- docs/runtime_options.md | 6 + metrics/prometheus.go | 142 ++++++++++--- metrics/prometheus_test.go | 197 ++++++++++++++++++ .../prometheus_metrics_perf_aggregated | 37 ++++ 4 files changed, 348 insertions(+), 34 deletions(-) create mode 100644 metrics/testdata/prometheus_metrics_perf_aggregated diff --git a/docs/runtime_options.md b/docs/runtime_options.md index 9621a69474..2fc8149fae 100644 --- a/docs/runtime_options.md +++ b/docs/runtime_options.md @@ -140,6 +140,12 @@ cAdvisor stores the latest historical data in memory. How long of a history it s --perf_events_config="" Path to a JSON file containing configuration of perf events to measure. Empty value disables perf events measuring. ``` +Core perf events can be exposed on Prometheus endpoint per CPU or aggregated by event. It is controlled through `--disable_metrics` parameter with option `percpu`, e.g.: +- `--disable_metrics="percpu"` - core perf events are aggregated +- `--disable_metrics=""` - core perf events are exposed per CPU. + +Aggregated form of core perf events significantly decrease volume of data. For aggregated form of core perf events scaling ratio (`container_perf_metric_scaling ratio`) indicates the lowest value of scaling ratio for specific event to show the worst precision. + ### Perf subsystem introduction One of the goals of kernel perf subsystem is to instrument CPU performance counters that allow to profile applications. diff --git a/metrics/prometheus.go b/metrics/prometheus.go index 23457f4e99..1064e045a2 100644 --- a/metrics/prometheus.go +++ b/metrics/prometheus.go @@ -1577,41 +1577,48 @@ func NewPrometheusCollector(i infoProvider, f ContainerLabelsFunc, includedMetri }...) } if includedMetrics.Has(container.PerfMetrics) { + if includedMetrics.Has(container.PerCpuUsageMetrics) { + c.containerMetrics = append(c.containerMetrics, []containerMetric{ + { + name: "container_perf_events_total", + help: "Perf event metric.", + valueType: prometheus.CounterValue, + extraLabels: []string{"cpu", "event"}, + getValues: func(s *info.ContainerStats) metricValues { + return getPerCPUCorePerfEvents(s) + }, + }, + { + name: "container_perf_events_scaling_ratio", + help: "Perf event metric scaling ratio.", + valueType: prometheus.GaugeValue, + extraLabels: []string{"cpu", "event"}, + getValues: func(s *info.ContainerStats) metricValues { + return getPerCPUCoreScalingRatio(s) + }, + }}...) + } else { + c.containerMetrics = append(c.containerMetrics, []containerMetric{ + { + name: "container_perf_events_total", + help: "Perf event metric.", + valueType: prometheus.CounterValue, + extraLabels: []string{"cpu", "event"}, + getValues: func(s *info.ContainerStats) metricValues { + return getAggregatedCorePerfEvents(s) + }, + }, + { + name: "container_perf_events_scaling_ratio", + help: "Perf event metric scaling ratio.", + valueType: prometheus.GaugeValue, + extraLabels: []string{"cpu", "event"}, + getValues: func(s *info.ContainerStats) metricValues { + return getMinCoreScalingRatio(s) + }, + }}...) + } c.containerMetrics = append(c.containerMetrics, []containerMetric{ - { - name: "container_perf_events_total", - help: "Perf event metric.", - valueType: prometheus.CounterValue, - extraLabels: []string{"cpu", "event"}, - getValues: func(s *info.ContainerStats) metricValues { - values := make(metricValues, 0, len(s.PerfStats)) - for _, metric := range s.PerfStats { - values = append(values, metricValue{ - value: float64(metric.Value), - labels: []string{strconv.Itoa(metric.Cpu), metric.Name}, - timestamp: s.Timestamp, - }) - } - return values - }, - }, - { - name: "container_perf_events_scaling_ratio", - help: "Perf event metric scaling ratio.", - valueType: prometheus.GaugeValue, - extraLabels: []string{"cpu", "event"}, - getValues: func(s *info.ContainerStats) metricValues { - values := make(metricValues, 0, len(s.PerfStats)) - for _, metric := range s.PerfStats { - values = append(values, metricValue{ - value: metric.ScalingRatio, - labels: []string{strconv.Itoa(metric.Cpu), metric.Name}, - timestamp: s.Timestamp, - }) - } - return values - }, - }, { name: "container_perf_uncore_events_total", help: "Perf uncore event metric.", @@ -1940,3 +1947,70 @@ func getNumaStatsPerNode(nodeStats map[uint8]uint64, labels []string, timestamp } return mValues } + +func getPerCPUCorePerfEvents(s *info.ContainerStats) metricValues { + values := make(metricValues, 0, len(s.PerfStats)) + for _, metric := range s.PerfStats { + values = append(values, metricValue{ + value: float64(metric.Value), + labels: []string{strconv.Itoa(metric.Cpu), metric.Name}, + timestamp: s.Timestamp, + }) + } + return values +} + +func getPerCPUCoreScalingRatio(s *info.ContainerStats) metricValues { + values := make(metricValues, 0, len(s.PerfStats)) + for _, metric := range s.PerfStats { + values = append(values, metricValue{ + value: metric.ScalingRatio, + labels: []string{strconv.Itoa(metric.Cpu), metric.Name}, + timestamp: s.Timestamp, + }) + } + return values +} + +func getAggregatedCorePerfEvents(s *info.ContainerStats) metricValues { + values := make(metricValues, 0) + + perfEventStatAgg := make(map[string]uint64) + // aggregate by event + for _, perfStat := range s.PerfStats { + perfEventStatAgg[perfStat.Name] += perfStat.Value + } + // create aggregated metrics + for perfEvent, perfValue := range perfEventStatAgg { + values = append(values, metricValue{ + value: float64(perfValue), + labels: []string{"", perfEvent}, + timestamp: s.Timestamp, + }) + } + return values +} + +func getMinCoreScalingRatio(s *info.ContainerStats) metricValues { + values := make(metricValues, 0) + perfEventStatMin := make(map[string]float64) + // search for minimal value of scalin ratio for specific event + for _, perfStat := range s.PerfStats { + if _, ok := perfEventStatMin[perfStat.Name]; !ok { + // found a new event + perfEventStatMin[perfStat.Name] = perfStat.ScalingRatio + } else if perfStat.ScalingRatio < perfEventStatMin[perfStat.Name] { + // found a lower value of scaling ration so replace the minimal value + perfEventStatMin[perfStat.Name] = perfStat.ScalingRatio + } + } + + for perfEvent, perfScalingRatio := range perfEventStatMin { + values = append(values, metricValue{ + value: perfScalingRatio, + labels: []string{"", perfEvent}, + timestamp: s.Timestamp, + }) + } + return values +} diff --git a/metrics/prometheus_test.go b/metrics/prometheus_test.go index 886ec6b99e..3da41c9505 100644 --- a/metrics/prometheus_test.go +++ b/metrics/prometheus_test.go @@ -44,6 +44,21 @@ func TestPrometheusCollector(t *testing.T) { testPrometheusCollector(t, reg, "testdata/prometheus_metrics") } +func TestPrometheusCollectorWithPerfAggregated(t *testing.T) { + metrics := container.MetricSet{ + container.PerfMetrics: struct{}{}, + } + c := NewPrometheusCollector(testSubcontainersInfoProvider{}, func(container *info.ContainerInfo) map[string]string { + s := DefaultContainerLabels(container) + s["zone.name"] = "hello" + return s + }, metrics, now, v2.RequestOptions{}) + reg := prometheus.NewRegistry() + reg.MustRegister(c) + + testPrometheusCollector(t, reg, "testdata/prometheus_metrics_perf_aggregated") +} + func testPrometheusCollector(t *testing.T, gatherer prometheus.Gatherer, metricsFile string) { wantMetrics, err := os.Open(metricsFile) if err != nil { @@ -122,3 +137,185 @@ func (m *mockInfoProvider) GetMachineInfo() (*info.MachineInfo, error) { func mockLabelFunc(*info.ContainerInfo) map[string]string { return map[string]string{} } + +func TestGetPerCpuCorePerfEvents(t *testing.T) { + containerStats := &info.ContainerStats{ + Timestamp: time.Unix(1395066367, 0), + PerfStats: []info.PerfStat{ + { + PerfValue: info.PerfValue{ + ScalingRatio: 1.0, + Value: 123, + Name: "instructions", + }, + Cpu: 0, + }, + { + PerfValue: info.PerfValue{ + ScalingRatio: 0.5, + Value: 456, + Name: "instructions", + }, + Cpu: 1, + }, + { + PerfValue: info.PerfValue{ + ScalingRatio: 0.7, + Value: 321, + Name: "instructions_retired"}, + Cpu: 0, + }, + { + PerfValue: info.PerfValue{ + ScalingRatio: 0.3, + Value: 789, + Name: "instructions_retired"}, + Cpu: 1, + }, + }, + } + metricVals := getPerCPUCorePerfEvents(containerStats) + assert.Equal(t, 4, len(metricVals)) + values := []float64{} + for _, metric := range metricVals { + values = append(values, metric.value) + } + assert.Contains(t, values, 123.0) + assert.Contains(t, values, 456.0) + assert.Contains(t, values, 321.0) + assert.Contains(t, values, 789.0) +} + +func TestGetPerCpuCoreScalingRatio(t *testing.T) { + containerStats := &info.ContainerStats{ + Timestamp: time.Unix(1395066367, 0), + PerfStats: []info.PerfStat{ + { + PerfValue: info.PerfValue{ + ScalingRatio: 1.0, + Value: 123, + Name: "instructions"}, + Cpu: 0, + }, + { + PerfValue: info.PerfValue{ + ScalingRatio: 0.5, + Value: 456, + Name: "instructions"}, + Cpu: 1, + }, + { + PerfValue: info.PerfValue{ + ScalingRatio: 0.7, + Value: 321, + Name: "instructions_retired"}, + Cpu: 0, + }, + { + PerfValue: info.PerfValue{ + ScalingRatio: 0.3, + Value: 789, + Name: "instructions_retired"}, + Cpu: 1, + }, + }, + } + metricVals := getPerCPUCoreScalingRatio(containerStats) + assert.Equal(t, 4, len(metricVals)) + values := []float64{} + for _, metric := range metricVals { + values = append(values, metric.value) + } + assert.Contains(t, values, 1.0) + assert.Contains(t, values, 0.5) + assert.Contains(t, values, 0.7) + assert.Contains(t, values, 0.3) +} + +func TestGetAggCorePerfEvents(t *testing.T) { + containerStats := &info.ContainerStats{ + Timestamp: time.Unix(1395066367, 0), + PerfStats: []info.PerfStat{ + { + PerfValue: info.PerfValue{ + ScalingRatio: 1.0, + Value: 123, + Name: "instructions"}, + Cpu: 0, + }, + { + PerfValue: info.PerfValue{ + ScalingRatio: 0.5, + Value: 456, + Name: "instructions"}, + Cpu: 1, + }, + { + PerfValue: info.PerfValue{ + ScalingRatio: 0.7, + Value: 321, + Name: "instructions_retired"}, + Cpu: 0, + }, + { + PerfValue: info.PerfValue{ + ScalingRatio: 0.3, + Value: 789, + Name: "instructions_retired"}, + Cpu: 1, + }, + }, + } + metricVals := getAggregatedCorePerfEvents(containerStats) + assert.Equal(t, 2, len(metricVals)) + values := []float64{} + for _, metric := range metricVals { + values = append(values, metric.value) + } + assert.Contains(t, values, 579.0) + assert.Contains(t, values, 1110.0) +} + +func TestGetMinCoreScalingRatio(t *testing.T) { + containerStats := &info.ContainerStats{ + Timestamp: time.Unix(1395066367, 0), + PerfStats: []info.PerfStat{ + { + PerfValue: info.PerfValue{ + ScalingRatio: 1.0, + Value: 123, + Name: "instructions"}, + Cpu: 0, + }, + { + PerfValue: info.PerfValue{ + ScalingRatio: 0.5, + Value: 456, + Name: "instructions"}, + Cpu: 1, + }, + { + PerfValue: info.PerfValue{ + ScalingRatio: 0.7, + Value: 321, + Name: "instructions_retired"}, + Cpu: 0, + }, + { + PerfValue: info.PerfValue{ + ScalingRatio: 0.3, + Value: 789, + Name: "instructions_retired"}, + Cpu: 1, + }, + }, + } + metricVals := getMinCoreScalingRatio(containerStats) + assert.Equal(t, 2, len(metricVals)) + values := []float64{} + for _, metric := range metricVals { + values = append(values, metric.value) + } + assert.Contains(t, values, 0.5) + assert.Contains(t, values, 0.3) +} diff --git a/metrics/testdata/prometheus_metrics_perf_aggregated b/metrics/testdata/prometheus_metrics_perf_aggregated new file mode 100644 index 0000000000..bee60f5141 --- /dev/null +++ b/metrics/testdata/prometheus_metrics_perf_aggregated @@ -0,0 +1,37 @@ +# HELP cadvisor_version_info A metric with a constant '1' value labeled by kernel version, OS version, docker version, cadvisor version & cadvisor revision. +# TYPE cadvisor_version_info gauge +cadvisor_version_info{cadvisorRevision="abcdef",cadvisorVersion="0.16.0",dockerVersion="1.8.1",kernelVersion="4.1.6-200.fc22.x86_64",osVersion="Fedora 22 (Twenty Two)"} 1 +# HELP container_last_seen Last time a container was seen by the exporter +# TYPE container_last_seen gauge +container_last_seen{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 1.395066363e+09 1395066363000 +# HELP container_perf_events_scaling_ratio Perf event metric scaling ratio. +# TYPE container_perf_events_scaling_ratio gauge +container_perf_events_scaling_ratio{container_env_foo_env="prod",container_label_foo_label="bar",cpu="",event="instructions",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 0.5 1395066363000 +container_perf_events_scaling_ratio{container_env_foo_env="prod",container_label_foo_label="bar",cpu="",event="instructions_retired",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 0.33333333333 1395066363000 +# HELP container_perf_events_total Perf event metric. +# TYPE container_perf_events_total counter +container_perf_events_total{container_env_foo_env="prod",container_label_foo_label="bar",cpu="",event="instructions",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 579 1395066363000 +container_perf_events_total{container_env_foo_env="prod",container_label_foo_label="bar",cpu="",event="instructions_retired",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 1110 1395066363000 +# HELP container_perf_uncore_events_scaling_ratio Perf uncore event metric scaling ratio. +# TYPE container_perf_uncore_events_scaling_ratio gauge +container_perf_uncore_events_scaling_ratio{container_env_foo_env="prod",container_label_foo_label="bar",event="cas_count_read",id="testcontainer",image="test",name="testcontaineralias",pmu="uncore_imc_0",socket="0",zone_name="hello"} 1 1395066363000 +container_perf_uncore_events_scaling_ratio{container_env_foo_env="prod",container_label_foo_label="bar",event="cas_count_read",id="testcontainer",image="test",name="testcontaineralias",pmu="uncore_imc_0",socket="1",zone_name="hello"} 1 1395066363000 +# HELP container_perf_uncore_events_total Perf uncore event metric. +# TYPE container_perf_uncore_events_total counter +container_perf_uncore_events_total{container_env_foo_env="prod",container_label_foo_label="bar",event="cas_count_read",id="testcontainer",image="test",name="testcontaineralias",pmu="uncore_imc_0",socket="0",zone_name="hello"} 1.231231512e+09 1395066363000 +container_perf_uncore_events_total{container_env_foo_env="prod",container_label_foo_label="bar",event="cas_count_read",id="testcontainer",image="test",name="testcontaineralias",pmu="uncore_imc_0",socket="1",zone_name="hello"} 1.111231331e+09 1395066363000 +# HELP container_scrape_error 1 if there was an error while getting container metrics, 0 otherwise +# TYPE container_scrape_error gauge +container_scrape_error 0 +# HELP container_spec_cpu_period CPU period of the container. +# TYPE container_spec_cpu_period gauge +container_spec_cpu_period{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 100000 +# HELP container_spec_cpu_quota CPU quota of the container. +# TYPE container_spec_cpu_quota gauge +container_spec_cpu_quota{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 10000 +# HELP container_spec_cpu_shares CPU share of the container. +# TYPE container_spec_cpu_shares gauge +container_spec_cpu_shares{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 1000 +# HELP container_start_time_seconds Start time of the container since unix epoch in seconds. +# TYPE container_start_time_seconds gauge +container_start_time_seconds{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 1.257894e+09