go_metrics.go: use histogram buckets instead of summary for Go runtime histogram

It is unclear how and when to reset summary - it is smoothed during the Go app uptime.
On the other hand, histogram buckets can be wrapped into increase() or rate() in order
to calculate the histogram distribution on arbitrary time range.

Limit the number of buckets per Go runtime histogram to 30 in order to prevent from high cardinality issues.
This commit is contained in:
Aliaksandr Valialkin 2023-11-30 01:47:03 +02:00
parent 5b58446f57
commit 2ec14979a8
No known key found for this signature in database
GPG Key ID: 52C003EE2BCDB9EB
2 changed files with 52 additions and 33 deletions

View File

@ -6,6 +6,7 @@ import (
"math" "math"
"runtime" "runtime"
runtimemetrics "runtime/metrics" runtimemetrics "runtime/metrics"
"strings"
"github.com/valyala/histogram" "github.com/valyala/histogram"
) )
@ -104,23 +105,40 @@ func writeRuntimeMetric(w io.Writer, name string, sample *runtimemetrics.Sample)
} }
func writeRuntimeHistogramMetric(w io.Writer, name string, h *runtimemetrics.Float64Histogram) { func writeRuntimeHistogramMetric(w io.Writer, name string, h *runtimemetrics.Float64Histogram) {
// Expose histogram metric as summary, since Go runtime returns too many histogram buckets,
// which may lead to high cardinality issues at the scraper side.
buckets := h.Buckets buckets := h.Buckets
counts := h.Counts counts := h.Counts
if len(buckets) != len(counts)+1 {
panic(fmt.Errorf("the number of buckets must be bigger than the number of counts by 1 in histogram %s; got buckets=%d, counts=%d", name, len(buckets), len(counts)))
}
tailCount := uint64(0)
if strings.HasSuffix(name, "_seconds") {
// Limit the maximum bucket to 1 second, since Go runtime exposes buckets with 10K seconds,
// which have little sense. At the same time such buckets may lead to high cardinality issues
// at the scraper side.
for len(buckets) > 0 && buckets[len(buckets)-1] > 1 {
buckets = buckets[:len(buckets)-1]
tailCount += counts[len(counts)-1]
counts = counts[:len(counts)-1]
}
}
iStep := float64(len(buckets)) / maxRuntimeHistogramBuckets
totalCount := uint64(0) totalCount := uint64(0)
for _, count := range counts { iNext := 0.0
totalCount += count
}
for _, q := range defaultSummaryQuantiles {
upperBound := uint64(math.Ceil(q * float64(totalCount)))
runningCount := uint64(0)
for i, count := range counts { for i, count := range counts {
runningCount += count totalCount += count
if runningCount >= upperBound { if float64(i) >= iNext {
fmt.Fprintf(w, `%s{quantile="%g"} %g`+"\n", name, q, buckets[i+1]) iNext += iStep
break le := buckets[i+1]
if !math.IsInf(le, 1) {
fmt.Fprintf(w, `%s_bucket{le="%g"} %d`+"\n", name, le, totalCount)
} }
} }
} }
totalCount += tailCount
fmt.Fprintf(w, `%s_bucket{le="+Inf"} %d`+"\n", name, totalCount)
} }
// Limit the number of buckets for Go runtime histograms in order to prevent from high cardinality issues at scraper side.
const maxRuntimeHistogramBuckets = 30

View File

@ -22,40 +22,41 @@ func TestWriteRuntimeHistogramMetricOk(t *testing.T) {
f(&runtimemetrics.Float64Histogram{ f(&runtimemetrics.Float64Histogram{
Counts: []uint64{1, 2, 3}, Counts: []uint64{1, 2, 3},
Buckets: []float64{1, 2, 3, 4}, Buckets: []float64{1, 2, 3, 4},
}, `foo{quantile="0.5"} 3 }, `foo_bucket{le="2"} 1
foo{quantile="0.9"} 4 foo_bucket{le="3"} 3
foo{quantile="0.97"} 4 foo_bucket{le="4"} 6
foo{quantile="0.99"} 4 foo_bucket{le="+Inf"} 6
foo{quantile="1"} 4
`) `)
f(&runtimemetrics.Float64Histogram{ f(&runtimemetrics.Float64Histogram{
Counts: []uint64{0, 25, 1, 0}, Counts: []uint64{0, 25, 1, 0},
Buckets: []float64{1, 2, 3, 4, math.Inf(1)}, Buckets: []float64{1, 2, 3, 4, math.Inf(1)},
}, `foo{quantile="0.5"} 3 }, `foo_bucket{le="2"} 0
foo{quantile="0.9"} 3 foo_bucket{le="3"} 25
foo{quantile="0.97"} 4 foo_bucket{le="4"} 26
foo{quantile="0.99"} 4 foo_bucket{le="+Inf"} 26
foo{quantile="1"} 4
`) `)
f(&runtimemetrics.Float64Histogram{ f(&runtimemetrics.Float64Histogram{
Counts: []uint64{0, 25, 1, 3, 0, 44, 15, 132, 10, 0}, Counts: []uint64{0, 25, 1, 3, 0, 44, 15, 132, 10, 0},
Buckets: []float64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, math.Inf(1)}, Buckets: []float64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, math.Inf(1)},
}, `foo{quantile="0.5"} 9 }, `foo_bucket{le="2"} 0
foo{quantile="0.9"} 9 foo_bucket{le="3"} 25
foo{quantile="0.97"} 10 foo_bucket{le="4"} 26
foo{quantile="0.99"} 10 foo_bucket{le="5"} 29
foo{quantile="1"} 10 foo_bucket{le="6"} 29
foo_bucket{le="7"} 73
foo_bucket{le="8"} 88
foo_bucket{le="9"} 220
foo_bucket{le="10"} 230
foo_bucket{le="+Inf"} 230
`) `)
f(&runtimemetrics.Float64Histogram{ f(&runtimemetrics.Float64Histogram{
Counts: []uint64{1, 5, 0}, Counts: []uint64{1, 5, 0},
Buckets: []float64{math.Inf(-1), 4, 5, math.Inf(1)}, Buckets: []float64{math.Inf(-1), 4, 5, math.Inf(1)},
}, `foo{quantile="0.5"} 5 }, `foo_bucket{le="4"} 1
foo{quantile="0.9"} 5 foo_bucket{le="5"} 6
foo{quantile="0.97"} 5 foo_bucket{le="+Inf"} 6
foo{quantile="0.99"} 5
foo{quantile="1"} 5
`) `)
} }