go_metrics.go: use histogram buckets instead of summary for Go runtime histogram
It is unclear how and when to reset summary - it is smoothed during the Go app uptime. On the other hand, histogram buckets can be wrapped into increase() or rate() in order to calculate the histogram distribution on arbitrary time range. Limit the number of buckets per Go runtime histogram to 30 in order to prevent from high cardinality issues.
This commit is contained in:
parent
5b58446f57
commit
2ec14979a8
@ -6,6 +6,7 @@ import (
|
||||
"math"
|
||||
"runtime"
|
||||
runtimemetrics "runtime/metrics"
|
||||
"strings"
|
||||
|
||||
"github.com/valyala/histogram"
|
||||
)
|
||||
@ -104,23 +105,40 @@ func writeRuntimeMetric(w io.Writer, name string, sample *runtimemetrics.Sample)
|
||||
}
|
||||
|
||||
func writeRuntimeHistogramMetric(w io.Writer, name string, h *runtimemetrics.Float64Histogram) {
|
||||
// Expose histogram metric as summary, since Go runtime returns too many histogram buckets,
|
||||
// which may lead to high cardinality issues at the scraper side.
|
||||
buckets := h.Buckets
|
||||
counts := h.Counts
|
||||
if len(buckets) != len(counts)+1 {
|
||||
panic(fmt.Errorf("the number of buckets must be bigger than the number of counts by 1 in histogram %s; got buckets=%d, counts=%d", name, len(buckets), len(counts)))
|
||||
}
|
||||
tailCount := uint64(0)
|
||||
if strings.HasSuffix(name, "_seconds") {
|
||||
// Limit the maximum bucket to 1 second, since Go runtime exposes buckets with 10K seconds,
|
||||
// which have little sense. At the same time such buckets may lead to high cardinality issues
|
||||
// at the scraper side.
|
||||
for len(buckets) > 0 && buckets[len(buckets)-1] > 1 {
|
||||
buckets = buckets[:len(buckets)-1]
|
||||
tailCount += counts[len(counts)-1]
|
||||
counts = counts[:len(counts)-1]
|
||||
}
|
||||
}
|
||||
|
||||
iStep := float64(len(buckets)) / maxRuntimeHistogramBuckets
|
||||
|
||||
totalCount := uint64(0)
|
||||
for _, count := range counts {
|
||||
totalCount += count
|
||||
}
|
||||
for _, q := range defaultSummaryQuantiles {
|
||||
upperBound := uint64(math.Ceil(q * float64(totalCount)))
|
||||
runningCount := uint64(0)
|
||||
iNext := 0.0
|
||||
for i, count := range counts {
|
||||
runningCount += count
|
||||
if runningCount >= upperBound {
|
||||
fmt.Fprintf(w, `%s{quantile="%g"} %g`+"\n", name, q, buckets[i+1])
|
||||
break
|
||||
totalCount += count
|
||||
if float64(i) >= iNext {
|
||||
iNext += iStep
|
||||
le := buckets[i+1]
|
||||
if !math.IsInf(le, 1) {
|
||||
fmt.Fprintf(w, `%s_bucket{le="%g"} %d`+"\n", name, le, totalCount)
|
||||
}
|
||||
}
|
||||
}
|
||||
totalCount += tailCount
|
||||
fmt.Fprintf(w, `%s_bucket{le="+Inf"} %d`+"\n", name, totalCount)
|
||||
}
|
||||
|
||||
// Limit the number of buckets for Go runtime histograms in order to prevent from high cardinality issues at scraper side.
|
||||
const maxRuntimeHistogramBuckets = 30
|
||||
|
@ -22,40 +22,41 @@ func TestWriteRuntimeHistogramMetricOk(t *testing.T) {
|
||||
f(&runtimemetrics.Float64Histogram{
|
||||
Counts: []uint64{1, 2, 3},
|
||||
Buckets: []float64{1, 2, 3, 4},
|
||||
}, `foo{quantile="0.5"} 3
|
||||
foo{quantile="0.9"} 4
|
||||
foo{quantile="0.97"} 4
|
||||
foo{quantile="0.99"} 4
|
||||
foo{quantile="1"} 4
|
||||
}, `foo_bucket{le="2"} 1
|
||||
foo_bucket{le="3"} 3
|
||||
foo_bucket{le="4"} 6
|
||||
foo_bucket{le="+Inf"} 6
|
||||
`)
|
||||
|
||||
f(&runtimemetrics.Float64Histogram{
|
||||
Counts: []uint64{0, 25, 1, 0},
|
||||
Buckets: []float64{1, 2, 3, 4, math.Inf(1)},
|
||||
}, `foo{quantile="0.5"} 3
|
||||
foo{quantile="0.9"} 3
|
||||
foo{quantile="0.97"} 4
|
||||
foo{quantile="0.99"} 4
|
||||
foo{quantile="1"} 4
|
||||
}, `foo_bucket{le="2"} 0
|
||||
foo_bucket{le="3"} 25
|
||||
foo_bucket{le="4"} 26
|
||||
foo_bucket{le="+Inf"} 26
|
||||
`)
|
||||
|
||||
f(&runtimemetrics.Float64Histogram{
|
||||
Counts: []uint64{0, 25, 1, 3, 0, 44, 15, 132, 10, 0},
|
||||
Buckets: []float64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, math.Inf(1)},
|
||||
}, `foo{quantile="0.5"} 9
|
||||
foo{quantile="0.9"} 9
|
||||
foo{quantile="0.97"} 10
|
||||
foo{quantile="0.99"} 10
|
||||
foo{quantile="1"} 10
|
||||
}, `foo_bucket{le="2"} 0
|
||||
foo_bucket{le="3"} 25
|
||||
foo_bucket{le="4"} 26
|
||||
foo_bucket{le="5"} 29
|
||||
foo_bucket{le="6"} 29
|
||||
foo_bucket{le="7"} 73
|
||||
foo_bucket{le="8"} 88
|
||||
foo_bucket{le="9"} 220
|
||||
foo_bucket{le="10"} 230
|
||||
foo_bucket{le="+Inf"} 230
|
||||
`)
|
||||
|
||||
f(&runtimemetrics.Float64Histogram{
|
||||
Counts: []uint64{1, 5, 0},
|
||||
Buckets: []float64{math.Inf(-1), 4, 5, math.Inf(1)},
|
||||
}, `foo{quantile="0.5"} 5
|
||||
foo{quantile="0.9"} 5
|
||||
foo{quantile="0.97"} 5
|
||||
foo{quantile="0.99"} 5
|
||||
foo{quantile="1"} 5
|
||||
}, `foo_bucket{le="4"} 1
|
||||
foo_bucket{le="5"} 6
|
||||
foo_bucket{le="+Inf"} 6
|
||||
`)
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user