go_metrics.go: use histogram buckets instead of summary for Go runtime histogram

It is unclear how and when to reset summary - it is smoothed during the Go app uptime. On the other hand, histogram buckets can be wrapped into increase() or rate() in order to calculate the histogram distribution on arbitrary time range. Limit the number of buckets per Go runtime histogram to 30 in order to prevent from high cardinality issues.
2023-11-30 01:47:03 +02:00 · 2023-11-30 01:47:03 +02:00 · 2ec14979a8
commit 2ec14979a8
parent 5b58446f57
2 changed files with 52 additions and 33 deletions
--- a/go_metrics.go
+++ b/go_metrics.go
@ -6,6 +6,7 @@ import (
 	"math"
 	"runtime"
 	runtimemetrics "runtime/metrics"
+	"strings"

 	"github.com/valyala/histogram"
 )
@ -104,23 +105,40 @@ func writeRuntimeMetric(w io.Writer, name string, sample *runtimemetrics.Sample)
 }

 func writeRuntimeHistogramMetric(w io.Writer, name string, h *runtimemetrics.Float64Histogram) {
-	// Expose histogram metric as summary, since Go runtime returns too many histogram buckets,
-	// which may lead to high cardinality issues at the scraper side.
 	buckets := h.Buckets
 	counts := h.Counts
-	totalCount := uint64(0)
-	for _, count := range counts {
-		totalCount += count
+	if len(buckets) != len(counts)+1 {
+		panic(fmt.Errorf("the number of buckets must be bigger than the number of counts by 1 in histogram %s; got buckets=%d, counts=%d", name, len(buckets), len(counts)))
 	}
-	for _, q := range defaultSummaryQuantiles {
-		upperBound := uint64(math.Ceil(q * float64(totalCount)))
-		runningCount := uint64(0)
-		for i, count := range counts {
-			runningCount += count
-			if runningCount >= upperBound {
-				fmt.Fprintf(w, `%s{quantile="%g"} %g`+"\n", name, q, buckets[i+1])
-				break
+	tailCount := uint64(0)
+	if strings.HasSuffix(name, "_seconds") {
+		// Limit the maximum bucket to 1 second, since Go runtime exposes buckets with 10K seconds,
+		// which have little sense. At the same time such buckets may lead to high cardinality issues
+		// at the scraper side.
+		for len(buckets) > 0 && buckets[len(buckets)-1] > 1 {
+			buckets = buckets[:len(buckets)-1]
+			tailCount += counts[len(counts)-1]
+			counts = counts[:len(counts)-1]
+		}
+	}
+
+	iStep := float64(len(buckets)) / maxRuntimeHistogramBuckets
+
+	totalCount := uint64(0)
+	iNext := 0.0
+	for i, count := range counts {
+		totalCount += count
+		if float64(i) >= iNext {
+			iNext += iStep
+			le := buckets[i+1]
+			if !math.IsInf(le, 1) {
+				fmt.Fprintf(w, `%s_bucket{le="%g"} %d`+"\n", name, le, totalCount)
 			}
 		}
 	}
+	totalCount += tailCount
+	fmt.Fprintf(w, `%s_bucket{le="+Inf"} %d`+"\n", name, totalCount)
 }
+
+// Limit the number of buckets for Go runtime histograms in order to prevent from high cardinality issues at scraper side.
+const maxRuntimeHistogramBuckets = 30
--- a/go_metrics_test.go
+++ b/go_metrics_test.go
@ -22,40 +22,41 @@ func TestWriteRuntimeHistogramMetricOk(t *testing.T) {
 	f(&runtimemetrics.Float64Histogram{
 		Counts:  []uint64{1, 2, 3},
 		Buckets: []float64{1, 2, 3, 4},
-	}, `foo{quantile="0.5"} 3
-foo{quantile="0.9"} 4
-foo{quantile="0.97"} 4
-foo{quantile="0.99"} 4
-foo{quantile="1"} 4
+	}, `foo_bucket{le="2"} 1
+foo_bucket{le="3"} 3
+foo_bucket{le="4"} 6
+foo_bucket{le="+Inf"} 6
 `)

 	f(&runtimemetrics.Float64Histogram{
 		Counts:  []uint64{0, 25, 1, 0},
 		Buckets: []float64{1, 2, 3, 4, math.Inf(1)},
-	}, `foo{quantile="0.5"} 3
-foo{quantile="0.9"} 3
-foo{quantile="0.97"} 4
-foo{quantile="0.99"} 4
-foo{quantile="1"} 4
+	}, `foo_bucket{le="2"} 0
+foo_bucket{le="3"} 25
+foo_bucket{le="4"} 26
+foo_bucket{le="+Inf"} 26
 `)

 	f(&runtimemetrics.Float64Histogram{
 		Counts:  []uint64{0, 25, 1, 3, 0, 44, 15, 132, 10, 0},
 		Buckets: []float64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, math.Inf(1)},
-	}, `foo{quantile="0.5"} 9
-foo{quantile="0.9"} 9
-foo{quantile="0.97"} 10
-foo{quantile="0.99"} 10
-foo{quantile="1"} 10
+	}, `foo_bucket{le="2"} 0
+foo_bucket{le="3"} 25
+foo_bucket{le="4"} 26
+foo_bucket{le="5"} 29
+foo_bucket{le="6"} 29
+foo_bucket{le="7"} 73
+foo_bucket{le="8"} 88
+foo_bucket{le="9"} 220
+foo_bucket{le="10"} 230
+foo_bucket{le="+Inf"} 230
 `)

 	f(&runtimemetrics.Float64Histogram{
 		Counts:  []uint64{1, 5, 0},
 		Buckets: []float64{math.Inf(-1), 4, 5, math.Inf(1)},
-	}, `foo{quantile="0.5"} 5
-foo{quantile="0.9"} 5
-foo{quantile="0.97"} 5
-foo{quantile="0.99"} 5
-foo{quantile="1"} 5
+	}, `foo_bucket{le="4"} 1
+foo_bucket{le="5"} 6
+foo_bucket{le="+Inf"} 6
 `)
 }