From 2ec14979a8f5e9665b7490fe580d3c49d831b4eb Mon Sep 17 00:00:00 2001
From: Aliaksandr Valialkin <valyala@victoriametrics.com>
Date: Thu, 30 Nov 2023 01:47:03 +0200
Subject: [PATCH] go_metrics.go: use histogram buckets instead of summary for
 Go runtime histogram

It is unclear how and when to reset summary - it is smoothed during the Go app uptime.
On the other hand, histogram buckets can be wrapped into increase() or rate() in order
to calculate the histogram distribution on arbitrary time range.

Limit the number of buckets per Go runtime histogram to 30 in order to prevent from high cardinality issues.
---
 go_metrics.go      | 44 +++++++++++++++++++++++++++++++-------------
 go_metrics_test.go | 41 +++++++++++++++++++++--------------------
 2 files changed, 52 insertions(+), 33 deletions(-)

diff --git a/go_metrics.go b/go_metrics.go
index 0bdb20a..5de7cb8 100644
--- a/go_metrics.go
+++ b/go_metrics.go
@@ -6,6 +6,7 @@ import (
 	"math"
 	"runtime"
 	runtimemetrics "runtime/metrics"
+	"strings"
 
 	"github.com/valyala/histogram"
 )
@@ -104,23 +105,40 @@ func writeRuntimeMetric(w io.Writer, name string, sample *runtimemetrics.Sample)
 }
 
 func writeRuntimeHistogramMetric(w io.Writer, name string, h *runtimemetrics.Float64Histogram) {
-	// Expose histogram metric as summary, since Go runtime returns too many histogram buckets,
-	// which may lead to high cardinality issues at the scraper side.
 	buckets := h.Buckets
 	counts := h.Counts
-	totalCount := uint64(0)
-	for _, count := range counts {
-		totalCount += count
+	if len(buckets) != len(counts)+1 {
+		panic(fmt.Errorf("the number of buckets must be bigger than the number of counts by 1 in histogram %s; got buckets=%d, counts=%d", name, len(buckets), len(counts)))
 	}
-	for _, q := range defaultSummaryQuantiles {
-		upperBound := uint64(math.Ceil(q * float64(totalCount)))
-		runningCount := uint64(0)
-		for i, count := range counts {
-			runningCount += count
-			if runningCount >= upperBound {
-				fmt.Fprintf(w, `%s{quantile="%g"} %g`+"\n", name, q, buckets[i+1])
-				break
+	tailCount := uint64(0)
+	if strings.HasSuffix(name, "_seconds") {
+		// Limit the maximum bucket to 1 second, since Go runtime exposes buckets with 10K seconds,
+		// which have little sense. At the same time such buckets may lead to high cardinality issues
+		// at the scraper side.
+		for len(buckets) > 0 && buckets[len(buckets)-1] > 1 {
+			buckets = buckets[:len(buckets)-1]
+			tailCount += counts[len(counts)-1]
+			counts = counts[:len(counts)-1]
+		}
+	}
+
+	iStep := float64(len(buckets)) / maxRuntimeHistogramBuckets
+
+	totalCount := uint64(0)
+	iNext := 0.0
+	for i, count := range counts {
+		totalCount += count
+		if float64(i) >= iNext {
+			iNext += iStep
+			le := buckets[i+1]
+			if !math.IsInf(le, 1) {
+				fmt.Fprintf(w, `%s_bucket{le="%g"} %d`+"\n", name, le, totalCount)
 			}
 		}
 	}
+	totalCount += tailCount
+	fmt.Fprintf(w, `%s_bucket{le="+Inf"} %d`+"\n", name, totalCount)
 }
+
+// Limit the number of buckets for Go runtime histograms in order to prevent from high cardinality issues at scraper side.
+const maxRuntimeHistogramBuckets = 30
diff --git a/go_metrics_test.go b/go_metrics_test.go
index 48adc05..cd510e7 100644
--- a/go_metrics_test.go
+++ b/go_metrics_test.go
@@ -22,40 +22,41 @@ func TestWriteRuntimeHistogramMetricOk(t *testing.T) {
 	f(&runtimemetrics.Float64Histogram{
 		Counts:  []uint64{1, 2, 3},
 		Buckets: []float64{1, 2, 3, 4},
-	}, `foo{quantile="0.5"} 3
-foo{quantile="0.9"} 4
-foo{quantile="0.97"} 4
-foo{quantile="0.99"} 4
-foo{quantile="1"} 4
+	}, `foo_bucket{le="2"} 1
+foo_bucket{le="3"} 3
+foo_bucket{le="4"} 6
+foo_bucket{le="+Inf"} 6
 `)
 
 	f(&runtimemetrics.Float64Histogram{
 		Counts:  []uint64{0, 25, 1, 0},
 		Buckets: []float64{1, 2, 3, 4, math.Inf(1)},
-	}, `foo{quantile="0.5"} 3
-foo{quantile="0.9"} 3
-foo{quantile="0.97"} 4
-foo{quantile="0.99"} 4
-foo{quantile="1"} 4
+	}, `foo_bucket{le="2"} 0
+foo_bucket{le="3"} 25
+foo_bucket{le="4"} 26
+foo_bucket{le="+Inf"} 26
 `)
 
 	f(&runtimemetrics.Float64Histogram{
 		Counts:  []uint64{0, 25, 1, 3, 0, 44, 15, 132, 10, 0},
 		Buckets: []float64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, math.Inf(1)},
-	}, `foo{quantile="0.5"} 9
-foo{quantile="0.9"} 9
-foo{quantile="0.97"} 10
-foo{quantile="0.99"} 10
-foo{quantile="1"} 10
+	}, `foo_bucket{le="2"} 0
+foo_bucket{le="3"} 25
+foo_bucket{le="4"} 26
+foo_bucket{le="5"} 29
+foo_bucket{le="6"} 29
+foo_bucket{le="7"} 73
+foo_bucket{le="8"} 88
+foo_bucket{le="9"} 220
+foo_bucket{le="10"} 230
+foo_bucket{le="+Inf"} 230
 `)
 
 	f(&runtimemetrics.Float64Histogram{
 		Counts:  []uint64{1, 5, 0},
 		Buckets: []float64{math.Inf(-1), 4, 5, math.Inf(1)},
-	}, `foo{quantile="0.5"} 5
-foo{quantile="0.9"} 5
-foo{quantile="0.97"} 5
-foo{quantile="0.99"} 5
-foo{quantile="1"} 5
+	}, `foo_bucket{le="4"} 1
+foo_bucket{le="5"} 6
+foo_bucket{le="+Inf"} 6
 `)
 }