Skip to content

Commit 6470faf

Browse files
craig[bot]aa-joshi
andcommitted
Merge #144453
144453: metrics: add SQLHistogram to support high cardinality metrics r=aa-joshi a=aa-joshi This patch introduces `SQLHistogram` which is an aggregation histogram of `SQLChildHistogram` metrics. SQLHistogram supports combination of `database` and `application_name` labels. The SQLChildHistogram stores the value of a histogram for a given combination of database and application name. SQLHistogram internally uses cache.UnorderedCache to store child metrics with default size of 5000. SQLHistogram will report to crdb-internal time series only the aggregate sum of all its children, while its children are additionally exported to prometheus. SQLHistogram differs from AggHistogram in that a SQLHistogram creates child metrics dynamically while AggHistogram needs child creation up front. We have extracted out dynamic child creation from AggHistogram in SQLHistogram. Epic: [CRDB-43153](https://cockroachlabs.atlassian.net/browse/CRDB-43153) Part of: [CRDB-48489](https://cockroachlabs.atlassian.net/browse/CRDB-48489) Release note: None Co-authored-by: Akshay Joshi <[email protected]>
2 parents 77bb5da + 9245fcf commit 6470faf

13 files changed

+583
-447
lines changed

pkg/util/metric/aggmetric/agg_metric.go

Lines changed: 8 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -236,11 +236,11 @@ func (sm *SQLMetric) add(metric ChildMetric) {
236236
sm.mu.children.Add(metric)
237237
}
238238

239+
type createChildMetricFunc func(labelValues labelValuesSlice) ChildMetric
240+
239241
// getOrAddChild returns the child metric for the given label values. If the child
240242
// doesn't exist, it creates a new one and adds it to the collection.
241-
func (sm *SQLMetric) getOrAddChild(
242-
metricType io_prometheus_client.MetricType, labelValues ...string,
243-
) ChildMetric {
243+
func (sm *SQLMetric) getOrAddChild(f createChildMetricFunc, labelValues ...string) ChildMetric {
244244
sm.mu.Lock()
245245
defer sm.mu.Unlock()
246246

@@ -249,20 +249,7 @@ func (sm *SQLMetric) getOrAddChild(
249249
return child
250250
}
251251

252-
// Otherwise, create a new child, add and return it.
253-
var child ChildMetric
254-
switch metricType {
255-
case io_prometheus_client.MetricType_COUNTER:
256-
child = &SQLChildCounter{
257-
labelValuesSlice: labelValuesSlice(labelValues),
258-
}
259-
case io_prometheus_client.MetricType_GAUGE:
260-
child = &SQLChildGauge{
261-
labelValuesSlice: labelValuesSlice(labelValues),
262-
}
263-
default:
264-
panic(errors.AssertionFailedf("unrecognised metric type %v", metricType))
265-
}
252+
child := f(labelValues)
266253

267254
sm.add(child)
268255
return child
@@ -273,18 +260,18 @@ func (sm *SQLMetric) getOrAddChild(
273260
// If the label configuration is either LabelConfigDisabled or unrecognised, it returns
274261
// ChildMetric as nil and false.
275262
func (sm *SQLMetric) getChildByLabelConfig(
276-
metricType io_prometheus_client.MetricType, db string, app string,
263+
f createChildMetricFunc, db string, app string,
277264
) (ChildMetric, bool) {
278265
var childMetric ChildMetric
279266
switch sm.labelConfig.Load() {
280267
case LabelConfigDB:
281-
childMetric = sm.getOrAddChild(metricType, db)
268+
childMetric = sm.getOrAddChild(f, db)
282269
return childMetric, true
283270
case LabelConfigApp:
284-
childMetric = sm.getOrAddChild(metricType, app)
271+
childMetric = sm.getOrAddChild(f, app)
285272
return childMetric, true
286273
case LabelConfigAppAndDB:
287-
childMetric = sm.getOrAddChild(metricType, db, app)
274+
childMetric = sm.getOrAddChild(f, db, app)
288275
return childMetric, true
289276
default:
290277
return nil, false

pkg/util/metric/aggmetric/counter.go

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -320,14 +320,20 @@ func (c *SQLCounter) Inspect(f func(interface{})) {
320320
func (c *SQLCounter) Inc(i int64, db, app string) {
321321
c.g.Inc(i)
322322

323-
childMetric, isChildMetricEnabled := c.getChildByLabelConfig(*c.GetType(), db, app)
323+
childMetric, isChildMetricEnabled := c.getChildByLabelConfig(c.createChildCounter, db, app)
324324
if !isChildMetricEnabled {
325325
return
326326
}
327327

328328
childMetric.(*SQLChildCounter).Inc(i)
329329
}
330330

331+
func (c *SQLCounter) createChildCounter(labelValues labelValuesSlice) ChildMetric {
332+
return &SQLChildCounter{
333+
labelValuesSlice: labelValues,
334+
}
335+
}
336+
331337
// SQLChildCounter is a child of a SQLCounter. When metrics are collected by prometheus,
332338
// each of the children will appear with a distinct label, however, when cockroach
333339
// internally collects metrics, only the parent is collected.

pkg/util/metric/aggmetric/gauge.go

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -421,7 +421,7 @@ func (sg *SQLGauge) Inspect(f func(interface{})) {
421421
// Gauge and updates it. Update increments parent metrics
422422
// irrespective of labelConfig.
423423
func (sg *SQLGauge) Update(val int64, db, app string) {
424-
childMetric, isChildMetricEnabled := sg.getChildByLabelConfig(*sg.GetType(), db, app)
424+
childMetric, isChildMetricEnabled := sg.getChildByLabelConfig(sg.createChildGauge, db, app)
425425

426426
// If the label configuration is either LabelConfigDisabled or unrecognised,
427427
// then only update aggregated gauge value.
@@ -442,7 +442,7 @@ func (sg *SQLGauge) Update(val int64, db, app string) {
442442
func (sg *SQLGauge) Inc(i int64, db, app string) {
443443
sg.g.Inc(i)
444444

445-
childMetric, isChildMetricEnabled := sg.getChildByLabelConfig(*sg.GetType(), db, app)
445+
childMetric, isChildMetricEnabled := sg.getChildByLabelConfig(sg.createChildGauge, db, app)
446446
if !isChildMetricEnabled {
447447
return
448448
}
@@ -456,13 +456,19 @@ func (sg *SQLGauge) Inc(i int64, db, app string) {
456456
func (sg *SQLGauge) Dec(i int64, db, app string) {
457457
sg.g.Dec(i)
458458

459-
childMetric, isChildMetricEnabled := sg.getChildByLabelConfig(*sg.GetType(), db, app)
459+
childMetric, isChildMetricEnabled := sg.getChildByLabelConfig(sg.createChildGauge, db, app)
460460
if !isChildMetricEnabled {
461461
return
462462
}
463463
childMetric.(*SQLChildGauge).Dec(i)
464464
}
465465

466+
func (sg *SQLGauge) createChildGauge(labelValues labelValuesSlice) ChildMetric {
467+
return &SQLChildGauge{
468+
labelValuesSlice: labelValues,
469+
}
470+
}
471+
466472
// SQLChildGauge is a child of a SQLGauge. When metrics are collected by prometheus,
467473
// each of the children will appear with a distinct label, however, when cockroach
468474
// internally collects metrics, only the parent is collected.

pkg/util/metric/aggmetric/histogram.go

Lines changed: 166 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -141,7 +141,7 @@ func (a *AggHistogram) ToPrometheusMetric() *prometheusgo.Metric {
141141
return a.h.ToPrometheusMetric()
142142
}
143143

144-
// AddChild adds a Counter to this AggHistogram. This method panics if a Counter
144+
// AddChild adds a Histogram to this AggHistogram. This method panics if a Histogram
145145
// already exists for this set of labelVals.
146146
func (a *AggHistogram) AddChild(labelVals ...string) *Histogram {
147147
child := &Histogram{
@@ -153,37 +153,6 @@ func (a *AggHistogram) AddChild(labelVals ...string) *Histogram {
153153
return child
154154
}
155155

156-
// RecordValue adds the given value to the histogram for the given label values. If a
157-
// histogram with the given label values doesn't exist yet, it creates a new
158-
// histogram and increments it. Panics if the number of label values doesn't
159-
// match the number of labels defined for this histogram.
160-
// Recording a value in excess of the configured maximum value for that histogram
161-
// results in recording the maximum value instead.
162-
func (a *AggHistogram) RecordValue(v int64, labelVals ...string) {
163-
if len(a.labels) != len(labelVals) {
164-
panic(errors.AssertionFailedf(
165-
"cannot increment child with %d label values %v to a metric with %d labels %v",
166-
len(labelVals), labelVals, len(a.labels), a.labels))
167-
}
168-
169-
// If the child already exists, update it.
170-
if child, ok := a.get(labelVals...); ok {
171-
child.(*Histogram).RecordValue(v)
172-
return
173-
}
174-
175-
// Otherwise, create a new child and update it.
176-
child := a.AddChild(labelVals...)
177-
child.RecordValue(v)
178-
}
179-
180-
// RemoveChild removes a Gauge from this AggGauge. This method panics if a Gauge
181-
// does not exist for this set of labelVals.
182-
func (g *AggHistogram) RemoveChild(labelVals ...string) {
183-
key := &Histogram{labelValuesSlice: labelValuesSlice(labelVals)}
184-
g.remove(key)
185-
}
186-
187156
// Histogram is a child of a AggHistogram. When values are recorded, so too is the
188157
// parent. When metrics are collected by prometheus, each of the children will
189158
// appear with a distinct label, however, when cockroach internally collects
@@ -218,3 +187,168 @@ func (g *Histogram) RecordValue(v int64) {
218187
g.h.RecordValue(v)
219188
g.parent.h.RecordValue(v)
220189
}
190+
191+
// SQLHistogram maintains a histogram as the sum of its children. The histogram will
192+
// report to crdb-internal time series only the aggregate sum of all of its
193+
// children, while its children are additionally exported to prometheus via the
194+
// PrometheusIterable interface. SQLHistogram differs from AggHistogram in that
195+
// a SQLHistogram creates child metrics dynamically while AggHistogram needs the
196+
// child creation up front.
197+
type SQLHistogram struct {
198+
h metric.IHistogram
199+
create func() metric.IHistogram
200+
*SQLMetric
201+
ticker struct {
202+
// We use a RWMutex, because we don't want child histograms to contend when
203+
// recording values, unless we're rotating histograms for the parent & children.
204+
// In this instance, the "writer" for the RWMutex is the ticker, and the "readers"
205+
// are all the child histograms recording their values.
206+
syncutil.RWMutex
207+
*tick.Ticker
208+
}
209+
}
210+
211+
var _ metric.Iterable = (*SQLHistogram)(nil)
212+
var _ metric.PrometheusIterable = (*SQLHistogram)(nil)
213+
var _ metric.PrometheusExportable = (*SQLHistogram)(nil)
214+
var _ metric.WindowedHistogram = (*SQLHistogram)(nil)
215+
var _ metric.CumulativeHistogram = (*SQLHistogram)(nil)
216+
217+
func NewSQLHistogram(opts metric.HistogramOptions) *SQLHistogram {
218+
create := func() metric.IHistogram {
219+
return metric.NewHistogram(opts)
220+
}
221+
s := &SQLHistogram{
222+
h: create(),
223+
create: create,
224+
}
225+
s.SQLMetric = NewSQLMetric(LabelConfigDisabled)
226+
s.ticker.Ticker = tick.NewTicker(
227+
now(),
228+
opts.Duration/metric.WindowedHistogramWrapNum,
229+
func() {
230+
// Atomically rotate the histogram window for the
231+
// parent histogram, and all the child histograms.
232+
s.h.Tick()
233+
s.apply(func(childMetric ChildMetric) {
234+
childHist, ok := childMetric.(*SQLChildHistogram)
235+
if !ok {
236+
panic(errors.AssertionFailedf(
237+
"unable to assert type of child for histogram %q when rotating histogram windows",
238+
opts.Metadata.Name))
239+
}
240+
childHist.h.Tick()
241+
})
242+
})
243+
return s
244+
}
245+
246+
// apply applies the given applyFn to every item in children
247+
func (sh *SQLHistogram) apply(applyFn func(childMetric ChildMetric)) {
248+
sh.mu.Lock()
249+
defer sh.mu.Unlock()
250+
sh.mu.children.Do(func(e interface{}) {
251+
applyFn(sh.mu.children.GetChildMetric(e).(*SQLChildHistogram))
252+
})
253+
}
254+
255+
// GetType is part of the metric.PrometheusExportable interface.
256+
func (sh *SQLHistogram) GetType() *prometheusgo.MetricType {
257+
return sh.h.GetType()
258+
}
259+
260+
// GetLabels is part of the metric.PrometheusExportable interface.
261+
func (sh *SQLHistogram) GetLabels(useStaticLabels bool) []*prometheusgo.LabelPair {
262+
return sh.h.GetLabels(useStaticLabels)
263+
}
264+
265+
// ToPrometheusMetric is part of the metric.PrometheusExportable interface.
266+
func (sh *SQLHistogram) ToPrometheusMetric() *prometheusgo.Metric {
267+
return sh.h.ToPrometheusMetric()
268+
}
269+
270+
// GetName is part of the metric.Iterable interface.
271+
func (sh *SQLHistogram) GetName(useStaticLabels bool) string {
272+
return sh.h.GetName(useStaticLabels)
273+
}
274+
275+
// GetHelp is part of the metric.Iterable interface.
276+
func (sh *SQLHistogram) GetHelp() string {
277+
return sh.h.GetHelp()
278+
}
279+
280+
// GetMeasurement is part of the metric.Iterable interface.
281+
func (sh *SQLHistogram) GetMeasurement() string {
282+
return sh.h.GetMeasurement()
283+
}
284+
285+
// GetUnit is part of the metric.Iterable interface.
286+
func (sh *SQLHistogram) GetUnit() metric.Unit {
287+
return sh.h.GetUnit()
288+
}
289+
290+
// GetMetadata is part of the metric.Iterable interface.
291+
func (sh *SQLHistogram) GetMetadata() metric.Metadata {
292+
return sh.h.GetMetadata()
293+
}
294+
295+
// Inspect is part of the metric.Iterable interface.
296+
func (sh *SQLHistogram) Inspect(f func(interface{})) {
297+
f(sh)
298+
}
299+
300+
// RecordValue records the Histogram value for the given label values. If a
301+
// Histogram with the given label values doesn't exist yet, it creates a new
302+
// Histogram and record against it. RecordValue records value in parent metrics
303+
// irrespective of labelConfig.
304+
func (sh *SQLHistogram) RecordValue(v int64, db, app string) {
305+
childMetric, isChildMetricEnabled := sh.getChildByLabelConfig(sh.createChildHistogram, db, app)
306+
sh.ticker.RLock()
307+
defer sh.ticker.RUnlock()
308+
309+
sh.h.RecordValue(v)
310+
if !isChildMetricEnabled {
311+
return
312+
}
313+
childMetric.(*SQLChildHistogram).RecordValue(v)
314+
}
315+
316+
// CumulativeSnapshot is part of the metric.CumulativeHistogram interface.
317+
func (sh *SQLHistogram) CumulativeSnapshot() metric.HistogramSnapshot {
318+
return sh.h.CumulativeSnapshot()
319+
}
320+
321+
// WindowedSnapshot is part of the metric.WindowedHistogram interface.
322+
func (sh *SQLHistogram) WindowedSnapshot() metric.HistogramSnapshot {
323+
return sh.h.WindowedSnapshot()
324+
}
325+
326+
func (sh *SQLHistogram) createChildHistogram(labelValues labelValuesSlice) ChildMetric {
327+
return &SQLChildHistogram{
328+
h: sh.create(),
329+
labelValuesSlice: labelValues,
330+
}
331+
}
332+
333+
// SQLChildHistogram is a child of a SQLHistogram. When metrics are collected by prometheus,
334+
// each of the children will appear with a distinct label, however, when cockroach
335+
// internally collects metrics, only the parent is collected.
336+
type SQLChildHistogram struct {
337+
labelValuesSlice
338+
h metric.IHistogram
339+
}
340+
341+
// ToPrometheusMetric constructs a prometheus metric for this Histogram.
342+
func (sch *SQLChildHistogram) ToPrometheusMetric() *prometheusgo.Metric {
343+
return sch.h.ToPrometheusMetric()
344+
}
345+
346+
// RecordValue sets the histogram's value.
347+
func (sch *SQLChildHistogram) RecordValue(v int64) {
348+
sch.h.RecordValue(v)
349+
}
350+
351+
// Value returns the SQLChildHistogram's current gauge.
352+
func (sch *SQLChildHistogram) Value() metric.HistogramSnapshot {
353+
return sch.h.CumulativeSnapshot()
354+
}

pkg/util/metric/aggmetric/histogram_test.go

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ import (
2323
"github.com/prometheus/common/expfmt"
2424
)
2525

26-
func TestAggHistogram(t *testing.T) {
26+
func TestSQLHistogram(t *testing.T) {
2727
defer leaktest.AfterTest(t)()
2828
const cacheSize = 10
2929
r := metric.NewRegistry()
@@ -44,15 +44,15 @@ func TestAggHistogram(t *testing.T) {
4444
return strings.Join(lines, "\n")
4545
}
4646

47-
h := NewHistogram(metric.HistogramOptions{
47+
h := NewSQLHistogram(metric.HistogramOptions{
4848
Metadata: metric.Metadata{
4949
Name: "histo_gram",
5050
},
5151
Duration: base.DefaultHistogramWindowInterval(),
5252
MaxVal: 100,
5353
SigFigs: 1,
5454
BucketConfig: metric.Percent100Buckets,
55-
}, "tenant_id", "hist_label")
55+
})
5656
r.AddMetric(h)
5757
cacheStorage := cache.NewUnorderedCache(cache.Config{
5858
Policy: cache.CacheLRU,
@@ -63,14 +63,15 @@ func TestAggHistogram(t *testing.T) {
6363
h.mu.children = &UnorderedCacheWrapper{
6464
cache: cacheStorage,
6565
}
66+
h.labelConfig.Store(LabelConfigAppAndDB)
6667

6768
for i := 0; i < cacheSize; i++ {
6869
h.RecordValue(1, "1", strconv.Itoa(i))
6970
}
7071

71-
testFile := "aggHistogram_pre_eviction.txt"
72+
testFile := "SQLHistogram_pre_eviction.txt"
7273
if metric.HdrEnabled() {
73-
testFile = "aggHistogram_pre_eviction_hdr.txt"
74+
testFile = "SQLHistogram_pre_eviction_hdr.txt"
7475
}
7576

7677
echotest.Require(t, writePrometheusMetrics(t), datapathutils.TestDataPath(t, testFile))
@@ -79,9 +80,9 @@ func TestAggHistogram(t *testing.T) {
7980
h.RecordValue(10, "2", strconv.Itoa(i))
8081
}
8182

82-
testFile = "aggHistogram_post_eviction.txt"
83+
testFile = "SQLHistogram_post_eviction.txt"
8384
if metric.HdrEnabled() {
84-
testFile = "aggHistogram_post_eviction_hdr.txt"
85+
testFile = "SQLHistogram_post_eviction_hdr.txt"
8586
}
8687
echotest.Require(t, writePrometheusMetrics(t), datapathutils.TestDataPath(t, testFile))
8788
}

0 commit comments

Comments
 (0)