Skip to content

Commit 9c4cd10

Browse files
SuperQcloudbehl
authored andcommitted
ceph-mixin: Update monitoring mixin
Update `rate()` queries to be more accurate. The use of `irate()` leads to misleading graphs because it only looks at the last 2 samples over the selected time range step interval. Also use `$__rate_interval` consistently in order to scale over short and long time ranges. * Replace `irate()` with `rate()` to avoid sample bias. * Use `$__rate_interval` consistently. * Update auto_count/min to provide higher detail graphs. Fixes: https://tracker.ceph.com/issues/72343 Signed-off-by: SuperQ <[email protected]> Signed-off-by: Ankush Behl <[email protected]>
1 parent 48a0613 commit 9c4cd10

21 files changed

+194
-92
lines changed

monitoring/ceph-mixin/dashboards/ceph-cluster.libsonnet

Lines changed: 14 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ local g = import 'grafonnet/grafana.libsonnet';
3939
current='$__auto_interval_interval',
4040
refresh=2,
4141
label='Interval',
42-
auto_count=10,
42+
auto_count=300,
4343
auto_min='1m',
4444
options=[
4545
{ selected: true, text: 'auto', value: '$__auto_interval_interval' },
@@ -175,7 +175,7 @@ local g = import 'grafonnet/grafana.libsonnet';
175175
{ color: 'green' },
176176
])
177177
.addTarget($.addTargetSchema(
178-
expr='sum(irate(ceph_osd_op_w_in_bytes{%(matchers)s}[5m]))' % $.matchers(),
178+
expr='sum(rate(ceph_osd_op_w_in_bytes{%(matchers)s}[$__rate_interval]))' % $.matchers(),
179179
instant=true,
180180
interval='$interval',
181181
datasource='$datasource',
@@ -198,7 +198,7 @@ local g = import 'grafonnet/grafana.libsonnet';
198198
{ color: '#9ac48a', value: 0 },
199199
])
200200
.addTarget($.addTargetSchema(
201-
expr='sum(irate(ceph_osd_op_r_out_bytes{%(matchers)s}[5m]))' % $.matchers(),
201+
expr='sum(rate(ceph_osd_op_r_out_bytes{%(matchers)s}[$__rate_interval]))' % $.matchers(),
202202
instant=true,
203203
interval='$interval',
204204
datasource='$datasource',
@@ -495,7 +495,7 @@ local g = import 'grafonnet/grafana.libsonnet';
495495
])
496496
.addTargets([
497497
$.addTargetSchema(
498-
expr='sum(irate(ceph_osd_op_w{%(matchers)s}[1m]))' % $.matchers(),
498+
expr='sum(rate(ceph_osd_op_w{%(matchers)s}[$__rate_interval]))' % $.matchers(),
499499
legendFormat='',
500500
datasource='$datasource',
501501
instant=true,
@@ -526,7 +526,7 @@ local g = import 'grafonnet/grafana.libsonnet';
526526
])
527527
.addTargets([
528528
$.addTargetSchema(
529-
expr='sum(irate(ceph_osd_op_r{%(matchers)s}[1m]))' % $.matchers(),
529+
expr='sum(rate(ceph_osd_op_r{%(matchers)s}[$__rate_interval]))' % $.matchers(),
530530
legendFormat='',
531531
datasource='$datasource',
532532
instant=true,
@@ -729,15 +729,15 @@ local g = import 'grafonnet/grafana.libsonnet';
729729
.addTargets(
730730
[
731731
$.addTargetSchema(
732-
expr='sum(irate(ceph_osd_op_w_in_bytes{%(matchers)s}[5m]))' % $.matchers(),
732+
expr='sum(rate(ceph_osd_op_w_in_bytes{%(matchers)s}[$__rate_interval]))' % $.matchers(),
733733
datasource='$datasource',
734734
interval='$interval',
735735
legendFormat='Write',
736736
step=300,
737737
range=true,
738738
),
739739
$.addTargetSchema(
740-
expr='sum(irate(ceph_osd_op_r_out_bytes{%(matchers)s}[5m]))' % $.matchers(),
740+
expr='sum(rate(ceph_osd_op_r_out_bytes{%(matchers)s}[$__rate_interval]))' % $.matchers(),
741741
datasource='$datasource',
742742
interval='$interval',
743743
legendFormat='Read',
@@ -772,15 +772,15 @@ local g = import 'grafonnet/grafana.libsonnet';
772772
.addTargets(
773773
[
774774
$.addTargetSchema(
775-
expr='sum(irate(ceph_osd_op_w{%(matchers)s}[1m]))' % $.matchers(),
775+
expr='sum(rate(ceph_osd_op_w{%(matchers)s}[$__rate_interval]))' % $.matchers(),
776776
datasource='$datasource',
777777
interval='$interval',
778778
legendFormat='Write',
779779
step=300,
780780
range=true,
781781
),
782782
$.addTargetSchema(
783-
expr='sum(irate(ceph_osd_op_r{%(matchers)s}[1m]))' % $.matchers(),
783+
expr='sum(rate(ceph_osd_op_r{%(matchers)s}[$__rate_interval]))' % $.matchers(),
784784
datasource='$datasource',
785785
interval='$interval',
786786
legendFormat='Read',
@@ -1298,7 +1298,7 @@ local g = import 'grafonnet/grafana.libsonnet';
12981298
])
12991299
.addTargets([
13001300
$.addTargetSchema(
1301-
expr='sum(irate(ceph_osd_recovery_ops{%(matchers)s}[$interval]))' % $.matchers(),
1301+
expr='sum(rate(ceph_osd_recovery_ops{%(matchers)s}[$__rate_interval]))' % $.matchers(),
13021302
datasource='$datasource',
13031303
interval='$interval',
13041304
legendFormat='OPS',
@@ -1443,7 +1443,7 @@ local g = import 'grafonnet/grafana.libsonnet';
14431443
yBucketSize=null,
14441444
pluginVersion='9.4.7',
14451445
).addTarget($.addTargetSchema(
1446-
expr='rate(ceph_osd_op_r_latency_sum{%(matchers)s}[5m]) / rate(ceph_osd_op_r_latency_count{%(matchers)s}[5m]) >= 0' % $.matchers(),
1446+
expr='rate(ceph_osd_op_r_latency_sum{%(matchers)s}[$__rate_interval]) / rate(ceph_osd_op_r_latency_count{%(matchers)s}[$__rate_interval]) >= 0' % $.matchers(),
14471447
datasource='$datasource',
14481448
interval='$interval',
14491449
instant=false,
@@ -1494,7 +1494,7 @@ local g = import 'grafonnet/grafana.libsonnet';
14941494
yBucketSize=null,
14951495
pluginVersion='9.4.7',
14961496
).addTarget($.addTargetSchema(
1497-
expr='rate(ceph_osd_op_w_latency_sum{%(matchers)s}[5m]) / rate(ceph_osd_op_w_latency_count{%(matchers)s}[5m]) >= 0' % $.matchers(),
1497+
expr='rate(ceph_osd_op_w_latency_sum{%(matchers)s}[$__rate_interval]) / rate(ceph_osd_op_w_latency_count{%(matchers)s}[$__rate_interval]) >= 0' % $.matchers(),
14981498
datasource='$datasource',
14991499
interval='$interval',
15001500
legendFormat='',
@@ -1525,12 +1525,12 @@ local g = import 'grafonnet/grafana.libsonnet';
15251525
])
15261526
.addTargets([
15271527
$.addTargetSchema(
1528-
expr='avg(rate(ceph_osd_op_r_latency_sum{%(matchers)s}[5m]) / rate(ceph_osd_op_r_latency_count{%(matchers)s}[5m]) >= 0)' % $.matchers(),
1528+
expr='avg(rate(ceph_osd_op_r_latency_sum{%(matchers)s}[$__rate_interval]) / rate(ceph_osd_op_r_latency_count{%(matchers)s}[$__rate_interval]) >= 0)' % $.matchers(),
15291529
datasource='$datasource',
15301530
legendFormat='Read',
15311531
),
15321532
$.addTargetSchema(
1533-
expr='avg(rate(ceph_osd_op_w_latency_sum{%(matchers)s}[5m]) / rate(ceph_osd_op_w_latency_count{%(matchers)s}[5m]) >= 0)' % $.matchers(),
1533+
expr='avg(rate(ceph_osd_op_w_latency_sum{%(matchers)s}[$__rate_interval]) / rate(ceph_osd_op_w_latency_count{%(matchers)s}[$__rate_interval]) >= 0)' % $.matchers(),
15341534
datasource='$datasource',
15351535
legendFormat='Write',
15361536
),

monitoring/ceph-mixin/dashboards/ceph-nvmeof-performance.libsonnet

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -119,7 +119,7 @@ local g = import 'grafonnet/grafana.libsonnet';
119119
])
120120
.addTarget(
121121
$.addTargetSchema(
122-
expr="avg by(instance) (rate(ceph_nvmeof_reactor_seconds_total{mode='busy',instance=~'$gateway'}[1m]))",
122+
expr="avg by(instance) (rate(ceph_nvmeof_reactor_seconds_total{mode='busy',instance=~'$gateway'}[$__rate_interval]))",
123123
format='',
124124
instant=false,
125125
legendFormat='{{name}}',
@@ -155,7 +155,7 @@ local g = import 'grafonnet/grafana.libsonnet';
155155
])
156156
.addTarget(
157157
$.addTargetSchema(
158-
expr="avg by (instance) (rate(ceph_nvmeof_reactor_seconds_total{mode='busy', instance=~'$gateway.*'}[1m]))\n",
158+
expr="avg by (instance) (rate(ceph_nvmeof_reactor_seconds_total{mode='busy', instance=~'$gateway.*'}[$__rate_interval]))",
159159
format='',
160160
instant=false,
161161
legendFormat='{{name}}',
@@ -191,7 +191,7 @@ local g = import 'grafonnet/grafana.libsonnet';
191191
])
192192
.addTarget(
193193
$.addTargetSchema(
194-
expr="avg((rate(ceph_nvmeof_bdev_read_seconds_total{instance=~'$gateway'}[30s]) / rate(ceph_nvmeof_bdev_reads_completed_total{instance=~'$gateway'}[30s])) > 0)\n",
194+
expr="avg((rate(ceph_nvmeof_bdev_read_seconds_total{instance=~'$gateway'}[$__rate_interval]) / rate(ceph_nvmeof_bdev_reads_completed_total{instance=~'$gateway'}[$__rate_interval])) > 0)",
195195
format='time_series',
196196
instant=false,
197197
legendFormat='Reads',
@@ -201,7 +201,7 @@ local g = import 'grafonnet/grafana.libsonnet';
201201
)
202202
.addTarget(
203203
$.addTargetSchema(
204-
expr="avg((rate(ceph_nvmeof_bdev_write_seconds_total{instance=~'$gateway'}[30s]) / rate(ceph_nvmeof_bdev_writes_completed_total{instance=~'$gateway'}[30s])) > 0)",
204+
expr="avg((rate(ceph_nvmeof_bdev_write_seconds_total{instance=~'$gateway'}[$__rate_interval]) / rate(ceph_nvmeof_bdev_writes_completed_total{instance=~'$gateway'}[$__rate_interval])) > 0)",
205205
format='time_series',
206206
instant=false,
207207
legendFormat='Writes',
@@ -237,7 +237,7 @@ local g = import 'grafonnet/grafana.libsonnet';
237237
])
238238
.addTarget(
239239
$.addTargetSchema(
240-
expr="sum by(instance) (rate(ceph_nvmeof_bdev_reads_completed_total{instance=~'$gateway'}[1m]) + rate(ceph_nvmeof_bdev_writes_completed_total{instance=~'$gateway'}[1m]))",
240+
expr="sum by(instance) (rate(ceph_nvmeof_bdev_reads_completed_total{instance=~'$gateway'}[$__rate_interval]) + rate(ceph_nvmeof_bdev_writes_completed_total{instance=~'$gateway'}[$__rate_interval]))",
241241
format='time_series',
242242
instant=false,
243243
legendFormat='__auto',
@@ -272,7 +272,7 @@ local g = import 'grafonnet/grafana.libsonnet';
272272
])
273273
.addTarget(
274274
$.addTargetSchema(
275-
expr="\nsum by(nqn) ((rate(ceph_nvmeof_bdev_reads_completed_total{instance=~'$gateway'}[1m]) + rate(ceph_nvmeof_bdev_writes_completed_total{instance=~'$gateway'}[1m])) * on(instance,bdev_name) group_right ceph_nvmeof_subsystem_namespace_metadata{instance=~'$gateway'})",
275+
expr="sum by(nqn) ((rate(ceph_nvmeof_bdev_reads_completed_total{instance=~'$gateway'}[$__rate_interval]) + rate(ceph_nvmeof_bdev_writes_completed_total{instance=~'$gateway'}[$__rate_interval])) * on(instance,bdev_name) group_right ceph_nvmeof_subsystem_namespace_metadata{instance=~'$gateway'})",
276276
format='time_series',
277277
instant=false,
278278
legendFormat='__auto',
@@ -308,7 +308,7 @@ local g = import 'grafonnet/grafana.libsonnet';
308308
])
309309
.addTarget(
310310
$.addTargetSchema(
311-
expr="topk(5, (sum by(pool_name, rbd_name) (((rate(ceph_nvmeof_bdev_reads_completed_total{instance=~'$gateway'}[1m]) + rate(ceph_nvmeof_bdev_writes_completed_total{instance=~'$gateway'}[1m])) * on(instance,bdev_name) group_right ceph_nvmeof_bdev_metadata{instance=~'$gateway'}) * on(instance, bdev_name) group_left(nqn) ceph_nvmeof_subsystem_namespace_metadata{nqn=~'$subsystem',instance=~'$gateway'})))",
311+
expr="topk(5, (sum by(pool_name, rbd_name) (((rate(ceph_nvmeof_bdev_reads_completed_total{instance=~'$gateway'}[$__rate_interval]) + rate(ceph_nvmeof_bdev_writes_completed_total{instance=~'$gateway'}[$__rate_interval])) * on(instance,bdev_name) group_right ceph_nvmeof_bdev_metadata{instance=~'$gateway'}) * on(instance, bdev_name) group_left(nqn) ceph_nvmeof_subsystem_namespace_metadata{nqn=~'$subsystem',instance=~'$gateway'})))",
312312
format='time_series',
313313
instant=false,
314314
legendFormat='{{pool_name}}/{{rbd_name}}',
@@ -344,7 +344,7 @@ local g = import 'grafonnet/grafana.libsonnet';
344344
])
345345
.addTarget(
346346
$.addTargetSchema(
347-
expr="sum by(instance) (rate(ceph_nvmeof_bdev_read_bytes_total{instance=~'$gateway'}[1m]) + rate(ceph_nvmeof_bdev_written_bytes_total{instance=~'$gateway'}[1m]))",
347+
expr="sum by(instance) (rate(ceph_nvmeof_bdev_read_bytes_total{instance=~'$gateway'}[$__rate_interval]) + rate(ceph_nvmeof_bdev_written_bytes_total{instance=~'$gateway'}[$__rate_interval]))",
348348
format='time_series',
349349
instant=false,
350350
legendFormat='{{name}}',
@@ -379,7 +379,7 @@ local g = import 'grafonnet/grafana.libsonnet';
379379
])
380380
.addTarget(
381381
$.addTargetSchema(
382-
expr="\nsum by(nqn) ((rate(ceph_nvmeof_bdev_read_bytes_total{instance=~'$gateway'}[1m]) + rate(ceph_nvmeof_bdev_written_bytes_total{instance=~'$gateway'}[1m])) * on(instance,bdev_name) group_right ceph_nvmeof_subsystem_namespace_metadata{instance=~'$gateway'})",
382+
expr="sum by(nqn) ((rate(ceph_nvmeof_bdev_read_bytes_total{instance=~'$gateway'}[$__rate_interval]) + rate(ceph_nvmeof_bdev_written_bytes_total{instance=~'$gateway'}[$__rate_interval])) * on(instance,bdev_name) group_right ceph_nvmeof_subsystem_namespace_metadata{instance=~'$gateway'})",
383383
format='time_series',
384384
instant=false,
385385
legendFormat='__auto',
@@ -415,7 +415,7 @@ local g = import 'grafonnet/grafana.libsonnet';
415415
])
416416
.addTarget(
417417
$.addTargetSchema(
418-
expr="topk(5, (sum by(pool_name, rbd_name) (((rate(ceph_nvmeof_bdev_read_bytes_total{instance=~'$gateway'}[1m]) + rate(ceph_nvmeof_bdev_written_bytes_total{instance=~'$gateway'}[1m])) * on(instance,bdev_name) group_right ceph_nvmeof_bdev_metadata{instance=~'$gateway'}) * on(instance, bdev_name) group_left(nqn) ceph_nvmeof_subsystem_namespace_metadata{nqn=~'$subsystem',instance=~'$gateway'})))",
418+
expr="topk(5, (sum by(pool_name, rbd_name) (((rate(ceph_nvmeof_bdev_read_bytes_total{instance=~'$gateway'}[$__rate_interval]) + rate(ceph_nvmeof_bdev_written_bytes_total{instance=~'$gateway'}[$__rate_interval])) * on(instance,bdev_name) group_right ceph_nvmeof_bdev_metadata{instance=~'$gateway'}) * on(instance, bdev_name) group_left(nqn) ceph_nvmeof_subsystem_namespace_metadata{nqn=~'$subsystem',instance=~'$gateway'})))",
419419
format='time_series',
420420
instant=false,
421421
legendFormat='{{name}}',

monitoring/ceph-mixin/dashboards/ceph-nvmeof.libsonnet

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -696,7 +696,7 @@ local g = import 'grafonnet/grafana.libsonnet';
696696
])
697697
.addTarget(
698698
$.addTargetSchema(
699-
expr="sum(irate(ceph_nvmeof_bdev_reads_completed_total{instance=~'$gateway'}[$__rate_interval]))",
699+
expr="sum(rate(ceph_nvmeof_bdev_reads_completed_total{instance=~'$gateway'}[$__rate_interval]))",
700700
format='time_series',
701701
instant=null,
702702
legendFormat='Read',
@@ -706,7 +706,7 @@ local g = import 'grafonnet/grafana.libsonnet';
706706
)
707707
.addTarget(
708708
$.addTargetSchema(
709-
expr="sum(irate(ceph_nvmeof_bdev_writes_completed_total{instance=~'$gateway'}[$__rate_interval]))",
709+
expr="sum(rate(ceph_nvmeof_bdev_writes_completed_total{instance=~'$gateway'}[$__rate_interval]))",
710710
format='time_series',
711711
instant=false,
712712
legendFormat='Write',
@@ -735,7 +735,7 @@ local g = import 'grafonnet/grafana.libsonnet';
735735
])
736736
.addTarget(
737737
$.addTargetSchema(
738-
expr="sum (irate(ceph_nvmeof_bdev_read_bytes_total{instance=~'$gateway'}[$__rate_interval]))",
738+
expr="sum (rate(ceph_nvmeof_bdev_read_bytes_total{instance=~'$gateway'}[$__rate_interval]))",
739739
format='time_series',
740740
instant=false,
741741
legendFormat='Read',
@@ -745,7 +745,7 @@ local g = import 'grafonnet/grafana.libsonnet';
745745
)
746746
.addTarget(
747747
$.addTargetSchema(
748-
expr="sum (irate(ceph_nvmeof_bdev_written_bytes_total{instance=~'$gateway'}[$__rate_interval]))",
748+
expr="sum (rate(ceph_nvmeof_bdev_written_bytes_total{instance=~'$gateway'}[$__rate_interval]))",
749749
format='time_series',
750750
instant=false,
751751
legendFormat='Write',

monitoring/ceph-mixin/dashboards/multi-cluster.libsonnet

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -440,7 +440,7 @@ local g = import 'grafonnet/grafana.libsonnet';
440440
])
441441
.addTargets([
442442
$.addTargetSchema(
443-
expr='sum(irate(ceph_pool_wr{cluster=~"$cluster"}[$__interval]))',
443+
expr='sum(rate(ceph_pool_wr{cluster=~"$cluster"}[$__rate_interval]))',
444444
datasource={ type: 'prometheus', uid: '$datasource' },
445445
hide=false,
446446
exemplar=false,
@@ -449,7 +449,7 @@ local g = import 'grafonnet/grafana.libsonnet';
449449
range=true,
450450
),
451451
$.addTargetSchema(
452-
expr='sum(irate(ceph_pool_rd{cluster=~"$cluster"}[$__interval]))',
452+
expr='sum(rate(ceph_pool_rd{cluster=~"$cluster"}[$__rate_interval]))',
453453
datasource={ type: 'prometheus', uid: '$datasource' },
454454
hide=false,
455455
exemplar=false,
@@ -617,7 +617,7 @@ local g = import 'grafonnet/grafana.libsonnet';
617617
])
618618
.addTargets([
619619
$.addTargetSchema(
620-
expr='sum(irate(ceph_pool_rd_bytes{cluster=~"$cluster"}[$__interval]))',
620+
expr='sum(rate(ceph_pool_rd_bytes{cluster=~"$cluster"}[$__rate_interval]))',
621621
datasource={ type: 'prometheus', uid: '$datasource' },
622622
hide=false,
623623
exemplar=false,
@@ -626,7 +626,7 @@ local g = import 'grafonnet/grafana.libsonnet';
626626
range=true,
627627
),
628628
$.addTargetSchema(
629-
expr='sum(irate(ceph_pool_wr_bytes{cluster=~"$cluster"}[$__interval]))',
629+
expr='sum(rate(ceph_pool_wr_bytes{cluster=~"$cluster"}[$__rate_interval]))',
630630
datasource={ type: 'prometheus', uid: '$datasource' },
631631
hide=false,
632632
exemplar=false,
@@ -651,7 +651,7 @@ local g = import 'grafonnet/grafana.libsonnet';
651651
])
652652
.addTargets([
653653
$.addTargetSchema(
654-
expr='sum(irate(ceph_osd_recovery_ops{cluster=~"$cluster"}[$__interval]))',
654+
expr='sum(rate(ceph_osd_recovery_ops{cluster=~"$cluster"}[$__rate_interval]))',
655655
datasource={ type: 'prometheus', uid: '$datasource' },
656656
hide=false,
657657
exemplar=false,
@@ -891,7 +891,7 @@ local g = import 'grafonnet/grafana.libsonnet';
891891
.addTargets(
892892
[
893893
$.addTargetSchema(
894-
expr='topk(10, sum by (cluster) (irate(ceph_osd_op_w[$__interval])) \n+ sum by (cluster) (irate(ceph_osd_op_r[$__interval])) )',
894+
expr='topk(10, sum by (cluster) (rate(ceph_osd_op_w[$__rate_interval])) \n+ sum by (cluster) (rate(ceph_osd_op_r[$__rate_interval])) )',
895895
datasource='$datasource',
896896
instant=false,
897897
legendFormat='{{cluster}}',

monitoring/ceph-mixin/dashboards/utils.libsonnet

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -485,8 +485,8 @@ local timeSeries = import 'timeseries_panel.libsonnet';
485485
valuelabels={},
486486
refresh=0,
487487
label='Interval',
488-
auto_count=10,
489-
auto_min='2m',
488+
auto_count=300,
489+
auto_min='1m',
490490
options=[],
491491
auto=null)::
492492
g.template.interval(name=name,

0 commit comments

Comments
 (0)