Skip to content

Commit 666f8fa

Browse files
author
Aashish Sharma
committed
mgr/dashboard: Add ceph_daemon filter to rgw overview grafana panel
queries Currently rgw_servers filtering is not working in RGW Overview garfana graphs. It is showing data of all the RGW services, even though filter set to single service. This PR intends to solve this issue Fixes: https://tracker.ceph.com/issues/69074 Signed-off-by: Aashish Sharma <[email protected]>
1 parent 202d87a commit 666f8fa

File tree

3 files changed

+18
-12
lines changed

3 files changed

+18
-12
lines changed

monitoring/ceph-mixin/dashboards/rgw.libsonnet

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -298,7 +298,7 @@ local g = import 'grafonnet/grafana.libsonnet';
298298
label_replace(
299299
rate(ceph_rgw_op_get_obj_lat_sum{%(matchers)s}[$__rate_interval]) /
300300
rate(ceph_rgw_op_get_obj_lat_count{%(matchers)s}[$__rate_interval]) *
301-
on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{%(matchers)s},
301+
on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers", %(matchers)s},
302302
"rgw_host", "$1", "ceph_daemon", "rgw.(.*)"
303303
)
304304
||| % $.matchers(),
@@ -314,7 +314,7 @@ local g = import 'grafonnet/grafana.libsonnet';
314314
label_replace(
315315
rate(ceph_rgw_op_put_obj_lat_sum{%(matchers)s}[$__rate_interval]) /
316316
rate(ceph_rgw_op_put_obj_lat_count{%(matchers)s}[$__rate_interval]) *
317-
on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{%(matchers)s},
317+
on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers", %(matchers)s},
318318
"rgw_host", "$1", "ceph_daemon", "rgw.(.*)"
319319
)
320320
||| % $.matchers(),
@@ -331,7 +331,7 @@ local g = import 'grafonnet/grafana.libsonnet';
331331
sum by (rgw_host) (
332332
label_replace(
333333
rate(ceph_rgw_req{%(matchers)s}[$__rate_interval]) *
334-
on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{%(matchers)s},
334+
on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers", %(matchers)s},
335335
"rgw_host", "$1", "ceph_daemon", "rgw.(.*)"
336336
)
337337
)
@@ -351,7 +351,7 @@ local g = import 'grafonnet/grafana.libsonnet';
351351
label_replace(
352352
rate(ceph_rgw_op_get_obj_lat_sum{%(matchers)s}[$__rate_interval]) /
353353
rate(ceph_rgw_op_get_obj_lat_count{%(matchers)s}[$__rate_interval]) *
354-
on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{%(matchers)s},
354+
on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers", %(matchers)s},
355355
"rgw_host", "$1", "ceph_daemon", "rgw.(.*)"
356356
)
357357
||| % $.matchers(),
@@ -385,7 +385,7 @@ local g = import 'grafonnet/grafana.libsonnet';
385385
label_replace(sum by (instance_id) (
386386
rate(ceph_rgw_op_get_obj_bytes{%(matchers)s}[$__rate_interval]) +
387387
rate(ceph_rgw_op_put_obj_bytes{%(matchers)s}[$__rate_interval])) *
388-
on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{%(matchers)s},
388+
on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers", %(matchers)s},
389389
"rgw_host", "$1", "ceph_daemon", "rgw.(.*)"
390390
)
391391
||| % $.matchers(),
@@ -404,7 +404,7 @@ local g = import 'grafonnet/grafana.libsonnet';
404404
label_replace(
405405
rate(ceph_rgw_op_put_obj_lat_sum{%(matchers)s}[$__rate_interval]) /
406406
rate(ceph_rgw_op_put_obj_lat_count{%(matchers)s}[$__rate_interval]) *
407-
on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{%(matchers)s},
407+
on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers", %(matchers)s},
408408
"rgw_host", "$1", "ceph_daemon", "rgw.(.*)"
409409
)
410410
||| % $.matchers(),

monitoring/ceph-mixin/dashboards_out/radosgw-overview.json

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -108,14 +108,14 @@
108108
"steppedLine": false,
109109
"targets": [
110110
{
111-
"expr": "label_replace(\n rate(ceph_rgw_op_get_obj_lat_sum{cluster=~\"$cluster\", }[$__rate_interval]) /\n rate(ceph_rgw_op_get_obj_lat_count{cluster=~\"$cluster\", }[$__rate_interval]) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{cluster=~\"$cluster\", },\n \"rgw_host\", \"$1\", \"ceph_daemon\", \"rgw.(.*)\"\n)\n",
111+
"expr": "label_replace(\n rate(ceph_rgw_op_get_obj_lat_sum{cluster=~\"$cluster\", }[$__rate_interval]) /\n rate(ceph_rgw_op_get_obj_lat_count{cluster=~\"$cluster\", }[$__rate_interval]) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", },\n \"rgw_host\", \"$1\", \"ceph_daemon\", \"rgw.(.*)\"\n)\n",
112112
"format": "time_series",
113113
"intervalFactor": 1,
114114
"legendFormat": "GET {{rgw_host}}",
115115
"refId": "A"
116116
},
117117
{
118-
"expr": "label_replace(\n rate(ceph_rgw_op_put_obj_lat_sum{cluster=~\"$cluster\", }[$__rate_interval]) /\n rate(ceph_rgw_op_put_obj_lat_count{cluster=~\"$cluster\", }[$__rate_interval]) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{cluster=~\"$cluster\", },\n \"rgw_host\", \"$1\", \"ceph_daemon\", \"rgw.(.*)\"\n)\n",
118+
"expr": "label_replace(\n rate(ceph_rgw_op_put_obj_lat_sum{cluster=~\"$cluster\", }[$__rate_interval]) /\n rate(ceph_rgw_op_put_obj_lat_count{cluster=~\"$cluster\", }[$__rate_interval]) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", },\n \"rgw_host\", \"$1\", \"ceph_daemon\", \"rgw.(.*)\"\n)\n",
119119
"format": "time_series",
120120
"intervalFactor": 1,
121121
"legendFormat": "PUT {{rgw_host}}",
@@ -210,7 +210,7 @@
210210
"steppedLine": false,
211211
"targets": [
212212
{
213-
"expr": "sum by (rgw_host) (\n label_replace(\n rate(ceph_rgw_req{cluster=~\"$cluster\", }[$__rate_interval]) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{cluster=~\"$cluster\", },\n \"rgw_host\", \"$1\", \"ceph_daemon\", \"rgw.(.*)\"\n )\n)\n",
213+
"expr": "sum by (rgw_host) (\n label_replace(\n rate(ceph_rgw_req{cluster=~\"$cluster\", }[$__rate_interval]) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", },\n \"rgw_host\", \"$1\", \"ceph_daemon\", \"rgw.(.*)\"\n )\n)\n",
214214
"format": "time_series",
215215
"intervalFactor": 1,
216216
"legendFormat": "{{rgw_host}}",
@@ -305,7 +305,7 @@
305305
"steppedLine": false,
306306
"targets": [
307307
{
308-
"expr": "label_replace(\n rate(ceph_rgw_op_get_obj_lat_sum{cluster=~\"$cluster\", }[$__rate_interval]) /\n rate(ceph_rgw_op_get_obj_lat_count{cluster=~\"$cluster\", }[$__rate_interval]) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{cluster=~\"$cluster\", },\n \"rgw_host\", \"$1\", \"ceph_daemon\", \"rgw.(.*)\"\n)\n",
308+
"expr": "label_replace(\n rate(ceph_rgw_op_get_obj_lat_sum{cluster=~\"$cluster\", }[$__rate_interval]) /\n rate(ceph_rgw_op_get_obj_lat_count{cluster=~\"$cluster\", }[$__rate_interval]) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", },\n \"rgw_host\", \"$1\", \"ceph_daemon\", \"rgw.(.*)\"\n)\n",
309309
"format": "time_series",
310310
"intervalFactor": 1,
311311
"legendFormat": "{{rgw_host}}",
@@ -502,7 +502,7 @@
502502
"steppedLine": false,
503503
"targets": [
504504
{
505-
"expr": "label_replace(sum by (instance_id) (\n rate(ceph_rgw_op_get_obj_bytes{cluster=~\"$cluster\", }[$__rate_interval]) +\n rate(ceph_rgw_op_put_obj_bytes{cluster=~\"$cluster\", }[$__rate_interval])) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{cluster=~\"$cluster\", },\n \"rgw_host\", \"$1\", \"ceph_daemon\", \"rgw.(.*)\"\n)\n",
505+
"expr": "label_replace(sum by (instance_id) (\n rate(ceph_rgw_op_get_obj_bytes{cluster=~\"$cluster\", }[$__rate_interval]) +\n rate(ceph_rgw_op_put_obj_bytes{cluster=~\"$cluster\", }[$__rate_interval])) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", },\n \"rgw_host\", \"$1\", \"ceph_daemon\", \"rgw.(.*)\"\n)\n",
506506
"format": "time_series",
507507
"intervalFactor": 1,
508508
"legendFormat": "{{rgw_host}}",
@@ -597,7 +597,7 @@
597597
"steppedLine": false,
598598
"targets": [
599599
{
600-
"expr": "label_replace(\n rate(ceph_rgw_op_put_obj_lat_sum{cluster=~\"$cluster\", }[$__rate_interval]) /\n rate(ceph_rgw_op_put_obj_lat_count{cluster=~\"$cluster\", }[$__rate_interval]) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{cluster=~\"$cluster\", },\n \"rgw_host\", \"$1\", \"ceph_daemon\", \"rgw.(.*)\"\n)\n",
600+
"expr": "label_replace(\n rate(ceph_rgw_op_put_obj_lat_sum{cluster=~\"$cluster\", }[$__rate_interval]) /\n rate(ceph_rgw_op_put_obj_lat_count{cluster=~\"$cluster\", }[$__rate_interval]) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", },\n \"rgw_host\", \"$1\", \"ceph_daemon\", \"rgw.(.*)\"\n)\n",
601601
"format": "time_series",
602602
"intervalFactor": 1,
603603
"legendFormat": "{{rgw_host}}",

monitoring/ceph-mixin/tests_dashboards/features/radosgw_overview.feature

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ Scenario: "Test Average GET Latencies"
77
| ceph_rgw_op_get_obj_lat_count{instance="127.0.0.1", instance_id="58892247", job="ceph", cluster="mycluster"} | 20 60 80 |
88
| ceph_rgw_metadata{ceph_daemon="rgw.foo", hostname="localhost", instance="127.0.0.1", instance_id="58892247", job="ceph", cluster="mycluster"} | 1 1 1 |
99
When interval is `30s`
10+
And variable `rgw_servers` is `rgw.foo`
1011
Then Grafana panel `Average GET/PUT Latencies by RGW Instance` with legend `GET {{rgw_host}}` shows:
1112
| metrics | values |
1213
| {ceph_daemon="rgw.foo", instance="127.0.0.1", instance_id="58892247", job="ceph", rgw_host="foo", cluster="mycluster"} | 1.5 |
@@ -18,6 +19,7 @@ Scenario: "Test Average PUT Latencies"
1819
| ceph_rgw_op_put_obj_lat_count{instance="127.0.0.1", instance_id="58892247", job="ceph", cluster="mycluster"} | 10 30 50 |
1920
| ceph_rgw_metadata{ceph_daemon="rgw.foo", hostname="localhost", instance="127.0.0.1", instance_id="58892247", job="ceph", cluster="mycluster"} | 1 1 1 |
2021
When interval is `30s`
22+
And variable `rgw_servers` is `rgw.foo`
2123
Then Grafana panel `Average GET/PUT Latencies by RGW Instance` with legend `PUT {{rgw_host}}` shows:
2224
| metrics | values |
2325
| {ceph_daemon="rgw.foo", instance="127.0.0.1", instance_id="58892247", job="ceph", rgw_host="foo", cluster="mycluster"} | 1 |
@@ -28,6 +30,7 @@ Scenario: "Test Total Requests/sec by RGW Instance"
2830
| ceph_rgw_req{instance="127.0.0.1", instance_id="92806566", job="ceph", cluster="mycluster"} | 10 50 100 |
2931
| ceph_rgw_metadata{ceph_daemon="rgw.1", hostname="localhost", instance="127.0.0.1", instance_id="92806566", job="ceph", cluster="mycluster"} | 1 1 1 |
3032
When interval is `30s`
33+
And variable `rgw_servers` is `rgw.1`
3134
Then Grafana panel `Total Requests/sec by RGW Instance` with legend `{{rgw_host}}` shows:
3235
| metrics | values |
3336
| {rgw_host="1"} | 1.5 |
@@ -39,6 +42,7 @@ Scenario: "Test GET Latencies by RGW Instance"
3942
| ceph_rgw_op_get_obj_lat_count{instance="127.0.0.1", instance_id="58892247", job="ceph", cluster="mycluster"} | 20 60 80 |
4043
| ceph_rgw_metadata{ceph_daemon="rgw.foo", hostname="localhost", instance="127.0.0.1", instance_id="58892247", job="ceph", cluster="mycluster"} | 1 1 1 |
4144
When interval is `30s`
45+
And variable `rgw_servers` is `rgw.foo`
4246
Then Grafana panel `GET Latencies by RGW Instance` with legend `{{rgw_host}}` shows:
4347
| metrics | values |
4448
| {ceph_daemon="rgw.foo", instance="127.0.0.1", instance_id="58892247", job="ceph", rgw_host="foo", cluster="mycluster"} | 1.5 |
@@ -71,6 +75,7 @@ Scenario: "Test Bandwidth by RGW Instance"
7175
| ceph_rgw_metadata{ceph_daemon="rgw.1", hostname="localhost", instance="127.0.0.1", instance_id="92806566", job="ceph", cluster="mycluster"} | 1 1 1 |
7276
When evaluation time is `1m`
7377
And interval is `30s`
78+
And variable `rgw_servers` is `rgw.1`
7479
Then Grafana panel `Bandwidth by RGW Instance` with legend `{{rgw_host}}` shows:
7580
| metrics | values |
7681
| {ceph_daemon="rgw.1", instance_id="92806566", rgw_host="1"} | 2.25 |
@@ -83,6 +88,7 @@ Scenario: "Test PUT Latencies by RGW Instance"
8388
| ceph_rgw_metadata{ceph_daemon="rgw.foo", hostname="localhost", instance="127.0.0.1", instance_id="58892247", job="ceph", cluster="mycluster"} | 1 1 1 |
8489
When evaluation time is `1m`
8590
And interval is `30s`
91+
And variable `rgw_servers` is `rgw.foo`
8692
Then Grafana panel `PUT Latencies by RGW Instance` with legend `{{rgw_host}}` shows:
8793
| metrics | values |
8894
| {ceph_daemon="rgw.foo", instance="127.0.0.1", instance_id="58892247", job="ceph", rgw_host="foo", cluster="mycluster"} | 1 |

0 commit comments

Comments
 (0)