Skip to content

Commit dd8a2b2

Browse files
dashboard: panels with CPU and memory utilization
This patch adds `CPU/memory/virtual memory` utilization panels per instance and total. Closes #TNTP-4365
1 parent 3cc409e commit dd8a2b2

16 files changed

+12603
-2991
lines changed

CHANGELOG.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
99
### Added
1010
- Panel with Сartridge configuration checksum (#242)
1111
- Panel with `need schema upgrade` status (#243)
12-
12+
- Panels with `CPU/memory/virtual memory` utilization per instance and total (#245)
1313

1414
## [3.2.1] - 2024-12-06
1515
Grafana revisions:

dashboard/panels/cpu.libsonnet

Lines changed: 225 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
local grafana = import 'grafonnet/grafana.libsonnet';
22

33
local common = import 'dashboard/panels/common.libsonnet';
4+
local common_utils = import 'dashboard/panels/common.libsonnet';
45
local variable = import 'dashboard/variable.libsonnet';
56

67
local influxdb = grafana.influxdb;
@@ -21,14 +22,14 @@ local prometheus = grafana.prometheus;
2122
format='percentunit',
2223
decimalsY1=0,
2324
min=0,
24-
panel_width=12,
25+
panel_width=8,
2526
).addTarget(
2627
common.target(cfg, metric_name, rate=true)
2728
),
2829

29-
getrusage_cpu_user_time(
30+
getrusage_cpu_instance_user_time(
3031
cfg,
31-
title='CPU user time',
32+
title='CPU user time per instance',
3233
description=|||
3334
This is the average share of time
3435
spent by instance process executing in user mode.
@@ -43,9 +44,9 @@ local prometheus = grafana.prometheus;
4344
metric_name='tnt_cpu_user_time',
4445
),
4546

46-
getrusage_cpu_system_time(
47+
getrusage_cpu_instance_system_time(
4748
cfg,
48-
title='CPU system time',
49+
title='CPU system time per instance',
4950
description=|||
5051
This is the average share of time
5152
spent by instance process executing in kernel mode.
@@ -60,6 +61,225 @@ local prometheus = grafana.prometheus;
6061
metric_name='tnt_cpu_system_time',
6162
),
6263

64+
// --------------------------------------------------------------------------
65+
local getrusage_cpu_total_percentage_graph(
66+
cfg, title, description,
67+
) = common.default_graph(
68+
cfg,
69+
title=title,
70+
description=description,
71+
format='percentunit',
72+
decimalsY1=0,
73+
min=0,
74+
panel_width=8,
75+
).addTarget(
76+
if cfg.type == variable.datasource_type.prometheus then
77+
prometheus.target(
78+
expr=std.format(
79+
|||
80+
rate(%(metrics_prefix)stnt_cpu_user_time{%(filters)s}[$__rate_interval]) +
81+
rate(%(metrics_prefix)stnt_cpu_system_time{%(filters)s}[$__rate_interval])
82+
|||,
83+
{
84+
metrics_prefix: cfg.metrics_prefix,
85+
filters: common.prometheus_query_filters(cfg.filters),
86+
}
87+
),
88+
legendFormat='{{alias}}'
89+
)
90+
else if cfg.type == variable.datasource_type.influxdb then
91+
influxdb.target(
92+
rawQuery=true,
93+
query=std.format(|||
94+
SELECT non_negative_derivative(SUM("value"), 1s)
95+
FROM %(measurement_with_policy)s
96+
WHERE (("metric_name" = '%(metric_user_time)s' OR "metric_name" = '%(metric_system_time)s') AND %(filters)s)
97+
AND $timeFilter
98+
GROUP BY time($__interval), "label_pairs_alias" fill(none)
99+
|||, {
100+
measurement_with_policy: std.format('%(policy_prefix)s"%(measurement)s"', {
101+
policy_prefix: if cfg.policy == 'default' then '' else std.format('"%(policy)s".', cfg.policy),
102+
measurement: cfg.measurement,
103+
}),
104+
metric_user_time: cfg.metrics_prefix + 'tnt_cpu_user_time',
105+
metric_system_time: cfg.metrics_prefix + 'tnt_cpu_system_time',
106+
filters: common.influxdb_query_filters(cfg.filters),
107+
}),
108+
alias='$tag_label_pairs_alias',
109+
)
110+
),
111+
112+
getrusage_cpu_instance_total_time(
113+
cfg,
114+
title='CPU total time per instance',
115+
description=|||
116+
This is the average share of time spent
117+
by instance process executing.
118+
119+
Panel minimal requirements: metrics 0.8.0.
120+
|||,
121+
):: getrusage_cpu_total_percentage_graph(
122+
cfg=cfg,
123+
title=title,
124+
description=description,
125+
),
126+
127+
// --------------------------------------------------------------------------
128+
local getrusage_cpu_common_percentage_graph(
129+
cfg,
130+
title,
131+
description,
132+
prometheus_expr,
133+
prometheus_legend,
134+
influx_query,
135+
influx_alias,
136+
) = common.default_graph(
137+
cfg,
138+
title=title,
139+
description=description,
140+
format='percentunit',
141+
decimalsY1=0,
142+
min=0,
143+
panel_width=8,
144+
).addTarget(
145+
if cfg.type == variable.datasource_type.prometheus then
146+
prometheus.target(
147+
expr=prometheus_expr,
148+
legendFormat=prometheus_legend,
149+
)
150+
else if cfg.type == variable.datasource_type.influxdb then
151+
influxdb.target(
152+
rawQuery=true,
153+
query=influx_query,
154+
alias=influx_alias,
155+
)
156+
),
157+
158+
getrusage_cpu_total_time(
159+
cfg,
160+
title='CPU total time per cluster',
161+
description=|||
162+
This is the total share of time spent
163+
by each cluster process executing.
164+
165+
Panel minimal requirements: metrics 0.8.0.
166+
|||,
167+
):: getrusage_cpu_common_percentage_graph(
168+
cfg=cfg,
169+
title=title,
170+
description=description,
171+
prometheus_expr=std.format(
172+
|||
173+
sum(rate(%(metrics_prefix)stnt_cpu_user_time{%(filters)s}[$__rate_interval])) +
174+
sum(rate(%(metrics_prefix)stnt_cpu_system_time{%(filters)s}[$__rate_interval]))
175+
|||,
176+
{
177+
metrics_prefix: cfg.metrics_prefix,
178+
filters: common.prometheus_query_filters(common.remove_field(cfg.filters, 'alias')),
179+
}
180+
),
181+
prometheus_legend=title,
182+
influx_query=std.format(|||
183+
SELECT non_negative_derivative(SUM("value"), 1s)
184+
FROM %(measurement_with_policy)s
185+
WHERE (("metric_name" = '%(metric_user_time)s' OR "metric_name" = '%(metric_system_time)s') AND %(filters)s)
186+
AND $timeFilter
187+
GROUP BY time($__interval)
188+
|||, {
189+
measurement_with_policy: std.format('%(policy_prefix)s"%(measurement)s"', {
190+
policy_prefix: if cfg.policy == 'default' then '' else std.format('"%(policy)s".', cfg.policy),
191+
measurement: cfg.measurement,
192+
}),
193+
metric_user_time: cfg.metrics_prefix + 'tnt_cpu_user_time',
194+
metric_system_time: cfg.metrics_prefix + 'tnt_cpu_system_time',
195+
filters: if common.influxdb_query_filters(common.remove_field(cfg.filters, 'label_pairs_alias')) != ''
196+
then common.influxdb_query_filters(common.remove_field(cfg.filters, 'label_pairs_alias'))
197+
else 'true',
198+
}),
199+
influx_alias=title
200+
),
201+
202+
getrusage_cpu_total_user_time(
203+
cfg,
204+
title='CPU total user time per cluster',
205+
description=|||
206+
This is the total share of time
207+
spent in user mode per cluster.
208+
209+
Panel minimal requirements: metrics 0.8.0.
210+
|||,
211+
):: getrusage_cpu_common_percentage_graph(
212+
cfg=cfg,
213+
title=title,
214+
description=description,
215+
prometheus_expr=std.format(
216+
|||
217+
sum(rate(%(metrics_prefix)stnt_cpu_user_time{%(filters)s}[$__rate_interval]))
218+
|||,
219+
{
220+
metrics_prefix: cfg.metrics_prefix,
221+
filters: common.prometheus_query_filters(common.remove_field(cfg.filters, 'alias')),
222+
}
223+
),
224+
prometheus_legend=title,
225+
influx_query=std.format(|||
226+
SELECT non_negative_derivative(SUM("value"), 1s)
227+
FROM %(measurement_with_policy)s
228+
WHERE "metric_name" = '%(metric_user_time)s' AND %(filters)s
229+
AND $timeFilter
230+
GROUP BY time($__interval)
231+
|||, {
232+
measurement_with_policy: std.format('%(policy_prefix)s"%(measurement)s"', {
233+
policy_prefix: if cfg.policy == 'default' then '' else std.format('"%(policy)s".', cfg.policy),
234+
measurement: cfg.measurement,
235+
}),
236+
metric_user_time: cfg.metrics_prefix + 'tnt_cpu_user_time',
237+
filters: common.influxdb_query_filters(cfg.filters),
238+
}),
239+
influx_alias=title
240+
),
241+
242+
getrusage_cpu_total_system_time(
243+
cfg,
244+
title='CPU total system time per cluster',
245+
description=|||
246+
This is the total share of time
247+
spent in system mode per cluster.
248+
249+
Panel minimal requirements: metrics 0.8.0.
250+
|||,
251+
):: getrusage_cpu_common_percentage_graph(
252+
cfg=cfg,
253+
title=title,
254+
description=description,
255+
prometheus_expr=std.format(
256+
|||
257+
sum(rate(%(metrics_prefix)stnt_cpu_system_time{%(filters)s}[$__rate_interval]))
258+
|||,
259+
{
260+
metrics_prefix: cfg.metrics_prefix,
261+
filters: common.prometheus_query_filters(common.remove_field(cfg.filters, 'alias')),
262+
}
263+
),
264+
prometheus_legend=title,
265+
influx_query=std.format(|||
266+
SELECT non_negative_derivative(SUM("value"), 1s)
267+
FROM %(measurement_with_policy)s
268+
WHERE "metric_name" = '%(metric_system_time)s' AND %(filters)s
269+
AND $timeFilter
270+
GROUP BY time($__interval)
271+
|||, {
272+
measurement_with_policy: std.format('%(policy_prefix)s"%(measurement)s"', {
273+
policy_prefix: if cfg.policy == 'default' then '' else std.format('"%(policy)s".', cfg.policy),
274+
measurement: cfg.measurement,
275+
}),
276+
metric_system_time: cfg.metrics_prefix + 'tnt_cpu_system_time',
277+
filters: common.influxdb_query_filters(cfg.filters),
278+
}),
279+
influx_alias=title
280+
),
281+
282+
// --------------------------------------------------------------------------
63283
local procstat_thread_time_graph(
64284
cfg,
65285
title,

0 commit comments

Comments
 (0)