Skip to content

Commit 4aef755

Browse files
committed
modernize the hbase mixin
1 parent d7c1c77 commit 4aef755

19 files changed

+2180
-3937
lines changed

apache-hbase-mixin/alerts/alerts.libsonnet renamed to apache-hbase-mixin/alerts.libsonnet

Lines changed: 13 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,14 @@
11
{
2-
prometheusAlerts+:: {
3-
groups+: [
2+
new(this): {
3+
groups: [
44
{
55
name: 'apache-hbase-alerts',
66
rules: [
77
{
88
alert: 'HBaseHighHeapMemUsage',
99
expr: |||
10-
100 * sum without(context, hostname, processname) (jvm_metrics_mem_heap_used_m{%(filterSelector)s} / clamp_min(jvm_metrics_mem_heap_committed_m{%(filterSelector)s}, 1)) > %(alertsHighHeapMemUsage)s
11-
||| % $._config,
10+
100 * sum without(context, hostname, processname) (jvm_metrics_mem_heap_used_m{%(filteringSelector)s} / clamp_min(jvm_metrics_mem_heap_committed_m{%(filteringSelector)s}, 1)) > %(alertsHighHeapMemUsage)s
11+
||| % this.config,
1212
'for': '5m',
1313
labels: {
1414
severity: 'warning',
@@ -18,14 +18,14 @@
1818
description:
1919
(
2020
'The heap memory usage for the JVM on instance {{$labels.instance}} in cluster {{$labels.hbase_cluster}} is {{printf "%%.0f" $value}} percent, which is above the threshold of %(alertsHighHeapMemUsage)s percent'
21-
) % $._config,
21+
) % this.config,
2222
},
2323
},
2424
{
2525
alert: 'HBaseDeadRegionServer',
2626
expr: |||
2727
server_num_dead_region_servers > %(alertsDeadRegionServer)s
28-
||| % $._config,
28+
||| % this.config,
2929
'for': '5m',
3030
labels: {
3131
severity: 'warning',
@@ -35,14 +35,14 @@
3535
description:
3636
(
3737
'{{$value}} RegionServer(s) in cluster {{$labels.hbase_cluster}} are unresponsive, which is above the threshold of %(alertsDeadRegionServer)s. The name(s) of the dead RegionServer(s) are {{$labels.deadregionservers}}'
38-
) % $._config,
38+
) % this.config,
3939
},
4040
},
4141
{
4242
alert: 'HBaseOldRegionsInTransition',
4343
expr: |||
4444
100 * assignment_manager_rit_count_over_threshold / clamp_min(assignment_manager_rit_count, 1) > %(alertsOldRegionsInTransition)s
45-
||| % $._config,
45+
||| % this.config,
4646
'for': '5m',
4747
labels: {
4848
severity: 'warning',
@@ -52,14 +52,14 @@
5252
description:
5353
(
5454
'{{printf "%%.0f" $value}} percent of RegionServers in transition in cluster {{$labels.hbase_cluster}} are transitioning for longer than expected, which is above the threshold of %(alertsOldRegionsInTransition)s percent'
55-
) % $._config,
55+
) % this.config,
5656
},
5757
},
5858
{
5959
alert: 'HBaseHighMasterAuthFailRate',
6060
expr: |||
6161
100 * rate(master_authentication_failures[5m]) / (clamp_min(rate(master_authentication_successes[5m]), 1) + clamp_min(rate(master_authentication_failures[5m]), 1)) > %(alertsHighMasterAuthFailRate)s
62-
||| % $._config,
62+
||| % this.config,
6363
'for': '5m',
6464
labels: {
6565
severity: 'warning',
@@ -69,14 +69,14 @@
6969
description:
7070
(
7171
'{{printf "%%.0f" $value}} percent of authentication attempts to the master are failing in cluster {{$labels.hbase_cluster}}, which is above the threshold of %(alertsHighMasterAuthFailRate)s percent'
72-
) % $._config,
72+
) % this.config,
7373
},
7474
},
7575
{
7676
alert: 'HBaseHighRSAuthFailRate',
7777
expr: |||
7878
100 * rate(region_server_authentication_failures[5m]) / (clamp_min(rate(region_server_authentication_successes[5m]), 1) + clamp_min(rate(region_server_authentication_failures[5m]), 1)) > %(alertsHighRSAuthFailRate)s
79-
||| % $._config,
79+
||| % this.config,
8080
'for': '5m',
8181
labels: {
8282
severity: 'warning',
@@ -86,7 +86,7 @@
8686
description:
8787
(
8888
'{{printf "%%.0f" $value}} percent of authentication attempts to the RegionServer {{$labels.instance}} are failing in cluster {{$labels.hbase_cluster}}, which is above the threshold of %(alertsHighRSAuthFailRate)s percent'
89-
) % $._config,
89+
) % this.config,
9090
},
9191
},
9292
],
Lines changed: 29 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,35 @@
11
{
2-
_config+:: {
3-
filterSelector: 'job="integrations/apache-hbase"',
2+
local this = self,
3+
filteringSelector: 'job="integrations/apache-hbase"',
4+
groupLabels: ['job', 'hbase_cluster'],
5+
instanceLabels: ['instance'],
6+
logLabels: ['job', 'hbase_cluster', 'instance'],
47

5-
dashboardTags: ['apache-hbase-mixin'],
6-
dashboardPeriod: 'now-30m',
7-
dashboardTimezone: 'default',
8-
dashboardRefresh: '1m',
8+
dashboardTags: [self.uid + '-mixin'],
9+
uid: 'apache-hbase',
10+
dashboardNamePrefix: 'Apache HBase',
11+
dashboardPeriod: 'now-30m',
12+
dashboardTimezone: 'default',
13+
dashboardRefresh: '1m',
14+
metricsSource: ['prometheus', 'prometheusv2'],
915

10-
// alerts thresholds
11-
alertsHighHeapMemUsage: 80, // percentage
12-
alertsHighNonHeapMemUsage: 80, // percentage
13-
alertsDeadRegionServer: 0, // count
14-
alertsOldRegionsInTransition: 50, // percentage
15-
alertsHighMasterAuthFailRate: 35, // percentage
16-
alertsHighRSAuthFailRate: 35, // percentage
16+
// Logging configuration
17+
enableLokiLogs: true,
18+
extraLogLabels: ['level'],
19+
logsVolumeGroupBy: 'level',
20+
showLogsVolume: true,
1721

18-
enableLokiLogs: true,
22+
// Alerts thresholds
23+
alertsHighHeapMemUsage: 80, // percentage
24+
alertsHighNonHeapMemUsage: 80, // percentage
25+
alertsDeadRegionServer: 0, // count
26+
alertsOldRegionsInTransition: 50, // percentage
27+
alertsHighMasterAuthFailRate: 35, // percentage
28+
alertsHighRSAuthFailRate: 35, // percentage
29+
30+
// Signals configuration
31+
signals+: {
32+
cluster: (import './signals/cluster.libsonnet')(this),
33+
regionserver: (import './signals/regionserver.libsonnet')(this),
1934
},
2035
}
Lines changed: 105 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,105 @@
1+
local g = import './g.libsonnet';
2+
local commonlib = import 'common-lib/common/main.libsonnet';
3+
local logslib = import 'logs-lib/logs/main.libsonnet';
4+
5+
{
6+
local root = self,
7+
new(this)::
8+
local prefix = this.config.dashboardNamePrefix;
9+
local links = this.grafana.links;
10+
local tags = this.config.dashboardTags;
11+
local uid = g.util.string.slugify(this.config.uid);
12+
local vars = this.grafana.variables;
13+
local annotations = this.grafana.annotations;
14+
local refresh = this.config.dashboardRefresh;
15+
local period = this.config.dashboardPeriod;
16+
local timezone = this.config.dashboardTimezone;
17+
{
18+
'apache-hbase-cluster-overview.json':
19+
g.dashboard.new(prefix + ' cluster overview')
20+
+ g.dashboard.withPanels(
21+
g.util.panel.resolveCollapsedFlagOnRows(
22+
g.util.grid.wrapPanels(
23+
[
24+
this.grafana.rows.clusterOverview,
25+
]
26+
)
27+
)
28+
) + root.applyCommon(
29+
vars.multiInstance,
30+
uid + '-cluster-overview',
31+
tags,
32+
links { clusterOverview:: {} },
33+
annotations,
34+
timezone,
35+
refresh,
36+
period,
37+
),
38+
39+
'apache-hbase-regionserver-overview.json':
40+
g.dashboard.new(prefix + ' RegionServer overview')
41+
+ g.dashboard.withPanels(
42+
g.util.panel.resolveCollapsedFlagOnRows(
43+
g.util.grid.wrapPanels(
44+
[
45+
this.grafana.rows.regionServerOverview,
46+
]
47+
)
48+
)
49+
) + root.applyCommon(
50+
vars.multiInstance,
51+
uid + '-regionserver-overview',
52+
tags,
53+
links { regionServerOverview:: {} },
54+
annotations,
55+
timezone,
56+
refresh,
57+
period,
58+
),
59+
}
60+
+
61+
if this.config.enableLokiLogs then
62+
{
63+
'apache-hbase-logs.json':
64+
logslib.new(
65+
prefix + ' logs',
66+
datasourceName=this.grafana.variables.datasources.loki.name,
67+
datasourceRegex=this.grafana.variables.datasources.loki.regex,
68+
filterSelector=this.config.filteringSelector,
69+
labels=this.config.logLabels + this.config.extraLogLabels,
70+
formatParser=null,
71+
showLogsVolume=this.config.showLogsVolume,
72+
logsVolumeGroupBy=this.config.logsVolumeGroupBy,
73+
)
74+
{
75+
dashboards+:
76+
{
77+
logs+:
78+
root.applyCommon(super.logs.templating.list, uid=uid + '_logs', tags=tags, links=links { logs+:: {} }, annotations=annotations, timezone=timezone, refresh=refresh, period=period),
79+
},
80+
panels+:
81+
{
82+
logs+:
83+
g.panel.logs.options.withEnableLogDetails(true)
84+
+ g.panel.logs.options.withShowTime(false)
85+
+ g.panel.logs.options.withWrapLogMessage(false),
86+
},
87+
variables+: {
88+
toArray+: [
89+
this.grafana.variables.datasources.prometheus { hide: 2 },
90+
],
91+
},
92+
}.dashboards.logs,
93+
}
94+
else {},
95+
96+
applyCommon(vars, uid, tags, links, annotations, timezone, refresh, period):
97+
g.dashboard.withTags(tags)
98+
+ g.dashboard.withUid(uid)
99+
+ g.dashboard.withLinks(std.objectValues(links))
100+
+ g.dashboard.withTimezone(timezone)
101+
+ g.dashboard.withRefresh(refresh)
102+
+ g.dashboard.time.withFrom(period)
103+
+ g.dashboard.withVariables(vars)
104+
+ g.dashboard.withAnnotations(std.objectValues(annotations)),
105+
}

0 commit comments

Comments
 (0)