Skip to content

Commit 6acda51

Browse files
Presto Mixin (#1087)
* finished * dakota 1/n * caleb 1/n * added links * added jvm gauges + updated titles * alert fixes * k8 support + emily + gabe feedback * query shortening due to JMX config update * update alerts metrics * Emily 2/n * sample app revisions * screenshots added --------- Co-authored-by: Emily <[email protected]>
1 parent 356d4fb commit 6acda51

12 files changed

+3729
-0
lines changed

presto-mixin/.lint

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
exclusions:
2+
template-job-rule:
3+
reason: "Prometheus datasource variable is being named as prometheus_datasource now while linter expects 'datasource'"
4+
panel-datasource-rule:
5+
reason: "Loki datasource variable is being named as loki_datasource now while linter expects 'datasource'"
6+
template-datasource-rule:
7+
reason: "Based on new convention we are using variable names prometheus_datasource and loki_datasource where as linter expects 'datasource'"
8+
template-instance-rule:
9+
reason: "Based on new convention we are using variable names prometheus_datasource and loki_datasource where as linter expects 'datasource'"
10+
target-instance-rule:
11+
reason: "The dashboard is a 'cluster' dashboard where the instance refers to nodes, this dashboard focuses only on the cluster view."
12+
entries:
13+
- dashboard: "Presto overview"
14+
panel-title-description-rule:
15+
reason: "Not required for logs volume"
16+
panel-units-rule:
17+
reason: "Logs volume has no unit"

presto-mixin/Makefile

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
JSONNET_FMT := jsonnetfmt -n 2 --max-blank-lines 1 --string-style s --comment-style s
2+
3+
.PHONY: all
4+
all: build dashboards_out prometheus_alerts.yaml
5+
6+
vendor: jsonnetfile.json
7+
jb install
8+
9+
.PHONY: build
10+
build: vendor
11+
12+
.PHONY: fmt
13+
fmt:
14+
find . -name 'vendor' -prune -o -name '*.libsonnet' -print -o -name '*.jsonnet' -print | \
15+
xargs -n 1 -- $(JSONNET_FMT) -i
16+
17+
.PHONY: lint
18+
lint: build
19+
find . -name 'vendor' -prune -o -name '*.libsonnet' -print -o -name '*.jsonnet' -print | \
20+
while read f; do \
21+
$(JSONNET_FMT) "$$f" | diff -u "$$f" -; \
22+
done
23+
mixtool lint mixin.libsonnet
24+
25+
dashboards_out: mixin.libsonnet config.libsonnet $(wildcard dashboards/*)
26+
@mkdir -p dashboards_out
27+
mixtool generate dashboards mixin.libsonnet -d dashboards_out
28+
29+
prometheus_alerts.yaml: mixin.libsonnet alerts/*.libsonnet
30+
mixtool generate alerts mixin.libsonnet -a prometheus_alerts.yaml
31+
32+
.PHONY: clean
33+
clean:
34+
rm -rf dashboards_out prometheus_alerts.yaml

presto-mixin/README.md

Lines changed: 132 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,132 @@
1+
# Presto mixin
2+
3+
The Presto mixin is a set of configurable Grafana dashboards and alerts.
4+
5+
The Presto mixin contains the following dashboards:
6+
7+
- Presto overview
8+
- Presto coordinator
9+
- Presto worker
10+
- Presto logs
11+
12+
and the following alerts:
13+
14+
- PrestoHighInsufficientResources
15+
- PrestoHighTaskFailuresWarning
16+
- PrestoHighTaskFailuresCritical
17+
- PrestoHighQueuedTaskCount
18+
- PrestoHighBlockedNodes
19+
- PrestoHighFailedQueriesWarning
20+
- PrestoHighFailedQueriesCritical
21+
22+
## Presto overview
23+
24+
The Presto overview dashboard provides details on integration status/alerts, workers/coordinators, error failures, data throughput, blocked nodes, and distributed bytes.
25+
![Presto overview dashboard (queries)](https://storage.googleapis.com/grafanalabs-integration-assets/presto/screenshots/presto_overview_1.png)
26+
![Presto overview dashboard (processing)](https://storage.googleapis.com/grafanalabs-integration-assets/presto/screenshots/presto_overview_1.png)
27+
28+
## Presto coordinator overview
29+
30+
The Presto coordinator overview dashboard provides details on various query counts and rates, query execution time, CPU time consumed, CPU input throughput, error failures, JVM metrics, and memory pool information.
31+
![Presto coordinator dashboard (queries)](https://storage.googleapis.com/grafanalabs-integration-assets/presto/screenshots/presto_coordinator_1.png)
32+
![Presto coordinator dashboard (JVM)](https://storage.googleapis.com/grafanalabs-integration-assets/presto/screenshots/presto_coordinator_2.png)
33+
34+
## Presto worker overview
35+
36+
The Presto worker overview dashboard provides details on various task rates, pool sizes, output positions, data throughput, JVM metrics, and memory pool information.
37+
![Presto worker dashboard (tasks)](https://storage.googleapis.com/grafanalabs-integration-assets/presto/screenshots/presto_worker_1.png)
38+
![Presto worker dashboard (JVM)](https://storage.googleapis.com/grafanalabs-integration-assets/presto/screenshots/presto_worker_2.png)
39+
40+
## Presto logs
41+
42+
The Presto logs dashboard provides details on incoming system logs.
43+
![Presto logs dashboard](https://storage.googleapis.com/grafanalabs-integration-assets/presto/screenshots/presto_logs_overview.png)
44+
45+
Presto system logs are enabled by default in the `config.libsonnet` and can be removed by setting `enableLokiLogs` to `false`. Then run `make` again to regenerate the dashboard:
46+
47+
```
48+
{
49+
_config+:: {
50+
enableLokiLogs: false,
51+
},
52+
}
53+
```
54+
55+
In order for the selectors to properly work for system logs ingested into your logs datasource, please also include the matching `instance`, `job`, and `presto_cluster` labels onto the [scrape configs](https://grafana.com/docs/loki/latest/clients/promtail/configuration/#scrape_configs) as to match the labels for ingested metrics.
56+
57+
```yaml
58+
scrape_configs:
59+
- job_name: integrations/presto
60+
static_configs:
61+
- targets: [localhost]
62+
labels:
63+
job: integrations/presto
64+
instance: "<your-instance-name>"
65+
presto_cluster: "<your-cluster-name>"
66+
__path__: /var/presto/logs/*.log
67+
pipeline_stages:
68+
- multiline:
69+
firstline: '\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d{3}'
70+
- regex:
71+
expression: '\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d{3}Z\s+(?P<level>\w+)(?P<message>.+)'
72+
- labels:
73+
level:
74+
```
75+
76+
## Alerts overview
77+
78+
- PrestoHighInsufficientFailures: The amount of failures that are occurring due to insufficient resources are scaling, causing saturation in the system.
79+
- PrestoHighTaskFailuresWarning: The amount of tasks that are failing is increasing, this might affect query processing and could result in incomplete or incorrect results.
80+
- PrestoHighTaskFailuresCritical: The amount of tasks that are failing has reached a critical level. This might affect query processing and could result in incomplete or incorrect results.
81+
- PrestoHighQueuedTaskCount: The amount of tasks that are being put in queue is increasing. A high number of queued tasks can lead to increased query latencies and degraded system performance.
82+
- PrestoHighBlockedNodes: The amount of nodes that are blocked due to memory restrictions is increasing. Blocked nodes can cause performance degradation and resource starvation.
83+
- PrestoHighFailedQueriesWarning: The amount of queries failing is increasing. Failed queries can prevent users from accessing data, disrupt analytics processes, and might indicate underlying issues with the system or data.
84+
- PrestoHighFailedQueriesCritical: The amount of queries failing has increased to critical levels. Failed queries can prevent users from accessing data, disrupt analytics processes, and might indicate underlying issues with the system or data.
85+
86+
Default thresholds can be configured in `config.libsonnet`.
87+
88+
```js
89+
{
90+
_configs+:: {
91+
92+
// alerts thresholds
93+
alertsHighInsufficientResourceErrors: 0, // count
94+
alertsHighTaskFailuresWarning: 0, // count
95+
alertsHighTaskFailuresCritical: 30, // percent
96+
alertsHighQueuedTaskCount: 5, // count
97+
alertsHighBlockedNodesCount: 0, // count
98+
alertsHighFailedQueryCountWarning: 0, // count
99+
alertsHighFailedQueryCountCritical: 30, // percent
100+
}
101+
}
102+
```
103+
104+
## Install tools
105+
106+
```bash
107+
go install github.com/jsonnet-bundler/jsonnet-bundler/cmd/jb@latest
108+
go install github.com/monitoring-mixins/mixtool/cmd/mixtool@latest
109+
```
110+
111+
For linting and formatting, you would also need `jsonnetfmt` installed. If you
112+
have a working Go development environment, it's easiest to run the following:
113+
114+
```bash
115+
go install github.com/google/go-jsonnet/cmd/jsonnetfmt@latest
116+
```
117+
118+
The files in `dashboards_out` need to be imported
119+
into your Grafana server. The exact details will be depending on your environment.
120+
121+
`prometheus_alerts.yaml` needs to be imported into Prometheus.
122+
123+
## Generate dashboards and alerts
124+
125+
Edit `config.libsonnet` if required and then build JSON dashboard files for Grafana:
126+
127+
```bash
128+
make
129+
```
130+
131+
For more advanced uses of mixins, see
132+
https://github.com/monitoring-mixins/docs.

presto-mixin/alerts/alerts.libsonnet

Lines changed: 130 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,130 @@
1+
{
2+
prometheusAlerts+:: {
3+
groups+: [
4+
{
5+
name: 'presto-alerts',
6+
rules: [
7+
{
8+
alert: 'PrestoHighInsufficientResources',
9+
expr: |||
10+
increase(presto_QueryManager_InsufficientResourcesFailures_TotalCount[5m]) > %(alertsHighInsufficientResourceErrors)s
11+
||| % $._config,
12+
'for': '5m',
13+
labels: {
14+
severity: 'critical',
15+
},
16+
annotations: {
17+
summary: 'The amount of failures that are occurring due to insufficient resources are scaling, causing saturation in the system.',
18+
description:
19+
(
20+
'The number of insufficient resource failures on {{$labels.instance}} is {{ printf "%%.0f" $value }} which is greater than the threshold of %(alertsHighInsufficientResourceErrors)s.'
21+
) % $._config,
22+
},
23+
},
24+
{
25+
alert: 'PrestoHighTaskFailuresWarning',
26+
expr: |||
27+
increase(presto_TaskManager_FailedTasks_TotalCount[5m]) > %(alertsHighTaskFailuresWarning)s
28+
||| % $._config,
29+
'for': '5m',
30+
labels: {
31+
severity: 'warning',
32+
},
33+
annotations: {
34+
summary: 'The amount of tasks that are failing is increasing, this might affect query processing and could result in incomplete or incorrect results.',
35+
description:
36+
(
37+
'The number of task failures on {{$labels.instance}} is {{ printf "%%.0f" $value }} which is above the threshold of %(alertsHighTaskFailuresWarning)s.'
38+
) % $._config,
39+
},
40+
},
41+
{
42+
alert: 'PrestoHighTaskFailuresCritical',
43+
expr: |||
44+
increase(presto_TaskManager_FailedTasks_TotalCount[5m]) / clamp_min(increase(presto_TaskManager_FailedTasks_TotalCount[10m]), 1) * 100 > %(alertsHighTaskFailuresCritical)s
45+
||| % $._config,
46+
'for': '5m',
47+
labels: {
48+
severity: 'critical',
49+
},
50+
annotations: {
51+
summary: 'The amount of tasks that are failing has reached a critical level. This might affect query processing and could result in incomplete or incorrect results.',
52+
description:
53+
(
54+
'The number of task failures on {{$labels.instance}} is {{ printf "%%.0f" $value }} which is above the threshold of %(alertsHighTaskFailuresCritical)s%%s.'
55+
) % $._config,
56+
},
57+
},
58+
{
59+
alert: 'PrestoHighQueuedTaskCount',
60+
expr: |||
61+
increase(presto_QueryExecution_Executor_QueuedTaskCount[5m]) > %(alertsHighQueuedTaskCount)s
62+
||| % $._config,
63+
'for': '5m',
64+
labels: {
65+
severity: 'warning',
66+
},
67+
annotations: {
68+
summary: 'The amount of tasks that are being put in queue is increasing. A high number of queued tasks can lead to increased query latencies and degraded system performance.',
69+
description:
70+
(
71+
'The number of queued tasks on {{$labels.instance}} is {{ printf "%%.0f" $value }} which is greater than the threshold of %(alertsHighQueuedTaskCount)s'
72+
) % $._config,
73+
},
74+
},
75+
{
76+
alert: 'PrestoHighBlockedNodes',
77+
expr: |||
78+
increase(presto_ClusterMemoryPool_general_BlockedNodes[5m]) > %(alertsHighBlockedNodesCount)s
79+
||| % $._config,
80+
'for': '5m',
81+
labels: {
82+
severity: 'critical',
83+
},
84+
annotations: {
85+
summary: 'The amount of nodes that are blocked due to memory restrictions is increasing. Blocked nodes can cause performance degradation and resource starvation.',
86+
description:
87+
(
88+
'The number of blocked nodes on {{$labels.instance}} is {{ printf "%%.0f" $value }} which is greater than the threshold of %(alertsHighBlockedNodesCount)s'
89+
) % $._config,
90+
},
91+
},
92+
{
93+
alert: 'PrestoHighFailedQueriesWarning',
94+
expr: |||
95+
increase(presto_QueryManager_FailedQueries_TotalCount[5m]) > %(alertsHighFailedQueryCountWarning)s
96+
||| % $._config,
97+
'for': '5m',
98+
labels: {
99+
severity: 'warning',
100+
},
101+
annotations: {
102+
summary: 'The amount of queries failing is increasing. Failed queries can prevent users from accessing data, disrupt analytics processes, and might indicate underlying issues with the system or data.',
103+
description:
104+
(
105+
'The number of failed queries on {{$labels.instance}} is {{ printf "%%.0f" $value }} which is greater than the threshold of %(alertsHighFailedQueryCountWarning)s'
106+
) % $._config,
107+
},
108+
},
109+
{
110+
alert: 'PrestoHighFailedQueriesCritical',
111+
expr: |||
112+
increase(presto_QueryManager_FailedQueries_TotalCount[5m]) / clamp_min(increase(presto_QueryManager_FailedQueries_TotalCount[10m]), 1) * 100 > %(alertsHighFailedQueryCountCritical)s
113+
||| % $._config,
114+
'for': '5m',
115+
labels: {
116+
severity: 'critical',
117+
},
118+
annotations: {
119+
summary: 'The amount of queries failing has increased to critical levels. Failed queries can prevent users from accessing data, disrupt analytics processes, and might indicate underlying issues with the system or data.',
120+
description:
121+
(
122+
'The number of failed queries on {{$labels.instance}} is {{ printf "%%.0f" $value }} which is greater than the threshold of %(alertsHighFailedQueryCountCritical)s%%s.'
123+
) % $._config,
124+
},
125+
},
126+
],
127+
},
128+
],
129+
},
130+
}

presto-mixin/config.libsonnet

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
{
2+
_config+:: {
3+
enableMultiCluster: false,
4+
prestoOverviewSelector: if self.enableMultiCluster then 'job=~"$job", cluster=~"$cluster"' else 'job=~"$job"',
5+
prestoSelector: if self.enableMultiCluster then 'job=~"$job", instance=~"$instance", cluster=~"$cluster"' else 'job=~"$job", instance=~"$instance"',
6+
prestoAlertSelector: if self.enableMultiCluster then 'job=~"${job:regex}", cluster=~"${cluster:regex}"' else 'job=~"${job:regex}"',
7+
prestoOverviewLegendSelector: if self.enableMultiCluster then '{{cluster}} - {{presto_cluster}}' else '{{presto_cluster}}',
8+
prestoLegendSelector: if self.enableMultiCluster then '{{cluster}} - {{instance}}' else '{{instance}}',
9+
filterSelector: 'job=~"integrations/presto"',
10+
11+
dashboardTags: ['presto-mixin'],
12+
dashboardPeriod: 'now-30m',
13+
dashboardTimezone: 'default',
14+
dashboardRefresh: '1m',
15+
16+
// alerts thresholds
17+
alertsHighInsufficientResourceErrors: 0, // count
18+
alertsHighTaskFailuresWarning: 0, // count
19+
alertsHighTaskFailuresCritical: 30, // percent
20+
alertsHighQueuedTaskCount: 5, // count
21+
alertsHighBlockedNodesCount: 0, // count
22+
alertsHighFailedQueryCountWarning: 0, // count
23+
alertsHighFailedQueryCountCritical: 30, // percent
24+
enableLokiLogs: true,
25+
},
26+
}
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
(import 'presto-overview.libsonnet') +
2+
(import 'presto-coordinator.libsonnet') +
3+
(import 'presto-worker.libsonnet') +
4+
(import 'presto-logs-overview.libsonnet')

0 commit comments

Comments
 (0)