Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
131 changes: 131 additions & 0 deletions presto-mixin/alerts.libsonnet
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
{
new(this):
{
groups+: [
{
name: 'presto-alerts',
rules: [
{
alert: 'PrestoHighInsufficientResources',
expr: |||
increase(presto_QueryManager_InsufficientResourcesFailures_TotalCount[5m]) > %(alertsHighInsufficientResourceErrors)s
||| % this.config,
'for': '5m',
labels: {
severity: 'critical',
},
annotations: {
summary: 'The amount of failures that are occurring due to insufficient resources are scaling, causing saturation in the system.',
description:
(
'The number of insufficient resource failures on {{$labels.instance}} is {{ printf "%%.0f" $value }} which is greater than the threshold of %(alertsHighInsufficientResourceErrors)s.'
) % this.config,
},
},
{
alert: 'PrestoHighTaskFailuresWarning',
expr: |||
increase(presto_TaskManager_FailedTasks_TotalCount[5m]) > %(alertsHighTaskFailuresWarning)s
||| % this.config,
'for': '5m',
labels: {
severity: 'warning',
},
annotations: {
summary: 'The amount of tasks that are failing is increasing, this might affect query processing and could result in incomplete or incorrect results.',
description:
(
'The number of task failures on {{$labels.instance}} is {{ printf "%%.0f" $value }} which is above the threshold of %(alertsHighTaskFailuresWarning)s.'
) % this.config,
},
},
{
alert: 'PrestoHighTaskFailuresCritical',
expr: |||
increase(presto_TaskManager_FailedTasks_TotalCount[5m]) / clamp_min(increase(presto_TaskManager_FailedTasks_TotalCount[10m]), 1) * 100 > %(alertsHighTaskFailuresCritical)s
||| % this.config,
'for': '5m',
labels: {
severity: 'critical',
},
annotations: {
summary: 'The amount of tasks that are failing has reached a critical level. This might affect query processing and could result in incomplete or incorrect results.',
description:
(
'The number of task failures on {{$labels.instance}} is {{ printf "%%.0f" $value }} which is above the threshold of %(alertsHighTaskFailuresCritical)s%%s.'
) % this.config,
},
},
{
alert: 'PrestoHighQueuedTaskCount',
expr: |||
increase(presto_QueryExecution_Executor_QueuedTaskCount[5m]) > %(alertsHighQueuedTaskCount)s
||| % this.config,
'for': '5m',
labels: {
severity: 'warning',
},
annotations: {
summary: 'The amount of tasks that are being put in queue is increasing. A high number of queued tasks can lead to increased query latencies and degraded system performance.',
description:
(
'The number of queued tasks on {{$labels.instance}} is {{ printf "%%.0f" $value }} which is greater than the threshold of %(alertsHighQueuedTaskCount)s'
) % this.config,
},
},
{
alert: 'PrestoHighBlockedNodes',
expr: |||
increase(presto_ClusterMemoryPool_general_BlockedNodes[5m]) > %(alertsHighBlockedNodesCount)s
||| % this.config,
'for': '5m',
labels: {
severity: 'critical',
},
annotations: {
summary: 'The amount of nodes that are blocked due to memory restrictions is increasing. Blocked nodes can cause performance degradation and resource starvation.',
description:
(
'The number of blocked nodes on {{$labels.instance}} is {{ printf "%%.0f" $value }} which is greater than the threshold of %(alertsHighBlockedNodesCount)s'
) % this.config,
},
},
{
alert: 'PrestoHighFailedQueriesWarning',
expr: |||
increase(presto_QueryManager_FailedQueries_TotalCount[5m]) > %(alertsHighFailedQueryCountWarning)s
||| % this.config,
'for': '5m',
labels: {
severity: 'warning',
},
annotations: {
summary: 'The amount of queries failing is increasing. Failed queries can prevent users from accessing data, disrupt analytics processes, and might indicate underlying issues with the system or data.',
description:
(
'The number of failed queries on {{$labels.instance}} is {{ printf "%%.0f" $value }} which is greater than the threshold of %(alertsHighFailedQueryCountWarning)s'
) % this.config,
},
},
{
alert: 'PrestoHighFailedQueriesCritical',
expr: |||
increase(presto_QueryManager_FailedQueries_TotalCount[5m]) / clamp_min(increase(presto_QueryManager_FailedQueries_TotalCount[10m]), 1) * 100 > %(alertsHighFailedQueryCountCritical)s
||| % this.config,
'for': '5m',
labels: {
severity: 'critical',
},
annotations: {
summary: 'The amount of queries failing has increased to critical levels. Failed queries can prevent users from accessing data, disrupt analytics processes, and might indicate underlying issues with the system or data.',
description:
(
'The number of failed queries on {{$labels.instance}} is {{ printf "%%.0f" $value }} which is greater than the threshold of %(alertsHighFailedQueryCountCritical)s%%s.'
) % this.config,
},
},
],
},
],
},
}
130 changes: 0 additions & 130 deletions presto-mixin/alerts/alerts.libsonnet

This file was deleted.

53 changes: 32 additions & 21 deletions presto-mixin/config.libsonnet
Original file line number Diff line number Diff line change
@@ -1,26 +1,37 @@
{
_config+:: {
enableMultiCluster: false,
prestoOverviewSelector: if self.enableMultiCluster then 'job=~"$job", cluster=~"$cluster"' else 'job=~"$job"',
prestoSelector: if self.enableMultiCluster then 'job=~"$job", instance=~"$instance", cluster=~"$cluster"' else 'job=~"$job", instance=~"$instance"',
prestoAlertSelector: if self.enableMultiCluster then 'job=~"${job:regex}", cluster=~"${cluster:regex}"' else 'job=~"${job:regex}"',
prestoOverviewLegendSelector: if self.enableMultiCluster then '{{cluster}} - {{presto_cluster}}' else '{{presto_cluster}}',
prestoLegendSelector: if self.enableMultiCluster then '{{cluster}} - {{instance}}' else '{{instance}}',
filterSelector: 'job=~"integrations/presto"',
local this = self,
filteringSelector: 'job=~"integrations/presto"',
groupLabels: ['job', 'cluster', 'presto_cluster'],
instanceLabels: ['instance'],
uid: 'presto',

dashboardTags: ['presto-mixin'],
dashboardPeriod: 'now-30m',
dashboardTimezone: 'default',
dashboardRefresh: '1m',
dashboardNamePrefix: 'Presto',
dashboardTags: ['presto-mixin'],
dashboardPeriod: 'now-30m',
dashboardTimezone: 'default',
dashboardRefresh: '1m',

// Data source configuration
metricsSource: 'prometheus',
enableLokiLogs: true,
logLabels: this.groupLabels + this.instanceLabels,
extraLogLabels: [],
logsVolumeGroupBy: 'level',
showLogsVolume: true,

// Alerts configuration
alertsHighInsufficientResourceErrors: 0, // count
alertsHighTaskFailuresWarning: 0, // count
alertsHighTaskFailuresCritical: 30, // percent
alertsHighQueuedTaskCount: 5, // count
alertsHighBlockedNodesCount: 0, // count
alertsHighFailedQueryCountWarning: 0, // count
alertsHighFailedQueryCountCritical: 30, // percent

signals+: {
overview: (import './signals/overview.libsonnet')(this),
coordinator: (import './signals/coordinator.libsonnet')(this),
worker: (import './signals/worker.libsonnet')(this),

// alerts thresholds
alertsHighInsufficientResourceErrors: 0, // count
alertsHighTaskFailuresWarning: 0, // count
alertsHighTaskFailuresCritical: 30, // percent
alertsHighQueuedTaskCount: 5, // count
alertsHighBlockedNodesCount: 0, // count
alertsHighFailedQueryCountWarning: 0, // count
alertsHighFailedQueryCountCritical: 30, // percent
enableLokiLogs: true,
},
}
Loading
Loading