diff --git a/presto-mixin/alerts.libsonnet b/presto-mixin/alerts.libsonnet new file mode 100644 index 000000000..11c2e71a6 --- /dev/null +++ b/presto-mixin/alerts.libsonnet @@ -0,0 +1,131 @@ +{ + new(this): + { + groups+: [ + { + name: 'presto-alerts', + rules: [ + { + alert: 'PrestoHighInsufficientResources', + expr: ||| + increase(presto_QueryManager_InsufficientResourcesFailures_TotalCount[5m]) > %(alertsHighInsufficientResourceErrors)s + ||| % this.config, + 'for': '5m', + labels: { + severity: 'critical', + }, + annotations: { + summary: 'The amount of failures that are occurring due to insufficient resources are scaling, causing saturation in the system.', + description: + ( + 'The number of insufficient resource failures on {{$labels.instance}} is {{ printf "%%.0f" $value }} which is greater than the threshold of %(alertsHighInsufficientResourceErrors)s.' + ) % this.config, + }, + }, + { + alert: 'PrestoHighTaskFailuresWarning', + expr: ||| + increase(presto_TaskManager_FailedTasks_TotalCount[5m]) > %(alertsHighTaskFailuresWarning)s + ||| % this.config, + 'for': '5m', + labels: { + severity: 'warning', + }, + annotations: { + summary: 'The amount of tasks that are failing is increasing, this might affect query processing and could result in incomplete or incorrect results.', + description: + ( + 'The number of task failures on {{$labels.instance}} is {{ printf "%%.0f" $value }} which is above the threshold of %(alertsHighTaskFailuresWarning)s.' + ) % this.config, + }, + }, + { + alert: 'PrestoHighTaskFailuresCritical', + expr: ||| + increase(presto_TaskManager_FailedTasks_TotalCount[5m]) / clamp_min(increase(presto_TaskManager_FailedTasks_TotalCount[10m]), 1) * 100 > %(alertsHighTaskFailuresCritical)s + ||| % this.config, + 'for': '5m', + labels: { + severity: 'critical', + }, + annotations: { + summary: 'The amount of tasks that are failing has reached a critical level. This might affect query processing and could result in incomplete or incorrect results.', + description: + ( + 'The number of task failures on {{$labels.instance}} is {{ printf "%%.0f" $value }} which is above the threshold of %(alertsHighTaskFailuresCritical)s%%s.' + ) % this.config, + }, + }, + { + alert: 'PrestoHighQueuedTaskCount', + expr: ||| + increase(presto_QueryExecution_Executor_QueuedTaskCount[5m]) > %(alertsHighQueuedTaskCount)s + ||| % this.config, + 'for': '5m', + labels: { + severity: 'warning', + }, + annotations: { + summary: 'The amount of tasks that are being put in queue is increasing. A high number of queued tasks can lead to increased query latencies and degraded system performance.', + description: + ( + 'The number of queued tasks on {{$labels.instance}} is {{ printf "%%.0f" $value }} which is greater than the threshold of %(alertsHighQueuedTaskCount)s' + ) % this.config, + }, + }, + { + alert: 'PrestoHighBlockedNodes', + expr: ||| + increase(presto_ClusterMemoryPool_general_BlockedNodes[5m]) > %(alertsHighBlockedNodesCount)s + ||| % this.config, + 'for': '5m', + labels: { + severity: 'critical', + }, + annotations: { + summary: 'The amount of nodes that are blocked due to memory restrictions is increasing. Blocked nodes can cause performance degradation and resource starvation.', + description: + ( + 'The number of blocked nodes on {{$labels.instance}} is {{ printf "%%.0f" $value }} which is greater than the threshold of %(alertsHighBlockedNodesCount)s' + ) % this.config, + }, + }, + { + alert: 'PrestoHighFailedQueriesWarning', + expr: ||| + increase(presto_QueryManager_FailedQueries_TotalCount[5m]) > %(alertsHighFailedQueryCountWarning)s + ||| % this.config, + 'for': '5m', + labels: { + severity: 'warning', + }, + annotations: { + summary: 'The amount of queries failing is increasing. Failed queries can prevent users from accessing data, disrupt analytics processes, and might indicate underlying issues with the system or data.', + description: + ( + 'The number of failed queries on {{$labels.instance}} is {{ printf "%%.0f" $value }} which is greater than the threshold of %(alertsHighFailedQueryCountWarning)s' + ) % this.config, + }, + }, + { + alert: 'PrestoHighFailedQueriesCritical', + expr: ||| + increase(presto_QueryManager_FailedQueries_TotalCount[5m]) / clamp_min(increase(presto_QueryManager_FailedQueries_TotalCount[10m]), 1) * 100 > %(alertsHighFailedQueryCountCritical)s + ||| % this.config, + 'for': '5m', + labels: { + severity: 'critical', + }, + annotations: { + summary: 'The amount of queries failing has increased to critical levels. Failed queries can prevent users from accessing data, disrupt analytics processes, and might indicate underlying issues with the system or data.', + description: + ( + 'The number of failed queries on {{$labels.instance}} is {{ printf "%%.0f" $value }} which is greater than the threshold of %(alertsHighFailedQueryCountCritical)s%%s.' + ) % this.config, + }, + }, + ], + }, + ], + }, +} diff --git a/presto-mixin/alerts/alerts.libsonnet b/presto-mixin/alerts/alerts.libsonnet deleted file mode 100644 index fb0604d1c..000000000 --- a/presto-mixin/alerts/alerts.libsonnet +++ /dev/null @@ -1,130 +0,0 @@ -{ - prometheusAlerts+:: { - groups+: [ - { - name: 'presto-alerts', - rules: [ - { - alert: 'PrestoHighInsufficientResources', - expr: ||| - increase(presto_QueryManager_InsufficientResourcesFailures_TotalCount[5m]) > %(alertsHighInsufficientResourceErrors)s - ||| % $._config, - 'for': '5m', - labels: { - severity: 'critical', - }, - annotations: { - summary: 'The amount of failures that are occurring due to insufficient resources are scaling, causing saturation in the system.', - description: - ( - 'The number of insufficient resource failures on {{$labels.instance}} is {{ printf "%%.0f" $value }} which is greater than the threshold of %(alertsHighInsufficientResourceErrors)s.' - ) % $._config, - }, - }, - { - alert: 'PrestoHighTaskFailuresWarning', - expr: ||| - increase(presto_TaskManager_FailedTasks_TotalCount[5m]) > %(alertsHighTaskFailuresWarning)s - ||| % $._config, - 'for': '5m', - labels: { - severity: 'warning', - }, - annotations: { - summary: 'The amount of tasks that are failing is increasing, this might affect query processing and could result in incomplete or incorrect results.', - description: - ( - 'The number of task failures on {{$labels.instance}} is {{ printf "%%.0f" $value }} which is above the threshold of %(alertsHighTaskFailuresWarning)s.' - ) % $._config, - }, - }, - { - alert: 'PrestoHighTaskFailuresCritical', - expr: ||| - increase(presto_TaskManager_FailedTasks_TotalCount[5m]) / clamp_min(increase(presto_TaskManager_FailedTasks_TotalCount[10m]), 1) * 100 > %(alertsHighTaskFailuresCritical)s - ||| % $._config, - 'for': '5m', - labels: { - severity: 'critical', - }, - annotations: { - summary: 'The amount of tasks that are failing has reached a critical level. This might affect query processing and could result in incomplete or incorrect results.', - description: - ( - 'The number of task failures on {{$labels.instance}} is {{ printf "%%.0f" $value }} which is above the threshold of %(alertsHighTaskFailuresCritical)s%%s.' - ) % $._config, - }, - }, - { - alert: 'PrestoHighQueuedTaskCount', - expr: ||| - increase(presto_QueryExecution_Executor_QueuedTaskCount[5m]) > %(alertsHighQueuedTaskCount)s - ||| % $._config, - 'for': '5m', - labels: { - severity: 'warning', - }, - annotations: { - summary: 'The amount of tasks that are being put in queue is increasing. A high number of queued tasks can lead to increased query latencies and degraded system performance.', - description: - ( - 'The number of queued tasks on {{$labels.instance}} is {{ printf "%%.0f" $value }} which is greater than the threshold of %(alertsHighQueuedTaskCount)s' - ) % $._config, - }, - }, - { - alert: 'PrestoHighBlockedNodes', - expr: ||| - increase(presto_ClusterMemoryPool_general_BlockedNodes[5m]) > %(alertsHighBlockedNodesCount)s - ||| % $._config, - 'for': '5m', - labels: { - severity: 'critical', - }, - annotations: { - summary: 'The amount of nodes that are blocked due to memory restrictions is increasing. Blocked nodes can cause performance degradation and resource starvation.', - description: - ( - 'The number of blocked nodes on {{$labels.instance}} is {{ printf "%%.0f" $value }} which is greater than the threshold of %(alertsHighBlockedNodesCount)s' - ) % $._config, - }, - }, - { - alert: 'PrestoHighFailedQueriesWarning', - expr: ||| - increase(presto_QueryManager_FailedQueries_TotalCount[5m]) > %(alertsHighFailedQueryCountWarning)s - ||| % $._config, - 'for': '5m', - labels: { - severity: 'warning', - }, - annotations: { - summary: 'The amount of queries failing is increasing. Failed queries can prevent users from accessing data, disrupt analytics processes, and might indicate underlying issues with the system or data.', - description: - ( - 'The number of failed queries on {{$labels.instance}} is {{ printf "%%.0f" $value }} which is greater than the threshold of %(alertsHighFailedQueryCountWarning)s' - ) % $._config, - }, - }, - { - alert: 'PrestoHighFailedQueriesCritical', - expr: ||| - increase(presto_QueryManager_FailedQueries_TotalCount[5m]) / clamp_min(increase(presto_QueryManager_FailedQueries_TotalCount[10m]), 1) * 100 > %(alertsHighFailedQueryCountCritical)s - ||| % $._config, - 'for': '5m', - labels: { - severity: 'critical', - }, - annotations: { - summary: 'The amount of queries failing has increased to critical levels. Failed queries can prevent users from accessing data, disrupt analytics processes, and might indicate underlying issues with the system or data.', - description: - ( - 'The number of failed queries on {{$labels.instance}} is {{ printf "%%.0f" $value }} which is greater than the threshold of %(alertsHighFailedQueryCountCritical)s%%s.' - ) % $._config, - }, - }, - ], - }, - ], - }, -} diff --git a/presto-mixin/config.libsonnet b/presto-mixin/config.libsonnet index 47a4bec50..55277785f 100644 --- a/presto-mixin/config.libsonnet +++ b/presto-mixin/config.libsonnet @@ -1,26 +1,37 @@ { - _config+:: { - enableMultiCluster: false, - prestoOverviewSelector: if self.enableMultiCluster then 'job=~"$job", cluster=~"$cluster"' else 'job=~"$job"', - prestoSelector: if self.enableMultiCluster then 'job=~"$job", instance=~"$instance", cluster=~"$cluster"' else 'job=~"$job", instance=~"$instance"', - prestoAlertSelector: if self.enableMultiCluster then 'job=~"${job:regex}", cluster=~"${cluster:regex}"' else 'job=~"${job:regex}"', - prestoOverviewLegendSelector: if self.enableMultiCluster then '{{cluster}} - {{presto_cluster}}' else '{{presto_cluster}}', - prestoLegendSelector: if self.enableMultiCluster then '{{cluster}} - {{instance}}' else '{{instance}}', - filterSelector: 'job=~"integrations/presto"', + local this = self, + filteringSelector: 'job=~"integrations/presto"', + groupLabels: ['job', 'cluster', 'presto_cluster'], + instanceLabels: ['instance'], + uid: 'presto', - dashboardTags: ['presto-mixin'], - dashboardPeriod: 'now-30m', - dashboardTimezone: 'default', - dashboardRefresh: '1m', + dashboardNamePrefix: 'Presto', + dashboardTags: ['presto-mixin'], + dashboardPeriod: 'now-30m', + dashboardTimezone: 'default', + dashboardRefresh: '1m', + + // Data source configuration + metricsSource: 'prometheus', + enableLokiLogs: true, + logLabels: this.groupLabels + this.instanceLabels, + extraLogLabels: [], + logsVolumeGroupBy: 'level', + showLogsVolume: true, + + // Alerts configuration + alertsHighInsufficientResourceErrors: 0, // count + alertsHighTaskFailuresWarning: 0, // count + alertsHighTaskFailuresCritical: 30, // percent + alertsHighQueuedTaskCount: 5, // count + alertsHighBlockedNodesCount: 0, // count + alertsHighFailedQueryCountWarning: 0, // count + alertsHighFailedQueryCountCritical: 30, // percent + + signals+: { + overview: (import './signals/overview.libsonnet')(this), + coordinator: (import './signals/coordinator.libsonnet')(this), + worker: (import './signals/worker.libsonnet')(this), - // alerts thresholds - alertsHighInsufficientResourceErrors: 0, // count - alertsHighTaskFailuresWarning: 0, // count - alertsHighTaskFailuresCritical: 30, // percent - alertsHighQueuedTaskCount: 5, // count - alertsHighBlockedNodesCount: 0, // count - alertsHighFailedQueryCountWarning: 0, // count - alertsHighFailedQueryCountCritical: 30, // percent - enableLokiLogs: true, }, } diff --git a/presto-mixin/dashboards.libsonnet b/presto-mixin/dashboards.libsonnet new file mode 100644 index 000000000..6bd86f2ba --- /dev/null +++ b/presto-mixin/dashboards.libsonnet @@ -0,0 +1,124 @@ +local g = import './g.libsonnet'; +local logslib = import 'logs-lib/logs/main.libsonnet'; + +{ + local root = self, + + new(this):: + local prefix = this.config.dashboardNamePrefix; + local links = this.grafana.links; + local tags = this.config.dashboardTags; + local uid = g.util.string.slugify(this.config.uid); + local vars = this.grafana.variables; + local annotations = this.grafana.annotations; + local refresh = this.config.dashboardRefresh; + local period = this.config.dashboardPeriod; + local timezone = this.config.dashboardTimezone; + { + 'presto-coordinator.json': + g.dashboard.new(prefix + ' coordinator') + + g.dashboard.withPanels( + g.util.panel.resolveCollapsedFlagOnRows( + g.util.grid.wrapPanels( + [ + this.grafana.rows.coordinator, + this.grafana.rows.coordinatorJVM, + ] + ) + ) + ) + root.applyCommon( + vars.multiInstance, + uid + '_coordinator', + tags, + links { prestoCoordinator+:: {} }, + annotations, + timezone, + refresh, + period + ), + + 'presto-overview.json': + g.dashboard.new(prefix + ' overview') + + g.dashboard.withPanels( + g.util.panel.resolveCollapsedFlagOnRows( + g.util.grid.wrapPanels( + [this.grafana.rows.overview] + ) + ) + ) + root.applyCommon( + vars.multiInstance, + uid + '_overview', + tags, + links { prestoOverview+:: {} }, + annotations, + timezone, + refresh, + period + ), + + 'presto-worker.json': + g.dashboard.new(prefix + ' worker') + + g.dashboard.withPanels( + g.util.panel.resolveCollapsedFlagOnRows( + g.util.grid.wrapPanels( + [ + this.grafana.rows.worker, + this.grafana.rows.workerJVM, + ] + ) + ) + ) + root.applyCommon( + vars.multiInstance, + uid + '_worker', + tags, + links { prestoWorker+:: {} }, + annotations, + timezone, + refresh, + period + ), + + } + + if this.config.enableLokiLogs then { + 'presto-logs.json': + logslib.new( + prefix + ' logs', + datasourceName=this.grafana.variables.datasources.loki.name, + datasourceRegex=this.grafana.variables.datasources.loki.regex, + filterSelector=this.config.filteringSelector, + labels=this.config.groupLabels + this.config.extraLogLabels, + formatParser=null, + showLogsVolume=this.config.showLogsVolume, + ) + { + dashboards+: + { + logs+: + root.applyCommon(super.logs.templating.list, uid=uid + '-logs', tags=tags, links=links { logs+:: {} }, annotations=annotations, timezone=timezone, refresh=refresh, period=period), + }, + panels+: + { + logs+: + g.panel.logs.options.withEnableLogDetails(true) + + g.panel.logs.options.withShowTime(false) + + g.panel.logs.options.withWrapLogMessage(false), + }, + variables+: { + toArray+: [ + this.grafana.variables.datasources.prometheus { hide: 2 }, + ], + }, + }.dashboards.logs, + } + else {}, + + applyCommon(vars, uid, tags, links, annotations, timezone, refresh, period): + g.dashboard.withTags(tags) + + g.dashboard.withUid(uid) + + g.dashboard.withLinks(std.objectValues(links)) + + g.dashboard.withTimezone(timezone) + + g.dashboard.withRefresh(refresh) + + g.dashboard.time.withFrom(period) + + g.dashboard.withVariables(vars) + + g.dashboard.withAnnotations(std.objectValues(annotations)), +} diff --git a/presto-mixin/dashboards/dashboards.libsonnet b/presto-mixin/dashboards/dashboards.libsonnet deleted file mode 100644 index 4a71626ea..000000000 --- a/presto-mixin/dashboards/dashboards.libsonnet +++ /dev/null @@ -1,4 +0,0 @@ -(import 'presto-overview.libsonnet') + -(import 'presto-coordinator.libsonnet') + -(import 'presto-worker.libsonnet') + -(import 'presto-logs-overview.libsonnet') diff --git a/presto-mixin/dashboards/presto-coordinator.libsonnet b/presto-mixin/dashboards/presto-coordinator.libsonnet deleted file mode 100644 index 8170fa77a..000000000 --- a/presto-mixin/dashboards/presto-coordinator.libsonnet +++ /dev/null @@ -1,1242 +0,0 @@ -local g = (import 'grafana-builder/grafana.libsonnet'); -local grafana = (import 'grafonnet/grafana.libsonnet'); -local dashboard = grafana.dashboard; -local template = grafana.template; -local prometheus = grafana.prometheus; - -local dashboardUid = 'presto-coordinator'; - -local promDatasourceName = 'prometheus_datasource'; -local getMatcher(cfg) = '%(prestoSelector)s' % cfg; -local getLegendMatcher(cfg) = '%(prestoLegendSelector)s' % cfg; -local promDatasource = { - uid: '${%s}' % promDatasourceName, -}; - -local nonheapMemoryUsagePanel(legendMatcher, matcher) = { - datasource: promDatasource, - targets: [ - prometheus.target( - 'avg (jvm_nonheap_memory_used{' + matcher + ', presto_cluster=~"$presto_cluster"} / clamp_min((jvm_nonheap_memory_used{' + matcher + ', presto_cluster=~"$presto_cluster"} + jvm_nonheap_memory_committed{' + matcher + ', presto_cluster=~"$presto_cluster"}),1))', - datasource=promDatasource, - legendFormat='' + legendMatcher + '', - ), - ], - type: 'gauge', - title: 'Non-heap memory usage', - description: "An average gauge of the JVM's non-heap memory usage across coordinators.", - fieldConfig: { - defaults: { - color: { - mode: 'thresholds', - }, - mappings: [], - thresholds: { - mode: 'absolute', - steps: [ - { - color: 'green', - value: null, - }, - { - color: '#EAB839', - value: 0.7, - }, - { - color: 'red', - value: 0.8, - }, - ], - }, - unit: 'percentunit', - }, - overrides: [], - }, - options: { - minVizHeight: 75, - minVizWidth: 75, - orientation: 'auto', - reduceOptions: { - calcs: [ - 'lastNotNull', - ], - fields: '', - values: false, - }, - showThresholdLabels: false, - showThresholdMarkers: true, - }, - pluginVersion: '10.2.0-62263', -}; - -local heapMemoryUsagePanel(legendMatcher, matcher) = { - datasource: promDatasource, - targets: [ - prometheus.target( - 'avg (jvm_heap_memory_used{' + matcher + ', presto_cluster=~"$presto_cluster"} / clamp_min((jvm_heap_memory_used{' + matcher + ', presto_cluster=~"$presto_cluster"} + jvm_heap_memory_committed{' + matcher + ', presto_cluster=~"$presto_cluster"}),1))', - datasource=promDatasource, - legendFormat='' + legendMatcher + '', - ), - ], - type: 'gauge', - title: 'Heap memory usage', - description: "An average gauge of the JVM's heap memory usage across coordinators.", - fieldConfig: { - defaults: { - color: { - mode: 'thresholds', - }, - mappings: [], - thresholds: { - mode: 'absolute', - steps: [ - { - color: 'green', - value: null, - }, - { - color: '#EAB839', - value: 0.7, - }, - { - color: 'red', - value: 0.8, - }, - ], - }, - unit: 'percentunit', - }, - overrides: [], - }, - options: { - minVizHeight: 75, - minVizWidth: 75, - orientation: 'auto', - reduceOptions: { - calcs: [ - 'lastNotNull', - ], - fields: '', - values: false, - }, - showThresholdLabels: false, - showThresholdMarkers: true, - }, - pluginVersion: '10.2.0-62263', -}; - -local errorFailuresOneMinuteCountPanel(legendMatcher, matcher) = { - datasource: promDatasource, - targets: [ - prometheus.target( - 'presto_QueryManager_InternalFailures_OneMinute_Count{' + matcher + ', presto_cluster=~"$presto_cluster"}', - datasource=promDatasource, - legendFormat='' + legendMatcher + ' - internal', - format='time_series', - ), - prometheus.target( - 'presto_QueryManager_UserErrorFailures_OneMinute_Count{' + matcher + ', presto_cluster=~"$presto_cluster"}', - datasource=promDatasource, - legendFormat='' + legendMatcher + ' - user', - ), - ], - type: 'timeseries', - title: 'Error failures - one minute count', - description: 'The number of internal and user error failures occurring on the instance.', - fieldConfig: { - defaults: { - color: { - mode: 'palette-classic', - }, - custom: { - axisBorderShow: false, - axisCenteredZero: false, - axisColorMode: 'text', - axisLabel: '', - axisPlacement: 'auto', - barAlignment: 0, - drawStyle: 'line', - fillOpacity: 15, - gradientMode: 'none', - hideFrom: { - legend: false, - tooltip: false, - viz: false, - }, - insertNulls: false, - lineInterpolation: 'stepBefore', - lineWidth: 2, - pointSize: 5, - scaleDistribution: { - type: 'linear', - }, - showPoints: 'never', - spanNulls: false, - stacking: { - group: 'A', - mode: 'normal', - }, - thresholdsStyle: { - mode: 'off', - }, - }, - mappings: [], - thresholds: { - mode: 'absolute', - steps: [ - { - color: 'green', - value: null, - }, - ], - }, - unit: 'none', - }, - overrides: [], - }, - options: { - legend: { - calcs: [], - displayMode: 'list', - placement: 'bottom', - showLegend: true, - }, - tooltip: { - mode: 'multi', - sort: 'desc', - }, - }, -}; - -local normalQueryOneMinuteCountPanel(legendMatcher, matcher) = { - datasource: promDatasource, - targets: [ - prometheus.target( - 'presto_QueryManager_CompletedQueries_OneMinute_Count{' + matcher + ', presto_cluster=~"$presto_cluster"}', - datasource=promDatasource, - legendFormat='' + legendMatcher + ' - completed', - format='time_series', - ), - prometheus.target( - 'presto_QueryManager_RunningQueries{' + matcher + ', presto_cluster=~"$presto_cluster"}', - datasource=promDatasource, - legendFormat='' + legendMatcher + ' - running', - ), - prometheus.target( - 'presto_QueryManager_StartedQueries_OneMinute_Count{' + matcher + ', presto_cluster=~"$presto_cluster"}', - datasource=promDatasource, - legendFormat='' + legendMatcher + ' - started', - ), - ], - type: 'timeseries', - title: 'Normal query - one minute count', - description: 'A count of completed, running, and started queries.', - fieldConfig: { - defaults: { - color: { - fixedColor: '#C8F2C2', - mode: 'palette-classic', - }, - custom: { - axisBorderShow: false, - axisCenteredZero: false, - axisColorMode: 'text', - axisLabel: '', - axisPlacement: 'left', - barAlignment: 0, - drawStyle: 'line', - fillOpacity: 15, - gradientMode: 'none', - hideFrom: { - legend: false, - tooltip: false, - viz: false, - }, - insertNulls: false, - lineInterpolation: 'stepBefore', - lineWidth: 2, - pointSize: 5, - scaleDistribution: { - type: 'linear', - }, - showPoints: 'never', - spanNulls: false, - stacking: { - group: 'A', - mode: 'normal', - }, - thresholdsStyle: { - mode: 'off', - }, - }, - mappings: [], - thresholds: { - mode: 'absolute', - steps: [ - { - color: 'green', - value: null, - }, - ], - }, - unit: 'none', - }, - overrides: [], - }, - options: { - legend: { - calcs: [], - displayMode: 'list', - placement: 'bottom', - showLegend: true, - }, - tooltip: { - mode: 'multi', - sort: 'desc', - }, - }, -}; - -local abnormalQueryOneMinuteCountPanel(legendMatcher, matcher) = { - datasource: promDatasource, - targets: [ - prometheus.target( - 'presto_QueryManager_FailedQueries_OneMinute_Count{' + matcher + ', presto_cluster=~"$presto_cluster"}', - datasource=promDatasource, - legendFormat='' + legendMatcher + ' - failed', - format='time_series', - ), - prometheus.target( - 'presto_QueryManager_AbandonedQueries_OneMinute_Count{' + matcher + ', presto_cluster=~"$presto_cluster"}', - datasource=promDatasource, - legendFormat='' + legendMatcher + ' - abandoned', - ), - prometheus.target( - 'presto_QueryManager_CanceledQueries_OneMinute_Count{' + matcher + ', presto_cluster=~"$presto_cluster"}', - datasource=promDatasource, - legendFormat='' + legendMatcher + ' - canceled', - ), - ], - type: 'timeseries', - title: 'Abnormal query - one minute count', - description: 'A count of failed, abandoned, and canceled queries.', - fieldConfig: { - defaults: { - color: { - fixedColor: '#C8F2C2', - mode: 'palette-classic', - }, - custom: { - axisBorderShow: false, - axisCenteredZero: false, - axisColorMode: 'text', - axisLabel: '', - axisPlacement: 'left', - barAlignment: 0, - drawStyle: 'line', - fillOpacity: 15, - gradientMode: 'none', - hideFrom: { - legend: false, - tooltip: false, - viz: false, - }, - insertNulls: false, - lineInterpolation: 'stepBefore', - lineWidth: 2, - pointSize: 5, - scaleDistribution: { - type: 'linear', - }, - showPoints: 'never', - spanNulls: false, - stacking: { - group: 'A', - mode: 'normal', - }, - thresholdsStyle: { - mode: 'off', - }, - }, - mappings: [], - thresholds: { - mode: 'absolute', - steps: [ - { - color: 'green', - value: null, - }, - ], - }, - unit: 'none', - }, - overrides: [], - }, - options: { - legend: { - calcs: [], - displayMode: 'list', - placement: 'bottom', - showLegend: true, - }, - tooltip: { - mode: 'multi', - sort: 'desc', - }, - }, -}; - -local normalQueryOneMinuteRatePanel(legendMatcher, matcher) = { - datasource: promDatasource, - targets: [ - prometheus.target( - 'presto_QueryManager_CompletedQueries_OneMinute_Rate{' + matcher + ', presto_cluster=~"$presto_cluster"}', - datasource=promDatasource, - legendFormat='' + legendMatcher + ' - completed', - format='time_series', - ), - prometheus.target( - 'presto_QueryManager_RunningQueries{' + matcher + ', presto_cluster=~"$presto_cluster"}', - datasource=promDatasource, - legendFormat='' + legendMatcher + ' - running', - ), - prometheus.target( - 'presto_QueryManager_StartedQueries_OneMinute_Rate{' + matcher + ', presto_cluster=~"$presto_cluster"}', - datasource=promDatasource, - legendFormat='' + legendMatcher + ' - started', - ), - ], - type: 'timeseries', - title: 'Normal query - one minute rate', - description: 'The rate of normally operating queries such as the completed, running, and started queries.', - fieldConfig: { - defaults: { - color: { - fixedColor: '#C8F2C2', - mode: 'palette-classic', - }, - custom: { - axisBorderShow: false, - axisCenteredZero: false, - axisColorMode: 'text', - axisLabel: '', - axisPlacement: 'left', - barAlignment: 0, - drawStyle: 'line', - fillOpacity: 15, - gradientMode: 'none', - hideFrom: { - legend: false, - tooltip: false, - viz: false, - }, - insertNulls: false, - lineInterpolation: 'stepBefore', - lineWidth: 2, - pointSize: 5, - scaleDistribution: { - type: 'linear', - }, - showPoints: 'never', - spanNulls: false, - stacking: { - group: 'A', - mode: 'normal', - }, - thresholdsStyle: { - mode: 'off', - }, - }, - mappings: [], - thresholds: { - mode: 'absolute', - steps: [ - { - color: 'green', - value: null, - }, - ], - }, - unit: 'ops', - }, - overrides: [], - }, - options: { - legend: { - calcs: [], - displayMode: 'list', - placement: 'bottom', - showLegend: true, - }, - tooltip: { - mode: 'multi', - sort: 'desc', - }, - }, -}; - -local abnormalQueryOneMinuteRatePanel(legendMatcher, matcher) = { - datasource: promDatasource, - targets: [ - prometheus.target( - 'rate(presto_QueryManager_FailedQueries_TotalCount{' + matcher + ', presto_cluster=~"$presto_cluster"}[$__rate_interval])', - datasource=promDatasource, - legendFormat='' + legendMatcher + ' - failed', - format='time_series', - ), - prometheus.target( - 'rate(presto_QueryManager_AbandonedQueries_TotalCount{' + matcher + ', presto_cluster=~"$presto_cluster"}[$__rate_interval])', - datasource=promDatasource, - legendFormat='' + legendMatcher + ' - abandoned', - ), - prometheus.target( - 'rate(presto_QueryManager_CanceledQueries_TotalCount{' + matcher + ', presto_cluster=~"$presto_cluster"}[$__rate_interval])', - datasource=promDatasource, - legendFormat='' + legendMatcher + ' - canceled', - ), - ], - type: 'timeseries', - title: 'Abnormal query - one minute rate', - description: 'The rate of abnormal queries such as the failed, abandoned, and canceled queries.', - fieldConfig: { - defaults: { - color: { - fixedColor: '#C8F2C2', - mode: 'palette-classic', - }, - custom: { - axisBorderShow: false, - axisCenteredZero: false, - axisColorMode: 'text', - axisLabel: '', - axisPlacement: 'left', - barAlignment: 0, - drawStyle: 'line', - fillOpacity: 15, - gradientMode: 'none', - hideFrom: { - legend: false, - tooltip: false, - viz: false, - }, - insertNulls: false, - lineInterpolation: 'stepBefore', - lineWidth: 2, - pointSize: 5, - scaleDistribution: { - type: 'linear', - }, - showPoints: 'never', - spanNulls: false, - stacking: { - group: 'A', - mode: 'normal', - }, - thresholdsStyle: { - mode: 'off', - }, - }, - mappings: [], - thresholds: { - mode: 'absolute', - steps: [ - { - color: 'green', - value: null, - }, - ], - }, - unit: 'ops', - }, - overrides: [], - }, - options: { - legend: { - calcs: [], - displayMode: 'list', - placement: 'bottom', - showLegend: true, - }, - tooltip: { - mode: 'multi', - sort: 'desc', - }, - }, -}; - -local queryExecutionTimeOneMinuteCountPanel(legendMatcher, matcher) = { - datasource: promDatasource, - targets: [ - prometheus.target( - 'presto_QueryManager_ExecutionTime_OneMinute_P75{' + matcher + ', presto_cluster=~"$presto_cluster"}', - datasource=promDatasource, - legendFormat='' + legendMatcher + ' - p75', - format='time_series', - ), - prometheus.target( - 'presto_QueryManager_ExecutionTime_OneMinute_P95{' + matcher + ', presto_cluster=~"$presto_cluster"}', - datasource=promDatasource, - legendFormat='' + legendMatcher + ' - p95', - ), - prometheus.target( - 'presto_QueryManager_ExecutionTime_OneMinute_P99{' + matcher + ', presto_cluster=~"$presto_cluster"}', - datasource=promDatasource, - legendFormat='' + legendMatcher + ' - p99', - ), - prometheus.target( - 'presto_QueryManager_ExecutionTime_OneMinute_P50{' + matcher + ', presto_cluster=~"$presto_cluster"}', - datasource=promDatasource, - legendFormat='' + legendMatcher + ' - p50', - ), - ], - type: 'timeseries', - title: 'Query execution time - one minute count', - description: 'The time it took to run queries over the past one minute period.\n', - fieldConfig: { - defaults: { - color: { - mode: 'palette-classic', - }, - custom: { - axisBorderShow: false, - axisCenteredZero: false, - axisColorMode: 'text', - axisLabel: '', - axisPlacement: 'auto', - barAlignment: 0, - drawStyle: 'line', - fillOpacity: 15, - gradientMode: 'none', - hideFrom: { - legend: false, - tooltip: false, - viz: false, - }, - insertNulls: false, - lineInterpolation: 'stepBefore', - lineWidth: 2, - pointSize: 5, - scaleDistribution: { - type: 'linear', - }, - showPoints: 'never', - spanNulls: false, - stacking: { - group: 'A', - mode: 'none', - }, - thresholdsStyle: { - mode: 'off', - }, - }, - mappings: [], - thresholds: { - mode: 'absolute', - steps: [ - { - color: 'green', - value: null, - }, - ], - }, - unit: 'ms', - }, - overrides: [], - }, - options: { - legend: { - calcs: [], - displayMode: 'list', - placement: 'bottom', - showLegend: true, - }, - tooltip: { - mode: 'multi', - sort: 'desc', - }, - }, -}; - -local cpuTimeConsumedOneMinuteRatePanel(legendMatcher, matcher) = { - datasource: promDatasource, - targets: [ - prometheus.target( - 'presto_QueryManager_ConsumedCpuTimeSecs_OneMinute_Count{' + matcher + ', presto_cluster=~"$presto_cluster"}', - datasource=promDatasource, - legendFormat='' + legendMatcher + ' ', - format='time_series', - ), - ], - type: 'timeseries', - title: 'CPU time consumed - one minute rate', - description: "CPU time consumed by Presto's QueryManager for executing queries over one-minute intervals, measured in CPU seconds used.", - fieldConfig: { - defaults: { - color: { - mode: 'palette-classic', - }, - custom: { - axisBorderShow: false, - axisCenteredZero: false, - axisColorMode: 'text', - axisLabel: '', - axisPlacement: 'auto', - barAlignment: 0, - drawStyle: 'line', - fillOpacity: 15, - gradientMode: 'none', - hideFrom: { - legend: false, - tooltip: false, - viz: false, - }, - insertNulls: false, - lineInterpolation: 'stepBefore', - lineWidth: 2, - pointSize: 5, - scaleDistribution: { - type: 'linear', - }, - showPoints: 'never', - spanNulls: false, - stacking: { - group: 'A', - mode: 'none', - }, - thresholdsStyle: { - mode: 'off', - }, - }, - mappings: [], - thresholds: { - mode: 'absolute', - steps: [ - { - color: 'green', - value: null, - }, - ], - }, - unit: 's', - }, - overrides: [], - }, - options: { - legend: { - calcs: [], - displayMode: 'list', - placement: 'bottom', - showLegend: true, - }, - tooltip: { - mode: 'multi', - sort: 'desc', - }, - }, -}; - -local cpuInputThroughputOneMinuteCountPanel(legendMatcher, matcher) = { - datasource: promDatasource, - targets: [ - prometheus.target( - 'presto_QueryManager_CpuInputByteRate_OneMinute_Total{' + matcher + ', presto_cluster=~"$presto_cluster"}', - datasource=promDatasource, - legendFormat='' + legendMatcher + ' ', - format='time_series', - ), - ], - type: 'timeseries', - title: 'CPU input throughput - one minute count', - description: 'The rate at which input data is being read and processed by the CPU.', - fieldConfig: { - defaults: { - color: { - mode: 'palette-classic', - }, - custom: { - axisBorderShow: false, - axisCenteredZero: false, - axisColorMode: 'text', - axisLabel: '', - axisPlacement: 'auto', - barAlignment: 0, - drawStyle: 'line', - fillOpacity: 15, - gradientMode: 'none', - hideFrom: { - legend: false, - tooltip: false, - viz: false, - }, - insertNulls: false, - lineInterpolation: 'smooth', - lineWidth: 2, - pointSize: 5, - scaleDistribution: { - type: 'linear', - }, - showPoints: 'never', - spanNulls: false, - stacking: { - group: 'A', - mode: 'none', - }, - thresholdsStyle: { - mode: 'off', - }, - }, - mappings: [], - thresholds: { - mode: 'absolute', - steps: [ - { - color: 'green', - value: null, - }, - ], - }, - unit: 'Bps', - }, - overrides: [], - }, - options: { - legend: { - calcs: [], - displayMode: 'list', - placement: 'bottom', - showLegend: true, - }, - tooltip: { - mode: 'multi', - sort: 'desc', - }, - }, -}; - -local jvmMetricsRow = { - datasource: promDatasource, - targets: [], - type: 'row', - title: 'JVM metrics', - collapsed: false, -}; - -local garbageCollectionCount(legendMatcher, matcher) = { - datasource: promDatasource, - targets: [ - prometheus.target( - 'increase(jvm_gc_collection_count{' + matcher + ', presto_cluster=~"$presto_cluster", name="G1 Young Generation"}[$__interval:])', - datasource=promDatasource, - legendFormat='' + legendMatcher + ' ', - interval='1m', - format='time_series', - ), - ], - type: 'timeseries', - title: 'Garbage collection count / $__interval', - description: 'The recent increase in the number of garbage collection events for the JVM.', - fieldConfig: { - defaults: { - color: { - mode: 'palette-classic', - }, - custom: { - axisBorderShow: false, - axisCenteredZero: false, - axisColorMode: 'text', - axisLabel: '', - axisPlacement: 'auto', - barAlignment: 0, - drawStyle: 'line', - fillOpacity: 15, - gradientMode: 'none', - hideFrom: { - legend: false, - tooltip: false, - viz: false, - }, - insertNulls: false, - lineInterpolation: 'smooth', - lineWidth: 2, - pointSize: 5, - scaleDistribution: { - type: 'linear', - }, - showPoints: 'never', - spanNulls: false, - stacking: { - group: 'A', - mode: 'normal', - }, - thresholdsStyle: { - mode: 'off', - }, - }, - mappings: [], - thresholds: { - mode: 'absolute', - steps: [ - { - color: 'green', - value: null, - }, - ], - }, - unit: 'none', - }, - overrides: [], - }, - options: { - legend: { - calcs: [], - displayMode: 'list', - placement: 'bottom', - showLegend: true, - }, - tooltip: { - mode: 'multi', - sort: 'desc', - }, - }, -}; - -local garbageCollectionDurationPanel(legendMatcher, matcher) = { - datasource: promDatasource, - targets: [ - prometheus.target( - 'jvm_gc_duration{' + matcher + ', presto_cluster=~"$presto_cluster", name="G1 Young Generation"}', - datasource=promDatasource, - legendFormat='' + legendMatcher + '', - format='time_series', - ), - ], - type: 'timeseries', - title: 'Garbage collection duration', - description: 'The average duration for each garbage collection operation in the JVM.', - fieldConfig: { - defaults: { - color: { - mode: 'palette-classic', - }, - custom: { - axisBorderShow: false, - axisCenteredZero: false, - axisColorMode: 'text', - axisLabel: '', - axisPlacement: 'auto', - barAlignment: 0, - drawStyle: 'line', - fillOpacity: 15, - gradientMode: 'none', - hideFrom: { - legend: false, - tooltip: false, - viz: false, - }, - insertNulls: false, - lineInterpolation: 'smooth', - lineWidth: 2, - pointSize: 5, - scaleDistribution: { - type: 'linear', - }, - showPoints: 'never', - spanNulls: false, - stacking: { - group: 'A', - mode: 'normal', - }, - thresholdsStyle: { - mode: 'off', - }, - }, - mappings: [], - thresholds: { - mode: 'absolute', - steps: [ - { - color: 'green', - value: null, - }, - ], - }, - unit: 'ms', - }, - overrides: [], - }, - options: { - legend: { - calcs: [], - displayMode: 'list', - placement: 'bottom', - showLegend: true, - }, - tooltip: { - mode: 'multi', - sort: 'desc', - }, - }, -}; - -local memoryUsedPanel(legendMatcher, matcher) = { - datasource: promDatasource, - targets: [ - prometheus.target( - 'jvm_nonheap_memory_used{' + matcher + ', presto_cluster=~"$presto_cluster"}', - datasource=promDatasource, - legendFormat='' + legendMatcher + ' - non heap', - format='time_series', - ), - prometheus.target( - 'jvm_heap_memory_used{' + matcher + ', presto_cluster=~"$presto_cluster"}', - datasource=promDatasource, - legendFormat='' + legendMatcher + ' - heap', - ), - ], - type: 'timeseries', - title: 'Memory used', - description: 'The heap and non-heap memory used by the JVM.', - fieldConfig: { - defaults: { - color: { - mode: 'palette-classic', - }, - custom: { - axisBorderShow: false, - axisCenteredZero: false, - axisColorMode: 'text', - axisLabel: '', - axisPlacement: 'auto', - barAlignment: 0, - drawStyle: 'line', - fillOpacity: 15, - gradientMode: 'none', - hideFrom: { - legend: false, - tooltip: false, - viz: false, - }, - insertNulls: false, - lineInterpolation: 'smooth', - lineWidth: 2, - pointSize: 5, - scaleDistribution: { - type: 'linear', - }, - showPoints: 'never', - spanNulls: false, - stacking: { - group: 'A', - mode: 'normal', - }, - thresholdsStyle: { - mode: 'off', - }, - }, - mappings: [], - thresholds: { - mode: 'absolute', - steps: [ - { - color: 'green', - value: null, - }, - ], - }, - unit: 'decbytes', - }, - overrides: [], - }, - options: { - legend: { - calcs: [], - displayMode: 'list', - placement: 'bottom', - showLegend: true, - }, - tooltip: { - mode: 'multi', - sort: 'desc', - }, - }, -}; - -local memoryCommittedPanel(legendMatcher, matcher) = { - datasource: promDatasource, - targets: [ - prometheus.target( - 'jvm_heap_memory_committed{' + matcher + ', presto_cluster=~"$presto_cluster"}', - datasource=promDatasource, - legendFormat='' + legendMatcher + ' - heap', - format='time_series', - ), - prometheus.target( - 'jvm_nonheap_memory_committed{' + matcher + ', presto_cluster=~"$presto_cluster"}', - datasource=promDatasource, - legendFormat='' + legendMatcher + ' - non heap', - ), - ], - type: 'timeseries', - title: 'Memory committed', - description: 'The heap and non-heap memory committed.', - fieldConfig: { - defaults: { - color: { - mode: 'palette-classic', - }, - custom: { - axisBorderShow: false, - axisCenteredZero: false, - axisColorMode: 'text', - axisLabel: '', - axisPlacement: 'auto', - barAlignment: 0, - drawStyle: 'line', - fillOpacity: 15, - gradientMode: 'none', - hideFrom: { - legend: false, - tooltip: false, - viz: false, - }, - insertNulls: false, - lineInterpolation: 'smooth', - lineWidth: 2, - pointSize: 5, - scaleDistribution: { - type: 'linear', - }, - showPoints: 'never', - spanNulls: false, - stacking: { - group: 'A', - mode: 'normal', - }, - thresholdsStyle: { - mode: 'off', - }, - }, - mappings: [], - thresholds: { - mode: 'absolute', - steps: [ - { - color: 'green', - value: null, - }, - ], - }, - unit: 'decbytes', - }, - overrides: [], - }, - options: { - legend: { - calcs: [], - displayMode: 'list', - placement: 'bottom', - showLegend: true, - }, - tooltip: { - mode: 'multi', - sort: 'desc', - }, - }, -}; - -{ - grafanaDashboards+:: { - 'presto-coordinator.json': - dashboard.new( - 'Presto coordinator', - time_from='%s' % $._config.dashboardPeriod, - tags=($._config.dashboardTags), - timezone='%s' % $._config.dashboardTimezone, - refresh='%s' % $._config.dashboardRefresh, - description='', - uid=dashboardUid, - ) - .addLink(grafana.link.dashboards( - asDropdown=false, - title='Other Presto dashboards', - includeVars=true, - keepTime=true, - tags=($._config.dashboardTags), - )) - .addTemplates( - [ - template.datasource( - promDatasourceName, - 'prometheus', - null, - label='Data Source', - refresh='load' - ), - template.new( - 'job', - promDatasource, - 'label_values(presto_HeartbeatDetector_ActiveCount,job)', - label='Job', - refresh=2, - includeAll=true, - multi=true, - allValues='.+', - sort=0 - ), - template.new( - 'cluster', - promDatasource, - 'label_values(presto_HeartbeatDetector_ActiveCount{job=~"$job"}, cluster)', - label='Cluster', - refresh=2, - includeAll=true, - multi=true, - allValues='', - hide=if $._config.enableMultiCluster then '' else 'variable', - sort=0 - ), - template.new( - 'presto_cluster', - promDatasource, - 'label_values(presto_HeartbeatDetector_ActiveCount{job=~"$job"},presto_cluster)', - label='Presto cluster', - refresh=2, - includeAll=false, - multi=false, - allValues='.*', - sort=0 - ), - template.new( - 'instance', - promDatasource, - 'label_values(presto_HeartbeatDetector_ActiveCount{job=~"$job", presto_cluster=~"$presto_cluster"},instance)', - label='Instance', - refresh=2, - includeAll=false, - multi=true, - allValues='', - sort=0 - ), - ] - ) - .addPanels( - [ - nonheapMemoryUsagePanel(getLegendMatcher($._config), getMatcher($._config)) { gridPos: { h: 9, w: 6, x: 0, y: 0 } }, - heapMemoryUsagePanel(getLegendMatcher($._config), getMatcher($._config)) { gridPos: { h: 9, w: 6, x: 6, y: 0 } }, - errorFailuresOneMinuteCountPanel(getLegendMatcher($._config), getMatcher($._config)) { gridPos: { h: 9, w: 12, x: 12, y: 0 } }, - normalQueryOneMinuteCountPanel(getLegendMatcher($._config), getMatcher($._config)) { gridPos: { h: 9, w: 12, x: 0, y: 9 } }, - abnormalQueryOneMinuteCountPanel(getLegendMatcher($._config), getMatcher($._config)) { gridPos: { h: 9, w: 12, x: 12, y: 9 } }, - normalQueryOneMinuteRatePanel(getLegendMatcher($._config), getMatcher($._config)) { gridPos: { h: 9, w: 12, x: 0, y: 18 } }, - abnormalQueryOneMinuteRatePanel(getLegendMatcher($._config), getMatcher($._config)) { gridPos: { h: 9, w: 12, x: 12, y: 18 } }, - queryExecutionTimeOneMinuteCountPanel(getLegendMatcher($._config), getMatcher($._config)) { gridPos: { h: 8, w: 24, x: 0, y: 27 } }, - cpuTimeConsumedOneMinuteRatePanel(getLegendMatcher($._config), getMatcher($._config)) { gridPos: { h: 8, w: 12, x: 0, y: 35 } }, - cpuInputThroughputOneMinuteCountPanel(getLegendMatcher($._config), getMatcher($._config)) { gridPos: { h: 8, w: 12, x: 12, y: 35 } }, - jvmMetricsRow { gridPos: { h: 1, w: 24, x: 0, y: 43 } }, - garbageCollectionCount(getLegendMatcher($._config), getMatcher($._config)) { gridPos: { h: 8, w: 12, x: 0, y: 44 } }, - garbageCollectionDurationPanel(getLegendMatcher($._config), getMatcher($._config)) { gridPos: { h: 8, w: 12, x: 12, y: 44 } }, - memoryUsedPanel(getLegendMatcher($._config), getMatcher($._config)) { gridPos: { h: 8, w: 12, x: 0, y: 52 } }, - memoryCommittedPanel(getLegendMatcher($._config), getMatcher($._config)) { gridPos: { h: 8, w: 12, x: 12, y: 52 } }, - ] - ), - }, -} diff --git a/presto-mixin/dashboards/presto-logs-overview.libsonnet b/presto-mixin/dashboards/presto-logs-overview.libsonnet deleted file mode 100644 index d963c7e89..000000000 --- a/presto-mixin/dashboards/presto-logs-overview.libsonnet +++ /dev/null @@ -1,32 +0,0 @@ -local g = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonnet'; -local logsDashboard = import 'github.com/grafana/jsonnet-libs/logs-lib/logs/main.libsonnet'; -{ - grafanaDashboards+:: - if $._config.enableLokiLogs then { - local prestoLogs = - logsDashboard.new( - 'Presto logs overview', - datasourceName='loki_datasource', - datasourceRegex='', - filterSelector=$._config.filterSelector, - labels=['job', 'presto_cluster', 'instance', 'level'], - formatParser=null, - showLogsVolume=true - ) - { - panels+: - { - logs+: - // presto logs already have timestamp - g.panel.logs.options.withShowTime(false), - }, - dashboards+: - { - logs+: g.dashboard.withLinksMixin($.grafanaDashboards['presto-overview.json'].links) - + g.dashboard.withTags($._config.dashboardTags) - + g.dashboard.withRefresh($._config.dashboardRefresh), - }, - }, - 'presto-logs.json': prestoLogs.dashboards.logs, - } else {}, -} diff --git a/presto-mixin/dashboards/presto-overview.libsonnet b/presto-mixin/dashboards/presto-overview.libsonnet deleted file mode 100644 index 6304d9c91..000000000 --- a/presto-mixin/dashboards/presto-overview.libsonnet +++ /dev/null @@ -1,993 +0,0 @@ -local g = (import 'grafana-builder/grafana.libsonnet'); -local grafana = (import 'grafonnet/grafana.libsonnet'); -local dashboard = grafana.dashboard; -local template = grafana.template; -local prometheus = grafana.prometheus; - -local dashboardUid = 'presto-overview'; - -local promDatasourceName = 'prometheus_datasource'; -local getMatcher(cfg) = '%(prestoOverviewSelector)s' % cfg; -local getLegendMatcher(cfg) = '%(prestoOverviewLegendSelector)s' % cfg; -local getAlertMatcher(cfg) = '%(prestoAlertSelector)s' % cfg; -local promDatasource = { - uid: '${%s}' % promDatasourceName, -}; - -local activeResourceManagersPanel(legendMatcher, matcher) = { - datasource: promDatasource, - targets: [ - prometheus.target( - 'sum (max by (presto_cluster) (presto_metadata_DiscoveryNodeManager_ActiveResourceManagerCount{' + matcher + ', presto_cluster=~"$presto_cluster"}))', - datasource=promDatasource, - legendFormat='Resource manager', - format='time_series', - ), - ], - type: 'stat', - title: 'Active resource managers', - description: 'The number of active resource managers.', - fieldConfig: { - defaults: { - color: { - mode: 'thresholds', - }, - mappings: [], - thresholds: { - mode: 'absolute', - steps: [ - { - color: 'green', - value: null, - }, - { - color: 'text', - value: 0, - }, - { - color: 'green', - value: 1, - }, - ], - }, - unit: 'none', - }, - overrides: [], - }, - options: { - colorMode: 'value', - graphMode: 'none', - justifyMode: 'auto', - orientation: 'auto', - reduceOptions: { - calcs: [ - 'lastNotNull', - ], - fields: '', - values: false, - }, - textMode: 'auto', - }, - pluginVersion: '10.2.0-62263', -}; - -local activeCoordinatorsPanel(legendMatcher, matcher) = { - datasource: promDatasource, - targets: [ - prometheus.target( - 'sum(max by (presto_cluster) (presto_metadata_DiscoveryNodeManager_ActiveCoordinatorCount{' + matcher + ', presto_cluster=~"$presto_cluster"}))', - datasource=promDatasource, - legendFormat='Coordinator', - format='time_series', - ), - ], - type: 'stat', - title: 'Active coordinators', - description: 'Number of broker instances across clusters.', - fieldConfig: { - defaults: { - color: { - mode: 'thresholds', - }, - mappings: [], - thresholds: { - mode: 'absolute', - steps: [ - { - color: 'green', - value: null, - }, - { - color: 'red', - value: 0, - }, - { - color: 'green', - value: 1, - }, - ], - }, - unit: 'none', - }, - overrides: [], - }, - options: { - colorMode: 'value', - graphMode: 'none', - justifyMode: 'auto', - orientation: 'auto', - reduceOptions: { - calcs: [ - 'lastNotNull', - ], - fields: '', - values: false, - }, - textMode: 'auto', - }, - pluginVersion: '10.2.0-62263', -}; - -local activeWorkersPanel(legendMatcher, matcher) = { - datasource: promDatasource, - targets: [ - prometheus.target( - 'sum(max by (presto_cluster) (presto_metadata_DiscoveryNodeManager_ActiveNodeCount{' + matcher + ', presto_cluster=~"$presto_cluster"}) - max by (presto_cluster) (presto_metadata_DiscoveryNodeManager_ActiveCoordinatorCount{' + matcher + ', presto_cluster=~"$presto_cluster"}) - max by (presto_cluster) (presto_metadata_DiscoveryNodeManager_ActiveResourceManagerCount{' + matcher + ', presto_cluster=~"$presto_cluster"}))', - datasource=promDatasource, - legendFormat='Worker', - format='time_series', - ), - ], - type: 'stat', - title: 'Active workers', - description: 'The number of active workers.', - fieldConfig: { - defaults: { - color: { - mode: 'thresholds', - }, - mappings: [], - thresholds: { - mode: 'absolute', - steps: [ - { - color: 'green', - value: null, - }, - { - color: 'red', - value: 0, - }, - { - color: 'green', - value: 1, - }, - ], - }, - unit: 'none', - }, - overrides: [], - }, - options: { - colorMode: 'value', - graphMode: 'none', - justifyMode: 'auto', - orientation: 'auto', - reduceOptions: { - calcs: [ - 'lastNotNull', - ], - fields: '', - values: false, - }, - textMode: 'auto', - }, - pluginVersion: '10.2.0-62263', -}; - -local inactiveWorkersPanel(legendMatcher, matcher) = { - datasource: promDatasource, - targets: [ - prometheus.target( - 'sum(max by (presto_cluster) (presto_metadata_DiscoveryNodeManager_InactiveNodeCount{' + matcher + ', presto_cluster=~"$presto_cluster"}))', - datasource=promDatasource, - legendFormat='Worker', - format='time_series', - ), - ], - type: 'stat', - title: 'Inactive workers', - description: 'The number of inactive workers.', - fieldConfig: { - defaults: { - color: { - mode: 'thresholds', - }, - mappings: [], - thresholds: { - mode: 'absolute', - steps: [ - { - color: 'green', - value: null, - }, - { - color: 'red', - value: 3, - }, - ], - }, - unit: 'none', - }, - overrides: [], - }, - options: { - colorMode: 'value', - graphMode: 'none', - justifyMode: 'auto', - orientation: 'auto', - reduceOptions: { - calcs: [ - 'lastNotNull', - ], - fields: '', - values: false, - }, - textMode: 'auto', - }, - pluginVersion: '10.2.0-62263', -}; - -local completedQueriesOneMinuteCountPanel(legendMatcher, matcher) = { - datasource: promDatasource, - targets: [ - prometheus.target( - 'presto_QueryManager_CompletedQueries_OneMinute_Count{' + matcher + ', presto_cluster=~"$presto_cluster"}', - datasource=promDatasource, - legendFormat='' + legendMatcher + '', - format='time_series', - ), - ], - type: 'timeseries', - title: 'Completed queries - one minute count', - description: 'The number of completed queries.', - fieldConfig: { - defaults: { - color: { - fixedColor: '#C8F2C2', - mode: 'palette-classic', - }, - custom: { - axisBorderShow: false, - axisCenteredZero: false, - axisColorMode: 'text', - axisLabel: '', - axisPlacement: 'auto', - barAlignment: 0, - drawStyle: 'line', - fillOpacity: 15, - gradientMode: 'none', - hideFrom: { - legend: false, - tooltip: false, - viz: false, - }, - insertNulls: false, - lineInterpolation: 'stepBefore', - lineWidth: 2, - pointSize: 5, - scaleDistribution: { - type: 'linear', - }, - showPoints: 'never', - spanNulls: false, - stacking: { - group: 'A', - mode: 'none', - }, - thresholdsStyle: { - mode: 'off', - }, - }, - mappings: [], - thresholds: { - mode: 'absolute', - steps: [ - { - color: 'green', - value: null, - }, - ], - }, - unit: 'none', - }, - overrides: [], - }, - options: { - legend: { - calcs: [], - displayMode: 'list', - placement: 'bottom', - showLegend: true, - }, - tooltip: { - mode: 'multi', - sort: 'desc', - }, - }, -}; - -local alertsPanel(matcher) = { - datasource: promDatasource, - targets: [ - prometheus.target( - '', - datasource=promDatasource, - legendFormat='', - format='time_series', - ), - ], - type: 'alertlist', - title: 'Alerts', - description: 'Reports firing alerts.', - options: { - alertInstanceLabelFilter: '{' + matcher + ', presto_cluster=~"${presto_cluster:regex}"}', - alertName: '', - dashboardAlerts: false, - datasource: 'Prometheus', - groupBy: [], - groupMode: 'default', - maxItems: 20, - sortOrder: 1, - stateFilter: { - 'error': true, - firing: true, - noData: false, - normal: true, - pending: true, - }, - viewMode: 'list', - }, -}; - -local userErrorFailuresOneMinuteRatePanel(legendMatcher, matcher) = { - datasource: promDatasource, - targets: [ - prometheus.target( - 'presto_QueryManager_UserErrorFailures_OneMinute_Rate{' + matcher + ', presto_cluster=~"$presto_cluster"}', - datasource=promDatasource, - legendFormat='' + legendMatcher + '', - format='time_series', - ), - ], - type: 'timeseries', - title: 'User error failures - one minute rate', - description: 'The rate of user error failures occurring across the clusters.', - fieldConfig: { - defaults: { - color: { - mode: 'palette-classic', - }, - custom: { - axisBorderShow: false, - axisCenteredZero: false, - axisColorMode: 'text', - axisLabel: '', - axisPlacement: 'auto', - barAlignment: 0, - drawStyle: 'line', - fillOpacity: 15, - gradientMode: 'none', - hideFrom: { - legend: false, - tooltip: false, - viz: false, - }, - insertNulls: false, - lineInterpolation: 'stepBefore', - lineWidth: 2, - pointSize: 5, - scaleDistribution: { - type: 'linear', - }, - showPoints: 'never', - spanNulls: false, - stacking: { - group: 'A', - mode: 'none', - }, - thresholdsStyle: { - mode: 'off', - }, - }, - mappings: [], - thresholds: { - mode: 'absolute', - steps: [ - { - color: 'green', - value: null, - }, - ], - }, - unit: 'err/s', - }, - overrides: [], - }, - options: { - legend: { - calcs: [], - displayMode: 'list', - placement: 'bottom', - showLegend: true, - }, - tooltip: { - mode: 'multi', - sort: 'desc', - }, - }, -}; - -local queuedQueriesPanel(legendMatcher, matcher) = { - datasource: promDatasource, - targets: [ - prometheus.target( - 'presto_QueryManager_QueuedQueries{' + matcher + ', presto_cluster=~"$presto_cluster"}', - datasource=promDatasource, - legendFormat='' + legendMatcher + '', - format='time_series', - ), - ], - type: 'timeseries', - title: 'Queued queries', - description: 'The number of queued queries.', - fieldConfig: { - defaults: { - color: { - mode: 'palette-classic', - }, - custom: { - axisBorderShow: false, - axisCenteredZero: false, - axisColorMode: 'text', - axisLabel: '', - axisPlacement: 'auto', - barAlignment: 0, - drawStyle: 'bars', - fillOpacity: 15, - gradientMode: 'none', - hideFrom: { - legend: false, - tooltip: false, - viz: false, - }, - insertNulls: false, - lineInterpolation: 'smooth', - lineWidth: 2, - pointSize: 5, - scaleDistribution: { - type: 'linear', - }, - showPoints: 'never', - spanNulls: false, - stacking: { - group: 'A', - mode: 'none', - }, - thresholdsStyle: { - mode: 'off', - }, - }, - mappings: [], - thresholds: { - mode: 'absolute', - steps: [ - { - color: 'green', - value: null, - }, - ], - }, - unit: 'none', - }, - overrides: [], - }, - options: { - legend: { - calcs: [], - displayMode: 'list', - placement: 'bottom', - showLegend: true, - }, - tooltip: { - mode: 'multi', - sort: 'desc', - }, - }, -}; - -local blockedNodesPanel(legendMatcher, matcher) = { - datasource: promDatasource, - targets: [ - prometheus.target( - 'presto_ClusterMemoryPool_general_BlockedNodes{' + matcher + ', presto_cluster=~"$presto_cluster"}', - datasource=promDatasource, - legendFormat='' + legendMatcher + '', - format='time_series', - ), - ], - type: 'timeseries', - title: 'Blocked nodes', - description: 'The number of nodes that are blocked due to memory restrictions.', - fieldConfig: { - defaults: { - color: { - mode: 'palette-classic', - }, - custom: { - axisBorderShow: false, - axisCenteredZero: false, - axisColorMode: 'text', - axisLabel: '', - axisPlacement: 'auto', - barAlignment: 0, - drawStyle: 'line', - fillOpacity: 15, - gradientMode: 'none', - hideFrom: { - legend: false, - tooltip: false, - viz: false, - }, - insertNulls: false, - lineInterpolation: 'stepBefore', - lineWidth: 2, - pointSize: 5, - scaleDistribution: { - type: 'linear', - }, - showPoints: 'never', - spanNulls: false, - stacking: { - group: 'A', - mode: 'normal', - }, - thresholdsStyle: { - mode: 'off', - }, - }, - mappings: [], - thresholds: { - mode: 'absolute', - steps: [ - { - color: 'green', - value: null, - }, - ], - }, - unit: 'none', - }, - overrides: [], - }, - options: { - legend: { - calcs: [], - displayMode: 'list', - placement: 'bottom', - showLegend: true, - }, - tooltip: { - mode: 'multi', - sort: 'desc', - }, - }, -}; - -local internalErrorFailuresOneMinuteRatePanel(legendMatcher, matcher) = { - datasource: promDatasource, - targets: [ - prometheus.target( - 'presto_QueryManager_InternalFailures_OneMinute_Rate{' + matcher + ', presto_cluster=~"$presto_cluster"}', - datasource=promDatasource, - legendFormat='' + legendMatcher + '', - format='time_series', - ), - ], - type: 'timeseries', - title: 'Internal error failures - one minute rate', - description: 'The rate of internal failures occurring across the clusters.', - fieldConfig: { - defaults: { - color: { - mode: 'palette-classic', - }, - custom: { - axisBorderShow: false, - axisCenteredZero: false, - axisColorMode: 'text', - axisLabel: '', - axisPlacement: 'auto', - barAlignment: 0, - drawStyle: 'line', - fillOpacity: 15, - gradientMode: 'none', - hideFrom: { - legend: false, - tooltip: false, - viz: false, - }, - insertNulls: false, - lineInterpolation: 'stepBefore', - lineWidth: 2, - pointSize: 5, - scaleDistribution: { - type: 'linear', - }, - showPoints: 'never', - spanNulls: false, - stacking: { - group: 'A', - mode: 'none', - }, - thresholdsStyle: { - mode: 'off', - }, - }, - mappings: [], - thresholds: { - mode: 'absolute', - steps: [ - { - color: 'green', - value: null, - }, - ], - }, - unit: 'err/s', - }, - overrides: [], - }, - options: { - legend: { - calcs: [], - displayMode: 'list', - placement: 'bottom', - showLegend: true, - }, - tooltip: { - mode: 'multi', - sort: 'desc', - }, - }, -}; - -local clusterMemoryDistributedBytesPanel(legendMatcher, matcher) = { - datasource: promDatasource, - targets: [ - prometheus.target( - 'sum by (presto_cluster) (presto_ClusterMemoryPool_general_FreeDistributedBytes{' + matcher + ', presto_cluster=~"$presto_cluster"})', - datasource=promDatasource, - legendFormat='' + legendMatcher + ' - free', - format='time_series', - ), - prometheus.target( - 'sum by (presto_cluster) (presto_ClusterMemoryPool_reserved_FreeDistributedBytes{' + matcher + ', presto_cluster=~"$presto_cluster"})', - datasource=promDatasource, - legendFormat='' + legendMatcher + ' - reserved', - format='time_series', - ), - ], - type: 'timeseries', - title: 'Cluster memory distributed bytes', - description: 'The amount of memory available across the clusters.', - fieldConfig: { - defaults: { - color: { - mode: 'palette-classic', - }, - custom: { - axisBorderShow: false, - axisCenteredZero: false, - axisColorMode: 'text', - axisLabel: '', - axisPlacement: 'auto', - barAlignment: 0, - drawStyle: 'line', - fillOpacity: 15, - gradientMode: 'none', - hideFrom: { - legend: false, - tooltip: false, - viz: false, - }, - insertNulls: false, - lineInterpolation: 'smooth', - lineWidth: 2, - pointSize: 5, - scaleDistribution: { - type: 'linear', - }, - showPoints: 'never', - spanNulls: false, - stacking: { - group: 'A', - mode: 'normal', - }, - thresholdsStyle: { - mode: 'off', - }, - }, - mappings: [], - thresholds: { - mode: 'absolute', - steps: [ - { - color: 'green', - value: null, - }, - ], - }, - unit: 'decbytes', - }, - overrides: [], - }, - options: { - legend: { - calcs: [], - displayMode: 'list', - placement: 'bottom', - showLegend: true, - }, - tooltip: { - mode: 'multi', - sort: 'desc', - }, - }, -}; - -local InsufficientResourceFailuresOneMinuteRatePanel(legendMatcher, matcher) = { - datasource: promDatasource, - targets: [ - prometheus.target( - 'presto_QueryManager_InsufficientResourcesFailures_OneMinute_Rate{' + matcher + ', presto_cluster=~"$presto_cluster"}', - datasource=promDatasource, - legendFormat='' + legendMatcher + '', - format='time_series', - ), - ], - type: 'timeseries', - title: ' Insufficient resource failures - one minute rate', - description: 'The rate that failures are occurring due to insufficient resources.', - fieldConfig: { - defaults: { - color: { - mode: 'palette-classic', - }, - custom: { - axisBorderShow: false, - axisCenteredZero: false, - axisColorMode: 'text', - axisLabel: '', - axisPlacement: 'auto', - barAlignment: 0, - drawStyle: 'line', - fillOpacity: 15, - gradientMode: 'none', - hideFrom: { - legend: false, - tooltip: false, - viz: false, - }, - insertNulls: false, - lineInterpolation: 'stepBefore', - lineWidth: 2, - pointSize: 5, - scaleDistribution: { - type: 'linear', - }, - showPoints: 'never', - spanNulls: false, - stacking: { - group: 'A', - mode: 'none', - }, - thresholdsStyle: { - mode: 'off', - }, - }, - mappings: [], - thresholds: { - mode: 'absolute', - steps: [ - { - color: 'green', - value: null, - }, - ], - }, - unit: 'err/s', - }, - overrides: [], - }, - options: { - legend: { - calcs: [], - displayMode: 'list', - placement: 'bottom', - showLegend: true, - }, - tooltip: { - mode: 'multi', - sort: 'desc', - }, - }, -}; - -local dataProcessingThroughputOneMinuteRatePanel(legendMatcher, matcher) = { - datasource: promDatasource, - targets: [ - prometheus.target( - 'sum by (presto_cluster) (presto_TaskManager_InputDataSize_OneMinute_Rate{' + matcher + ', presto_cluster=~"$presto_cluster"})', - datasource=promDatasource, - legendFormat='' + legendMatcher + ' - input', - format='time_series', - ), - prometheus.target( - 'sum by (presto_cluster) (presto_TaskManager_OutputDataSize_OneMinute_Rate{' + matcher + ', presto_cluster=~"$presto_cluster"})', - datasource=promDatasource, - legendFormat='' + legendMatcher + ' - output', - format='time_series', - ), - ], - type: 'timeseries', - title: 'Data processing throughput - one minute rate', - description: 'The rate at which volumes of data are being processed', - fieldConfig: { - defaults: { - color: { - mode: 'palette-classic', - }, - custom: { - axisBorderShow: false, - axisCenteredZero: false, - axisColorMode: 'text', - axisLabel: '', - axisPlacement: 'auto', - barAlignment: 0, - drawStyle: 'line', - fillOpacity: 15, - gradientMode: 'none', - hideFrom: { - legend: false, - tooltip: false, - viz: false, - }, - insertNulls: false, - lineInterpolation: 'stepBefore', - lineWidth: 2, - pointSize: 5, - scaleDistribution: { - type: 'linear', - }, - showPoints: 'never', - spanNulls: false, - stacking: { - group: 'A', - mode: 'normal', - }, - thresholdsStyle: { - mode: 'off', - }, - }, - mappings: [], - thresholds: { - mode: 'absolute', - steps: [ - { - color: 'green', - value: null, - }, - ], - }, - unit: 'Bps', - }, - overrides: [], - }, - options: { - legend: { - calcs: [], - displayMode: 'list', - placement: 'bottom', - showLegend: true, - }, - tooltip: { - mode: 'multi', - sort: 'desc', - }, - }, -}; - -{ - grafanaDashboards+:: { - 'presto-overview.json': - dashboard.new( - 'Presto overview', - time_from='%s' % $._config.dashboardPeriod, - tags=($._config.dashboardTags), - timezone='%s' % $._config.dashboardTimezone, - refresh='%s' % $._config.dashboardRefresh, - description='', - uid=dashboardUid, - ) - .addLink(grafana.link.dashboards( - asDropdown=false, - title='Other Presto dashboards', - includeVars=true, - keepTime=true, - tags=($._config.dashboardTags), - )) - .addTemplates( - [ - template.datasource( - promDatasourceName, - 'prometheus', - null, - label='Data Source', - refresh='load' - ), - template.new( - 'job', - promDatasource, - 'label_values(presto_HeartbeatDetector_ActiveCount,job)', - label='Job', - refresh=2, - includeAll=true, - multi=true, - allValues='.+', - sort=0 - ), - template.new( - 'cluster', - promDatasource, - 'label_values(presto_HeartbeatDetector_ActiveCount{job=~"$job"}, cluster)', - label='Cluster', - refresh=2, - includeAll=true, - multi=true, - allValues='', - hide=if $._config.enableMultiCluster then '' else 'variable', - sort=0 - ), - template.new( - 'presto_cluster', - promDatasource, - 'label_values(presto_HeartbeatDetector_ActiveCount{job=~"$job"},presto_cluster)', - label='Presto cluster', - refresh=2, - includeAll=true, - multi=true, - allValues='.*', - sort=0 - ), - ] - ) - .addPanels( - [ - activeResourceManagersPanel(getLegendMatcher($._config), getMatcher($._config)) { gridPos: { h: 4, w: 6, x: 0, y: 0 } }, - activeCoordinatorsPanel(getLegendMatcher($._config), getMatcher($._config)) { gridPos: { h: 4, w: 6, x: 6, y: 0 } }, - activeWorkersPanel(getLegendMatcher($._config), getMatcher($._config)) { gridPos: { h: 4, w: 6, x: 12, y: 0 } }, - inactiveWorkersPanel(getLegendMatcher($._config), getMatcher($._config)) { gridPos: { h: 4, w: 6, x: 18, y: 0 } }, - completedQueriesOneMinuteCountPanel(getLegendMatcher($._config), getMatcher($._config)) { gridPos: { h: 8, w: 12, x: 0, y: 4 } }, - alertsPanel(getAlertMatcher($._config)) { gridPos: { h: 8, w: 12, x: 12, y: 4 } }, - userErrorFailuresOneMinuteRatePanel(getLegendMatcher($._config), getMatcher($._config)) { gridPos: { h: 8, w: 12, x: 0, y: 12 } }, - queuedQueriesPanel(getLegendMatcher($._config), getMatcher($._config)) { gridPos: { h: 8, w: 12, x: 12, y: 12 } }, - blockedNodesPanel(getLegendMatcher($._config), getMatcher($._config)) { gridPos: { h: 8, w: 12, x: 0, y: 20 } }, - internalErrorFailuresOneMinuteRatePanel(getLegendMatcher($._config), getMatcher($._config)) { gridPos: { h: 8, w: 12, x: 12, y: 20 } }, - clusterMemoryDistributedBytesPanel(getLegendMatcher($._config), getMatcher($._config)) { gridPos: { h: 8, w: 12, x: 0, y: 28 } }, - InsufficientResourceFailuresOneMinuteRatePanel(getLegendMatcher($._config), getMatcher($._config)) { gridPos: { h: 8, w: 12, x: 12, y: 28 } }, - dataProcessingThroughputOneMinuteRatePanel(getLegendMatcher($._config), getMatcher($._config)) { gridPos: { h: 9, w: 24, x: 0, y: 36 } }, - ] - ), - }, -} diff --git a/presto-mixin/dashboards/presto-worker.libsonnet b/presto-mixin/dashboards/presto-worker.libsonnet deleted file mode 100644 index 5e22ce362..000000000 --- a/presto-mixin/dashboards/presto-worker.libsonnet +++ /dev/null @@ -1,1083 +0,0 @@ -local g = (import 'grafana-builder/grafana.libsonnet'); -local grafana = (import 'grafonnet/grafana.libsonnet'); -local dashboard = grafana.dashboard; -local template = grafana.template; -local prometheus = grafana.prometheus; - -local dashboardUid = 'presto-worker'; - -local promDatasourceName = 'prometheus_datasource'; -local getMatcher(cfg) = '%(prestoSelector)s' % cfg; -local getLegendMatcher(cfg) = '%(prestoLegendSelector)s' % cfg; -local promDatasource = { - uid: '${%s}' % promDatasourceName, -}; - -local nonheapMemoryUsagePanel(legendMatcher, matcher) = { - datasource: promDatasource, - targets: [ - prometheus.target( - 'avg (jvm_nonheap_memory_used{' + matcher + ', presto_cluster=~"$presto_cluster"} / clamp_min((jvm_nonheap_memory_used{' + matcher + ', presto_cluster=~"$presto_cluster"} + jvm_nonheap_memory_committed{' + matcher + ', presto_cluster=~"$presto_cluster"}),1))', - datasource=promDatasource, - ), - ], - type: 'gauge', - title: 'Non-heap memory usage', - description: "An average gauge of the JVM's non-heap memory usage across coordinators.", - fieldConfig: { - defaults: { - color: { - mode: 'thresholds', - }, - mappings: [], - thresholds: { - mode: 'absolute', - steps: [ - { - color: 'green', - value: null, - }, - { - color: '#EAB839', - value: 0.7, - }, - { - color: 'red', - value: 0.8, - }, - ], - }, - unit: 'percentunit', - }, - overrides: [], - }, - options: { - minVizHeight: 75, - minVizWidth: 75, - orientation: 'auto', - reduceOptions: { - calcs: [ - 'lastNotNull', - ], - fields: '', - values: false, - }, - showThresholdLabels: false, - showThresholdMarkers: true, - }, - pluginVersion: '10.2.0-62263', -}; - -local heapMemoryUsagePanel(legendMatcher, matcher) = { - datasource: promDatasource, - targets: [ - prometheus.target( - 'avg (jvm_heap_memory_used{' + matcher + ', presto_cluster=~"$presto_cluster"} / clamp_min((jvm_heap_memory_used{' + matcher + ', presto_cluster=~"$presto_cluster"} + jvm_heap_memory_committed{' + matcher + ', presto_cluster=~"$presto_cluster"}),1))', - datasource=promDatasource, - ), - ], - type: 'gauge', - title: 'Heap memory usage', - description: "An average gauge of the JVM's heap memory usage across workers.", - fieldConfig: { - defaults: { - color: { - mode: 'thresholds', - }, - mappings: [], - thresholds: { - mode: 'absolute', - steps: [ - { - color: 'green', - value: null, - }, - { - color: '#EAB839', - value: 0.7, - }, - { - color: 'red', - value: 0.8, - }, - ], - }, - unit: 'percentunit', - }, - overrides: [], - }, - options: { - minVizHeight: 75, - minVizWidth: 75, - orientation: 'auto', - reduceOptions: { - calcs: [ - 'lastNotNull', - ], - fields: '', - values: false, - }, - showThresholdLabels: false, - showThresholdMarkers: true, - }, - pluginVersion: '10.2.0-62263', -}; - -local queuedTasksPanel(legendMatcher, matcher) = { - datasource: promDatasource, - targets: [ - prometheus.target( - 'presto_TaskExecutor_ProcessorExecutor_QueuedTaskCount{' + matcher + ', presto_cluster=~"$presto_cluster"}', - datasource=promDatasource, - legendFormat='' + legendMatcher + '', - format='time_series', - ), - ], - type: 'timeseries', - title: 'Queued tasks', - description: 'The number of tasks that are being queued by the task executor.', - fieldConfig: { - defaults: { - color: { - fixedColor: '#C8F2C2', - mode: 'palette-classic', - }, - custom: { - axisBorderShow: false, - axisCenteredZero: false, - axisColorMode: 'text', - axisLabel: '', - axisPlacement: 'auto', - barAlignment: 0, - drawStyle: 'line', - fillOpacity: 15, - gradientMode: 'none', - hideFrom: { - legend: false, - tooltip: false, - viz: false, - }, - insertNulls: false, - lineInterpolation: 'smooth', - lineWidth: 2, - pointSize: 5, - scaleDistribution: { - type: 'linear', - }, - showPoints: 'never', - spanNulls: false, - stacking: { - group: 'A', - mode: 'normal', - }, - thresholdsStyle: { - mode: 'off', - }, - }, - mappings: [], - thresholds: { - mode: 'absolute', - steps: [ - { - color: 'green', - value: null, - }, - ], - }, - unit: 'none', - }, - overrides: [], - }, - options: { - legend: { - calcs: [], - displayMode: 'list', - placement: 'bottom', - showLegend: true, - }, - tooltip: { - mode: 'multi', - sort: 'desc', - }, - }, -}; - -local failedCompletedTasksPanel(legendMatcher, matcher) = { - datasource: promDatasource, - targets: [ - prometheus.target( - 'rate(presto_TaskManager_FailedTasks_TotalCount{' + matcher + ', presto_cluster=~"$presto_cluster"}[$__rate_interval])', - datasource=promDatasource, - legendFormat='' + legendMatcher + ' - failed', - format='time_series', - ), - prometheus.target( - 'rate(presto_TaskExecutor_ProcessorExecutor_CompletedTaskCount{' + matcher + ', presto_cluster=~"$presto_cluster"}[$__rate_interval])', - datasource=promDatasource, - legendFormat='' + legendMatcher + ' - completed', - ), - ], - type: 'timeseries', - title: 'Failed & Completed Tasks', - description: 'The rate at which tasks have failed and completed', - fieldConfig: { - defaults: { - color: { - mode: 'palette-classic', - }, - custom: { - axisBorderShow: false, - axisCenteredZero: false, - axisColorMode: 'text', - axisLabel: '', - axisPlacement: 'auto', - barAlignment: 0, - drawStyle: 'line', - fillOpacity: 15, - gradientMode: 'none', - hideFrom: { - legend: false, - tooltip: false, - viz: false, - }, - insertNulls: false, - lineInterpolation: 'smooth', - lineWidth: 2, - pointSize: 5, - scaleDistribution: { - type: 'linear', - }, - showPoints: 'never', - spanNulls: false, - stacking: { - group: 'A', - mode: 'normal', - }, - thresholdsStyle: { - mode: 'off', - }, - }, - mappings: [], - thresholds: { - mode: 'absolute', - steps: [ - { - color: 'green', - value: null, - }, - ], - }, - unit: 'ops', - }, - overrides: [ - { - matcher: { - id: 'byFrameRefID', - options: 'A', - }, - properties: [ - { - id: 'color', - value: { - fixedColor: 'red', - mode: 'fixed', - }, - }, - { - id: 'custom.axisPlacement', - value: 'left', - }, - ], - }, - { - matcher: { - id: 'byFrameRefID', - options: 'B', - }, - properties: [ - { - id: 'color', - value: { - fixedColor: 'green', - mode: 'fixed', - }, - }, - { - id: 'custom.axisPlacement', - value: 'right', - }, - ], - }, - ], - }, - options: { - legend: { - calcs: [], - displayMode: 'list', - placement: 'bottom', - showLegend: true, - }, - tooltip: { - mode: 'multi', - sort: 'desc', - }, - }, -}; - -local outputPositionsPanel(legendMatcher, matcher) = { - datasource: promDatasource, - targets: [ - prometheus.target( - 'presto_TaskManager_OutputPositions_OneMinute_Rate{' + matcher + ', presto_cluster=~"$presto_cluster"}', - datasource=promDatasource, - legendFormat='' + legendMatcher + '', - format='time_series', - ), - ], - type: 'timeseries', - title: 'Output positions - one minute rate', - description: 'The rate of rows (or records) produced by an operation.', - fieldConfig: { - defaults: { - color: { - mode: 'palette-classic', - }, - custom: { - axisBorderShow: false, - axisCenteredZero: false, - axisColorMode: 'text', - axisLabel: '', - axisPlacement: 'auto', - barAlignment: 0, - drawStyle: 'line', - fillOpacity: 15, - gradientMode: 'none', - hideFrom: { - legend: false, - tooltip: false, - viz: false, - }, - insertNulls: false, - lineInterpolation: 'stepBefore', - lineWidth: 2, - pointSize: 5, - scaleDistribution: { - type: 'linear', - }, - showPoints: 'never', - spanNulls: false, - stacking: { - group: 'A', - mode: 'normal', - }, - thresholdsStyle: { - mode: 'off', - }, - }, - mappings: [], - thresholds: { - mode: 'absolute', - steps: [ - { - color: 'green', - value: null, - }, - ], - }, - unit: 'rowsps', - }, - overrides: [], - }, - options: { - legend: { - calcs: [], - displayMode: 'list', - placement: 'bottom', - showLegend: true, - }, - tooltip: { - mode: 'multi', - sort: 'desc', - }, - }, -}; - -local executorPoolSizePanel(legendMatcher, matcher) = { - datasource: promDatasource, - targets: [ - prometheus.target( - 'presto_TaskManager_TaskNotificationExecutor_PoolSize{' + matcher + ', presto_cluster=~"$presto_cluster"}', - datasource=promDatasource, - legendFormat='' + legendMatcher + ' - task notification', - format='time_series', - ), - prometheus.target( - 'presto_TaskExecutor_ProcessorExecutor_CorePoolSize{' + matcher + ', presto_cluster=~"$presto_cluster"}', - datasource=promDatasource, - legendFormat='' + legendMatcher + ' - process executor core', - ), - prometheus.target( - 'presto_TaskExecutor_ProcessorExecutor_PoolSize{' + matcher + ', presto_cluster=~"$presto_cluster"}', - datasource=promDatasource, - legendFormat='' + legendMatcher + ' - process executor', - ), - ], - type: 'timeseries', - title: 'Executor pool size', - description: 'The pool size of the task notification executor and process executor.', - fieldConfig: { - defaults: { - color: { - mode: 'palette-classic', - }, - custom: { - axisBorderShow: false, - axisCenteredZero: false, - axisColorMode: 'text', - axisLabel: '', - axisPlacement: 'auto', - barAlignment: 0, - drawStyle: 'line', - fillOpacity: 15, - gradientMode: 'none', - hideFrom: { - legend: false, - tooltip: false, - viz: false, - }, - insertNulls: false, - lineInterpolation: 'smooth', - lineWidth: 2, - pointSize: 5, - scaleDistribution: { - type: 'linear', - }, - showPoints: 'never', - spanNulls: false, - stacking: { - group: 'A', - mode: 'normal', - }, - thresholdsStyle: { - mode: 'off', - }, - }, - mappings: [], - thresholds: { - mode: 'absolute', - steps: [ - { - color: 'green', - value: null, - }, - ], - }, - unit: 'none', - }, - overrides: [], - }, - options: { - legend: { - calcs: [], - displayMode: 'list', - placement: 'bottom', - showLegend: true, - }, - tooltip: { - mode: 'multi', - sort: 'desc', - }, - }, -}; - -local memoryPoolPanel(legendMatcher, matcher) = { - datasource: promDatasource, - targets: [ - prometheus.target( - 'sum by (instance, presto_cluster) (presto_MemoryPool_general_FreeBytes{' + matcher + ', presto_cluster=~"$presto_cluster"})', - datasource=promDatasource, - legendFormat='' + legendMatcher + ' - free', - format='time_series', - ), - prometheus.target( - 'sum by (instance, presto_cluster) (presto_MemoryPool_reserved_FreeBytes{' + matcher + ', presto_cluster=~"$presto_cluster"})', - datasource=promDatasource, - legendFormat='' + legendMatcher + ' - reserved', - ), - ], - type: 'timeseries', - title: 'Memory pool', - description: 'The amount of Presto memory available.', - fieldConfig: { - defaults: { - color: { - mode: 'palette-classic', - }, - custom: { - axisBorderShow: false, - axisCenteredZero: false, - axisColorMode: 'text', - axisLabel: '', - axisPlacement: 'auto', - barAlignment: 0, - drawStyle: 'line', - fillOpacity: 15, - gradientMode: 'none', - hideFrom: { - legend: false, - tooltip: false, - viz: false, - }, - insertNulls: false, - lineInterpolation: 'smooth', - lineWidth: 2, - pointSize: 5, - scaleDistribution: { - type: 'linear', - }, - showPoints: 'never', - spanNulls: false, - stacking: { - group: 'A', - mode: 'normal', - }, - thresholdsStyle: { - mode: 'off', - }, - }, - mappings: [], - thresholds: { - mode: 'absolute', - steps: [ - { - color: 'green', - value: null, - }, - ], - }, - unit: 'decbytes', - }, - overrides: [], - }, - options: { - legend: { - calcs: [], - displayMode: 'list', - placement: 'bottom', - showLegend: true, - }, - tooltip: { - mode: 'multi', - sort: 'desc', - }, - }, -}; - -local dataProcessingThroughputOneMinuteRatePanel(legendMatcher, matcher) = { - datasource: promDatasource, - targets: [ - prometheus.target( - 'presto_TaskManager_InputDataSize_OneMinute_Rate{' + matcher + ', presto_cluster=~"$presto_cluster"}', - datasource=promDatasource, - legendFormat='' + legendMatcher + ' - input', - format='time_series', - ), - prometheus.target( - 'presto_TaskManager_OutputDataSize_OneMinute_Rate{' + matcher + ', presto_cluster=~"$presto_cluster"}', - datasource=promDatasource, - legendFormat='' + legendMatcher + ' - output', - ), - ], - type: 'timeseries', - title: 'Data processing throughput - one minute rate', - description: 'The rate at which volumes of data are being processed', - fieldConfig: { - defaults: { - color: { - mode: 'palette-classic', - }, - custom: { - axisBorderShow: false, - axisCenteredZero: false, - axisColorMode: 'text', - axisLabel: '', - axisPlacement: 'auto', - barAlignment: 0, - drawStyle: 'line', - fillOpacity: 15, - gradientMode: 'none', - hideFrom: { - legend: false, - tooltip: false, - viz: false, - }, - insertNulls: false, - lineInterpolation: 'stepBefore', - lineWidth: 2, - pointSize: 5, - scaleDistribution: { - type: 'linear', - }, - showPoints: 'never', - spanNulls: false, - stacking: { - group: 'A', - mode: 'normal', - }, - thresholdsStyle: { - mode: 'off', - }, - }, - mappings: [], - thresholds: { - mode: 'absolute', - steps: [ - { - color: 'green', - value: null, - }, - ], - }, - unit: 'Bps', - }, - overrides: [], - }, - options: { - legend: { - calcs: [], - displayMode: 'list', - placement: 'bottom', - showLegend: true, - }, - tooltip: { - mode: 'multi', - sort: 'desc', - }, - }, -}; - -local jvmMetricsRow = { - datasource: promDatasource, - targets: [], - type: 'row', - title: 'JVM metrics', - collapsed: false, -}; - -local garbageCollectionCount(legendMatcher, matcher) = { - datasource: promDatasource, - targets: [ - prometheus.target( - 'increase(jvm_gc_collection_count{' + matcher + ', presto_cluster=~"$presto_cluster", name="G1 Young Generation"}[$__interval:])', - datasource=promDatasource, - legendFormat='' + legendMatcher + '', - interval='1m', - format='time_series', - ), - ], - type: 'timeseries', - title: 'Garbage collection count / $__interval', - description: 'The recent increase in the number of garbage collection events for the JVM.', - fieldConfig: { - defaults: { - color: { - mode: 'palette-classic', - }, - custom: { - axisBorderShow: false, - axisCenteredZero: false, - axisColorMode: 'text', - axisLabel: '', - axisPlacement: 'auto', - barAlignment: 0, - drawStyle: 'line', - fillOpacity: 15, - gradientMode: 'none', - hideFrom: { - legend: false, - tooltip: false, - viz: false, - }, - insertNulls: false, - lineInterpolation: 'smooth', - lineWidth: 2, - pointSize: 5, - scaleDistribution: { - type: 'linear', - }, - showPoints: 'never', - spanNulls: false, - stacking: { - group: 'A', - mode: 'normal', - }, - thresholdsStyle: { - mode: 'off', - }, - }, - mappings: [], - thresholds: { - mode: 'absolute', - steps: [ - { - color: 'green', - value: null, - }, - ], - }, - unit: 'none', - }, - overrides: [], - }, - options: { - legend: { - calcs: [], - displayMode: 'list', - placement: 'bottom', - showLegend: true, - }, - tooltip: { - mode: 'multi', - sort: 'desc', - }, - }, -}; - -local garbageCollectionDurationPanel(legendMatcher, matcher) = { - datasource: promDatasource, - targets: [ - prometheus.target( - 'jvm_gc_duration{' + matcher + ', presto_cluster=~"$presto_cluster", name="G1 Young Generation"}', - datasource=promDatasource, - legendFormat='' + legendMatcher + '', - format='time_series', - ), - ], - type: 'timeseries', - title: 'Garbage collection duration', - description: 'The average duration for each garbage collection operation in the JVM.', - fieldConfig: { - defaults: { - color: { - mode: 'palette-classic', - }, - custom: { - axisBorderShow: false, - axisCenteredZero: false, - axisColorMode: 'text', - axisLabel: '', - axisPlacement: 'auto', - barAlignment: 0, - drawStyle: 'line', - fillOpacity: 15, - gradientMode: 'none', - hideFrom: { - legend: false, - tooltip: false, - viz: false, - }, - insertNulls: false, - lineInterpolation: 'smooth', - lineWidth: 2, - pointSize: 5, - scaleDistribution: { - type: 'linear', - }, - showPoints: 'never', - spanNulls: false, - stacking: { - group: 'A', - mode: 'normal', - }, - thresholdsStyle: { - mode: 'off', - }, - }, - mappings: [], - thresholds: { - mode: 'absolute', - steps: [ - { - color: 'green', - value: null, - }, - ], - }, - unit: 'ms', - }, - overrides: [], - }, - options: { - legend: { - calcs: [], - displayMode: 'list', - placement: 'bottom', - showLegend: true, - }, - tooltip: { - mode: 'multi', - sort: 'desc', - }, - }, -}; - -local memoryUsedPanel(legendMatcher, matcher) = { - datasource: promDatasource, - targets: [ - prometheus.target( - 'jvm_nonheap_memory_used{' + matcher + ', presto_cluster=~"$presto_cluster"}', - datasource=promDatasource, - legendFormat='' + legendMatcher + ' - non heap', - format='time_series', - ), - prometheus.target( - 'jvm_heap_memory_used{' + matcher + ', presto_cluster=~"$presto_cluster"}', - datasource=promDatasource, - legendFormat='' + legendMatcher + ' - heap', - ), - ], - type: 'timeseries', - title: 'Memory used', - description: 'The heap and non-heap memory used by the JVM.', - fieldConfig: { - defaults: { - color: { - mode: 'palette-classic', - }, - custom: { - axisBorderShow: false, - axisCenteredZero: false, - axisColorMode: 'text', - axisLabel: '', - axisPlacement: 'auto', - barAlignment: 0, - drawStyle: 'line', - fillOpacity: 15, - gradientMode: 'none', - hideFrom: { - legend: false, - tooltip: false, - viz: false, - }, - insertNulls: false, - lineInterpolation: 'smooth', - lineWidth: 2, - pointSize: 5, - scaleDistribution: { - type: 'linear', - }, - showPoints: 'never', - spanNulls: false, - stacking: { - group: 'A', - mode: 'normal', - }, - thresholdsStyle: { - mode: 'off', - }, - }, - mappings: [], - thresholds: { - mode: 'absolute', - steps: [ - { - color: 'green', - value: null, - }, - ], - }, - unit: 'decbytes', - }, - overrides: [], - }, - options: { - legend: { - calcs: [], - displayMode: 'list', - placement: 'bottom', - showLegend: true, - }, - tooltip: { - mode: 'multi', - sort: 'desc', - }, - }, -}; - -local memoryCommittedPanel(legendMatcher, matcher) = { - datasource: promDatasource, - targets: [ - prometheus.target( - 'jvm_heap_memory_committed{' + matcher + ', presto_cluster=~"$presto_cluster"}', - datasource=promDatasource, - legendFormat='' + legendMatcher + ' - heap', - format='time_series', - ), - prometheus.target( - 'jvm_nonheap_memory_committed{' + matcher + ', presto_cluster=~"$presto_cluster"}', - datasource=promDatasource, - legendFormat='' + legendMatcher + ' - non heap', - ), - ], - type: 'timeseries', - title: 'Memory committed', - description: 'The heap and non-heap memory committed.', - fieldConfig: { - defaults: { - color: { - mode: 'palette-classic', - }, - custom: { - axisBorderShow: false, - axisCenteredZero: false, - axisColorMode: 'text', - axisLabel: '', - axisPlacement: 'auto', - barAlignment: 0, - drawStyle: 'line', - fillOpacity: 15, - gradientMode: 'none', - hideFrom: { - legend: false, - tooltip: false, - viz: false, - }, - insertNulls: false, - lineInterpolation: 'smooth', - lineWidth: 2, - pointSize: 5, - scaleDistribution: { - type: 'linear', - }, - showPoints: 'never', - spanNulls: false, - stacking: { - group: 'A', - mode: 'normal', - }, - thresholdsStyle: { - mode: 'off', - }, - }, - mappings: [], - thresholds: { - mode: 'absolute', - steps: [ - { - color: 'green', - value: null, - }, - ], - }, - unit: 'decbytes', - }, - overrides: [], - }, - options: { - legend: { - calcs: [], - displayMode: 'list', - placement: 'bottom', - showLegend: true, - }, - tooltip: { - mode: 'multi', - sort: 'desc', - }, - }, -}; - -{ - grafanaDashboards+:: { - 'presto-worker.json': - dashboard.new( - 'Presto worker', - time_from='%s' % $._config.dashboardPeriod, - tags=($._config.dashboardTags), - timezone='%s' % $._config.dashboardTimezone, - refresh='%s' % $._config.dashboardRefresh, - description='', - uid=dashboardUid, - ) - .addLink(grafana.link.dashboards( - asDropdown=false, - title='Other Presto dashboards', - includeVars=true, - keepTime=true, - tags=($._config.dashboardTags), - )) - .addTemplates( - [ - template.datasource( - promDatasourceName, - 'prometheus', - null, - label='Data Source', - refresh='load' - ), - template.new( - 'job', - promDatasource, - 'label_values(presto_metadata_DiscoveryNodeManager_ActiveNodeCount,job)', - label='Job', - refresh=2, - includeAll=true, - multi=true, - allValues='.+', - sort=0 - ), - template.new( - 'cluster', - promDatasource, - 'label_values(presto_metadata_DiscoveryNodeManager_ActiveNodeCount{job=~"$job"}, cluster)', - label='Cluster', - refresh=2, - includeAll=true, - multi=true, - allValues='', - hide=if $._config.enableMultiCluster then '' else 'variable', - sort=0 - ), - template.new( - 'presto_cluster', - promDatasource, - 'label_values(presto_metadata_DiscoveryNodeManager_ActiveNodeCount{job=~"$job"},presto_cluster)', - label='Presto cluster', - refresh=2, - includeAll=false, - multi=false, - allValues='.*', - sort=0 - ), - template.new( - 'instance', - promDatasource, - 'label_values(presto_metadata_DiscoveryNodeManager_ActiveNodeCount{job=~"$job", presto_cluster=~"$presto_cluster"},instance)', - label='Instance', - refresh=2, - includeAll=false, - multi=true, - allValues='', - sort=0 - ), - ] - ) - .addPanels( - [ - nonheapMemoryUsagePanel(getLegendMatcher($._config), getMatcher($._config)) { gridPos: { h: 8, w: 3, x: 0, y: 0 } }, - heapMemoryUsagePanel(getLegendMatcher($._config), getMatcher($._config)) { gridPos: { h: 8, w: 3, x: 3, y: 0 } }, - queuedTasksPanel(getLegendMatcher($._config), getMatcher($._config)) { gridPos: { h: 8, w: 6, x: 6, y: 0 } }, - failedCompletedTasksPanel(getLegendMatcher($._config), getMatcher($._config)) { gridPos: { h: 8, w: 12, x: 12, y: 0 } }, - outputPositionsPanel(getLegendMatcher($._config), getMatcher($._config)) { gridPos: { h: 8, w: 12, x: 0, y: 8 } }, - executorPoolSizePanel(getLegendMatcher($._config), getMatcher($._config)) { gridPos: { h: 8, w: 12, x: 12, y: 8 } }, - memoryPoolPanel(getLegendMatcher($._config), getMatcher($._config)) { gridPos: { h: 8, w: 12, x: 0, y: 16 } }, - dataProcessingThroughputOneMinuteRatePanel(getLegendMatcher($._config), getMatcher($._config)) { gridPos: { h: 8, w: 12, x: 12, y: 16 } }, - jvmMetricsRow { gridPos: { h: 1, w: 24, x: 0, y: 24 } }, - garbageCollectionCount(getLegendMatcher($._config), getMatcher($._config)) { gridPos: { h: 8, w: 12, x: 0, y: 25 } }, - garbageCollectionDurationPanel(getLegendMatcher($._config), getMatcher($._config)) { gridPos: { h: 8, w: 12, x: 12, y: 25 } }, - memoryUsedPanel(getLegendMatcher($._config), getMatcher($._config)) { gridPos: { h: 8, w: 12, x: 0, y: 33 } }, - memoryCommittedPanel(getLegendMatcher($._config), getMatcher($._config)) { gridPos: { h: 8, w: 12, x: 12, y: 33 } }, - ] - ), - }, -} diff --git a/presto-mixin/dashboards_out/presto-coordinator.json b/presto-mixin/dashboards_out/presto-coordinator.json index 213cadbcc..13adca11b 100644 --- a/presto-mixin/dashboards_out/presto-coordinator.json +++ b/presto-mixin/dashboards_out/presto-coordinator.json @@ -1,94 +1,97 @@ { - "__inputs": [ ], - "__requires": [ ], "annotations": { "list": [ ] }, - "description": "", - "editable": false, - "gnetId": null, - "graphTooltip": 0, - "hideControls": false, - "id": null, "links": [ { - "asDropdown": false, - "icon": "external link", + "keepTime": true, + "title": "Presto logs", + "type": "link", + "url": "/d/presto-logs" + }, + { + "asDropdown": true, "includeVars": true, "keepTime": true, "tags": [ "presto-mixin" ], - "targetBlank": false, - "title": "Other Presto dashboards", - "type": "dashboards", - "url": "" + "title": "All dashboards", + "type": "dashboards" + }, + { + "keepTime": true, + "title": "Presto overview", + "type": "link", + "url": "/d/presto_overview" + }, + { + "keepTime": true, + "title": "Presto worker", + "type": "link", + "url": "/d/presto_worker" } ], "panels": [ + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 0, + "x": 0, + "y": 0 + }, + "id": 1, + "panels": [ ], + "title": "Coordinator", + "type": "row" + }, { "datasource": { - "uid": "${prometheus_datasource}" + "type": "datasource", + "uid": "-- Mixed --" }, - "description": "An average gauge of the JVM's non-heap memory usage across coordinators.", + "description": "Non-heap memory usage", "fieldConfig": { "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [ ], "thresholds": { - "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "light-green", + "value": 0 }, { "color": "#EAB839", "value": 0.69999999999999996 }, { - "color": "red", + "color": "light-red", "value": 0.80000000000000004 } ] }, "unit": "percentunit" - }, - "overrides": [ ] + } }, "gridPos": { - "h": 9, + "h": 8, "w": 6, "x": 0, - "y": 0 + "y": 1 }, "id": 2, - "options": { - "minVizHeight": 75, - "minVizWidth": 75, - "orientation": "auto", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "showThresholdLabels": false, - "showThresholdMarkers": true - }, - "pluginVersion": "10.2.0-62263", + "pluginVersion": "v11.4.0", "targets": [ { "datasource": { + "type": "prometheus", "uid": "${prometheus_datasource}" }, - "expr": "avg (jvm_nonheap_memory_used{job=~\"$job\", instance=~\"$instance\", presto_cluster=~\"$presto_cluster\"} / clamp_min((jvm_nonheap_memory_used{job=~\"$job\", instance=~\"$instance\", presto_cluster=~\"$presto_cluster\"} + jvm_nonheap_memory_committed{job=~\"$job\", instance=~\"$instance\", presto_cluster=~\"$presto_cluster\"}),1))", + "expr": "avg (jvm_nonheap_memory_used{job=~\"integrations/presto\",job=~\"$job\",cluster=~\"$cluster\",presto_cluster=~\"$presto_cluster\",instance=~\"$instance\"} / clamp_min((jvm_nonheap_memory_used{job=~\"integrations/presto\",job=~\"$job\",cluster=~\"$cluster\",presto_cluster=~\"$presto_cluster\",instance=~\"$instance\"} + jvm_nonheap_memory_committed{job=~\"integrations/presto\",job=~\"$job\",cluster=~\"$cluster\",presto_cluster=~\"$presto_cluster\",instance=~\"$instance\"}), 1))", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}}" + "instant": false, + "legendFormat": "{{job}} {{cluster}} {{presto_cluster}}", + "refId": "Non-heap memory usage" } ], "title": "Non-heap memory usage", @@ -96,67 +99,50 @@ }, { "datasource": { - "uid": "${prometheus_datasource}" + "type": "datasource", + "uid": "-- Mixed --" }, - "description": "An average gauge of the JVM's heap memory usage across coordinators.", + "description": "Heap memory usage", "fieldConfig": { "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [ ], "thresholds": { - "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "light-green", + "value": 0 }, { "color": "#EAB839", "value": 0.69999999999999996 }, { - "color": "red", + "color": "light-red", "value": 0.80000000000000004 } ] }, "unit": "percentunit" - }, - "overrides": [ ] + } }, "gridPos": { - "h": 9, + "h": 8, "w": 6, "x": 6, - "y": 0 + "y": 1 }, "id": 3, - "options": { - "minVizHeight": 75, - "minVizWidth": 75, - "orientation": "auto", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "showThresholdLabels": false, - "showThresholdMarkers": true - }, - "pluginVersion": "10.2.0-62263", + "pluginVersion": "v11.4.0", "targets": [ { "datasource": { + "type": "prometheus", "uid": "${prometheus_datasource}" }, - "expr": "avg (jvm_heap_memory_used{job=~\"$job\", instance=~\"$instance\", presto_cluster=~\"$presto_cluster\"} / clamp_min((jvm_heap_memory_used{job=~\"$job\", instance=~\"$instance\", presto_cluster=~\"$presto_cluster\"} + jvm_heap_memory_committed{job=~\"$job\", instance=~\"$instance\", presto_cluster=~\"$presto_cluster\"}),1))", + "expr": "avg (jvm_heap_memory_used{job=~\"integrations/presto\",job=~\"$job\",cluster=~\"$cluster\",presto_cluster=~\"$presto_cluster\",instance=~\"$instance\"} / clamp_min((jvm_heap_memory_used{job=~\"integrations/presto\",job=~\"$job\",cluster=~\"$cluster\",presto_cluster=~\"$presto_cluster\",instance=~\"$instance\"} + jvm_heap_memory_committed{job=~\"integrations/presto\",job=~\"$job\",cluster=~\"$cluster\",presto_cluster=~\"$presto_cluster\",instance=~\"$instance\"}), 1))", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}}" + "instant": false, + "legendFormat": "{{job}} {{cluster}} {{presto_cluster}}", + "refId": "Heap memory usage" } ], "title": "Heap memory usage", @@ -164,97 +150,51 @@ }, { "datasource": { + "type": "prometheus", "uid": "${prometheus_datasource}" }, - "description": "The number of internal and user error failures occurring on the instance.", + "description": "", "fieldConfig": { "defaults": { - "color": { - "mode": "palette-classic" - }, "custom": { - "axisBorderShow": false, - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 15, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "insertNulls": false, - "lineInterpolation": "stepBefore", + "fillOpacity": 30, + "gradientMode": "opacity", + "lineInterpolation": "smooth", "lineWidth": 2, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "normal" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [ ], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - } - ] + "showPoints": "never" }, "unit": "none" - }, - "overrides": [ ] + } }, "gridPos": { - "h": 9, + "h": 8, "w": 12, "x": 12, - "y": 0 + "y": 1 }, "id": 4, "options": { "legend": { "calcs": [ ], - "displayMode": "list", - "placement": "bottom", - "showLegend": true + "displayMode": "list" }, "tooltip": { "mode": "multi", "sort": "desc" } }, + "pluginVersion": "v11.0.0", "targets": [ { "datasource": { + "type": "prometheus", "uid": "${prometheus_datasource}" }, - "expr": "presto_QueryManager_InternalFailures_OneMinute_Count{job=~\"$job\", instance=~\"$instance\", presto_cluster=~\"$presto_cluster\"}", + "expr": "presto_QueryManager_InternalFailures_OneMinute_Count{job=~\"integrations/presto\",job=~\"$job\",cluster=~\"$cluster\",presto_cluster=~\"$presto_cluster\",instance=~\"$instance\"}", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}} - internal" - }, - { - "datasource": { - "uid": "${prometheus_datasource}" - }, - "expr": "presto_QueryManager_UserErrorFailures_OneMinute_Count{job=~\"$job\", instance=~\"$instance\", presto_cluster=~\"$presto_cluster\"}", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}} - user" + "instant": false, + "legendFormat": "{{presto_cluster}} - internal", + "refId": "Error failures internal" } ], "title": "Error failures - one minute count", @@ -262,63 +202,24 @@ }, { "datasource": { + "type": "prometheus", "uid": "${prometheus_datasource}" }, - "description": "A count of completed, running, and started queries.", + "description": "", "fieldConfig": { "defaults": { - "color": { - "fixedColor": "#C8F2C2", - "mode": "palette-classic" - }, "custom": { - "axisBorderShow": false, - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "left", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 15, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "insertNulls": false, - "lineInterpolation": "stepBefore", + "fillOpacity": 30, + "gradientMode": "opacity", + "lineInterpolation": "smooth", "lineWidth": 2, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "normal" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [ ], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - } - ] + "showPoints": "never" }, "unit": "none" - }, - "overrides": [ ] + } }, "gridPos": { - "h": 9, + "h": 8, "w": 12, "x": 0, "y": 9 @@ -327,42 +228,47 @@ "options": { "legend": { "calcs": [ ], - "displayMode": "list", - "placement": "bottom", - "showLegend": true + "displayMode": "list" }, "tooltip": { "mode": "multi", "sort": "desc" } }, + "pluginVersion": "v11.0.0", "targets": [ { "datasource": { + "type": "prometheus", "uid": "${prometheus_datasource}" }, - "expr": "presto_QueryManager_CompletedQueries_OneMinute_Count{job=~\"$job\", instance=~\"$instance\", presto_cluster=~\"$presto_cluster\"}", + "expr": "presto_QueryManager_CompletedQueries_OneMinute_Count{job=~\"integrations/presto\",job=~\"$job\",cluster=~\"$cluster\",presto_cluster=~\"$presto_cluster\",instance=~\"$instance\"}", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}} - completed" + "instant": false, + "legendFormat": "{{presto_cluster}} - completed", + "refId": "Query completed" }, { "datasource": { + "type": "prometheus", "uid": "${prometheus_datasource}" }, - "expr": "presto_QueryManager_RunningQueries{job=~\"$job\", instance=~\"$instance\", presto_cluster=~\"$presto_cluster\"}", + "expr": "presto_QueryManager_RunningQueries{job=~\"integrations/presto\",job=~\"$job\",cluster=~\"$cluster\",presto_cluster=~\"$presto_cluster\",instance=~\"$instance\"}", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}} - running" + "instant": false, + "legendFormat": "{{presto_cluster}} - running", + "refId": "Query running" }, { "datasource": { + "type": "prometheus", "uid": "${prometheus_datasource}" }, - "expr": "presto_QueryManager_StartedQueries_OneMinute_Count{job=~\"$job\", instance=~\"$instance\", presto_cluster=~\"$presto_cluster\"}", + "expr": "presto_QueryManager_StartedQueries_OneMinute_Count{job=~\"integrations/presto\",job=~\"$job\",cluster=~\"$cluster\",presto_cluster=~\"$presto_cluster\",instance=~\"$instance\"}", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}} - started" + "instant": false, + "legendFormat": "{{presto_cluster}} - started", + "refId": "Query started" } ], "title": "Normal query - one minute count", @@ -370,63 +276,24 @@ }, { "datasource": { + "type": "prometheus", "uid": "${prometheus_datasource}" }, - "description": "A count of failed, abandoned, and canceled queries.", + "description": "", "fieldConfig": { "defaults": { - "color": { - "fixedColor": "#C8F2C2", - "mode": "palette-classic" - }, "custom": { - "axisBorderShow": false, - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "left", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 15, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "insertNulls": false, - "lineInterpolation": "stepBefore", + "fillOpacity": 30, + "gradientMode": "opacity", + "lineInterpolation": "smooth", "lineWidth": 2, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "normal" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [ ], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - } - ] + "showPoints": "never" }, "unit": "none" - }, - "overrides": [ ] + } }, "gridPos": { - "h": 9, + "h": 8, "w": 12, "x": 12, "y": 9 @@ -435,42 +302,47 @@ "options": { "legend": { "calcs": [ ], - "displayMode": "list", - "placement": "bottom", - "showLegend": true + "displayMode": "list" }, "tooltip": { "mode": "multi", "sort": "desc" } }, + "pluginVersion": "v11.0.0", "targets": [ { "datasource": { + "type": "prometheus", "uid": "${prometheus_datasource}" }, - "expr": "presto_QueryManager_FailedQueries_OneMinute_Count{job=~\"$job\", instance=~\"$instance\", presto_cluster=~\"$presto_cluster\"}", + "expr": "presto_QueryManager_FailedQueries_OneMinute_Count{job=~\"integrations/presto\",job=~\"$job\",cluster=~\"$cluster\",presto_cluster=~\"$presto_cluster\",instance=~\"$instance\"}", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}} - failed" + "instant": false, + "legendFormat": "{{presto_cluster}} - failed", + "refId": "Abnormal query failed" }, { "datasource": { + "type": "prometheus", "uid": "${prometheus_datasource}" }, - "expr": "presto_QueryManager_AbandonedQueries_OneMinute_Count{job=~\"$job\", instance=~\"$instance\", presto_cluster=~\"$presto_cluster\"}", + "expr": "presto_QueryManager_AbandonedQueries_OneMinute_Count{job=~\"integrations/presto\",job=~\"$job\",cluster=~\"$cluster\",presto_cluster=~\"$presto_cluster\",instance=~\"$instance\"}", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}} - abandoned" + "instant": false, + "legendFormat": "{{presto_cluster}} - abandoned", + "refId": "Abnormal query abandoned" }, { "datasource": { + "type": "prometheus", "uid": "${prometheus_datasource}" }, - "expr": "presto_QueryManager_CanceledQueries_OneMinute_Count{job=~\"$job\", instance=~\"$instance\", presto_cluster=~\"$presto_cluster\"}", + "expr": "presto_QueryManager_CanceledQueries_OneMinute_Count{job=~\"integrations/presto\",job=~\"$job\",cluster=~\"$cluster\",presto_cluster=~\"$presto_cluster\",instance=~\"$instance\"}", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}} - canceled" + "instant": false, + "legendFormat": "{{presto_cluster}} - canceled", + "refId": "Abnormal query canceled" } ], "title": "Abnormal query - one minute count", @@ -478,107 +350,73 @@ }, { "datasource": { + "type": "prometheus", "uid": "${prometheus_datasource}" }, - "description": "The rate of normally operating queries such as the completed, running, and started queries.", + "description": "", "fieldConfig": { "defaults": { - "color": { - "fixedColor": "#C8F2C2", - "mode": "palette-classic" - }, "custom": { - "axisBorderShow": false, - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "left", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 15, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "insertNulls": false, - "lineInterpolation": "stepBefore", + "fillOpacity": 30, + "gradientMode": "opacity", + "lineInterpolation": "smooth", "lineWidth": 2, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "normal" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [ ], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - } - ] + "showPoints": "never" }, - "unit": "ops" - }, - "overrides": [ ] + "unit": "none" + } }, "gridPos": { - "h": 9, + "h": 8, "w": 12, "x": 0, - "y": 18 + "y": 17 }, "id": 7, "options": { "legend": { "calcs": [ ], - "displayMode": "list", - "placement": "bottom", - "showLegend": true + "displayMode": "list" }, "tooltip": { "mode": "multi", "sort": "desc" } }, + "pluginVersion": "v11.0.0", "targets": [ { "datasource": { + "type": "prometheus", "uid": "${prometheus_datasource}" }, - "expr": "presto_QueryManager_CompletedQueries_OneMinute_Rate{job=~\"$job\", instance=~\"$instance\", presto_cluster=~\"$presto_cluster\"}", + "expr": "presto_QueryManager_CompletedQueries_OneMinute_Rate{job=~\"integrations/presto\",job=~\"$job\",cluster=~\"$cluster\",presto_cluster=~\"$presto_cluster\",instance=~\"$instance\"}", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}} - completed" + "instant": false, + "legendFormat": "{{presto_cluster}} - completed", + "refId": "Normal query completed rate" }, { "datasource": { + "type": "prometheus", "uid": "${prometheus_datasource}" }, - "expr": "presto_QueryManager_RunningQueries{job=~\"$job\", instance=~\"$instance\", presto_cluster=~\"$presto_cluster\"}", + "expr": "presto_QueryManager_RunningQueries{job=~\"integrations/presto\",job=~\"$job\",cluster=~\"$cluster\",presto_cluster=~\"$presto_cluster\",instance=~\"$instance\"}", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}} - running" + "instant": false, + "legendFormat": "{{presto_cluster}} - running", + "refId": "Normal query running rate" }, { "datasource": { + "type": "prometheus", "uid": "${prometheus_datasource}" }, - "expr": "presto_QueryManager_StartedQueries_OneMinute_Rate{job=~\"$job\", instance=~\"$instance\", presto_cluster=~\"$presto_cluster\"}", + "expr": "presto_QueryManager_StartedQueries_OneMinute_Rate{job=~\"integrations/presto\",job=~\"$job\",cluster=~\"$cluster\",presto_cluster=~\"$presto_cluster\",instance=~\"$instance\"}", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}} - started" + "instant": false, + "legendFormat": "{{presto_cluster}} - started", + "refId": "Normal query started rate" } ], "title": "Normal query - one minute rate", @@ -586,107 +424,76 @@ }, { "datasource": { + "type": "prometheus", "uid": "${prometheus_datasource}" }, - "description": "The rate of abnormal queries such as the failed, abandoned, and canceled queries.", + "description": "", "fieldConfig": { "defaults": { - "color": { - "fixedColor": "#C8F2C2", - "mode": "palette-classic" - }, "custom": { - "axisBorderShow": false, - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "left", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 15, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "insertNulls": false, - "lineInterpolation": "stepBefore", + "fillOpacity": 30, + "gradientMode": "opacity", + "lineInterpolation": "smooth", "lineWidth": 2, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "normal" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [ ], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - } - ] + "showPoints": "never" }, - "unit": "ops" - }, - "overrides": [ ] + "unit": "none" + } }, "gridPos": { - "h": 9, + "h": 8, "w": 12, "x": 12, - "y": 18 + "y": 17 }, "id": 8, "options": { "legend": { "calcs": [ ], - "displayMode": "list", - "placement": "bottom", - "showLegend": true + "displayMode": "list" }, "tooltip": { "mode": "multi", "sort": "desc" } }, + "pluginVersion": "v11.0.0", "targets": [ { "datasource": { + "type": "prometheus", "uid": "${prometheus_datasource}" }, - "expr": "rate(presto_QueryManager_FailedQueries_TotalCount{job=~\"$job\", instance=~\"$instance\", presto_cluster=~\"$presto_cluster\"}[$__rate_interval])", + "expr": "rate(presto_QueryManager_FailedQueries_TotalCount{job=~\"integrations/presto\",job=~\"$job\",cluster=~\"$cluster\",presto_cluster=~\"$presto_cluster\",instance=~\"$instance\"}[$__rate_interval])", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}} - failed" + "instant": false, + "interval": "1m", + "legendFormat": "{{presto_cluster}} - failed", + "refId": "Abnormal query completed rate" }, { "datasource": { + "type": "prometheus", "uid": "${prometheus_datasource}" }, - "expr": "rate(presto_QueryManager_AbandonedQueries_TotalCount{job=~\"$job\", instance=~\"$instance\", presto_cluster=~\"$presto_cluster\"}[$__rate_interval])", + "expr": "rate(presto_QueryManager_AbandonedQueries_TotalCount{job=~\"integrations/presto\",job=~\"$job\",cluster=~\"$cluster\",presto_cluster=~\"$presto_cluster\",instance=~\"$instance\"}[$__rate_interval])", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}} - abandoned" + "instant": false, + "interval": "1m", + "legendFormat": "{{presto_cluster}} - abandoned", + "refId": "Abnormal query abandoned rate" }, { "datasource": { + "type": "prometheus", "uid": "${prometheus_datasource}" }, - "expr": "rate(presto_QueryManager_CanceledQueries_TotalCount{job=~\"$job\", instance=~\"$instance\", presto_cluster=~\"$presto_cluster\"}[$__rate_interval])", + "expr": "rate(presto_QueryManager_CanceledQueries_TotalCount{job=~\"integrations/presto\",job=~\"$job\",cluster=~\"$cluster\",presto_cluster=~\"$presto_cluster\",instance=~\"$instance\"}[$__rate_interval])", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}} - canceled" + "instant": false, + "interval": "1m", + "legendFormat": "{{presto_cluster}} - canceled", + "refId": "Abnormal query canceled rate" } ], "title": "Abnormal query - one minute rate", @@ -694,115 +501,84 @@ }, { "datasource": { + "type": "prometheus", "uid": "${prometheus_datasource}" }, - "description": "The time it took to run queries over the past one minute period.\n", + "description": "", "fieldConfig": { "defaults": { - "color": { - "mode": "palette-classic" - }, "custom": { - "axisBorderShow": false, - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 15, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "insertNulls": false, - "lineInterpolation": "stepBefore", + "fillOpacity": 30, + "gradientMode": "opacity", + "lineInterpolation": "smooth", "lineWidth": 2, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [ ], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - } - ] + "showPoints": "never" }, "unit": "ms" - }, - "overrides": [ ] + } }, "gridPos": { "h": 8, "w": 24, "x": 0, - "y": 27 + "y": 25 }, "id": 9, "options": { "legend": { "calcs": [ ], - "displayMode": "list", - "placement": "bottom", - "showLegend": true + "displayMode": "list" }, "tooltip": { "mode": "multi", "sort": "desc" } }, + "pluginVersion": "v11.0.0", "targets": [ { "datasource": { + "type": "prometheus", "uid": "${prometheus_datasource}" }, - "expr": "presto_QueryManager_ExecutionTime_OneMinute_P75{job=~\"$job\", instance=~\"$instance\", presto_cluster=~\"$presto_cluster\"}", + "expr": "presto_QueryManager_ExecutionTime_OneMinute_P50{job=~\"integrations/presto\",job=~\"$job\",cluster=~\"$cluster\",presto_cluster=~\"$presto_cluster\",instance=~\"$instance\"}", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}} - p75" + "instant": false, + "legendFormat": "{{presto_cluster}} - p50", + "refId": "Query execution time (p50)" }, { "datasource": { + "type": "prometheus", "uid": "${prometheus_datasource}" }, - "expr": "presto_QueryManager_ExecutionTime_OneMinute_P95{job=~\"$job\", instance=~\"$instance\", presto_cluster=~\"$presto_cluster\"}", + "expr": "presto_QueryManager_ExecutionTime_OneMinute_P75{job=~\"integrations/presto\",job=~\"$job\",cluster=~\"$cluster\",presto_cluster=~\"$presto_cluster\",instance=~\"$instance\"}", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}} - p95" + "instant": false, + "legendFormat": "{{presto_cluster}} - p75", + "refId": "Query execution time (p75)" }, { "datasource": { + "type": "prometheus", "uid": "${prometheus_datasource}" }, - "expr": "presto_QueryManager_ExecutionTime_OneMinute_P99{job=~\"$job\", instance=~\"$instance\", presto_cluster=~\"$presto_cluster\"}", + "expr": "presto_QueryManager_ExecutionTime_OneMinute_P95{job=~\"integrations/presto\",job=~\"$job\",cluster=~\"$cluster\",presto_cluster=~\"$presto_cluster\",instance=~\"$instance\"}", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}} - p99" + "instant": false, + "legendFormat": "{{presto_cluster}} - p95", + "refId": "Query execution time (p95)" }, { "datasource": { + "type": "prometheus", "uid": "${prometheus_datasource}" }, - "expr": "presto_QueryManager_ExecutionTime_OneMinute_P50{job=~\"$job\", instance=~\"$instance\", presto_cluster=~\"$presto_cluster\"}", + "expr": "presto_QueryManager_ExecutionTime_OneMinute_P99{job=~\"integrations/presto\",job=~\"$job\",cluster=~\"$cluster\",presto_cluster=~\"$presto_cluster\",instance=~\"$instance\"}", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}} - p50" + "instant": false, + "legendFormat": "{{presto_cluster}} - p99", + "refId": "Query execution time (p99)" } ], "title": "Query execution time - one minute count", @@ -810,177 +586,103 @@ }, { "datasource": { + "type": "prometheus", "uid": "${prometheus_datasource}" }, - "description": "CPU time consumed by Presto's QueryManager for executing queries over one-minute intervals, measured in CPU seconds used.", + "description": "", "fieldConfig": { "defaults": { - "color": { - "mode": "palette-classic" - }, "custom": { - "axisBorderShow": false, - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 15, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "insertNulls": false, - "lineInterpolation": "stepBefore", + "fillOpacity": 30, + "gradientMode": "opacity", + "lineInterpolation": "smooth", "lineWidth": 2, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [ ], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - } - ] + "showPoints": "never" }, - "unit": "s" - }, - "overrides": [ ] + "unit": "ms" + } }, "gridPos": { "h": 8, "w": 12, "x": 0, - "y": 35 + "y": 33 }, "id": 10, "options": { "legend": { "calcs": [ ], - "displayMode": "list", - "placement": "bottom", - "showLegend": true + "displayMode": "list" }, "tooltip": { "mode": "multi", "sort": "desc" } }, + "pluginVersion": "v11.0.0", "targets": [ { "datasource": { + "type": "prometheus", "uid": "${prometheus_datasource}" }, - "expr": "presto_QueryManager_ConsumedCpuTimeSecs_OneMinute_Count{job=~\"$job\", instance=~\"$instance\", presto_cluster=~\"$presto_cluster\"}", + "expr": "presto_QueryManager_ConsumedCpuTimeSecs_OneMinute_Count{job=~\"integrations/presto\",job=~\"$job\",cluster=~\"$cluster\",presto_cluster=~\"$presto_cluster\",instance=~\"$instance\"}", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}} " + "instant": false, + "legendFormat": "{{job}} {{cluster}} {{presto_cluster}}", + "refId": "CPU time consumed" } ], - "title": "CPU time consumed - one minute rate", + "title": "CPU time consumed - one minute count", "type": "timeseries" }, { "datasource": { + "type": "prometheus", "uid": "${prometheus_datasource}" }, - "description": "The rate at which input data is being read and processed by the CPU.", + "description": "", "fieldConfig": { "defaults": { - "color": { - "mode": "palette-classic" - }, "custom": { - "axisBorderShow": false, - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 15, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "insertNulls": false, + "fillOpacity": 30, + "gradientMode": "opacity", "lineInterpolation": "smooth", "lineWidth": 2, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [ ], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - } - ] + "showPoints": "never" }, "unit": "Bps" - }, - "overrides": [ ] + } }, "gridPos": { "h": 8, "w": 12, "x": 12, - "y": 35 + "y": 33 }, "id": 11, "options": { "legend": { "calcs": [ ], - "displayMode": "list", - "placement": "bottom", - "showLegend": true + "displayMode": "list" }, "tooltip": { "mode": "multi", "sort": "desc" } }, + "pluginVersion": "v11.0.0", "targets": [ { "datasource": { + "type": "prometheus", "uid": "${prometheus_datasource}" }, - "expr": "presto_QueryManager_CpuInputByteRate_OneMinute_Total{job=~\"$job\", instance=~\"$instance\", presto_cluster=~\"$presto_cluster\"}", + "expr": "presto_QueryManager_CpuInputByteRate_OneMinute_Total{job=~\"integrations/presto\",job=~\"$job\",cluster=~\"$cluster\",presto_cluster=~\"$presto_cluster\",instance=~\"$instance\"}", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}} " + "instant": false, + "legendFormat": "{{job}} {{cluster}} {{presto_cluster}}", + "refId": "CPU input throughput" } ], "title": "CPU input throughput - one minute count", @@ -988,105 +690,65 @@ }, { "collapsed": false, - "datasource": { - "uid": "${prometheus_datasource}" - }, "gridPos": { "h": 1, - "w": 24, + "w": 0, "x": 0, - "y": 43 + "y": 41 }, "id": 12, - "targets": [ ], - "title": "JVM metrics", + "panels": [ ], + "title": "JVM", "type": "row" }, { "datasource": { + "type": "prometheus", "uid": "${prometheus_datasource}" }, - "description": "The recent increase in the number of garbage collection events for the JVM.", + "description": "", "fieldConfig": { "defaults": { - "color": { - "mode": "palette-classic" - }, "custom": { - "axisBorderShow": false, - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 15, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "insertNulls": false, + "fillOpacity": 30, + "gradientMode": "opacity", "lineInterpolation": "smooth", "lineWidth": 2, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "normal" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [ ], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - } - ] + "showPoints": "never" }, "unit": "none" - }, - "overrides": [ ] + } }, "gridPos": { "h": 8, "w": 12, "x": 0, - "y": 44 + "y": 42 }, "id": 13, "options": { "legend": { "calcs": [ ], - "displayMode": "list", - "placement": "bottom", - "showLegend": true + "displayMode": "list" }, "tooltip": { "mode": "multi", "sort": "desc" } }, + "pluginVersion": "v11.0.0", "targets": [ { "datasource": { + "type": "prometheus", "uid": "${prometheus_datasource}" }, - "expr": "increase(jvm_gc_collection_count{job=~\"$job\", instance=~\"$instance\", presto_cluster=~\"$presto_cluster\", name=\"G1 Young Generation\"}[$__interval:])", + "expr": "increase(jvm_gc_collection_count{job=~\"integrations/presto\",job=~\"$job\",cluster=~\"$cluster\",presto_cluster=~\"$presto_cluster\",instance=~\"$instance\", name=\"G1 Young Generation\"}[$__interval:] offset -$__interval)", "format": "time_series", - "interval": "1m", - "intervalFactor": 2, - "legendFormat": "{{instance}} " + "instant": false, + "interval": "2m", + "legendFormat": "{{job}} {{cluster}} {{presto_cluster}}", + "refId": "Garbage collector count" } ], "title": "Garbage collection count / $__interval", @@ -1094,88 +756,51 @@ }, { "datasource": { + "type": "prometheus", "uid": "${prometheus_datasource}" }, "description": "The average duration for each garbage collection operation in the JVM.", "fieldConfig": { "defaults": { - "color": { - "mode": "palette-classic" - }, "custom": { - "axisBorderShow": false, - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 15, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "insertNulls": false, + "fillOpacity": 30, + "gradientMode": "opacity", "lineInterpolation": "smooth", "lineWidth": 2, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "normal" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [ ], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - } - ] + "showPoints": "never" }, "unit": "ms" - }, - "overrides": [ ] + } }, "gridPos": { "h": 8, "w": 12, "x": 12, - "y": 44 + "y": 42 }, "id": 14, "options": { "legend": { "calcs": [ ], - "displayMode": "list", - "placement": "bottom", - "showLegend": true + "displayMode": "list" }, "tooltip": { "mode": "multi", "sort": "desc" } }, + "pluginVersion": "v11.0.0", "targets": [ { "datasource": { + "type": "prometheus", "uid": "${prometheus_datasource}" }, - "expr": "jvm_gc_duration{job=~\"$job\", instance=~\"$instance\", presto_cluster=~\"$presto_cluster\", name=\"G1 Young Generation\"}", + "expr": "jvm_gc_duration{job=~\"integrations/presto\",job=~\"$job\",cluster=~\"$cluster\",presto_cluster=~\"$presto_cluster\",instance=~\"$instance\", name=\"G1 Young Generation\"}", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}}" + "instant": false, + "legendFormat": "{{job}} {{cluster}} {{presto_cluster}}", + "refId": "Garbage collection duration" } ], "title": "Garbage collection duration", @@ -1183,308 +808,212 @@ }, { "datasource": { + "type": "prometheus", "uid": "${prometheus_datasource}" }, - "description": "The heap and non-heap memory used by the JVM.", + "description": "The heap and non-heap memory committed by the JVM.", "fieldConfig": { "defaults": { - "color": { - "mode": "palette-classic" - }, "custom": { - "axisBorderShow": false, - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 15, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "insertNulls": false, + "fillOpacity": 30, + "gradientMode": "opacity", "lineInterpolation": "smooth", "lineWidth": 2, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "normal" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [ ], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - } - ] + "showPoints": "never" }, "unit": "decbytes" - }, - "overrides": [ ] + } }, "gridPos": { "h": 8, "w": 12, "x": 0, - "y": 52 + "y": 50 }, "id": 15, "options": { "legend": { "calcs": [ ], - "displayMode": "list", - "placement": "bottom", - "showLegend": true + "displayMode": "list" }, "tooltip": { "mode": "multi", "sort": "desc" } }, + "pluginVersion": "v11.0.0", "targets": [ { "datasource": { + "type": "prometheus", "uid": "${prometheus_datasource}" }, - "expr": "jvm_nonheap_memory_used{job=~\"$job\", instance=~\"$instance\", presto_cluster=~\"$presto_cluster\"}", + "expr": "jvm_heap_memory_committed{job=~\"integrations/presto\",job=~\"$job\",cluster=~\"$cluster\",presto_cluster=~\"$presto_cluster\",instance=~\"$instance\"}", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}} - non heap" + "instant": false, + "legendFormat": "{{presto_cluster}} - heap", + "refId": "Heap memory committed" }, { "datasource": { + "type": "prometheus", "uid": "${prometheus_datasource}" }, - "expr": "jvm_heap_memory_used{job=~\"$job\", instance=~\"$instance\", presto_cluster=~\"$presto_cluster\"}", + "expr": "jvm_nonheap_memory_committed{job=~\"integrations/presto\",job=~\"$job\",cluster=~\"$cluster\",presto_cluster=~\"$presto_cluster\",instance=~\"$instance\"}", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}} - heap" + "instant": false, + "legendFormat": "{{presto_cluster}} - non heap", + "refId": "Non-heap memory committed" } ], - "title": "Memory used", + "title": "Memory committed", "type": "timeseries" }, { "datasource": { + "type": "prometheus", "uid": "${prometheus_datasource}" }, - "description": "The heap and non-heap memory committed.", + "description": "The heap and non-heap memory used by the JVM.", "fieldConfig": { "defaults": { - "color": { - "mode": "palette-classic" - }, "custom": { - "axisBorderShow": false, - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 15, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "insertNulls": false, + "fillOpacity": 30, + "gradientMode": "opacity", "lineInterpolation": "smooth", "lineWidth": 2, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "normal" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [ ], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - } - ] + "showPoints": "never" }, "unit": "decbytes" - }, - "overrides": [ ] + } }, "gridPos": { "h": 8, "w": 12, "x": 12, - "y": 52 + "y": 50 }, "id": 16, "options": { "legend": { "calcs": [ ], - "displayMode": "list", - "placement": "bottom", - "showLegend": true + "displayMode": "list" }, "tooltip": { "mode": "multi", "sort": "desc" } }, + "pluginVersion": "v11.0.0", "targets": [ { "datasource": { + "type": "prometheus", "uid": "${prometheus_datasource}" }, - "expr": "jvm_heap_memory_committed{job=~\"$job\", instance=~\"$instance\", presto_cluster=~\"$presto_cluster\"}", + "expr": "jvm_heap_memory_used{job=~\"integrations/presto\",job=~\"$job\",cluster=~\"$cluster\",presto_cluster=~\"$presto_cluster\",instance=~\"$instance\"}", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}} - heap" + "instant": false, + "legendFormat": "{{presto_cluster}} - heap", + "refId": "Heap memory usage" }, { "datasource": { + "type": "prometheus", "uid": "${prometheus_datasource}" }, - "expr": "jvm_nonheap_memory_committed{job=~\"$job\", instance=~\"$instance\", presto_cluster=~\"$presto_cluster\"}", + "expr": "jvm_nonheap_memory_used{job=~\"integrations/presto\",job=~\"$job\",cluster=~\"$cluster\",presto_cluster=~\"$presto_cluster\",instance=~\"$instance\"}", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}} - non heap" + "instant": false, + "legendFormat": "{{presto_cluster}} - non heap", + "refId": "Non-heap memory usage" } ], - "title": "Memory committed", + "title": "Memory used", "type": "timeseries" } ], "refresh": "1m", - "rows": [ ], - "schemaVersion": 14, - "style": "dark", + "schemaVersion": 39, "tags": [ "presto-mixin" ], "templating": { "list": [ { - "current": { }, - "hide": 0, - "label": "Data Source", + "label": "Prometheus data source", "name": "prometheus_datasource", - "options": [ ], "query": "prometheus", - "refresh": 1, "regex": "", "type": "datasource" }, { "allValue": ".+", - "current": { }, "datasource": { + "type": "prometheus", "uid": "${prometheus_datasource}" }, - "hide": 0, "includeAll": true, "label": "Job", "multi": true, "name": "job", - "options": [ ], - "query": "label_values(presto_HeartbeatDetector_ActiveCount,job)", + "query": "label_values(presto_metadata_DiscoveryNodeManager_ActiveResourceManagerCount{job=~\"integrations/presto\"}, job)", "refresh": 2, - "regex": "", - "sort": 0, - "tagValuesQuery": "", - "tags": [ ], - "tagsQuery": "", - "type": "query", - "useTags": false + "sort": 1, + "type": "query" }, { - "allValue": "", - "current": { }, + "allValue": ".*", "datasource": { + "type": "prometheus", "uid": "${prometheus_datasource}" }, - "hide": 2, "includeAll": true, "label": "Cluster", "multi": true, "name": "cluster", - "options": [ ], - "query": "label_values(presto_HeartbeatDetector_ActiveCount{job=~\"$job\"}, cluster)", + "query": "label_values(presto_metadata_DiscoveryNodeManager_ActiveResourceManagerCount{job=~\"integrations/presto\",job=~\"$job\"}, cluster)", "refresh": 2, - "regex": "", - "sort": 0, - "tagValuesQuery": "", - "tags": [ ], - "tagsQuery": "", - "type": "query", - "useTags": false + "sort": 1, + "type": "query" }, { - "allValue": ".*", - "current": { }, + "allValue": ".+", "datasource": { + "type": "prometheus", "uid": "${prometheus_datasource}" }, - "hide": 0, - "includeAll": false, + "includeAll": true, "label": "Presto cluster", - "multi": false, + "multi": true, "name": "presto_cluster", - "options": [ ], - "query": "label_values(presto_HeartbeatDetector_ActiveCount{job=~\"$job\"},presto_cluster)", + "query": "label_values(presto_metadata_DiscoveryNodeManager_ActiveResourceManagerCount{job=~\"integrations/presto\",job=~\"$job\",cluster=~\"$cluster\"}, presto_cluster)", "refresh": 2, - "regex": "", - "sort": 0, - "tagValuesQuery": "", - "tags": [ ], - "tagsQuery": "", - "type": "query", - "useTags": false + "sort": 1, + "type": "query" }, { - "allValue": "", - "current": { }, + "allValue": ".+", "datasource": { + "type": "prometheus", "uid": "${prometheus_datasource}" }, - "hide": 0, - "includeAll": false, + "includeAll": true, "label": "Instance", "multi": true, "name": "instance", - "options": [ ], - "query": "label_values(presto_HeartbeatDetector_ActiveCount{job=~\"$job\", presto_cluster=~\"$presto_cluster\"},instance)", + "query": "label_values(presto_metadata_DiscoveryNodeManager_ActiveResourceManagerCount{job=~\"integrations/presto\",job=~\"$job\",cluster=~\"$cluster\",presto_cluster=~\"$presto_cluster\"}, instance)", "refresh": 2, + "sort": 1, + "type": "query" + }, + { + "hide": 2, + "label": "Loki data source", + "name": "loki_datasource", + "query": "loki", "regex": "", - "sort": 0, - "tagValuesQuery": "", - "tags": [ ], - "tagsQuery": "", - "type": "query", - "useTags": false + "type": "datasource" } ] }, @@ -1492,33 +1021,7 @@ "from": "now-30m", "to": "now" }, - "timepicker": { - "refresh_intervals": [ - "5s", - "10s", - "30s", - "1m", - "5m", - "15m", - "30m", - "1h", - "2h", - "1d" - ], - "time_options": [ - "5m", - "15m", - "1h", - "6h", - "12h", - "24h", - "2d", - "7d", - "30d" - ] - }, "timezone": "default", "title": "Presto coordinator", - "uid": "presto-coordinator", - "version": 0 + "uid": "presto_coordinator" } \ No newline at end of file diff --git a/presto-mixin/dashboards_out/presto-logs.json b/presto-mixin/dashboards_out/presto-logs.json index 7780666f4..3820c4474 100644 --- a/presto-mixin/dashboards_out/presto-logs.json +++ b/presto-mixin/dashboards_out/presto-logs.json @@ -1,17 +1,35 @@ { + "annotations": { + "list": [ ] + }, "links": [ { - "asDropdown": false, - "icon": "external link", + "asDropdown": true, "includeVars": true, "keepTime": true, "tags": [ "presto-mixin" ], - "targetBlank": false, - "title": "Other Presto dashboards", - "type": "dashboards", - "url": "" + "title": "All dashboards", + "type": "dashboards" + }, + { + "keepTime": true, + "title": "Presto coordinator", + "type": "link", + "url": "/d/presto_coordinator" + }, + { + "keepTime": true, + "title": "Presto overview", + "type": "link", + "url": "/d/presto_overview" + }, + { + "keepTime": true, + "title": "Presto worker", + "type": "link", + "url": "/d/presto_worker" } ], "panels": [ @@ -161,7 +179,7 @@ "type": "loki", "uid": "${loki_datasource}" }, - "expr": "sum by (level) (count_over_time({job=~\"integrations/presto\",job=~\"$job\",presto_cluster=~\"$presto_cluster\",instance=~\"$instance\",level=~\"$level\"}\n|~ \"$regex_search\"\n\n[$__auto]))\n", + "expr": "sum by (level) (count_over_time({job=~\"integrations/presto\",job=~\"$job\",cluster=~\"$cluster\",presto_cluster=~\"$presto_cluster\"}\n|~ \"$regex_search\"\n\n[$__auto]))\n", "legendFormat": "{{ level }}" } ], @@ -194,7 +212,7 @@ "enableLogDetails": true, "prettifyLogMessage": true, "showTime": false, - "wrapLogMessage": true + "wrapLogMessage": false }, "pluginVersion": "v11.0.0", "targets": [ @@ -203,7 +221,7 @@ "type": "loki", "uid": "${loki_datasource}" }, - "expr": "{job=~\"integrations/presto\",job=~\"$job\",presto_cluster=~\"$presto_cluster\",instance=~\"$instance\",level=~\"$level\"} \n|~ \"$regex_search\"\n\n\n" + "expr": "{job=~\"integrations/presto\",job=~\"$job\",cluster=~\"$cluster\",presto_cluster=~\"$presto_cluster\"} \n|~ \"$regex_search\"\n\n\n" } ], "title": "Logs", @@ -246,25 +264,10 @@ "uid": "${loki_datasource}" }, "includeAll": true, - "label": "Presto_cluster", - "multi": true, - "name": "presto_cluster", - "query": "label_values({job=~\"integrations/presto\",job=~\"$job\"}, presto_cluster)", - "refresh": 2, - "sort": 1, - "type": "query" - }, - { - "allValue": ".*", - "datasource": { - "type": "loki", - "uid": "${loki_datasource}" - }, - "includeAll": true, - "label": "Instance", + "label": "Cluster", "multi": true, - "name": "instance", - "query": "label_values({job=~\"integrations/presto\",job=~\"$job\",presto_cluster=~\"$presto_cluster\"}, instance)", + "name": "cluster", + "query": "label_values({job=~\"integrations/presto\",job=~\"$job\"}, cluster)", "refresh": 2, "sort": 1, "type": "query" @@ -276,10 +279,10 @@ "uid": "${loki_datasource}" }, "includeAll": true, - "label": "Level", + "label": "Presto cluster", "multi": true, - "name": "level", - "query": "label_values({job=~\"integrations/presto\",job=~\"$job\",presto_cluster=~\"$presto_cluster\",instance=~\"$instance\"}, level)", + "name": "presto_cluster", + "query": "label_values({job=~\"integrations/presto\",job=~\"$job\",cluster=~\"$cluster\"}, presto_cluster)", "refresh": 2, "sort": 1, "type": "query" @@ -301,14 +304,22 @@ ], "query": "", "type": "textbox" + }, + { + "hide": 2, + "label": "Prometheus data source", + "name": "prometheus_datasource", + "query": "prometheus", + "regex": "", + "type": "datasource" } ] }, "time": { - "from": "now-6h", + "from": "now-30m", "to": "now" }, - "timezone": "utc", - "title": "Presto logs overview", - "uid": "presto-logs-overview" + "timezone": "default", + "title": "Presto logs", + "uid": "presto-logs" } \ No newline at end of file diff --git a/presto-mixin/dashboards_out/presto-overview.json b/presto-mixin/dashboards_out/presto-overview.json index 562075b43..4fbb51614 100644 --- a/presto-mixin/dashboards_out/presto-overview.json +++ b/presto-mixin/dashboards_out/presto-overview.json @@ -1,94 +1,81 @@ { - "__inputs": [ ], - "__requires": [ ], "annotations": { "list": [ ] }, - "description": "", - "editable": false, - "gnetId": null, - "graphTooltip": 0, - "hideControls": false, - "id": null, "links": [ { - "asDropdown": false, - "icon": "external link", + "keepTime": true, + "title": "Presto logs", + "type": "link", + "url": "/d/presto-logs" + }, + { + "asDropdown": true, "includeVars": true, "keepTime": true, "tags": [ "presto-mixin" ], - "targetBlank": false, - "title": "Other Presto dashboards", - "type": "dashboards", - "url": "" + "title": "All dashboards", + "type": "dashboards" + }, + { + "keepTime": true, + "title": "Presto coordinator", + "type": "link", + "url": "/d/presto_coordinator" + }, + { + "keepTime": true, + "title": "Presto worker", + "type": "link", + "url": "/d/presto_worker" } ], "panels": [ + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 0, + "x": 0, + "y": 0 + }, + "id": 1, + "panels": [ ], + "title": "Overview", + "type": "row" + }, { "datasource": { - "uid": "${prometheus_datasource}" + "type": "datasource", + "uid": "-- Mixed --" }, - "description": "The number of active resource managers.", + "description": "Active resource managers", "fieldConfig": { "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [ ], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "text", - "value": 0 - }, - { - "color": "green", - "value": 1 - } - ] - }, "unit": "none" - }, - "overrides": [ ] + } }, "gridPos": { "h": 4, "w": 6, "x": 0, - "y": 0 + "y": 1 }, "id": 2, - "options": { - "colorMode": "value", - "graphMode": "none", - "justifyMode": "auto", - "orientation": "auto", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "textMode": "auto" - }, - "pluginVersion": "10.2.0-62263", + "pluginVersion": "v11.4.0", "targets": [ { "datasource": { + "type": "prometheus", "uid": "${prometheus_datasource}" }, - "expr": "sum (max by (presto_cluster) (presto_metadata_DiscoveryNodeManager_ActiveResourceManagerCount{job=~\"$job\", presto_cluster=~\"$presto_cluster\"}))", + "expr": "sum(max by (presto_cluster) (presto_metadata_DiscoveryNodeManager_ActiveResourceManagerCount{job=~\"integrations/presto\",job=~\"$job\",cluster=~\"$cluster\",presto_cluster=~\"$presto_cluster\",instance=~\"$instance\"}))", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "Resource manager" + "instant": false, + "legendFormat": "Resource manager", + "refId": "Active resource managers" } ], "title": "Active resource managers", @@ -96,67 +83,34 @@ }, { "datasource": { - "uid": "${prometheus_datasource}" + "type": "datasource", + "uid": "-- Mixed --" }, - "description": "Number of broker instances across clusters.", + "description": "Active coordinators", "fieldConfig": { "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [ ], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 0 - }, - { - "color": "green", - "value": 1 - } - ] - }, "unit": "none" - }, - "overrides": [ ] + } }, "gridPos": { "h": 4, "w": 6, "x": 6, - "y": 0 + "y": 1 }, "id": 3, - "options": { - "colorMode": "value", - "graphMode": "none", - "justifyMode": "auto", - "orientation": "auto", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "textMode": "auto" - }, - "pluginVersion": "10.2.0-62263", + "pluginVersion": "v11.4.0", "targets": [ { "datasource": { + "type": "prometheus", "uid": "${prometheus_datasource}" }, - "expr": "sum(max by (presto_cluster) (presto_metadata_DiscoveryNodeManager_ActiveCoordinatorCount{job=~\"$job\", presto_cluster=~\"$presto_cluster\"}))", + "expr": "sum(max by (presto_cluster) (presto_metadata_DiscoveryNodeManager_ActiveCoordinatorCount{job=~\"integrations/presto\",job=~\"$job\",cluster=~\"$cluster\",presto_cluster=~\"$presto_cluster\",instance=~\"$instance\"}))", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "Coordinator" + "instant": false, + "legendFormat": "Coordinator", + "refId": "Active coordinators" } ], "title": "Active coordinators", @@ -164,67 +118,34 @@ }, { "datasource": { - "uid": "${prometheus_datasource}" + "type": "datasource", + "uid": "-- Mixed --" }, - "description": "The number of active workers.", + "description": "Active workers", "fieldConfig": { "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [ ], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 0 - }, - { - "color": "green", - "value": 1 - } - ] - }, "unit": "none" - }, - "overrides": [ ] + } }, "gridPos": { "h": 4, "w": 6, "x": 12, - "y": 0 + "y": 1 }, "id": 4, - "options": { - "colorMode": "value", - "graphMode": "none", - "justifyMode": "auto", - "orientation": "auto", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "textMode": "auto" - }, - "pluginVersion": "10.2.0-62263", + "pluginVersion": "v11.4.0", "targets": [ { "datasource": { + "type": "prometheus", "uid": "${prometheus_datasource}" }, - "expr": "sum(max by (presto_cluster) (presto_metadata_DiscoveryNodeManager_ActiveNodeCount{job=~\"$job\", presto_cluster=~\"$presto_cluster\"}) - max by (presto_cluster) (presto_metadata_DiscoveryNodeManager_ActiveCoordinatorCount{job=~\"$job\", presto_cluster=~\"$presto_cluster\"}) - max by (presto_cluster) (presto_metadata_DiscoveryNodeManager_ActiveResourceManagerCount{job=~\"$job\", presto_cluster=~\"$presto_cluster\"}))", + "expr": "sum(max by (presto_cluster) (presto_metadata_DiscoveryNodeManager_ActiveNodeCount{job=~\"integrations/presto\",job=~\"$job\",cluster=~\"$cluster\",presto_cluster=~\"$presto_cluster\",instance=~\"$instance\"}))", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "Worker" + "instant": false, + "legendFormat": "Worker", + "refId": "Active workers" } ], "title": "Active workers", @@ -232,63 +153,34 @@ }, { "datasource": { - "uid": "${prometheus_datasource}" + "type": "datasource", + "uid": "-- Mixed --" }, - "description": "The number of inactive workers.", + "description": "Inactive workers", "fieldConfig": { "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [ ], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 3 - } - ] - }, "unit": "none" - }, - "overrides": [ ] + } }, "gridPos": { "h": 4, "w": 6, "x": 18, - "y": 0 + "y": 1 }, "id": 5, - "options": { - "colorMode": "value", - "graphMode": "none", - "justifyMode": "auto", - "orientation": "auto", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "textMode": "auto" - }, - "pluginVersion": "10.2.0-62263", + "pluginVersion": "v11.4.0", "targets": [ { "datasource": { + "type": "prometheus", "uid": "${prometheus_datasource}" }, - "expr": "sum(max by (presto_cluster) (presto_metadata_DiscoveryNodeManager_InactiveNodeCount{job=~\"$job\", presto_cluster=~\"$presto_cluster\"}))", + "expr": "sum(max by (presto_cluster) (presto_metadata_DiscoveryNodeManager_InactiveNodeCount{job=~\"integrations/presto\",job=~\"$job\",cluster=~\"$cluster\",presto_cluster=~\"$presto_cluster\",instance=~\"$instance\"}))", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "Worker" + "instant": false, + "legendFormat": "{{presto_cluster}} - inactive", + "refId": "Inactive workers" } ], "title": "Inactive workers", @@ -296,113 +188,68 @@ }, { "datasource": { + "type": "prometheus", "uid": "${prometheus_datasource}" }, - "description": "The number of completed queries.", + "description": "", "fieldConfig": { "defaults": { - "color": { - "fixedColor": "#C8F2C2", - "mode": "palette-classic" - }, "custom": { - "axisBorderShow": false, - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 15, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "insertNulls": false, - "lineInterpolation": "stepBefore", + "fillOpacity": 30, + "gradientMode": "opacity", + "lineInterpolation": "smooth", "lineWidth": 2, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [ ], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - } - ] + "showPoints": "never" }, "unit": "none" - }, - "overrides": [ ] + } }, "gridPos": { "h": 8, "w": 12, "x": 0, - "y": 4 + "y": 9 }, "id": 6, "options": { "legend": { "calcs": [ ], - "displayMode": "list", - "placement": "bottom", - "showLegend": true + "displayMode": "list" }, "tooltip": { "mode": "multi", "sort": "desc" } }, + "pluginVersion": "v11.0.0", "targets": [ { "datasource": { + "type": "prometheus", "uid": "${prometheus_datasource}" }, - "expr": "presto_QueryManager_CompletedQueries_OneMinute_Count{job=~\"$job\", presto_cluster=~\"$presto_cluster\"}", + "expr": "presto_QueryManager_CompletedQueries_OneMinute_Count{job=~\"integrations/presto\",job=~\"$job\",cluster=~\"$cluster\",presto_cluster=~\"$presto_cluster\",instance=~\"$instance\"}", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{presto_cluster}}" + "instant": false, + "legendFormat": "{{presto_cluster}} - completed", + "refId": "Completed queries - one minute count" } ], "title": "Completed queries - one minute count", "type": "timeseries" }, { - "datasource": { - "uid": "${prometheus_datasource}" - }, - "description": "Reports firing alerts.", "gridPos": { "h": 8, "w": 12, "x": 12, - "y": 4 + "y": 9 }, "id": 7, "options": { "alertInstanceLabelFilter": "{job=~\"${job:regex}\", presto_cluster=~\"${presto_cluster:regex}\"}", "alertName": "", "dashboardAlerts": false, - "datasource": "Prometheus", - "groupBy": [ ], - "groupMode": "default", "maxItems": 20, "sortOrder": 1, "stateFilter": { @@ -414,104 +261,57 @@ }, "viewMode": "list" }, - "targets": [ - { - "datasource": { - "uid": "${prometheus_datasource}" - }, - "expr": "", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "" - } - ], + "targets": [ ], "title": "Alerts", "type": "alertlist" }, { "datasource": { + "type": "prometheus", "uid": "${prometheus_datasource}" }, - "description": "The rate of user error failures occurring across the clusters.", + "description": "The rate of user error failures occurring across clusters.", "fieldConfig": { "defaults": { - "color": { - "mode": "palette-classic" - }, "custom": { - "axisBorderShow": false, - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 15, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "insertNulls": false, - "lineInterpolation": "stepBefore", + "fillOpacity": 30, + "gradientMode": "opacity", + "lineInterpolation": "smooth", "lineWidth": 2, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [ ], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - } - ] + "showPoints": "never" }, "unit": "err/s" - }, - "overrides": [ ] + } }, "gridPos": { "h": 8, "w": 12, "x": 0, - "y": 12 + "y": 17 }, "id": 8, "options": { "legend": { "calcs": [ ], - "displayMode": "list", - "placement": "bottom", - "showLegend": true + "displayMode": "list" }, "tooltip": { "mode": "multi", "sort": "desc" } }, + "pluginVersion": "v11.0.0", "targets": [ { "datasource": { + "type": "prometheus", "uid": "${prometheus_datasource}" }, - "expr": "presto_QueryManager_UserErrorFailures_OneMinute_Rate{job=~\"$job\", presto_cluster=~\"$presto_cluster\"}", + "expr": "presto_QueryManager_UserErrorFailures_OneMinute_Count{job=~\"integrations/presto\",job=~\"$job\",cluster=~\"$cluster\",presto_cluster=~\"$presto_cluster\",instance=~\"$instance\"}", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{presto_cluster}}" + "instant": false, + "legendFormat": "{{presto_cluster}} - user", + "refId": "User error failures - one minute count" } ], "title": "User error failures - one minute rate", @@ -519,88 +319,51 @@ }, { "datasource": { + "type": "prometheus", "uid": "${prometheus_datasource}" }, "description": "The number of queued queries.", "fieldConfig": { "defaults": { - "color": { - "mode": "palette-classic" - }, "custom": { - "axisBorderShow": false, - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "bars", - "fillOpacity": 15, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "insertNulls": false, + "fillOpacity": 30, + "gradientMode": "opacity", "lineInterpolation": "smooth", "lineWidth": 2, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [ ], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - } - ] + "showPoints": "never" }, "unit": "none" - }, - "overrides": [ ] + } }, "gridPos": { "h": 8, "w": 12, "x": 12, - "y": 12 + "y": 17 }, "id": 9, "options": { "legend": { "calcs": [ ], - "displayMode": "list", - "placement": "bottom", - "showLegend": true + "displayMode": "list" }, "tooltip": { "mode": "multi", "sort": "desc" } }, + "pluginVersion": "v11.0.0", "targets": [ { "datasource": { + "type": "prometheus", "uid": "${prometheus_datasource}" }, - "expr": "presto_QueryManager_QueuedQueries{job=~\"$job\", presto_cluster=~\"$presto_cluster\"}", + "expr": "presto_QueryManager_QueuedQueries{job=~\"integrations/presto\",job=~\"$job\",cluster=~\"$cluster\",presto_cluster=~\"$presto_cluster\",instance=~\"$instance\"}", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{presto_cluster}}" + "instant": false, + "legendFormat": "{{job}} {{cluster}} {{presto_cluster}}", + "refId": "Queued queries" } ], "title": "Queued queries", @@ -608,88 +371,51 @@ }, { "datasource": { + "type": "prometheus", "uid": "${prometheus_datasource}" }, "description": "The number of nodes that are blocked due to memory restrictions.", "fieldConfig": { "defaults": { - "color": { - "mode": "palette-classic" - }, "custom": { - "axisBorderShow": false, - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 15, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "insertNulls": false, - "lineInterpolation": "stepBefore", + "fillOpacity": 30, + "gradientMode": "opacity", + "lineInterpolation": "smooth", "lineWidth": 2, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "normal" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [ ], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - } - ] + "showPoints": "never" }, "unit": "none" - }, - "overrides": [ ] + } }, "gridPos": { "h": 8, "w": 12, "x": 0, - "y": 20 + "y": 25 }, "id": 10, "options": { "legend": { "calcs": [ ], - "displayMode": "list", - "placement": "bottom", - "showLegend": true + "displayMode": "list" }, "tooltip": { "mode": "multi", "sort": "desc" } }, + "pluginVersion": "v11.0.0", "targets": [ { "datasource": { + "type": "prometheus", "uid": "${prometheus_datasource}" }, - "expr": "presto_ClusterMemoryPool_general_BlockedNodes{job=~\"$job\", presto_cluster=~\"$presto_cluster\"}", + "expr": "presto_ClusterMemoryPool_general_BlockedNodes{job=~\"integrations/presto\",job=~\"$job\",cluster=~\"$cluster\",presto_cluster=~\"$presto_cluster\",instance=~\"$instance\"}", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{presto_cluster}}" + "instant": false, + "legendFormat": "{{job}} {{cluster}} {{presto_cluster}}", + "refId": "Blocked nodes" } ], "title": "Blocked nodes", @@ -697,88 +423,51 @@ }, { "datasource": { + "type": "prometheus", "uid": "${prometheus_datasource}" }, - "description": "The rate of internal failures occurring across the clusters.", + "description": "The rate of internal error failures occurring across clusters.", "fieldConfig": { "defaults": { - "color": { - "mode": "palette-classic" - }, "custom": { - "axisBorderShow": false, - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 15, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "insertNulls": false, - "lineInterpolation": "stepBefore", + "fillOpacity": 30, + "gradientMode": "opacity", + "lineInterpolation": "smooth", "lineWidth": 2, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [ ], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - } - ] + "showPoints": "never" }, "unit": "err/s" - }, - "overrides": [ ] + } }, "gridPos": { "h": 8, "w": 12, "x": 12, - "y": 20 + "y": 25 }, "id": 11, "options": { "legend": { "calcs": [ ], - "displayMode": "list", - "placement": "bottom", - "showLegend": true + "displayMode": "list" }, "tooltip": { "mode": "multi", "sort": "desc" } }, + "pluginVersion": "v11.0.0", "targets": [ { "datasource": { + "type": "prometheus", "uid": "${prometheus_datasource}" }, - "expr": "presto_QueryManager_InternalFailures_OneMinute_Rate{job=~\"$job\", presto_cluster=~\"$presto_cluster\"}", + "expr": "presto_QueryManager_InternalFailures_OneMinute_Count{job=~\"integrations/presto\",job=~\"$job\",cluster=~\"$cluster\",presto_cluster=~\"$presto_cluster\",instance=~\"$instance\"}", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{presto_cluster}}" + "instant": false, + "legendFormat": "{{job}} {{cluster}} {{presto_cluster}}", + "refId": "Internal error failures - one minute count" } ], "title": "Internal error failures - one minute rate", @@ -786,97 +475,62 @@ }, { "datasource": { + "type": "prometheus", "uid": "${prometheus_datasource}" }, "description": "The amount of memory available across the clusters.", "fieldConfig": { "defaults": { - "color": { - "mode": "palette-classic" - }, "custom": { - "axisBorderShow": false, - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 15, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "insertNulls": false, + "fillOpacity": 30, + "gradientMode": "opacity", "lineInterpolation": "smooth", "lineWidth": 2, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "normal" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [ ], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - } - ] + "showPoints": "never" }, "unit": "decbytes" - }, - "overrides": [ ] + } }, "gridPos": { "h": 8, "w": 12, "x": 0, - "y": 28 + "y": 33 }, "id": 12, "options": { "legend": { "calcs": [ ], - "displayMode": "list", - "placement": "bottom", - "showLegend": true + "displayMode": "list" }, "tooltip": { "mode": "multi", "sort": "desc" } }, + "pluginVersion": "v11.0.0", "targets": [ { "datasource": { + "type": "prometheus", "uid": "${prometheus_datasource}" }, - "expr": "sum by (presto_cluster) (presto_ClusterMemoryPool_general_FreeDistributedBytes{job=~\"$job\", presto_cluster=~\"$presto_cluster\"})", + "expr": "sum by (presto_cluster) (presto_ClusterMemoryPool_reserved_FreeDistributedBytes{job=~\"integrations/presto\",job=~\"$job\",cluster=~\"$cluster\",presto_cluster=~\"$presto_cluster\",instance=~\"$instance\"})", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{presto_cluster}} - free" + "instant": false, + "legendFormat": "{{job}} {{cluster}} {{presto_cluster}}", + "refId": "Cluster memory distributed bytes reserved" }, { "datasource": { + "type": "prometheus", "uid": "${prometheus_datasource}" }, - "expr": "sum by (presto_cluster) (presto_ClusterMemoryPool_reserved_FreeDistributedBytes{job=~\"$job\", presto_cluster=~\"$presto_cluster\"})", + "expr": "sum by (presto_cluster) (presto_ClusterMemoryPool_general_FreeDistributedBytes{job=~\"integrations/presto\",job=~\"$job\",cluster=~\"$cluster\",presto_cluster=~\"$presto_cluster\",instance=~\"$instance\"})", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{presto_cluster}} - reserved" + "instant": false, + "legendFormat": "{{job}} {{cluster}} {{presto_cluster}}", + "refId": "Cluster memory distributed bytes free" } ], "title": "Cluster memory distributed bytes", @@ -884,186 +538,114 @@ }, { "datasource": { + "type": "prometheus", "uid": "${prometheus_datasource}" }, "description": "The rate that failures are occurring due to insufficient resources.", "fieldConfig": { "defaults": { - "color": { - "mode": "palette-classic" - }, "custom": { - "axisBorderShow": false, - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 15, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "insertNulls": false, - "lineInterpolation": "stepBefore", + "fillOpacity": 30, + "gradientMode": "opacity", + "lineInterpolation": "smooth", "lineWidth": 2, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [ ], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - } - ] + "showPoints": "never" }, "unit": "err/s" - }, - "overrides": [ ] + } }, "gridPos": { "h": 8, "w": 12, "x": 12, - "y": 28 + "y": 33 }, "id": 13, "options": { "legend": { "calcs": [ ], - "displayMode": "list", - "placement": "bottom", - "showLegend": true + "displayMode": "list" }, "tooltip": { "mode": "multi", "sort": "desc" } }, + "pluginVersion": "v11.0.0", "targets": [ { "datasource": { + "type": "prometheus", "uid": "${prometheus_datasource}" }, - "expr": "presto_QueryManager_InsufficientResourcesFailures_OneMinute_Rate{job=~\"$job\", presto_cluster=~\"$presto_cluster\"}", + "expr": "presto_QueryManager_InsufficientResourcesFailures_OneMinute_Rate{job=~\"integrations/presto\",job=~\"$job\",cluster=~\"$cluster\",presto_cluster=~\"$presto_cluster\",instance=~\"$instance\"}", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{presto_cluster}}" + "instant": false, + "legendFormat": "{{job}} {{cluster}} {{presto_cluster}}", + "refId": "Insufficient resource failures - one minute rate" } ], - "title": " Insufficient resource failures - one minute rate", + "title": "Insufficient resource failures - one minute rate", "type": "timeseries" }, { "datasource": { + "type": "prometheus", "uid": "${prometheus_datasource}" }, - "description": "The rate at which volumes of data are being processed", + "description": "The rate at which volumes of data are being processed.", "fieldConfig": { "defaults": { - "color": { - "mode": "palette-classic" - }, "custom": { - "axisBorderShow": false, - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 15, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "insertNulls": false, - "lineInterpolation": "stepBefore", + "fillOpacity": 30, + "gradientMode": "opacity", + "lineInterpolation": "smooth", "lineWidth": 2, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "normal" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [ ], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - } - ] + "showPoints": "never" }, "unit": "Bps" - }, - "overrides": [ ] + } }, "gridPos": { "h": 9, "w": 24, "x": 0, - "y": 36 + "y": 42 }, "id": 14, "options": { "legend": { "calcs": [ ], - "displayMode": "list", - "placement": "bottom", - "showLegend": true + "displayMode": "list" }, "tooltip": { "mode": "multi", "sort": "desc" } }, + "pluginVersion": "v11.0.0", "targets": [ { "datasource": { + "type": "prometheus", "uid": "${prometheus_datasource}" }, - "expr": "sum by (presto_cluster) (presto_TaskManager_InputDataSize_OneMinute_Rate{job=~\"$job\", presto_cluster=~\"$presto_cluster\"})", + "expr": "sum by (presto_cluster) (presto_TaskManager_InputDataSize_OneMinute_Rate{job=~\"integrations/presto\",job=~\"$job\",cluster=~\"$cluster\",presto_cluster=~\"$presto_cluster\",instance=~\"$instance\"})", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{presto_cluster}} - input" + "instant": false, + "legendFormat": "{{presto_cluster}} - input", + "refId": "Data processing throughput input - one minute rate" }, { "datasource": { + "type": "prometheus", "uid": "${prometheus_datasource}" }, - "expr": "sum by (presto_cluster) (presto_TaskManager_OutputDataSize_OneMinute_Rate{job=~\"$job\", presto_cluster=~\"$presto_cluster\"})", + "expr": "sum by (presto_cluster) (presto_TaskManager_OutputDataSize_OneMinute_Rate{job=~\"integrations/presto\",job=~\"$job\",cluster=~\"$cluster\",presto_cluster=~\"$presto_cluster\",instance=~\"$instance\"})", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{presto_cluster}} - output" + "instant": false, + "legendFormat": "{{presto_cluster}} - output", + "refId": "Data processing throughput output - one minute rate" } ], "title": "Data processing throughput - one minute rate", @@ -1071,90 +653,86 @@ } ], "refresh": "1m", - "rows": [ ], - "schemaVersion": 14, - "style": "dark", + "schemaVersion": 39, "tags": [ "presto-mixin" ], "templating": { "list": [ { - "current": { }, - "hide": 0, - "label": "Data Source", + "label": "Prometheus data source", "name": "prometheus_datasource", - "options": [ ], "query": "prometheus", - "refresh": 1, "regex": "", "type": "datasource" }, { "allValue": ".+", - "current": { }, "datasource": { + "type": "prometheus", "uid": "${prometheus_datasource}" }, - "hide": 0, "includeAll": true, "label": "Job", "multi": true, "name": "job", - "options": [ ], - "query": "label_values(presto_HeartbeatDetector_ActiveCount,job)", + "query": "label_values(presto_metadata_DiscoveryNodeManager_ActiveResourceManagerCount{job=~\"integrations/presto\"}, job)", "refresh": 2, - "regex": "", - "sort": 0, - "tagValuesQuery": "", - "tags": [ ], - "tagsQuery": "", - "type": "query", - "useTags": false + "sort": 1, + "type": "query" }, { - "allValue": "", - "current": { }, + "allValue": ".*", "datasource": { + "type": "prometheus", "uid": "${prometheus_datasource}" }, - "hide": 2, "includeAll": true, "label": "Cluster", "multi": true, "name": "cluster", - "options": [ ], - "query": "label_values(presto_HeartbeatDetector_ActiveCount{job=~\"$job\"}, cluster)", + "query": "label_values(presto_metadata_DiscoveryNodeManager_ActiveResourceManagerCount{job=~\"integrations/presto\",job=~\"$job\"}, cluster)", "refresh": 2, - "regex": "", - "sort": 0, - "tagValuesQuery": "", - "tags": [ ], - "tagsQuery": "", - "type": "query", - "useTags": false + "sort": 1, + "type": "query" }, { - "allValue": ".*", - "current": { }, + "allValue": ".+", "datasource": { + "type": "prometheus", "uid": "${prometheus_datasource}" }, - "hide": 0, "includeAll": true, "label": "Presto cluster", "multi": true, "name": "presto_cluster", - "options": [ ], - "query": "label_values(presto_HeartbeatDetector_ActiveCount{job=~\"$job\"},presto_cluster)", + "query": "label_values(presto_metadata_DiscoveryNodeManager_ActiveResourceManagerCount{job=~\"integrations/presto\",job=~\"$job\",cluster=~\"$cluster\"}, presto_cluster)", "refresh": 2, + "sort": 1, + "type": "query" + }, + { + "allValue": ".+", + "datasource": { + "type": "prometheus", + "uid": "${prometheus_datasource}" + }, + "includeAll": true, + "label": "Instance", + "multi": true, + "name": "instance", + "query": "label_values(presto_metadata_DiscoveryNodeManager_ActiveResourceManagerCount{job=~\"integrations/presto\",job=~\"$job\",cluster=~\"$cluster\",presto_cluster=~\"$presto_cluster\"}, instance)", + "refresh": 2, + "sort": 1, + "type": "query" + }, + { + "hide": 2, + "label": "Loki data source", + "name": "loki_datasource", + "query": "loki", "regex": "", - "sort": 0, - "tagValuesQuery": "", - "tags": [ ], - "tagsQuery": "", - "type": "query", - "useTags": false + "type": "datasource" } ] }, @@ -1162,33 +740,7 @@ "from": "now-30m", "to": "now" }, - "timepicker": { - "refresh_intervals": [ - "5s", - "10s", - "30s", - "1m", - "5m", - "15m", - "30m", - "1h", - "2h", - "1d" - ], - "time_options": [ - "5m", - "15m", - "1h", - "6h", - "12h", - "24h", - "2d", - "7d", - "30d" - ] - }, "timezone": "default", "title": "Presto overview", - "uid": "presto-overview", - "version": 0 + "uid": "presto_overview" } \ No newline at end of file diff --git a/presto-mixin/dashboards_out/presto-worker.json b/presto-mixin/dashboards_out/presto-worker.json index e711aec0d..fa75f5ebe 100644 --- a/presto-mixin/dashboards_out/presto-worker.json +++ b/presto-mixin/dashboards_out/presto-worker.json @@ -1,94 +1,97 @@ { - "__inputs": [ ], - "__requires": [ ], "annotations": { "list": [ ] }, - "description": "", - "editable": false, - "gnetId": null, - "graphTooltip": 0, - "hideControls": false, - "id": null, "links": [ { - "asDropdown": false, - "icon": "external link", + "keepTime": true, + "title": "Presto logs", + "type": "link", + "url": "/d/presto-logs" + }, + { + "asDropdown": true, "includeVars": true, "keepTime": true, "tags": [ "presto-mixin" ], - "targetBlank": false, - "title": "Other Presto dashboards", - "type": "dashboards", - "url": "" + "title": "All dashboards", + "type": "dashboards" + }, + { + "keepTime": true, + "title": "Presto coordinator", + "type": "link", + "url": "/d/presto_coordinator" + }, + { + "keepTime": true, + "title": "Presto overview", + "type": "link", + "url": "/d/presto_overview" } ], "panels": [ + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 0, + "x": 0, + "y": 0 + }, + "id": 1, + "panels": [ ], + "title": "Worker", + "type": "row" + }, { "datasource": { - "uid": "${prometheus_datasource}" + "type": "datasource", + "uid": "-- Mixed --" }, - "description": "An average gauge of the JVM's non-heap memory usage across coordinators.", + "description": "Non-heap memory usage", "fieldConfig": { "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [ ], "thresholds": { - "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "light-green", + "value": 0 }, { "color": "#EAB839", "value": 0.69999999999999996 }, { - "color": "red", + "color": "light-red", "value": 0.80000000000000004 } ] }, "unit": "percentunit" - }, - "overrides": [ ] + } }, "gridPos": { - "h": 8, - "w": 3, + "h": 4, + "w": 12, "x": 0, - "y": 0 + "y": 1 }, "id": 2, - "options": { - "minVizHeight": 75, - "minVizWidth": 75, - "orientation": "auto", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "showThresholdLabels": false, - "showThresholdMarkers": true - }, - "pluginVersion": "10.2.0-62263", + "pluginVersion": "v11.4.0", "targets": [ { "datasource": { + "type": "prometheus", "uid": "${prometheus_datasource}" }, - "expr": "avg (jvm_nonheap_memory_used{job=~\"$job\", instance=~\"$instance\", presto_cluster=~\"$presto_cluster\"} / clamp_min((jvm_nonheap_memory_used{job=~\"$job\", instance=~\"$instance\", presto_cluster=~\"$presto_cluster\"} + jvm_nonheap_memory_committed{job=~\"$job\", instance=~\"$instance\", presto_cluster=~\"$presto_cluster\"}),1))", + "expr": "avg (jvm_nonheap_memory_used{job=~\"integrations/presto\",job=~\"$job\",cluster=~\"$cluster\",presto_cluster=~\"$presto_cluster\",instance=~\"$instance\"} / clamp_min((jvm_nonheap_memory_used{job=~\"integrations/presto\",job=~\"$job\",cluster=~\"$cluster\",presto_cluster=~\"$presto_cluster\",instance=~\"$instance\"} + jvm_nonheap_memory_committed{job=~\"integrations/presto\",job=~\"$job\",cluster=~\"$cluster\",presto_cluster=~\"$presto_cluster\",instance=~\"$instance\"}), 1))", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "" + "instant": false, + "legendFormat": "{{instance}}", + "refId": "Non-heap memory usage" } ], "title": "Non-heap memory usage", @@ -96,67 +99,50 @@ }, { "datasource": { - "uid": "${prometheus_datasource}" + "type": "datasource", + "uid": "-- Mixed --" }, - "description": "An average gauge of the JVM's heap memory usage across workers.", + "description": "Heap memory usage", "fieldConfig": { "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [ ], "thresholds": { - "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "light-green", + "value": 0 }, { "color": "#EAB839", "value": 0.69999999999999996 }, { - "color": "red", + "color": "light-red", "value": 0.80000000000000004 } ] }, "unit": "percentunit" - }, - "overrides": [ ] + } }, "gridPos": { - "h": 8, - "w": 3, - "x": 3, - "y": 0 + "h": 4, + "w": 12, + "x": 12, + "y": 1 }, "id": 3, - "options": { - "minVizHeight": 75, - "minVizWidth": 75, - "orientation": "auto", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "showThresholdLabels": false, - "showThresholdMarkers": true - }, - "pluginVersion": "10.2.0-62263", + "pluginVersion": "v11.4.0", "targets": [ { "datasource": { + "type": "prometheus", "uid": "${prometheus_datasource}" }, - "expr": "avg (jvm_heap_memory_used{job=~\"$job\", instance=~\"$instance\", presto_cluster=~\"$presto_cluster\"} / clamp_min((jvm_heap_memory_used{job=~\"$job\", instance=~\"$instance\", presto_cluster=~\"$presto_cluster\"} + jvm_heap_memory_committed{job=~\"$job\", instance=~\"$instance\", presto_cluster=~\"$presto_cluster\"}),1))", + "expr": "avg (jvm_heap_memory_used{job=~\"integrations/presto\",job=~\"$job\",cluster=~\"$cluster\",presto_cluster=~\"$presto_cluster\",instance=~\"$instance\"} / clamp_min((jvm_heap_memory_used{job=~\"integrations/presto\",job=~\"$job\",cluster=~\"$cluster\",presto_cluster=~\"$presto_cluster\",instance=~\"$instance\"} + jvm_heap_memory_committed{job=~\"integrations/presto\",job=~\"$job\",cluster=~\"$cluster\",presto_cluster=~\"$presto_cluster\",instance=~\"$instance\"}), 1))", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "" + "instant": false, + "legendFormat": "{{instance}}", + "refId": "Heap memory usage" } ], "title": "Heap memory usage", @@ -164,89 +150,51 @@ }, { "datasource": { + "type": "prometheus", "uid": "${prometheus_datasource}" }, "description": "The number of tasks that are being queued by the task executor.", "fieldConfig": { "defaults": { - "color": { - "fixedColor": "#C8F2C2", - "mode": "palette-classic" - }, "custom": { - "axisBorderShow": false, - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 15, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "insertNulls": false, + "fillOpacity": 30, + "gradientMode": "opacity", "lineInterpolation": "smooth", "lineWidth": 2, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "normal" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [ ], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - } - ] + "showPoints": "never" }, - "unit": "none" - }, - "overrides": [ ] + "unit": "ops" + } }, "gridPos": { "h": 8, - "w": 6, - "x": 6, - "y": 0 + "w": 12, + "x": 0, + "y": 9 }, "id": 4, "options": { "legend": { "calcs": [ ], - "displayMode": "list", - "placement": "bottom", - "showLegend": true + "displayMode": "list" }, "tooltip": { "mode": "multi", "sort": "desc" } }, + "pluginVersion": "v11.0.0", "targets": [ { "datasource": { + "type": "prometheus", "uid": "${prometheus_datasource}" }, - "expr": "presto_TaskExecutor_ProcessorExecutor_QueuedTaskCount{job=~\"$job\", instance=~\"$instance\", presto_cluster=~\"$presto_cluster\"}", + "expr": "presto_TaskExecutor_ProcessorExecutor_QueuedTaskCount{job=~\"integrations/presto\",job=~\"$job\",cluster=~\"$cluster\",presto_cluster=~\"$presto_cluster\",instance=~\"$instance\"}", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}}" + "instant": false, + "legendFormat": "{{instance}}", + "refId": "Queued tasks" } ], "title": "Queued tasks", @@ -254,332 +202,190 @@ }, { "datasource": { + "type": "prometheus", "uid": "${prometheus_datasource}" }, - "description": "The rate at which tasks have failed and completed", + "description": "The rate at which tasks have failed and completed.", "fieldConfig": { "defaults": { - "color": { - "mode": "palette-classic" - }, "custom": { - "axisBorderShow": false, - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 15, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "insertNulls": false, + "fillOpacity": 30, + "gradientMode": "opacity", "lineInterpolation": "smooth", "lineWidth": 2, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "normal" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [ ], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - } - ] + "showPoints": "never" }, "unit": "ops" - }, - "overrides": [ - { - "matcher": { - "id": "byFrameRefID", - "options": "A" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "red", - "mode": "fixed" - } - }, - { - "id": "custom.axisPlacement", - "value": "left" - } - ] - }, - { - "matcher": { - "id": "byFrameRefID", - "options": "B" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "green", - "mode": "fixed" - } - }, - { - "id": "custom.axisPlacement", - "value": "right" - } - ] - } - ] + } }, "gridPos": { "h": 8, "w": 12, "x": 12, - "y": 0 + "y": 9 }, "id": 5, "options": { "legend": { "calcs": [ ], - "displayMode": "list", - "placement": "bottom", - "showLegend": true + "displayMode": "list" }, "tooltip": { "mode": "multi", "sort": "desc" } }, + "pluginVersion": "v11.0.0", "targets": [ { "datasource": { + "type": "prometheus", "uid": "${prometheus_datasource}" }, - "expr": "rate(presto_TaskManager_FailedTasks_TotalCount{job=~\"$job\", instance=~\"$instance\", presto_cluster=~\"$presto_cluster\"}[$__rate_interval])", + "expr": "rate(presto_TaskExecutor_ProcessorExecutor_FailedTaskCount{job=~\"integrations/presto\",job=~\"$job\",cluster=~\"$cluster\",presto_cluster=~\"$presto_cluster\",instance=~\"$instance\"}[$__rate_interval])", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}} - failed" + "instant": false, + "interval": "1m", + "legendFormat": "{{instance}} - failed", + "refId": "Failed tasks" }, { "datasource": { + "type": "prometheus", "uid": "${prometheus_datasource}" }, - "expr": "rate(presto_TaskExecutor_ProcessorExecutor_CompletedTaskCount{job=~\"$job\", instance=~\"$instance\", presto_cluster=~\"$presto_cluster\"}[$__rate_interval])", + "expr": "rate(presto_TaskExecutor_ProcessorExecutor_CompletedTaskCount{job=~\"integrations/presto\",job=~\"$job\",cluster=~\"$cluster\",presto_cluster=~\"$presto_cluster\",instance=~\"$instance\"}[$__rate_interval])", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}} - completed" + "instant": false, + "interval": "1m", + "legendFormat": "{{instance}} - completed", + "refId": "Completed tasks" } ], - "title": "Failed & Completed Tasks", + "title": "Failed & completed tasks", "type": "timeseries" }, { "datasource": { + "type": "prometheus", "uid": "${prometheus_datasource}" }, "description": "The rate of rows (or records) produced by an operation.", "fieldConfig": { "defaults": { - "color": { - "mode": "palette-classic" - }, "custom": { - "axisBorderShow": false, - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 15, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "insertNulls": false, - "lineInterpolation": "stepBefore", + "fillOpacity": 30, + "gradientMode": "opacity", + "lineInterpolation": "smooth", "lineWidth": 2, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "normal" - }, - "thresholdsStyle": { - "mode": "off" - } + "showPoints": "never" }, - "mappings": [ ], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - } - ] - }, - "unit": "rowsps" - }, - "overrides": [ ] + "unit": "short" + } }, "gridPos": { "h": 8, "w": 12, "x": 0, - "y": 8 + "y": 17 }, "id": 6, "options": { "legend": { "calcs": [ ], - "displayMode": "list", - "placement": "bottom", - "showLegend": true + "displayMode": "list" }, "tooltip": { "mode": "multi", "sort": "desc" } }, + "pluginVersion": "v11.0.0", "targets": [ { "datasource": { + "type": "prometheus", "uid": "${prometheus_datasource}" }, - "expr": "presto_TaskManager_OutputPositions_OneMinute_Rate{job=~\"$job\", instance=~\"$instance\", presto_cluster=~\"$presto_cluster\"}", + "expr": "presto_TaskManager_OutputPositions_OneMinute_Rate{job=~\"integrations/presto\",job=~\"$job\",cluster=~\"$cluster\",presto_cluster=~\"$presto_cluster\",instance=~\"$instance\"}", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}}" + "instant": false, + "legendFormat": "{{instance}}", + "refId": "Output positions" } ], - "title": "Output positions - one minute rate", + "title": "Output positions", "type": "timeseries" }, { "datasource": { + "type": "prometheus", "uid": "${prometheus_datasource}" }, - "description": "The pool size of the task notification executor and process executor.", + "description": "The pool size of the task notification executor.", "fieldConfig": { "defaults": { - "color": { - "mode": "palette-classic" - }, "custom": { - "axisBorderShow": false, - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 15, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "insertNulls": false, + "fillOpacity": 30, + "gradientMode": "opacity", "lineInterpolation": "smooth", "lineWidth": 2, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "normal" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [ ], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - } - ] + "showPoints": "never" }, "unit": "none" - }, - "overrides": [ ] + } }, "gridPos": { "h": 8, "w": 12, "x": 12, - "y": 8 + "y": 17 }, "id": 7, "options": { "legend": { "calcs": [ ], - "displayMode": "list", - "placement": "bottom", - "showLegend": true + "displayMode": "list" }, "tooltip": { "mode": "multi", "sort": "desc" } }, + "pluginVersion": "v11.0.0", "targets": [ { "datasource": { + "type": "prometheus", "uid": "${prometheus_datasource}" }, - "expr": "presto_TaskManager_TaskNotificationExecutor_PoolSize{job=~\"$job\", instance=~\"$instance\", presto_cluster=~\"$presto_cluster\"}", + "expr": "presto_TaskExecutor_ProcessorExecutor_PoolSize{job=~\"integrations/presto\",job=~\"$job\",cluster=~\"$cluster\",presto_cluster=~\"$presto_cluster\",instance=~\"$instance\"}", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}} - task notification" + "instant": false, + "legendFormat": "{{instance}}", + "refId": "Executor pool size" }, { "datasource": { + "type": "prometheus", "uid": "${prometheus_datasource}" }, - "expr": "presto_TaskExecutor_ProcessorExecutor_CorePoolSize{job=~\"$job\", instance=~\"$instance\", presto_cluster=~\"$presto_cluster\"}", + "expr": "presto_TaskExecutor_ProcessorExecutor_CorePoolSize{job=~\"integrations/presto\",job=~\"$job\",cluster=~\"$cluster\",presto_cluster=~\"$presto_cluster\",instance=~\"$instance\"}", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}} - process executor core" + "instant": false, + "legendFormat": "{{instance}}", + "refId": "Process executor core pool size" }, { "datasource": { + "type": "prometheus", "uid": "${prometheus_datasource}" }, - "expr": "presto_TaskExecutor_ProcessorExecutor_PoolSize{job=~\"$job\", instance=~\"$instance\", presto_cluster=~\"$presto_cluster\"}", + "expr": "presto_TaskExecutor_ProcessorExecutor_PoolSize{job=~\"integrations/presto\",job=~\"$job\",cluster=~\"$cluster\",presto_cluster=~\"$presto_cluster\",instance=~\"$instance\"}", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}} - process executor" + "instant": false, + "legendFormat": "{{instance}}", + "refId": "Process executor pool size" } ], "title": "Executor pool size", @@ -587,97 +393,62 @@ }, { "datasource": { + "type": "prometheus", "uid": "${prometheus_datasource}" }, "description": "The amount of Presto memory available.", "fieldConfig": { "defaults": { - "color": { - "mode": "palette-classic" - }, "custom": { - "axisBorderShow": false, - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 15, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "insertNulls": false, + "fillOpacity": 30, + "gradientMode": "opacity", "lineInterpolation": "smooth", "lineWidth": 2, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "normal" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [ ], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - } - ] + "showPoints": "never" }, "unit": "decbytes" - }, - "overrides": [ ] + } }, "gridPos": { "h": 8, "w": 12, "x": 0, - "y": 16 + "y": 25 }, "id": 8, "options": { "legend": { "calcs": [ ], - "displayMode": "list", - "placement": "bottom", - "showLegend": true + "displayMode": "list" }, "tooltip": { "mode": "multi", "sort": "desc" } }, + "pluginVersion": "v11.0.0", "targets": [ { "datasource": { + "type": "prometheus", "uid": "${prometheus_datasource}" }, - "expr": "sum by (instance, presto_cluster) (presto_MemoryPool_general_FreeBytes{job=~\"$job\", instance=~\"$instance\", presto_cluster=~\"$presto_cluster\"})", + "expr": "presto_MemoryPool_general_FreeBytes{job=~\"integrations/presto\",job=~\"$job\",cluster=~\"$cluster\",presto_cluster=~\"$presto_cluster\",instance=~\"$instance\"}", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}} - free" + "instant": false, + "legendFormat": "{{instance}} - free", + "refId": "Memory pool free bytes" }, { "datasource": { + "type": "prometheus", "uid": "${prometheus_datasource}" }, - "expr": "sum by (instance, presto_cluster) (presto_MemoryPool_reserved_FreeBytes{job=~\"$job\", instance=~\"$instance\", presto_cluster=~\"$presto_cluster\"})", + "expr": "presto_MemoryPool_reserved_FreeBytes{job=~\"integrations/presto\",job=~\"$job\",cluster=~\"$cluster\",presto_cluster=~\"$presto_cluster\",instance=~\"$instance\"}", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}} - reserved" + "instant": false, + "legendFormat": "{{instance}} - reserved free", + "refId": "Memory pool reserved free bytes" } ], "title": "Memory pool", @@ -685,203 +456,128 @@ }, { "datasource": { + "type": "prometheus", "uid": "${prometheus_datasource}" }, - "description": "The rate at which volumes of data are being processed", + "description": "The rate at which volumes of data are being processed.", "fieldConfig": { "defaults": { - "color": { - "mode": "palette-classic" - }, "custom": { - "axisBorderShow": false, - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 15, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "insertNulls": false, - "lineInterpolation": "stepBefore", + "fillOpacity": 30, + "gradientMode": "opacity", + "lineInterpolation": "smooth", "lineWidth": 2, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "normal" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [ ], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - } - ] + "showPoints": "never" }, "unit": "Bps" - }, - "overrides": [ ] + } }, "gridPos": { "h": 8, "w": 12, "x": 12, - "y": 16 + "y": 25 }, "id": 9, "options": { "legend": { "calcs": [ ], - "displayMode": "list", - "placement": "bottom", - "showLegend": true + "displayMode": "list" }, "tooltip": { "mode": "multi", "sort": "desc" } }, + "pluginVersion": "v11.0.0", "targets": [ { "datasource": { + "type": "prometheus", "uid": "${prometheus_datasource}" }, - "expr": "presto_TaskManager_InputDataSize_OneMinute_Rate{job=~\"$job\", instance=~\"$instance\", presto_cluster=~\"$presto_cluster\"}", + "expr": "presto_TaskManager_InputDataSize_OneMinute_Rate{job=~\"integrations/presto\",job=~\"$job\",cluster=~\"$cluster\",presto_cluster=~\"$presto_cluster\",instance=~\"$instance\"}", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}} - input" + "instant": false, + "legendFormat": "{{instance}}", + "refId": "Data processing input" }, { "datasource": { + "type": "prometheus", "uid": "${prometheus_datasource}" }, - "expr": "presto_TaskManager_OutputDataSize_OneMinute_Rate{job=~\"$job\", instance=~\"$instance\", presto_cluster=~\"$presto_cluster\"}", + "expr": "presto_TaskManager_OutputDataSize_OneMinute_Rate{job=~\"integrations/presto\",job=~\"$job\",cluster=~\"$cluster\",presto_cluster=~\"$presto_cluster\",instance=~\"$instance\"}", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}} - output" + "instant": false, + "legendFormat": "{{instance}}", + "refId": "Data processing output" } ], - "title": "Data processing throughput - one minute rate", + "title": "Data processing throughput", "type": "timeseries" }, { "collapsed": false, - "datasource": { - "uid": "${prometheus_datasource}" - }, "gridPos": { "h": 1, - "w": 24, + "w": 0, "x": 0, - "y": 24 + "y": 33 }, "id": 10, - "targets": [ ], - "title": "JVM metrics", + "panels": [ ], + "title": "JVM", "type": "row" }, { "datasource": { + "type": "prometheus", "uid": "${prometheus_datasource}" }, "description": "The recent increase in the number of garbage collection events for the JVM.", "fieldConfig": { "defaults": { - "color": { - "mode": "palette-classic" - }, "custom": { - "axisBorderShow": false, - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 15, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "insertNulls": false, + "fillOpacity": 30, + "gradientMode": "opacity", "lineInterpolation": "smooth", "lineWidth": 2, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "normal" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [ ], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - } - ] + "showPoints": "never" }, "unit": "none" - }, - "overrides": [ ] + } }, "gridPos": { "h": 8, "w": 12, "x": 0, - "y": 25 + "y": 34 }, "id": 11, "options": { "legend": { "calcs": [ ], - "displayMode": "list", - "placement": "bottom", - "showLegend": true + "displayMode": "list" }, "tooltip": { "mode": "multi", "sort": "desc" } }, + "pluginVersion": "v11.0.0", "targets": [ { "datasource": { + "type": "prometheus", "uid": "${prometheus_datasource}" }, - "expr": "increase(jvm_gc_collection_count{job=~\"$job\", instance=~\"$instance\", presto_cluster=~\"$presto_cluster\", name=\"G1 Young Generation\"}[$__interval:])", + "expr": "increase(jvm_gc_collection_count{job=~\"integrations/presto\",job=~\"$job\",cluster=~\"$cluster\",presto_cluster=~\"$presto_cluster\",instance=~\"$instance\"}[$__interval:] offset -$__interval)", "format": "time_series", - "interval": "1m", - "intervalFactor": 2, - "legendFormat": "{{instance}}" + "instant": false, + "interval": "2m", + "legendFormat": "{{instance}}", + "refId": "Garbage collection count" } ], "title": "Garbage collection count / $__interval", @@ -889,88 +585,52 @@ }, { "datasource": { + "type": "prometheus", "uid": "${prometheus_datasource}" }, "description": "The average duration for each garbage collection operation in the JVM.", "fieldConfig": { "defaults": { - "color": { - "mode": "palette-classic" - }, "custom": { - "axisBorderShow": false, - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 15, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "insertNulls": false, + "fillOpacity": 30, + "gradientMode": "opacity", "lineInterpolation": "smooth", "lineWidth": 2, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "normal" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [ ], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - } - ] + "showPoints": "never" }, "unit": "ms" - }, - "overrides": [ ] + } }, "gridPos": { "h": 8, "w": 12, "x": 12, - "y": 25 + "y": 34 }, "id": 12, "options": { "legend": { "calcs": [ ], - "displayMode": "list", - "placement": "bottom", - "showLegend": true + "displayMode": "list" }, "tooltip": { "mode": "multi", "sort": "desc" } }, + "pluginVersion": "v11.0.0", "targets": [ { "datasource": { + "type": "prometheus", "uid": "${prometheus_datasource}" }, - "expr": "jvm_gc_duration{job=~\"$job\", instance=~\"$instance\", presto_cluster=~\"$presto_cluster\", name=\"G1 Young Generation\"}", + "expr": "jvm_gc_duration{job=~\"integrations/presto\",job=~\"$job\",cluster=~\"$cluster\",presto_cluster=~\"$presto_cluster\",instance=~\"$instance\"}", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}}" + "instant": false, + "interval": "2m", + "legendFormat": "{{instance}}", + "refId": "JVM GC duration" } ], "title": "Garbage collection duration", @@ -978,308 +638,212 @@ }, { "datasource": { + "type": "prometheus", "uid": "${prometheus_datasource}" }, - "description": "The heap and non-heap memory used by the JVM.", + "description": "The heap and non-heap memory committed by the JVM.", "fieldConfig": { "defaults": { - "color": { - "mode": "palette-classic" - }, "custom": { - "axisBorderShow": false, - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 15, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "insertNulls": false, + "fillOpacity": 30, + "gradientMode": "opacity", "lineInterpolation": "smooth", "lineWidth": 2, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "normal" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [ ], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - } - ] + "showPoints": "never" }, "unit": "decbytes" - }, - "overrides": [ ] + } }, "gridPos": { "h": 8, "w": 12, "x": 0, - "y": 33 + "y": 42 }, "id": 13, "options": { "legend": { "calcs": [ ], - "displayMode": "list", - "placement": "bottom", - "showLegend": true + "displayMode": "list" }, "tooltip": { "mode": "multi", "sort": "desc" } }, + "pluginVersion": "v11.0.0", "targets": [ { "datasource": { + "type": "prometheus", "uid": "${prometheus_datasource}" }, - "expr": "jvm_nonheap_memory_used{job=~\"$job\", instance=~\"$instance\", presto_cluster=~\"$presto_cluster\"}", + "expr": "jvm_heap_memory_committed{job=~\"integrations/presto\",job=~\"$job\",cluster=~\"$cluster\",presto_cluster=~\"$presto_cluster\",instance=~\"$instance\"}", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}} - non heap" + "instant": false, + "legendFormat": "{{instance}} - heap", + "refId": "JVM heap memory committed" }, { "datasource": { + "type": "prometheus", "uid": "${prometheus_datasource}" }, - "expr": "jvm_heap_memory_used{job=~\"$job\", instance=~\"$instance\", presto_cluster=~\"$presto_cluster\"}", + "expr": "jvm_nonheap_memory_committed{job=~\"integrations/presto\",job=~\"$job\",cluster=~\"$cluster\",presto_cluster=~\"$presto_cluster\",instance=~\"$instance\"}", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}} - heap" + "instant": false, + "legendFormat": "{{instance}} - non heap", + "refId": "JVM non-heap memory committed" } ], - "title": "Memory used", + "title": "Memory committed", "type": "timeseries" }, { "datasource": { + "type": "prometheus", "uid": "${prometheus_datasource}" }, - "description": "The heap and non-heap memory committed.", + "description": "The heap and non-heap memory used by the JVM.", "fieldConfig": { "defaults": { - "color": { - "mode": "palette-classic" - }, "custom": { - "axisBorderShow": false, - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 15, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "insertNulls": false, + "fillOpacity": 30, + "gradientMode": "opacity", "lineInterpolation": "smooth", "lineWidth": 2, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "normal" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [ ], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - } - ] + "showPoints": "never" }, "unit": "decbytes" - }, - "overrides": [ ] + } }, "gridPos": { "h": 8, "w": 12, "x": 12, - "y": 33 + "y": 42 }, "id": 14, "options": { "legend": { "calcs": [ ], - "displayMode": "list", - "placement": "bottom", - "showLegend": true + "displayMode": "list" }, "tooltip": { "mode": "multi", "sort": "desc" } }, + "pluginVersion": "v11.0.0", "targets": [ { "datasource": { + "type": "prometheus", "uid": "${prometheus_datasource}" }, - "expr": "jvm_heap_memory_committed{job=~\"$job\", instance=~\"$instance\", presto_cluster=~\"$presto_cluster\"}", + "expr": "jvm_heap_memory_used{job=~\"integrations/presto\",job=~\"$job\",cluster=~\"$cluster\",presto_cluster=~\"$presto_cluster\",instance=~\"$instance\"}", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}} - heap" + "instant": false, + "legendFormat": "{{instance}} - heap", + "refId": "JVM heap memory usage" }, { "datasource": { + "type": "prometheus", "uid": "${prometheus_datasource}" }, - "expr": "jvm_nonheap_memory_committed{job=~\"$job\", instance=~\"$instance\", presto_cluster=~\"$presto_cluster\"}", + "expr": "jvm_nonheap_memory_used{job=~\"integrations/presto\",job=~\"$job\",cluster=~\"$cluster\",presto_cluster=~\"$presto_cluster\",instance=~\"$instance\"}", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}} - non heap" + "instant": false, + "legendFormat": "{{instance}} - non heap", + "refId": "JVM non-heap memory usage" } ], - "title": "Memory committed", + "title": "Memory used", "type": "timeseries" } ], "refresh": "1m", - "rows": [ ], - "schemaVersion": 14, - "style": "dark", + "schemaVersion": 39, "tags": [ "presto-mixin" ], "templating": { "list": [ { - "current": { }, - "hide": 0, - "label": "Data Source", + "label": "Prometheus data source", "name": "prometheus_datasource", - "options": [ ], "query": "prometheus", - "refresh": 1, "regex": "", "type": "datasource" }, { "allValue": ".+", - "current": { }, "datasource": { + "type": "prometheus", "uid": "${prometheus_datasource}" }, - "hide": 0, "includeAll": true, "label": "Job", "multi": true, "name": "job", - "options": [ ], - "query": "label_values(presto_metadata_DiscoveryNodeManager_ActiveNodeCount,job)", + "query": "label_values(presto_metadata_DiscoveryNodeManager_ActiveResourceManagerCount{job=~\"integrations/presto\"}, job)", "refresh": 2, - "regex": "", - "sort": 0, - "tagValuesQuery": "", - "tags": [ ], - "tagsQuery": "", - "type": "query", - "useTags": false + "sort": 1, + "type": "query" }, { - "allValue": "", - "current": { }, + "allValue": ".*", "datasource": { + "type": "prometheus", "uid": "${prometheus_datasource}" }, - "hide": 2, "includeAll": true, "label": "Cluster", "multi": true, "name": "cluster", - "options": [ ], - "query": "label_values(presto_metadata_DiscoveryNodeManager_ActiveNodeCount{job=~\"$job\"}, cluster)", + "query": "label_values(presto_metadata_DiscoveryNodeManager_ActiveResourceManagerCount{job=~\"integrations/presto\",job=~\"$job\"}, cluster)", "refresh": 2, - "regex": "", - "sort": 0, - "tagValuesQuery": "", - "tags": [ ], - "tagsQuery": "", - "type": "query", - "useTags": false + "sort": 1, + "type": "query" }, { - "allValue": ".*", - "current": { }, + "allValue": ".+", "datasource": { + "type": "prometheus", "uid": "${prometheus_datasource}" }, - "hide": 0, - "includeAll": false, + "includeAll": true, "label": "Presto cluster", - "multi": false, + "multi": true, "name": "presto_cluster", - "options": [ ], - "query": "label_values(presto_metadata_DiscoveryNodeManager_ActiveNodeCount{job=~\"$job\"},presto_cluster)", + "query": "label_values(presto_metadata_DiscoveryNodeManager_ActiveResourceManagerCount{job=~\"integrations/presto\",job=~\"$job\",cluster=~\"$cluster\"}, presto_cluster)", "refresh": 2, - "regex": "", - "sort": 0, - "tagValuesQuery": "", - "tags": [ ], - "tagsQuery": "", - "type": "query", - "useTags": false + "sort": 1, + "type": "query" }, { - "allValue": "", - "current": { }, + "allValue": ".+", "datasource": { + "type": "prometheus", "uid": "${prometheus_datasource}" }, - "hide": 0, - "includeAll": false, + "includeAll": true, "label": "Instance", "multi": true, "name": "instance", - "options": [ ], - "query": "label_values(presto_metadata_DiscoveryNodeManager_ActiveNodeCount{job=~\"$job\", presto_cluster=~\"$presto_cluster\"},instance)", + "query": "label_values(presto_metadata_DiscoveryNodeManager_ActiveResourceManagerCount{job=~\"integrations/presto\",job=~\"$job\",cluster=~\"$cluster\",presto_cluster=~\"$presto_cluster\"}, instance)", "refresh": 2, + "sort": 1, + "type": "query" + }, + { + "hide": 2, + "label": "Loki data source", + "name": "loki_datasource", + "query": "loki", "regex": "", - "sort": 0, - "tagValuesQuery": "", - "tags": [ ], - "tagsQuery": "", - "type": "query", - "useTags": false + "type": "datasource" } ] }, @@ -1287,33 +851,7 @@ "from": "now-30m", "to": "now" }, - "timepicker": { - "refresh_intervals": [ - "5s", - "10s", - "30s", - "1m", - "5m", - "15m", - "30m", - "1h", - "2h", - "1d" - ], - "time_options": [ - "5m", - "15m", - "1h", - "6h", - "12h", - "24h", - "2d", - "7d", - "30d" - ] - }, "timezone": "default", "title": "Presto worker", - "uid": "presto-worker", - "version": 0 + "uid": "presto_worker" } \ No newline at end of file diff --git a/presto-mixin/g.libsonnet b/presto-mixin/g.libsonnet new file mode 100644 index 000000000..e6a2060ee --- /dev/null +++ b/presto-mixin/g.libsonnet @@ -0,0 +1 @@ +import 'github.com/grafana/grafonnet/gen/grafonnet-v11.4.0/main.libsonnet' diff --git a/presto-mixin/jsonnetfile.json b/presto-mixin/jsonnetfile.json index e8255b65e..6354d0e12 100644 --- a/presto-mixin/jsonnetfile.json +++ b/presto-mixin/jsonnetfile.json @@ -1,33 +1,42 @@ { "version": 1, "dependencies": [ - { - "source": { - "git": { - "remote": "https://github.com/grafana/grafonnet.git", - "subdir": "gen/grafonnet-latest" - } + { + "source": { + "git": { + "remote": "https://github.com/grafana/grafonnet-lib.git", + "subdir": "grafonnet" + } + }, + "version": "master" }, - "version": "main" - }, - { - "source": { - "git": { - "remote": "https://github.com/grafana/grafonnet-lib.git", - "subdir": "grafonnet" - } + { + "source": { + "git": { + "remote": "https://github.com/grafana/jsonnet-libs.git", + "subdir": "common-lib" + } + }, + "version": "master" }, - "version": "master" - }, - { - "source": { - "git": { - "remote": "https://github.com/grafana/jsonnet-libs.git", - "subdir": "logs-lib" - } + { + "source": { + "git": { + "remote": "https://github.com/grafana/jsonnet-libs.git", + "subdir": "grafana-cloud-integration-utils" + } + }, + "version": "master" }, - "version": "master" - } + { + "source": { + "git": { + "remote": "https://github.com/grafana/jsonnet-libs.git", + "subdir": "logs-lib" + } + }, + "version": "master" + } ], "legacyImports": true } diff --git a/presto-mixin/links.libsonnet b/presto-mixin/links.libsonnet new file mode 100644 index 000000000..5a2313490 --- /dev/null +++ b/presto-mixin/links.libsonnet @@ -0,0 +1,30 @@ +local g = import './g.libsonnet'; + +{ + local link = g.dashboard.link, + new(this): + { + prestoOverview: + link.link.new('Presto overview', '/d/' + this.grafana.dashboards['presto-overview.json'].uid) + + link.link.options.withKeepTime(true), + + prestoCoordinator: + link.link.new('Presto coordinator', '/d/' + this.grafana.dashboards['presto-coordinator.json'].uid) + + link.link.options.withKeepTime(true), + + prestoWorker: + link.link.new('Presto worker', '/d/' + this.grafana.dashboards['presto-worker.json'].uid) + + link.link.options.withKeepTime(true), + + otherDashboards: + link.dashboards.new('All dashboards', this.config.dashboardTags) + + link.dashboards.options.withIncludeVars(true) + + link.dashboards.options.withKeepTime(true) + + link.dashboards.options.withAsDropdown(true), + } + + if this.config.enableLokiLogs then { + logs: + link.link.new('Presto logs', '/d/' + this.grafana.dashboards['presto-logs.json'].uid) + + link.link.options.withKeepTime(true), + } else {}, +} diff --git a/presto-mixin/main.libsonnet b/presto-mixin/main.libsonnet new file mode 100644 index 000000000..94f10216c --- /dev/null +++ b/presto-mixin/main.libsonnet @@ -0,0 +1,49 @@ +local alerts = import './alerts.libsonnet'; +local config = import './config.libsonnet'; +local dashboards = import './dashboards.libsonnet'; +local links = import './links.libsonnet'; +local panels = import './panels.libsonnet'; +local rows = import './rows.libsonnet'; +local commonlib = import 'common-lib/common/main.libsonnet'; + +{ + withConfigMixin(config): { + config+: config, + }, + + new(): { + + local this = self, + config: config, + + signals: + { + [sig]: commonlib.signals.unmarshallJsonMulti( + this.config.signals[sig], + type=this.config.metricsSource + ) + for sig in std.objectFields(this.config.signals) + }, + + grafana: { + variables: commonlib.variables.new( + filteringSelector=this.config.filteringSelector, + groupLabels=this.config.groupLabels, + instanceLabels=this.config.instanceLabels, + varMetric='presto_metadata_DiscoveryNodeManager_ActiveResourceManagerCount', + customAllValue='.+', + enableLokiLogs=this.config.enableLokiLogs, + ), + annotations: {}, + links: links.new(this), + panels: panels.new(this), + dashboards: dashboards.new(this), + rows: rows.new(this), + }, + + prometheus: { + alerts: alerts.new(this), + recordingRules: {}, + }, + }, +} diff --git a/presto-mixin/mixin.libsonnet b/presto-mixin/mixin.libsonnet index 4d987cf31..9d1ebfb1f 100644 --- a/presto-mixin/mixin.libsonnet +++ b/presto-mixin/mixin.libsonnet @@ -1,3 +1,34 @@ -(import 'dashboards/dashboards.libsonnet') + -(import 'alerts/alerts.libsonnet') + -(import 'config.libsonnet') +local prestolib = import './main.libsonnet'; +local config = (import './config.libsonnet'); +local util = import 'grafana-cloud-integration-utils/util.libsonnet'; + +local presto = + prestolib.new() + + prestolib.withConfigMixin( + { + filteringSelector: config.filteringSelector, + uid: config.uid, + enableLokiLogs: config.enableLokiLogs, + } + ); + +local optional_labels = { + cluster+: { + allValue: '.*', + }, + presto_cluster+: { + label: 'Presto cluster', + }, +}; + +{ + grafanaDashboards+:: { + [fname]: + local dashboard = presto.grafana.dashboards[fname]; + dashboard + util.patch_variables(dashboard, optional_labels) + + for fname in std.objectFields(presto.grafana.dashboards) + }, + prometheusAlerts+:: presto.prometheus.alerts, + prometheusRules+:: presto.prometheus.recordingRules, +} diff --git a/presto-mixin/panels.libsonnet b/presto-mixin/panels.libsonnet new file mode 100644 index 000000000..f583223ea --- /dev/null +++ b/presto-mixin/panels.libsonnet @@ -0,0 +1,352 @@ +local g = import './g.libsonnet'; +local commonlib = import 'common-lib/common/main.libsonnet'; + +{ + new(this):: + { + local signals = this.signals, + + /* + ------------------------- + Overview + ------------------------- + */ + overviewActiveResourceManagers: + g.panel.stat.new('Active resource managers') + + g.panel.stat.queryOptions.withTargets([signals.overview.activeResourceManagers.asTarget()]) + + g.panel.stat.panelOptions.withDescription('Active resource managers') + + g.panel.stat.standardOptions.withUnit('none'), + + overviewActiveCoordinators: + g.panel.stat.new('Active coordinators') + + g.panel.stat.queryOptions.withTargets([signals.overview.activeCoordinators.asTarget()]) + + g.panel.stat.panelOptions.withDescription('Active coordinators') + + g.panel.stat.standardOptions.withUnit('none'), + + overviewActiveWorkers: + g.panel.stat.new('Active workers') + + g.panel.stat.queryOptions.withTargets([signals.overview.activeWorkers.asTarget()]) + + g.panel.stat.panelOptions.withDescription('Active workers') + + g.panel.stat.standardOptions.withUnit('none'), + + overviewInactiveWorkers: + g.panel.stat.new('Inactive workers') + + g.panel.stat.queryOptions.withTargets([signals.overview.inactiveWorkers.asTarget()]) + + g.panel.stat.panelOptions.withDescription('Inactive workers') + + g.panel.stat.standardOptions.withUnit('none'), + + overviewCompletedQueries: + commonlib.panels.generic.timeSeries.base.new('Completed queries - one minute count', targets=[signals.overview.completedQueries.asTarget()]) + + g.panel.timeSeries.standardOptions.withUnit('none'), + + overviewAlertsPanel: { + title: 'Alerts', + type: 'alertlist', + targets: [], + options: { + alertInstanceLabelFilter: '{job=~"${job:regex}", presto_cluster=~"${presto_cluster:regex}"}', + alertName: '', + dashboardAlerts: false, + maxItems: 20, + sortOrder: 1, + stateFilter: { + 'error': true, + firing: true, + noData: false, + normal: true, + pending: true, + }, + viewMode: 'list', + }, + }, + + overviewUserErrorFailureRate: + commonlib.panels.generic.timeSeries.base.new('User error failures - one minute rate', + targets=[signals.overview.userErrorFailures.asTarget()]) + + g.panel.timeSeries.panelOptions.withDescription('The rate of user error failures occurring across clusters.') + + g.panel.timeSeries.standardOptions.withUnit('err/s'), + + overviewQueuedQueries: + commonlib.panels.generic.timeSeries.base.new('Queued queries', + targets=[signals.overview.queuedQueries.asTarget()]) + + g.panel.timeSeries.panelOptions.withDescription('The number of queued queries.') + + g.panel.timeSeries.standardOptions.withUnit('none'), + + + overviewBlockedNodes: + commonlib.panels.generic.timeSeries.base.new('Blocked nodes', + targets=[signals.overview.blockedNodes.asTarget()]) + + g.panel.timeSeries.panelOptions.withDescription('The number of nodes that are blocked due to memory restrictions.') + + g.panel.timeSeries.standardOptions.withUnit('none'), + + overviewInternalErrorFailureRate: + commonlib.panels.generic.timeSeries.base.new('Internal error failures - one minute rate', + targets=[signals.overview.internalErrorFailures.asTarget()]) + + g.panel.timeSeries.panelOptions.withDescription('The rate of internal error failures occurring across clusters.') + + g.panel.timeSeries.standardOptions.withUnit('err/s'), + + + overviewClusterMemoryDistributed: + commonlib.panels.generic.timeSeries.base.new('Cluster memory distributed bytes', + targets=[signals.overview.clusterMemoryDistributedBytesReserved.asTarget(), signals.overview.clusterMemoryDistributedBytesFree.asTarget()]) + + g.panel.timeSeries.panelOptions.withDescription('The amount of memory available across the clusters.') + + g.panel.timeSeries.standardOptions.withUnit('decbytes'), + + + overviewInsufficientResourceFailures: + commonlib.panels.generic.timeSeries.base.new('Insufficient resource failures - one minute rate', + targets=[signals.overview.insufficientResourceFailures.asTarget()]) + + g.panel.timeSeries.panelOptions.withDescription('The rate that failures are occurring due to insufficient resources.') + + g.panel.timeSeries.standardOptions.withUnit('err/s'), + + overviewDataProcessingThroughput: + commonlib.panels.generic.timeSeries.base.new('Data processing throughput - one minute rate', + targets=[signals.overview.dataProcessingThroughputInput.asTarget(), signals.overview.dataProcessingThroughputOutput.asTarget()]) + + g.panel.timeSeries.panelOptions.withDescription('The rate at which volumes of data are being processed.') + + g.panel.timeSeries.standardOptions.withUnit('Bps'), + + /* + ------------------------- + Coordinator + ------------------------- + */ + + coordinatorNonheapMemoryUsage: + g.panel.gauge.new('Non-heap memory usage') + + g.panel.gauge.queryOptions.withTargets(signals.coordinator.nonheapMemoryUsage.asTarget()) + + g.panel.gauge.panelOptions.withDescription('Non-heap memory usage') + + g.panel.gauge.standardOptions.withUnit('percentunit') + + g.panel.gauge.standardOptions.thresholds.withSteps([ + g.panel.gauge.standardOptions.threshold.step.withColor('light-green') + + g.panel.gauge.standardOptions.threshold.step.withValue(0), + g.panel.gauge.standardOptions.threshold.step.withColor('#EAB839') + + g.panel.gauge.standardOptions.threshold.step.withValue(0.7), + g.panel.gauge.standardOptions.threshold.step.withColor('light-red') + + g.panel.gauge.standardOptions.threshold.step.withValue(0.8), + ]), + + coordinatorHeapMemoryUsage: + g.panel.gauge.new('Heap memory usage') + + g.panel.gauge.queryOptions.withTargets(signals.coordinator.heapMemoryUsage.asTarget()) + + g.panel.gauge.panelOptions.withDescription('Heap memory usage') + + g.panel.gauge.standardOptions.thresholds.withSteps([ + g.panel.gauge.standardOptions.threshold.step.withColor('light-green') + + g.panel.gauge.standardOptions.threshold.step.withValue(0), + g.panel.gauge.standardOptions.threshold.step.withColor('#EAB839') + + g.panel.gauge.standardOptions.threshold.step.withValue(0.7), + g.panel.gauge.standardOptions.threshold.step.withColor('light-red') + + g.panel.gauge.standardOptions.threshold.step.withValue(0.8), + ]) + + g.panel.gauge.standardOptions.withUnit('percentunit'), + + coordinatorErrorFailures: + commonlib.panels.generic.timeSeries.base.new('Error failures - one minute count', targets=[signals.coordinator.errorFailuresInternal.asTarget()]) + + g.panel.timeSeries.standardOptions.withUnit('none'), + + // coordinatorQueries: + // commonlib.panels.generic.timeSeries.base.new('Queries - one minute count', targets=[signals.coordinator.normalQueries.asTarget()]) + // + g.panel.timeSeries.standardOptions.withUnit('none'), + + coordinatorNormalQueries: + commonlib.panels.generic.timeSeries.base.new('Normal query - one minute count', targets=[ + signals.coordinator.queryCompleted.asTarget(), + signals.coordinator.queryRunning.asTarget(), + signals.coordinator.queryStarted.asTarget(), + ]) + + g.panel.timeSeries.standardOptions.withUnit('none'), + + coordinatorAbnormalQueries: + commonlib.panels.generic.timeSeries.base.new('Abnormal query - one minute count', targets=[ + signals.coordinator.abnormalQueryFailed.asTarget(), + signals.coordinator.abnormalQueryAbandoned.asTarget(), + signals.coordinator.abnormalQueryCanceled.asTarget(), + ]) + + g.panel.timeSeries.standardOptions.withUnit('none'), + + coordinatorNormalQueryRate: + commonlib.panels.generic.timeSeries.base.new('Normal query - one minute rate', targets=[ + signals.coordinator.normalQueryCompletedRate.asTarget(), + signals.coordinator.normalQueryRunningRate.asTarget(), + signals.coordinator.normalQueryStartedRate.asTarget(), + ]) + + g.panel.timeSeries.standardOptions.withUnit('none'), + + coordinatorAbnormalQueryRate: + commonlib.panels.generic.timeSeries.base.new('Abnormal query - one minute rate', targets=[ + signals.coordinator.abnormalQueryFailedRate.asTarget() { interval: '1m' }, + signals.coordinator.abnormalQueryAbandonedRate.asTarget() { interval: '1m' }, + signals.coordinator.abnormalQueryCanceledRate.asTarget() { interval: '1m' }, + ]) + + g.panel.timeSeries.standardOptions.withUnit('none'), + + coordinatorQueryExecutionTime: + commonlib.panels.generic.timeSeries.base.new('Query execution time - one minute count', targets=[ + signals.coordinator.queryExecutionTimeP50.asTarget(), + signals.coordinator.queryExecutionTimeP75.asTarget(), + signals.coordinator.queryExecutionTimeP95.asTarget(), + signals.coordinator.queryExecutionTimeP99.asTarget(), + ]) + + g.panel.timeSeries.standardOptions.withUnit('ms'), + + coordinatorCPUTimeConsumed: + commonlib.panels.generic.timeSeries.base.new('CPU time consumed - one minute count', targets=[ + signals.coordinator.cpuTimeConsumed.asTarget(), + ]) + + g.panel.timeSeries.standardOptions.withUnit('ms'), + + coordinatorCPUInputThroughput: + commonlib.panels.generic.timeSeries.base.new('CPU input throughput - one minute count', targets=[ + signals.coordinator.cpuInputThroughput.asTarget(), + ]) + + g.panel.timeSeries.standardOptions.withUnit('Bps'), + + coordinatorGarbageCollections: + commonlib.panels.generic.timeSeries.base.new('Garbage collection count / $__interval', targets=[ + signals.coordinator.jvmGarbageCollectorCount.asTarget() { interval: '2m' }, + ]) + + g.panel.timeSeries.standardOptions.withUnit('none'), + + coordinatorJVMGarbageCollectionDuration: + commonlib.panels.generic.timeSeries.base.new( + 'Garbage collection duration', + targets=[ + signals.coordinator.jvmGarbageCollectionDuration.asTarget(), + ] + ) + + g.panel.timeSeries.panelOptions.withDescription('The average duration for each garbage collection operation in the JVM.') + + g.panel.timeSeries.standardOptions.withUnit('ms'), + + coordinatorJVMMemoryUsage: + commonlib.panels.generic.timeSeries.base.new('Memory used', targets=[ + signals.coordinator.jvmHeapMemoryUsage.asTarget(), + signals.coordinator.jvmNonHeapMemoryUsage.asTarget(), + ]) + + g.panel.timeSeries.panelOptions.withDescription('The heap and non-heap memory used by the JVM.') + + g.panel.timeSeries.standardOptions.withUnit('decbytes'), + + coordinatorJVMMemoryCommitted: + commonlib.panels.generic.timeSeries.base.new('Memory committed', targets=[ + signals.coordinator.jvmHeapMemoryCommitted.asTarget(), + signals.coordinator.jvmNonHeapMemoryCommitted.asTarget(), + ]) + + g.panel.timeSeries.panelOptions.withDescription('The heap and non-heap memory committed by the JVM.') + + g.panel.timeSeries.standardOptions.withUnit('decbytes'), + + + /* + ------------------------- + Worker + ------------------------- + */ + + workerNonHeapMemoryUsage: + g.panel.gauge.new('Non-heap memory usage') + + g.panel.gauge.queryOptions.withTargets(signals.worker.nonheapMemoryUsage.asTarget()) + + g.panel.gauge.panelOptions.withDescription('Non-heap memory usage') + + g.panel.gauge.standardOptions.withUnit('percentunit') + + g.panel.gauge.standardOptions.thresholds.withSteps([ + g.panel.gauge.standardOptions.threshold.step.withColor('light-green') + + g.panel.gauge.standardOptions.threshold.step.withValue(0), + g.panel.gauge.standardOptions.threshold.step.withColor('#EAB839') + + g.panel.gauge.standardOptions.threshold.step.withValue(0.7), + g.panel.gauge.standardOptions.threshold.step.withColor('light-red') + + g.panel.gauge.standardOptions.threshold.step.withValue(0.8), + ]), + + workerHeapMemoryUsage: + g.panel.gauge.new('Heap memory usage') + + g.panel.gauge.queryOptions.withTargets(signals.worker.heapMemoryUsage.asTarget()) + + g.panel.gauge.panelOptions.withDescription('Heap memory usage') + + g.panel.gauge.standardOptions.withUnit('percentunit') + + g.panel.gauge.standardOptions.thresholds.withSteps([ + g.panel.gauge.standardOptions.threshold.step.withColor('light-green') + + g.panel.gauge.standardOptions.threshold.step.withValue(0), + g.panel.gauge.standardOptions.threshold.step.withColor('#EAB839') + + g.panel.gauge.standardOptions.threshold.step.withValue(0.7), + g.panel.gauge.standardOptions.threshold.step.withColor('light-red') + + g.panel.gauge.standardOptions.threshold.step.withValue(0.8), + ]), + + + workerQueuedTasks: + commonlib.panels.generic.timeSeries.base.new('Queued tasks', targets=[ + signals.worker.queuedTasks.asTarget(), + ]) + + g.panel.timeSeries.panelOptions.withDescription('The number of tasks that are being queued by the task executor.') + + g.panel.timeSeries.standardOptions.withUnit('ops'), + + workerFailedCompletedTasks: + commonlib.panels.generic.timeSeries.base.new('Failed & completed tasks', targets=[ + signals.worker.failedTasks.asTarget() { interval: '1m' }, + signals.worker.completedTasks.asTarget() { interval: '1m' }, + ]) + + g.panel.timeSeries.panelOptions.withDescription('The rate at which tasks have failed and completed.') + + g.panel.timeSeries.standardOptions.withUnit('ops'), + + workerOutputPositions: + commonlib.panels.generic.timeSeries.base.new('Output positions', targets=[ + signals.worker.outputPositions.asTarget(), + ]) + + g.panel.timeSeries.panelOptions.withDescription('The rate of rows (or records) produced by an operation.') + + g.panel.timeSeries.standardOptions.withUnit('short'), + + workerExecutorPoolSize: + commonlib.panels.generic.timeSeries.base.new('Executor pool size', targets=[ + signals.worker.taskNotificationExecutorPoolSize.asTarget(), + signals.worker.processExecutorCorePoolSize.asTarget(), + signals.worker.processExecutorPoolSize.asTarget(), + ]) + + g.panel.timeSeries.panelOptions.withDescription('The pool size of the task notification executor.') + + g.panel.timeSeries.standardOptions.withUnit('none'), + + workerMemoryPool: + commonlib.panels.generic.timeSeries.base.new('Memory pool', targets=[ + signals.worker.memoryPoolFreeBytes.asTarget(), + signals.worker.memoryPoolReservedFreeBytes.asTarget(), + ]) + + g.panel.timeSeries.panelOptions.withDescription('The amount of Presto memory available.') + + g.panel.timeSeries.standardOptions.withUnit('decbytes'), + + workerDataProcesssingThroughput: + commonlib.panels.generic.timeSeries.base.new('Data processing throughput', targets=[ + signals.worker.dataProcessingInput.asTarget(), + signals.worker.dataProcessingOutput.asTarget(), + ]) + + g.panel.timeSeries.panelOptions.withDescription('The rate at which volumes of data are being processed.') + + g.panel.timeSeries.standardOptions.withUnit('Bps'), + + // Worker JVM + + workerJVMGarbageCollectorCount: + commonlib.panels.generic.timeSeries.base.new('Garbage collection count / $__interval', targets=[ + signals.worker.garbageCollectionCount.asTarget() { interval: '2m' }, + ]) + + g.panel.timeSeries.panelOptions.withDescription('The recent increase in the number of garbage collection events for the JVM.') + + g.panel.timeSeries.standardOptions.withUnit('none'), + + workerJVMGarbageCollectionDuration: + commonlib.panels.generic.timeSeries.base.new('Garbage collection duration', targets=[ + signals.worker.garbageCollectionDuration.asTarget() { interval: '2m' }, + ]) + + g.panel.timeSeries.panelOptions.withDescription('The average duration for each garbage collection operation in the JVM.') + + g.panel.timeSeries.standardOptions.withUnit('ms'), + + + workerJVMMemoryUsage: + commonlib.panels.generic.timeSeries.base.new('Memory used', targets=[ + signals.worker.jvmHeapMemoryUsage.asTarget(), + signals.worker.jvmNonHeapMemoryUsage.asTarget(), + ]) + + g.panel.timeSeries.panelOptions.withDescription('The heap and non-heap memory used by the JVM.') + + g.panel.timeSeries.standardOptions.withUnit('decbytes'), + + workerJVMMemoryCommitted: + commonlib.panels.generic.timeSeries.base.new('Memory committed', targets=[ + signals.worker.jvmHeapMemoryCommitted.asTarget(), + signals.worker.jvmNonHeapMemoryCommitted.asTarget(), + ]) + + g.panel.timeSeries.panelOptions.withDescription('The heap and non-heap memory committed by the JVM.') + + g.panel.timeSeries.standardOptions.withUnit('decbytes'), + + }, +} diff --git a/presto-mixin/rows.libsonnet b/presto-mixin/rows.libsonnet new file mode 100644 index 000000000..26f19ed03 --- /dev/null +++ b/presto-mixin/rows.libsonnet @@ -0,0 +1,82 @@ +local g = import './g.libsonnet'; + +{ + new(this): { + overview: + g.panel.row.new('Overview') + + g.panel.row.withCollapsed(false) + + g.panel.row.withPanels( + [ + this.grafana.panels.overviewActiveResourceManagers { gridPos+: { h: 4, w: 6 } }, + this.grafana.panels.overviewActiveCoordinators { gridPos+: { h: 4, w: 6 } }, + this.grafana.panels.overviewActiveWorkers { gridPos+: { h: 4, w: 6 } }, + this.grafana.panels.overviewInactiveWorkers { gridPos+: { h: 4, w: 6 } }, + this.grafana.panels.overviewCompletedQueries { gridPos+: { h: 8, w: 12 } }, + this.grafana.panels.overviewAlertsPanel { gridPos+: { h: 8, w: 12 } }, + this.grafana.panels.overviewUserErrorFailureRate { gridPos+: { h: 8, w: 12 } }, + this.grafana.panels.overviewQueuedQueries { gridPos+: { h: 8, w: 12 } }, + this.grafana.panels.overviewBlockedNodes { gridPos+: { h: 8, w: 12 } }, + this.grafana.panels.overviewInternalErrorFailureRate { gridPos+: { h: 8, w: 12 } }, + this.grafana.panels.overviewClusterMemoryDistributed { gridPos+: { h: 8, w: 12 } }, + this.grafana.panels.overviewInsufficientResourceFailures { gridPos+: { h: 8, w: 12 } }, + this.grafana.panels.overviewDataProcessingThroughput { gridPos+: { h: 9, w: 24 } }, + ], + ), + + coordinator: + g.panel.row.new('Coordinator') + + g.panel.row.withCollapsed(false) + + g.panel.row.withPanels( + [ + this.grafana.panels.coordinatorNonheapMemoryUsage { gridPos+: { w: 6 } }, + this.grafana.panels.coordinatorHeapMemoryUsage { gridPos+: { w: 6 } }, + this.grafana.panels.coordinatorErrorFailures { gridPos+: { w: 12 } }, + this.grafana.panels.coordinatorNormalQueries { gridPos+: { w: 12 } }, + this.grafana.panels.coordinatorAbnormalQueries { gridPos+: { w: 12 } }, + this.grafana.panels.coordinatorNormalQueryRate { gridPos+: { w: 12 } }, + this.grafana.panels.coordinatorAbnormalQueryRate { gridPos+: { w: 12 } }, + this.grafana.panels.coordinatorQueryExecutionTime { gridPos+: { w: 24 } }, + this.grafana.panels.coordinatorCPUTimeConsumed { gridPos+: { w: 12 } }, + this.grafana.panels.coordinatorCPUInputThroughput { gridPos+: { w: 12 } }, + ], + ), + coordinatorJVM: + g.panel.row.new('JVM') + + g.panel.row.withCollapsed(false) + + g.panel.row.withPanels( + [ + this.grafana.panels.coordinatorGarbageCollections { gridPos+: { w: 12 } }, + this.grafana.panels.coordinatorJVMGarbageCollectionDuration { gridPos+: { w: 12 } }, + this.grafana.panels.coordinatorJVMMemoryCommitted { gridPos+: { w: 12 } }, + this.grafana.panels.coordinatorJVMMemoryUsage { gridPos+: { w: 12 } }, + ], + ), + + worker: + g.panel.row.new('Worker') + + g.panel.row.withCollapsed(false) + + g.panel.row.withPanels( + [ + this.grafana.panels.workerNonHeapMemoryUsage { gridPos+: { w: 12, h: 4 } }, + this.grafana.panels.workerHeapMemoryUsage { gridPos+: { w: 12, h: 4 } }, + this.grafana.panels.workerQueuedTasks { gridPos+: { w: 12 } }, + this.grafana.panels.workerFailedCompletedTasks { gridPos+: { w: 12 } }, + this.grafana.panels.workerOutputPositions { gridPos+: { w: 12 } }, + this.grafana.panels.workerExecutorPoolSize { gridPos+: { w: 12 } }, + this.grafana.panels.workerMemoryPool { gridPos+: { w: 12 } }, + this.grafana.panels.workerDataProcesssingThroughput { gridPos+: { w: 12 } }, + ], + ), + workerJVM: + g.panel.row.new('JVM') + + g.panel.row.withCollapsed(false) + + g.panel.row.withPanels( + [ + this.grafana.panels.workerJVMGarbageCollectorCount { gridPos+: { w: 12 } }, + this.grafana.panels.workerJVMGarbageCollectionDuration { gridPos+: { w: 12 } }, + this.grafana.panels.workerJVMMemoryCommitted { gridPos+: { w: 12 } }, + this.grafana.panels.workerJVMMemoryUsage { gridPos+: { w: 12 } }, + ], + ), + }, +} diff --git a/presto-mixin/signals/coordinator.libsonnet b/presto-mixin/signals/coordinator.libsonnet new file mode 100644 index 000000000..a0b92a872 --- /dev/null +++ b/presto-mixin/signals/coordinator.libsonnet @@ -0,0 +1,409 @@ +function(this) { + local legendCustomTemplate = '{{presto_cluster}}', + filteringSelector: this.filteringSelector, + groupLabels: this.groupLabels, + instanceLabels: this.instanceLabels, + enableLokiLogs: this.enableLokiLogs, + legendCustomTemplate: std.join(' ', std.map(function(label) '{{' + label + '}}', this.groupLabels)), + aggLevel: 'none', + aggFunction: 'avg', + discoveryMetric: { + prometheus: 'presto_QueryManager_InternalFailures_OneMinute_Count', + }, + signals: { + + nonheapMemoryUsage: { + name: 'Non-heap memory usage', + nameShort: 'Non-heap memory usage', + type: 'gauge', + description: 'The non-heap memory usage of the coordinator.', + unit: 'percentunit', + sources: { + prometheus: { + expr: 'avg (jvm_nonheap_memory_used{%(queriesSelector)s} / clamp_min((jvm_nonheap_memory_used{%(queriesSelector)s} + jvm_nonheap_memory_committed{%(queriesSelector)s}), 1))', + }, + }, + }, + + + heapMemoryUsage: { + name: 'Heap memory usage', + nameShort: 'Heap memory usage', + type: 'gauge', + description: 'The heap memory usage of the coordinator.', + unit: 'percentunit', + sources: { + prometheus: { + expr: 'avg (jvm_heap_memory_used{%(queriesSelector)s} / clamp_min((jvm_heap_memory_used{%(queriesSelector)s} + jvm_heap_memory_committed{%(queriesSelector)s}), 1))', + }, + }, + }, + + errorFailuresInternal: { + name: 'Error failures internal', + nameShort: 'Error failures internal', + type: 'gauge', + description: 'The number of internal error failures occurring on the coordinator.', + unit: 'none', + sources: { + prometheus: { + expr: 'presto_QueryManager_InternalFailures_OneMinute_Count{%(queriesSelector)s}', + legendCustomTemplate: legendCustomTemplate + ' - internal', + }, + }, + }, + + userErrorFailures: { + name: 'User error failures', + nameShort: 'User error failures', + type: 'gauge', + description: 'The number of user error failures occurring on the coordinator.', + unit: 'none', + sources: { + prometheus: { + expr: 'presto_QueryManager_UserErrorFailures_OneMinute_Count{%(queriesSelector)s}', + legendCustomTemplate: legendCustomTemplate + ' - user', + }, + }, + }, + + queryCompleted: { + name: 'Query completed', + nameShort: 'Query completed', + type: 'gauge', + description: 'The number of queries completed occurring on the coordinator.', + unit: 'none', + sources: { + prometheus: { + expr: 'presto_QueryManager_CompletedQueries_OneMinute_Count{%(queriesSelector)s}', + legendCustomTemplate: legendCustomTemplate + ' - completed', + }, + }, + }, + + queryRunning: { + name: 'Query running', + nameShort: 'Query running', + type: 'gauge', + description: 'The number of queries running occurring on the coordinator.', + unit: 'none', + sources: { + prometheus: { + expr: 'presto_QueryManager_RunningQueries{%(queriesSelector)s}', + legendCustomTemplate: legendCustomTemplate + ' - running', + }, + }, + }, + + queryStarted: { + name: 'Query started', + nameShort: 'Query started', + type: 'gauge', + description: 'The number of queries started occurring on the coordinator.', + unit: 'none', + sources: { + prometheus: { + expr: 'presto_QueryManager_StartedQueries_OneMinute_Count{%(queriesSelector)s}', + legendCustomTemplate: legendCustomTemplate + ' - started', + }, + }, + }, + + abnormalQueryFailed: { + name: 'Abnormal query failed', + nameShort: 'Abnormal query failed', + type: 'gauge', + description: 'A count of failed abnormal queries.', + unit: 'none', + sources: { + prometheus: { + expr: 'presto_QueryManager_FailedQueries_OneMinute_Count{%(queriesSelector)s}', + legendCustomTemplate: legendCustomTemplate + ' - failed', + }, + }, + }, + + abnormalQueryAbandoned: { + name: 'Abnormal query abandoned', + nameShort: 'Abnormal query abandoned', + type: 'gauge', + description: 'A count of abandoned abnormal queries.', + unit: 'none', + sources: { + prometheus: { + expr: 'presto_QueryManager_AbandonedQueries_OneMinute_Count{%(queriesSelector)s}', + legendCustomTemplate: legendCustomTemplate + ' - abandoned', + }, + }, + }, + + abnormalQueryCanceled: { + name: 'Abnormal query canceled', + nameShort: 'Abnormal query canceled', + type: 'gauge', + description: 'A count of canceled abnormal queries.', + unit: 'none', + sources: { + prometheus: { + expr: 'presto_QueryManager_CanceledQueries_OneMinute_Count{%(queriesSelector)s}', + legendCustomTemplate: legendCustomTemplate + ' - canceled', + }, + }, + }, + + normalQueryCompletedRate: { + name: 'Normal query completed rate', + nameShort: 'Normal query completed', + type: 'gauge', + description: 'A rate of completed normal queries.', + unit: 'none', + sources: { + prometheus: { + expr: 'presto_QueryManager_CompletedQueries_OneMinute_Rate{%(queriesSelector)s}', + legendCustomTemplate: legendCustomTemplate + ' - completed', + }, + }, + }, + + + normalQueryRunningRate: { + name: 'Normal query running rate', + nameShort: 'Normal query running', + type: 'gauge', + description: 'A rate of running normal queries.', + unit: 'none', + sources: { + prometheus: { + expr: 'presto_QueryManager_RunningQueries{%(queriesSelector)s}', + legendCustomTemplate: legendCustomTemplate + ' - running', + }, + }, + }, + + normalQueryStartedRate: { + name: 'Normal query started rate', + nameShort: 'Normal query started', + type: 'gauge', + description: 'A rate of started normal queries.', + unit: 'none', + sources: { + prometheus: { + expr: 'presto_QueryManager_StartedQueries_OneMinute_Rate{%(queriesSelector)s}', + legendCustomTemplate: legendCustomTemplate + ' - started', + }, + }, + }, + + abnormalQueryFailedRate: { + name: 'Abnormal query completed rate', + nameShort: 'Abnormal query completed', + type: 'counter', + description: 'A rate of failed abnormal queries.', + unit: 'none', + sources: { + prometheus: { + expr: 'presto_QueryManager_FailedQueries_TotalCount{%(queriesSelector)s}', + rangeFunction: 'rate', + legendCustomTemplate: legendCustomTemplate + ' - failed', + }, + }, + }, + + abnormalQueryAbandonedRate: { + name: 'Abnormal query abandoned rate', + nameShort: 'Abnormal query abandoned', + type: 'counter', + description: 'A rate of abandoned abnormal queries.', + unit: 'none', + sources: { + prometheus: { + expr: 'presto_QueryManager_AbandonedQueries_TotalCount{%(queriesSelector)s}', + rangeFunction: 'rate', + legendCustomTemplate: legendCustomTemplate + ' - abandoned', + }, + }, + }, + + abnormalQueryCanceledRate: { + name: 'Abnormal query canceled rate', + nameShort: 'Abnormal query canceled', + type: 'counter', + description: 'A rate of canceled abnormal queries.', + unit: 'none', + sources: { + prometheus: { + expr: 'presto_QueryManager_CanceledQueries_TotalCount{%(queriesSelector)s}', + rangeFunction: 'rate', + legendCustomTemplate: legendCustomTemplate + ' - canceled', + }, + }, + }, + + queryExecutionTimeP50: { + name: 'Query execution time (p50)', + nameShort: 'Query execution time', + type: 'gauge', + description: 'The time it took to run queries over the past one minute period.', + unit: 'ms', + sources: { + prometheus: { + expr: 'presto_QueryManager_ExecutionTime_OneMinute_P50{%(queriesSelector)s}', + legendCustomTemplate: legendCustomTemplate + ' - p50', + }, + }, + }, + + queryExecutionTimeP75: { + name: 'Query execution time (p75)', + nameShort: 'Query execution time', + type: 'gauge', + description: 'The time it took to run queries over the past one minute period.', + unit: 'ms', + sources: { + prometheus: { + expr: 'presto_QueryManager_ExecutionTime_OneMinute_P75{%(queriesSelector)s}', + legendCustomTemplate: legendCustomTemplate + ' - p75', + }, + }, + }, + + queryExecutionTimeP95: { + name: 'Query execution time (p95)', + nameShort: 'Query execution time', + type: 'gauge', + description: 'The time it took to run queries over the past one minute period.', + unit: 'ms', + sources: { + prometheus: { + expr: 'presto_QueryManager_ExecutionTime_OneMinute_P95{%(queriesSelector)s}', + legendCustomTemplate: legendCustomTemplate + ' - p95', + }, + }, + }, + + queryExecutionTimeP99: { + name: 'Query execution time (p99)', + nameShort: 'Query execution time', + type: 'gauge', + description: 'The time it took to run queries over the past one minute period.', + unit: 'ms', + sources: { + prometheus: { + expr: 'presto_QueryManager_ExecutionTime_OneMinute_P99{%(queriesSelector)s}', + legendCustomTemplate: legendCustomTemplate + ' - p99', + }, + }, + }, + + cpuTimeConsumed: { + name: 'CPU time consumed', + nameShort: 'CPU time consumed', + type: 'gauge', + description: 'The time it took to run queries over the past one minute period.', + unit: 'ms', + sources: { + prometheus: { + expr: 'presto_QueryManager_ConsumedCpuTimeSecs_OneMinute_Count{%(queriesSelector)s}', + }, + }, + }, + + cpuInputThroughput: { + name: 'CPU input throughput', + nameShort: 'CPU input throughput', + type: 'gauge', + description: 'The time it took to run queries over the past one minute period.', + unit: 'Bps', + sources: { + prometheus: { + expr: 'presto_QueryManager_CpuInputByteRate_OneMinute_Total{%(queriesSelector)s}', + }, + }, + }, + + // JVM metrics + + jvmGarbageCollectorCount: { + name: 'Garbage collector count', + nameShort: 'Garbage collector count', + type: 'counter', + description: 'The number of garbage collections.', + unit: 'none', + sources: { + prometheus: { + expr: 'jvm_gc_collection_count{%(queriesSelector)s, name="G1 Young Generation"}', + rangeFunction: 'increase', + }, + }, + }, + + jvmGarbageCollectionDuration: { + name: 'Garbage collection duration', + nameShort: 'Garbage collection duration', + type: 'gauge', + description: 'The duration of garbage collections.', + unit: 'ms', + sources: { + prometheus: { + expr: 'jvm_gc_duration{%(queriesSelector)s, name="G1 Young Generation"}', + }, + }, + }, + + jvmHeapMemoryUsage: { + name: 'Heap memory usage', + nameShort: 'Heap memory usage', + type: 'gauge', + description: 'The heap memory usage of the JVM.', + unit: 'decbytes', + sources: { + prometheus: { + expr: 'jvm_heap_memory_used{%(queriesSelector)s}', + legendCustomTemplate: legendCustomTemplate + ' - heap', + }, + }, + }, + + jvmNonHeapMemoryUsage: { + name: 'Non-heap memory usage', + nameShort: 'Non-heap memory usage', + type: 'gauge', + description: 'The non-heap memory usage of the JVM.', + unit: 'decbytes', + sources: { + prometheus: { + expr: 'jvm_nonheap_memory_used{%(queriesSelector)s}', + legendCustomTemplate: legendCustomTemplate + ' - non heap', + }, + }, + }, + + jvmHeapMemoryCommitted: { + name: 'Heap memory committed', + nameShort: 'Heap memory committed', + type: 'gauge', + description: 'The heap memory committed of the JVM.', + unit: 'decbytes', + sources: { + prometheus: { + expr: 'jvm_heap_memory_committed{%(queriesSelector)s}', + legendCustomTemplate: legendCustomTemplate + ' - heap', + }, + }, + }, + + jvmNonHeapMemoryCommitted: { + name: 'Non-heap memory committed', + nameShort: 'Non-heap memory committed', + type: 'gauge', + description: 'The non-heap memory committed of the JVM.', + unit: 'decbytes', + sources: { + prometheus: { + expr: 'jvm_nonheap_memory_committed{%(queriesSelector)s}', + legendCustomTemplate: legendCustomTemplate + ' - non heap', + }, + }, + }, + }, +} diff --git a/presto-mixin/signals/overview.libsonnet b/presto-mixin/signals/overview.libsonnet new file mode 100644 index 000000000..2247cf74b --- /dev/null +++ b/presto-mixin/signals/overview.libsonnet @@ -0,0 +1,209 @@ +function(this) { + local legendCustomTemplate = '{{presto_cluster}}', + filteringSelector: this.filteringSelector, + groupLabels: this.groupLabels, + instanceLabels: this.instanceLabels, + enableLokiLogs: this.enableLokiLogs, + legendCustomTemplate: std.join(' ', std.map(function(label) '{{' + label + '}}', this.groupLabels)), + aggLevel: 'none', + aggFunction: 'avg', + alertsInterval: '5m', + discoveryMetric: { + prometheus: 'presto_metadata_DiscoveryNodeManager_ActiveResourceManagerCount', + }, + signals: { + + activeResourceManagers: { + name: 'Active resource managers', + nameShort: 'Resource managers', + type: 'raw', + description: 'Number of resource manager instances across clusters.', + unit: 'none', + sources: { + prometheus: { + expr: 'sum(max by (presto_cluster) (presto_metadata_DiscoveryNodeManager_ActiveResourceManagerCount{%(queriesSelector)s}))', + legendCustomTemplate: 'Resource manager', + }, + }, + }, + + activeCoordinators: { + name: 'Active coordinators', + nameShort: 'Coordinators', + type: 'raw', + description: 'Number of broker instances across clusters.', + unit: 'none', + sources: { + prometheus: { + expr: 'sum(max by (presto_cluster) (presto_metadata_DiscoveryNodeManager_ActiveCoordinatorCount{%(queriesSelector)s}))', + legendCustomTemplate: 'Coordinator', + }, + }, + }, + + activeWorkers: { + name: 'Active workers', + nameShort: 'Workers', + type: 'raw', + description: 'Number of worker instances across clusters.', + unit: 'none', + sources: { + prometheus: { + expr: 'sum(max by (presto_cluster) (presto_metadata_DiscoveryNodeManager_ActiveNodeCount{%(queriesSelector)s}))', + legendCustomTemplate: 'Worker', + }, + }, + }, + + inactiveWorkers: { + name: 'Inactive workers', + nameShort: 'Inactive workers', + type: 'raw', + description: 'Number of inactive worker instances across clusters.', + unit: 'none', + sources: { + prometheus: { + expr: 'sum(max by (presto_cluster) (presto_metadata_DiscoveryNodeManager_InactiveNodeCount{%(queriesSelector)s}))', + legendCustomTemplate: legendCustomTemplate + ' - inactive', + }, + }, + }, + + completedQueries: { + name: 'Completed queries - one minute count', + nameShort: 'Completed queries', + type: 'gauge', + description: 'Number of completed queries across clusters.', + unit: 'none', + sources: { + prometheus: { + expr: 'presto_QueryManager_CompletedQueries_OneMinute_Count{%(queriesSelector)s}', + legendCustomTemplate: legendCustomTemplate + ' - completed', + }, + }, + }, + + + userErrorFailures: { + name: 'User error failures - one minute count', + nameShort: 'User error failures', + type: 'gauge', + description: 'Number of user error failures across clusters.', + unit: 'none', + sources: { + prometheus: { + expr: 'presto_QueryManager_UserErrorFailures_OneMinute_Count{%(queriesSelector)s}', + legendCustomTemplate: legendCustomTemplate + ' - user', + }, + }, + }, + + queuedQueries: { + name: 'Queued queries', + nameShort: 'Queued queries', + type: 'gauge', + description: 'Number of queued queries across clusters.', + unit: 'none', + sources: { + prometheus: { + expr: 'presto_QueryManager_QueuedQueries{%(queriesSelector)s}', + }, + }, + }, + + blockedNodes: { + name: 'Blocked nodes', + nameShort: 'Blocked nodes', + type: 'gauge', + description: 'Number of blocked nodes across clusters.', + unit: 'none', + sources: { + prometheus: { + expr: 'presto_ClusterMemoryPool_general_BlockedNodes{%(queriesSelector)s}', + }, + }, + }, + + internalErrorFailures: { + name: 'Internal error failures - one minute count', + nameShort: 'Internal error failures', + type: 'gauge', + description: 'Number of internal error failures across clusters.', + unit: 'none', + sources: { + prometheus: { + expr: 'presto_QueryManager_InternalFailures_OneMinute_Count{%(queriesSelector)s}', + }, + }, + }, + + clusterMemoryDistributedBytesFree: { + name: 'Cluster memory distributed bytes free', + nameShort: 'Cluster memory distributed bytes free', + type: 'gauge', + description: 'Number of cluster memory distributed bytes free across clusters.', + unit: 'decbytes', + sources: { + prometheus: { + expr: 'sum by (presto_cluster) (presto_ClusterMemoryPool_general_FreeDistributedBytes{%(queriesSelector)s})', + }, + }, + }, + + clusterMemoryDistributedBytesReserved: { + name: 'Cluster memory distributed bytes reserved', + nameShort: 'Cluster memory distributed bytes reserved', + type: 'raw', + description: 'Number of cluster memory distributed bytes reserved across clusters.', + unit: 'decbytes', + sources: { + prometheus: { + expr: 'sum by (presto_cluster) (presto_ClusterMemoryPool_reserved_FreeDistributedBytes{%(queriesSelector)s})', + }, + }, + }, + + + insufficientResourceFailures: { + name: 'Insufficient resource failures - one minute rate', + nameShort: 'Insufficient resource failures', + type: 'gauge', + description: 'Number of insufficient resource failures across clusters.', + unit: 'none', + sources: { + prometheus: { + expr: 'presto_QueryManager_InsufficientResourcesFailures_OneMinute_Rate{%(queriesSelector)s}', + }, + }, + }, + + dataProcessingThroughputInput: { + name: 'Data processing throughput input - one minute rate', + nameShort: 'Data processing throughput input', + type: 'gauge', + description: 'Number of data processing throughput input across clusters.', + unit: 'Bps', + sources: { + prometheus: { + expr: 'sum by (presto_cluster) (presto_TaskManager_InputDataSize_OneMinute_Rate{%(queriesSelector)s})', + legendCustomTemplate: legendCustomTemplate + ' - input', + }, + }, + }, + + + dataProcessingThroughputOutput: { + name: 'Data processing throughput output - one minute rate', + nameShort: 'Data processing throughput output', + type: 'gauge', + description: 'Number of data processing throughput output across clusters.', + unit: 'Bps', + sources: { + prometheus: { + expr: 'sum by (presto_cluster) (presto_TaskManager_OutputDataSize_OneMinute_Rate{%(queriesSelector)s})', + legendCustomTemplate: legendCustomTemplate + ' - output', + }, + }, + }, + }, +} diff --git a/presto-mixin/signals/worker.libsonnet b/presto-mixin/signals/worker.libsonnet new file mode 100644 index 000000000..4fdb6302a --- /dev/null +++ b/presto-mixin/signals/worker.libsonnet @@ -0,0 +1,270 @@ +function(this) { + local legendCustomTemplate = '{{instance}}', + filteringSelector: this.filteringSelector, + groupLabels: this.groupLabels, + instanceLabels: this.instanceLabels, + enableLokiLogs: this.enableLokiLogs, + legendCustomTemplate: std.join(' ', std.map(function(label) '{{' + label + '}}', this.instanceLabels)), + aggLevel: 'none', + alertsInterval: '5m', + discoveryMetric: { + prometheus: 'presto_TaskExecutor_ProcessorExecutor_QueuedTaskCount', + }, + signals: { + nonheapMemoryUsage: { + name: 'Non-heap memory usage', + nameShort: 'Non-heap memory usage', + type: 'raw', + description: 'The non-heap memory usage of the worker.', + unit: 'decbytes', + sources: { + prometheus: { + expr: 'avg (jvm_nonheap_memory_used{%(queriesSelector)s} / clamp_min((jvm_nonheap_memory_used{%(queriesSelector)s} + jvm_nonheap_memory_committed{%(queriesSelector)s}), 1))', + }, + }, + }, + heapMemoryUsage: { + name: 'Heap memory usage', + nameShort: 'Heap', + type: 'raw', + description: 'The heap memory usage of the worker.', + unit: 'percentunit', + sources: { + prometheus: { + expr: 'avg (jvm_heap_memory_used{%(queriesSelector)s} / clamp_min((jvm_heap_memory_used{%(queriesSelector)s} + jvm_heap_memory_committed{%(queriesSelector)s}), 1))', + }, + }, + }, + queuedTasks: { + name: 'Queued tasks', + nameShort: 'Queued', + type: 'gauge', + description: 'The number of tasks that are being queued by the task executor.', + unit: 'short', + sources: { + prometheus: { + expr: 'presto_TaskExecutor_ProcessorExecutor_QueuedTaskCount{%(queriesSelector)s}', + }, + }, + }, + + failedTasks: { + name: 'Failed tasks', + nameShort: 'Failed', + type: 'counter', + description: 'The number of tasks that have failed by the task executor.', + unit: 'ops', + sources: { + prometheus: { + expr: 'presto_TaskExecutor_ProcessorExecutor_FailedTaskCount{%(queriesSelector)s}', + legendCustomTemplate: '{{instance}} - failed', + }, + }, + }, + + completedTasks: { + name: 'Completed tasks', + nameShort: 'Completed', + type: 'counter', + description: 'The number of tasks that have completed by the task executor.', + unit: 'ops', + sources: { + prometheus: { + expr: 'presto_TaskExecutor_ProcessorExecutor_CompletedTaskCount{%(queriesSelector)s}', + legendCustomTemplate: '{{instance}} - completed', + }, + }, + }, + + outputPositions: { + name: 'Output positions', + nameShort: 'Output positions', + type: 'gauge', + description: 'The rate of rows (or records) produced by an operation.', + unit: 'short', + sources: { + prometheus: { + expr: 'presto_TaskManager_OutputPositions_OneMinute_Rate{%(queriesSelector)s}', + }, + }, + }, + + taskNotificationExecutorPoolSize: { + name: 'Executor pool size', + nameShort: 'Task notification executor pool size', + type: 'gauge', + description: 'The pool size of the task notification executor.', + unit: 'short', + sources: { + prometheus: { + expr: 'presto_TaskExecutor_ProcessorExecutor_PoolSize{%(queriesSelector)s}', + }, + }, + }, + + processExecutorCorePoolSize: { + name: 'Process executor core pool size', + nameShort: 'Process executor core pool size', + type: 'gauge', + description: 'The core pool size of the process executor.', + unit: 'short', + sources: { + prometheus: { + expr: 'presto_TaskExecutor_ProcessorExecutor_CorePoolSize{%(queriesSelector)s}', + }, + }, + }, + + processExecutorPoolSize: { + name: 'Process executor pool size', + nameShort: 'Process executor pool size', + type: 'gauge', + description: 'The pool size of the process executor.', + unit: 'short', + sources: { + prometheus: { + expr: 'presto_TaskExecutor_ProcessorExecutor_PoolSize{%(queriesSelector)s}', + }, + }, + }, + + memoryPoolFreeBytes: { + name: 'Memory pool free bytes', + nameShort: 'Memory pool free bytes', + type: 'gauge', + description: 'The free bytes of the memory pool.', + unit: 'decbytes', + sources: { + prometheus: { + expr: 'presto_MemoryPool_general_FreeBytes{%(queriesSelector)s}', + legendCustomTemplate: legendCustomTemplate + ' - free', + }, + }, + }, + + memoryPoolReservedFreeBytes: { + name: 'Memory pool reserved free bytes', + nameShort: 'Memory pool reserved free bytes', + type: 'gauge', + description: 'The reserved free bytes of the memory pool.', + unit: 'decbytes', + sources: { + prometheus: { + expr: 'presto_MemoryPool_reserved_FreeBytes{%(queriesSelector)s}', + legendCustomTemplate: legendCustomTemplate + ' - reserved free', + }, + }, + }, + + + dataProcessingInput: { + name: 'Data processing input', + nameShort: 'Data processing input', + type: 'gauge', + description: 'The input of the data processing.', + unit: 'decbytes', + sources: { + prometheus: { + expr: 'presto_TaskManager_InputDataSize_OneMinute_Rate{%(queriesSelector)s}', + }, + }, + }, + + dataProcessingOutput: { + name: 'Data processing output', + nameShort: 'Data processing output', + type: 'gauge', + description: 'The output of the data processing.', + unit: 'decbytes', + sources: { + prometheus: { + expr: 'presto_TaskManager_OutputDataSize_OneMinute_Rate{%(queriesSelector)s}', + }, + }, + }, + + garbageCollectionCount: { + name: 'Garbage collection count', + nameShort: 'Garbage collection count', + type: 'counter', + description: 'The number of garbage collections.', + unit: 'ops', + sources: { + prometheus: { + expr: 'jvm_gc_collection_count{%(queriesSelector)s}', + rangeFunction: 'increase', + }, + }, + }, + + garbageCollectionDuration: { + name: 'JVM GC duration', + nameShort: 'JVM GC duration', + type: 'gauge', + description: 'The duration for each garbage collection operation in the JVM.', + unit: 'ms', + sources: { + prometheus: { + expr: 'jvm_gc_duration{%(queriesSelector)s}', + rangeFunction: 'increase', + }, + }, + }, + + jvmNonHeapMemoryUsage: { + name: 'JVM non-heap memory usage', + nameShort: 'JVM non-heap memory usage', + type: 'gauge', + description: 'The non-heap memory usage of the JVM.', + unit: 'decbytes', + sources: { + prometheus: { + expr: 'jvm_nonheap_memory_used{%(queriesSelector)s}', + legendCustomTemplate: legendCustomTemplate + ' - non heap', + }, + }, + }, + + jvmHeapMemoryUsage: { + name: 'JVM heap memory usage', + nameShort: 'JVM heap memory usage', + type: 'gauge', + description: 'The heap memory usage of the JVM.', + unit: 'decbytes', + sources: { + prometheus: { + expr: 'jvm_heap_memory_used{%(queriesSelector)s}', + legendCustomTemplate: legendCustomTemplate + ' - heap', + }, + }, + }, + + jvmHeapMemoryCommitted: { + name: 'JVM heap memory committed', + nameShort: 'JVM heap memory committed', + type: 'gauge', + description: 'The heap memory committed of the JVM.', + unit: 'decbytes', + sources: { + prometheus: { + expr: 'jvm_heap_memory_committed{%(queriesSelector)s}', + legendCustomTemplate: legendCustomTemplate + ' - heap', + }, + }, + }, + + jvmNonHeapMemoryCommitted: { + name: 'JVM non-heap memory committed', + nameShort: 'JVM non-heap memory committed', + type: 'gauge', + description: 'The non-heap memory committed of the JVM.', + unit: 'decbytes', + sources: { + prometheus: { + expr: 'jvm_nonheap_memory_committed{%(queriesSelector)s}', + legendCustomTemplate: legendCustomTemplate + ' - non heap', + }, + }, + }, + }, +}