grafana
diff --git a/‎influxdb-mixin/.lint
Lines changed: 21 additions & 0 deletions b/‎influxdb-mixin/.lint
Lines changed: 21 additions & 0 deletions
diff --git a/‎influxdb-mixin/Makefile
Lines changed: 34 additions & 0 deletions b/‎influxdb-mixin/Makefile
Lines changed: 34 additions & 0 deletions
diff --git a/‎influxdb-mixin/README.md
Lines changed: 128 additions & 0 deletions b/‎influxdb-mixin/README.md
Lines changed: 128 additions & 0 deletions
diff --git a/‎influxdb-mixin/alerts/alerts.libsonnet
Lines changed: 118 additions & 0 deletions b/‎influxdb-mixin/alerts/alerts.libsonnet
Lines changed: 118 additions & 0 deletions
diff --git a/‎influxdb-mixin/config.libsonnet
Lines changed: 23 additions & 0 deletions b/‎influxdb-mixin/config.libsonnet
Lines changed: 23 additions & 0 deletions
diff --git a/‎influxdb-mixin/dashboards/dashboards.libsonnet
Lines changed: 3 additions & 0 deletions b/‎influxdb-mixin/dashboards/dashboards.libsonnet
Lines changed: 3 additions & 0 deletions
@@ -0,0 +1,21 @@
+exclusions:
+  template-job-rule:
+    reason: "Prometheus datasource variable is being named as prometheus_datasource now while linter expects 'datasource'"
+  panel-datasource-rule:
+    reason: "Loki datasource variable is being named as loki_datasource now while linter expects 'datasource'"
+  template-datasource-rule:
+    reason: "Based on new convention we are using variable names prometheus_datasource and loki_datasource where as linter expects 'datasource'"
+  template-instance-rule:
+    reason: "Based on new convention we are using variable names prometheus_datasource and loki_datasource where as linter expects 'datasource'"
+  target-instance-rule:
+    reason: "The dashboard is a 'cluster' dashboard where the instance refers to nodes, this dashboard focuses only on the cluster view."
+    entries:
+      - dashboard: "InfluxDB cluster overview"
+  target-promql-rule:
+    reason: "Linter does not support selector variable value as a scalar in top-k PromQL queries."
+  template-label-promql-rule:
+    reason: "Defining a selector for the value of top-k requires a predefined label that the linter considers invalid."
+  panel-title-description-rule:
+    reason: "Not required for logs volume"
+  panel-units-rule:
+    reason: "Logs volume has no unit"
@@ -0,0 +1,34 @@
+JSONNET_FMT := jsonnetfmt -n 2 --max-blank-lines 1 --string-style s --comment-style s
+
+.PHONY: all
+all: build dashboards_out prometheus_alerts.yaml
+
+vendor: jsonnetfile.json
+	jb install
+
+.PHONY: build
+build: vendor
+
+.PHONY: fmt
+fmt:
+	find . -name 'vendor' -prune -o -name '*.libsonnet' -print -o -name '*.jsonnet' -print | \
+		xargs -n 1 -- $(JSONNET_FMT) -i
+
+.PHONY: lint
+lint: build
+	find . -name 'vendor' -prune -o -name '*.libsonnet' -print -o -name '*.jsonnet' -print | \
+		while read f; do \
+			$(JSONNET_FMT) "$$f" | diff -u "$$f" -; \
+		done
+	mixtool lint mixin.libsonnet
+
+dashboards_out: mixin.libsonnet config.libsonnet $(wildcard dashboards/*)
+	@mkdir -p dashboards_out
+	mixtool generate dashboards mixin.libsonnet -d dashboards_out
+
+prometheus_alerts.yaml: mixin.libsonnet alerts/*.libsonnet
+	mixtool generate alerts mixin.libsonnet -a prometheus_alerts.yaml
+
+.PHONY: clean
+clean:
+	rm -rf dashboards_out prometheus_alerts.yaml
@@ -0,0 +1,128 @@
+# InfluxDB mixin
+
+The InfluxDB mixin is a set of configurable Grafana dashboards and alerts.
+
+The InfluxDB mixin contains the following dashboards:
+
+- InfluxDB cluster overview
+- InfluxDB instance overview
+- InfluxDB logs overview
+
+and the following alerts:
+
+- InfluxDBWarningTaskSchedulerHighFailureRate
+- InfluxDBCriticalTaskSchedulerHighFailureRate
+- InfluxDBHighBusyWorkerPercentage
+- InfluxDBHighHeapMemoryUsage
+- InfluxDBHighAverageAPIRequestLatency
+- InfluxDBSlowAverageIQLExecutionTime
+
+## InfluxDB cluster overview
+
+The InfluxDB cluster overview dashboard provides details on the cluster's performance and highlights top instances. The dashboard covers all available aspects of InfluxDB performance and integration health, including Golang performance, query/request load, and task scheduler activity.
+
+![First screenshot of the InfluxDB cluster overview dashboard](https://storage.googleapis.com/grafanalabs-integration-assets/influxdb/screenshots/influxdb_cluster_overview_1.png)
+![Second screenshot of the InfluxDB cluster overview dashboard](https://storage.googleapis.com/grafanalabs-integration-assets/influxdb/screenshots/influxdb_cluster_overview_2.png)
+![Third screenshot of the InfluxDB cluster overview dashboard](https://storage.googleapis.com/grafanalabs-integration-assets/influxdb/screenshots/influxdb_cluster_overview_3.png)
+
+## InfluxDB instance overview
+
+The InfluxDB instance overview dashboard provides details on one or more instances, including instance configuration stats, Golang performance, query/request load, and task scheduler activity.
+
+![First screenshot of the InfluxDB instance overview dashboard](https://storage.googleapis.com/grafanalabs-integration-assets/influxdb/screenshots/influxdb_instance_overview_1.png)
+![Second screenshot of the InfluxDB instance overview dashboard](https://storage.googleapis.com/grafanalabs-integration-assets/influxdb/screenshots/influxdb_instance_overview_2.png)
+![Third screenshot of the InfluxDB instance overview dashboard](https://storage.googleapis.com/grafanalabs-integration-assets/influxdb/screenshots/influxdb_instance_overview_3.png)
+
+
+## InfluxDB logs overview
+
+The InfluxDB logs overview dashboard allows users to view incoming InfluxDB logs. The dashboard also allows users to filter logs based on level, service, engine, and custom regex.
+
+![Screenshot of the InfluxDB logs dashboard](https://storage.googleapis.com/grafanalabs-integration-assets/influxdb/screenshots/influxdb_logs_overview.png)
+
+InfluxDB system logs are enabled by default in the `config.libsonnet` and can be disabled by setting `enableLokiLogs` to `false`. Then run `make` again to regenerate the dashboard:
+
+```
+{
+  _config+:: {
+    enableLokiLogs: false,
+  },
+}
+```
+
+For the selectors to properly work for InfluxDB logs ingested into your logs datasource, please also include the matching `instance`, `job`, and `influxdb_cluster` labels in the [scrape_configs](https://grafana.com/docs/loki/latest/clients/promtail/configuration/#scrape_configs) to match the labels for ingested metrics.
+
+```yaml
+scrape_configs:
+  - job_name: integrations/influxdb
+    static_configs:
+      - targets: [localhost]
+        labels:
+          job: integrations/influxdb
+          influxdb_cluster: "<your-cluster-name>"
+          instance: "<your-instance-name>"
+          __path__: /var/log/influxdb/influxdb.log
+    pipeline_stages:
+        - multiline:
+            firstline: 'ts=\d{4}'
+        - regex:
+            expression: 'ts=(\S+) lvl=(?P<level>\w+) msg=.* log_id=(\S+) (service=(?P<service>\S+) ){0,1}(engine=(?P<engine>\S*) ){0,1}.*$'
+        - labels:
+            level:
+            service:
+            engine:
+```
+
+## Alerts overview
+
+- InfluxDBWarningTaskSchedulerHighFailureRate: Automated data processing tasks are failing at a high rate.
+- InfluxDBCriticalTaskSchedulerHighFailureRate: Automated data processing tasks are failing at a critical rate.
+- InfluxDBHighBusyWorkerPercentage: There is a high percentage of busy workers.
+- InfluxDBHighHeapMemoryUsage: There is a high amount of heap memory being used.
+- InfluxDBHighAverageAPIRequestLatency: Average API request latency is too high. High latency will negatively affect system performance, degrading data availability and precision.
+- InfluxDBSlowAverageIQLExecutionTime: InfluxQL execution times are too slow. Slow query execution times will negatively affect system performance, degrading data availability and precision.
+
+Default thresholds can be configured in `config.libsonnet`.
+
+```js
+{
+  _config+:: {
+    alertsWarningTaskSchedulerHighFailureRate: 25, // %
+    alertsCriticalTaskSchedulerHighFailureRate: 50,  // %
+    alertsWarningHighBusyWorkerPercentage: 80,  // %
+    alertsWarningHighHeapMemoryUsage: 80,  // %
+    alertsWarningHighAverageAPIRequestLatency: 0.1, // count
+    alertsWarningSlowAverageIQLExecutionTime: 0.1, // count
+  },
+}
+```
+
+## Install tools
+
+```bash
+go install github.com/jsonnet-bundler/jsonnet-bundler/cmd/jb@latest
+go install github.com/monitoring-mixins/mixtool/cmd/mixtool@latest
+```
+
+For linting and formatting, you would also need `jsonnetfmt` installed. If you
+have a working Go development environment, it's easiest to run the following:
+
+```bash
+go install github.com/google/go-jsonnet/cmd/jsonnetfmt@latest
+```
+
+The files in `dashboards_out` need to be imported
+into your Grafana server. The exact details will depend on your environment.
+
+`prometheus_alerts.yaml` needs to be imported into Prometheus.
+
+## Generate dashboards and alerts
+
+Edit `config.libsonnet` if required and then build JSON dashboard files for Grafana:
+
+```bash
+make
+```
+
+For more advanced uses of mixins, see
+https://github.com/monitoring-mixins/docs.
@@ -0,0 +1,118 @@
+{
+  prometheusAlerts+:: {
+    groups+: [
+      {
+        name: 'influxdb',
+        rules: [
+          {
+            alert: 'InfluxDBWarningTaskSchedulerHighFailureRate',
+            expr: |||
+              100 * rate(task_scheduler_total_execute_failure[5m])/clamp_min(rate(task_scheduler_total_execution_calls[5m]), 1) >= %(alertsWarningTaskSchedulerHighFailureRate)s
+            ||| % $._config,
+            'for': '5m',
+            labels: {
+              severity: 'warning',
+            },
+            annotations: {
+              summary: 'Automated data processing tasks are failing at a high rate.',
+              description:
+                (
+                  'Task scheduler task executions for instance {{$labels.instance}} on cluster {{$labels.influxdb_cluster}} are failing at a rate of {{ printf "%%.0f" $value }} percent, ' +
+                  'which is above the threshold of %(alertsWarningTaskSchedulerHighFailureRate)s percent.'
+                ) % $._config,
+            },
+          },
+          {
+            alert: 'InfluxDBCriticalTaskSchedulerHighFailureRate',
+            expr: |||
+              100 * rate(task_scheduler_total_execute_failure[5m])/clamp_min(rate(task_scheduler_total_execution_calls[5m]), 1) >= %(alertsCriticalTaskSchedulerHighFailureRate)s
+            ||| % $._config,
+            'for': '5m',
+            labels: {
+              severity: 'critical',
+            },
+            annotations: {
+              summary: 'Automated data processing tasks are failing at a critical rate.',
+              description:
+                (
+                  'Task scheduler task executions for instance {{$labels.instance}} on cluster {{$labels.influxdb_cluster}} are failing at a rate of {{ printf "%%.0f" $value }} percent, ' +
+                  'which is above the threshold of %(alertsCriticalTaskSchedulerHighFailureRate)s percent.'
+                ) % $._config,
+            },
+          },
+          {
+            alert: 'InfluxDBHighBusyWorkerPercentage',
+            expr: |||
+              task_executor_workers_busy >= %(alertsWarningHighBusyWorkerPercentage)s
+            ||| % $._config,
+            'for': '5m',
+            labels: {
+              severity: 'critical',
+            },
+            annotations: {
+              summary: 'There is a high percentage of busy workers.',
+              description:
+                (
+                  'The busy worker percentage for instance {{$labels.instance}} on cluster {{$labels.influxdb_cluster}} is {{ printf "%%.0f" $value }} percent, ' +
+                  'which is above the threshold of %(alertsWarningHighBusyWorkerPercentage)s percent.'
+                ) % $._config,
+            },
+          },
+          {
+            alert: 'InfluxDBHighHeapMemoryUsage',
+            expr: |||
+              100 * go_memstats_heap_alloc_bytes/clamp_min((go_memstats_heap_idle_bytes + go_memstats_heap_alloc_bytes), 1) >= %(alertsWarningHighHeapMemoryUsage)s
+            ||| % $._config,
+            'for': '5m',
+            labels: {
+              severity: 'critical',
+            },
+            annotations: {
+              summary: 'There is a high amount of heap memory being used.',
+              description:
+                (
+                  'The heap memory usage for instance {{$labels.instance}} on cluster {{$labels.influxdb_cluster}} is {{ printf "%%.0f" $value }} percent, ' +
+                  'which is above the threshold of %(alertsWarningHighHeapMemoryUsage)s percent.'
+                ) % $._config,
+            },
+          },
+          {
+            alert: 'InfluxDBHighAverageAPIRequestLatency',
+            expr: |||
+              sum without(handler, method, path, response_code, status, user_agent) (increase(http_api_request_duration_seconds_sum[5m])/clamp_min(increase(http_api_requests_total[5m]), 1)) >= %(alertsWarningHighAverageAPIRequestLatency)s
+            ||| % $._config,
+            'for': '1m',
+            labels: {
+              severity: 'critical',
+            },
+            annotations: {
+              summary: 'Average API request latency is too high. High latency will negatively affect system performance, degrading data availability and precision.',
+              description:
+                (
+                  'The average API request latency for instance {{$labels.instance}} on cluster {{$labels.influxdb_cluster}} is {{ printf "%%.2f" $value }} seconds, which is above the threshold of %(alertsWarningHighAverageAPIRequestLatency)s seconds.'
+                ) % $._config,
+            },
+          },
+          {
+            alert: 'InfluxDBSlowAverageIQLExecutionTime',
+            expr: |||
+              sum without(result) (increase(influxql_service_executing_duration_seconds_sum[5m])/clamp_min(increase(influxql_service_requests_total[5m]), 1)) >= %(alertsWarningSlowAverageIQLExecutionTime)s
+            ||| % $._config,
+            'for': '5m',
+            labels: {
+              severity: 'warning',
+            },
+            annotations: {
+              summary: 'InfluxQL execution times are too slow. Slow query execution times will negatively affect system performance, degrading data availability and precision.',
+              description:
+                (
+                  'The average InfluxQL query execution time for instance {{$labels.instance}} on cluster {{$labels.influxdb_cluster}} is {{ printf "%%.2f" $value }} seconds, ' +
+                  'which is above the threshold of %(alertsWarningSlowAverageIQLExecutionTime)s seconds.'
+                ) % $._config,
+            },
+          },
+        ],
+      },
+    ],
+  },
+}
@@ -0,0 +1,23 @@
+{
+  _config+:: {
+    enableMultiCluster: false,
+    influxdbSelector: if self.enableMultiCluster then 'job=~"$job", cluster=~"$cluster"' else 'job=~"$job"',
+    multiclusterSelector: 'job=~"$job"',
+    filterSelector: 'job=~"integrations/influxdb"',
+
+    dashboardTags: ['influxdb-mixin'],
+    dashboardPeriod: 'now-30m',
+    dashboardTimezone: 'default',
+    dashboardRefresh: '1m',
+
+    // alerts thresholds
+    alertsWarningTaskSchedulerHighFailureRate: 25,  // %
+    alertsCriticalTaskSchedulerHighFailureRate: 50,  // %
+    alertsWarningHighBusyWorkerPercentage: 80,  // %
+    alertsWarningHighHeapMemoryUsage: 80,  // %
+    alertsWarningHighAverageAPIRequestLatency: 0.3,  // count
+    alertsWarningSlowAverageIQLExecutionTime: 0.1,  // count
+
+    enableLokiLogs: true,
+  },
+}
@@ -0,0 +1,3 @@
+(import 'influxdb-cluster-overview.libsonnet') +
+(import 'influxdb-instance-overview.libsonnet') +
+(import 'influxdb-logs-overview.libsonnet')
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+(import 'influxdb-cluster-overview.libsonnet') +`
	`2`	`+(import 'influxdb-instance-overview.libsonnet') +`
	`3`	`+(import 'influxdb-logs-overview.libsonnet')`