grafana
diff --git a/‎apache-solr-mixin/.lint
Lines changed: 46 additions & 0 deletions b/‎apache-solr-mixin/.lint
Lines changed: 46 additions & 0 deletions
diff --git a/‎apache-solr-mixin/Makefile
Lines changed: 34 additions & 0 deletions b/‎apache-solr-mixin/Makefile
Lines changed: 34 additions & 0 deletions
diff --git a/‎apache-solr-mixin/README.md
Lines changed: 141 additions & 0 deletions b/‎apache-solr-mixin/README.md
Lines changed: 141 additions & 0 deletions
diff --git a/‎apache-solr-mixin/alerts/alerts.libsonnet
Lines changed: 142 additions & 0 deletions b/‎apache-solr-mixin/alerts/alerts.libsonnet
Lines changed: 142 additions & 0 deletions
diff --git a/‎apache-solr-mixin/config.libsonnet
Lines changed: 24 additions & 0 deletions b/‎apache-solr-mixin/config.libsonnet
Lines changed: 24 additions & 0 deletions
@@ -0,0 +1,46 @@
+exclusions:
+  template-job-rule:
+    reason: "Prometheus datasource variable is being named as prometheus_datasource now while linter expects 'datasource'"
+  panel-datasource-rule:
+    reason: "Loki datasource variable is being named as loki_datasource now while linter expects 'datasource'"
+  template-datasource-rule:
+    reason: "Based on new convention we are using variable names prometheus_datasource and loki_datasource where as linter expects 'datasource'"
+  template-instance-rule:
+    reason: "Based on new convention we are using variable names prometheus_datasource and loki_datasource where as linter expects 'datasource'"
+  target-promql-rule:
+    reason: "Linter does not support selector variable value as a scalar in top-k PromQL queries."
+  template-label-promql-rule:
+    reason: "Defining a selector for the value of top-k requires a predefined label that the linter considers invalid."
+  panel-title-description-rule:
+    reason: "Not required for logs volume"
+  panel-units-rule:
+    reason: "Custom units are used for better user experience in these panels"
+    entries:
+    - panel: "Logs volume"
+    - panel: "Live nodes"
+    - panel: "Zookeeper status"
+    - panel: "Zookeeper ensemble size"
+    - panel: "Shard status"
+    - panel: "Replica status"
+    - panel: "Top cores by update handlers / $__interval"
+    - panel: "Top cores by core errors / $__interval"
+    - panel: "Top nodes by node errors / $__interval"
+    - panel: "Update handlers / $__interval"
+    - panel: "Cache evictions / $__interval"
+    - panel: "Core timeouts / $__interval"
+    - panel: "Node timeouts / $__interval"
+    - panel: "Query error rate"
+    - panel: "Query client errors"
+    - panel: "Connections"
+    - panel: "Threads / $__interval"
+    - panel: "Garbage collections / $__interval"
+    - panel: "File descriptors"
+    - panel: "Requests  / $__interval"
+    - panel: "Responses  / $__interval"
+    - panel: "Dispatches  / $__interval"
+  target-instance-rule:
+    reason: "base_url is used instead of instance because of how cluster metrics are returned."
+    entires:
+      - dashboard: "Apache Solr cluster overview"
+      - dashboard: "Apache Solr query performance"
+      - dashboard: "Apache Solr resource monitoring"
@@ -0,0 +1,34 @@
+JSONNET_FMT := jsonnetfmt -n 2 --max-blank-lines 1 --string-style s --comment-style s
+
+.PHONY: all
+all: build dashboards_out prometheus_alerts.yaml
+
+vendor: jsonnetfile.json
+	jb install
+
+.PHONY: build
+build: vendor
+
+.PHONY: fmt
+fmt:
+	find . -name 'vendor' -prune -o -name '*.libsonnet' -print -o -name '*.jsonnet' -print | \
+		xargs -n 1 -- $(JSONNET_FMT) -i
+
+.PHONY: lint
+lint: build
+	find . -name 'vendor' -prune -o -name '*.libsonnet' -print -o -name '*.jsonnet' -print | \
+		while read f; do \
+			$(JSONNET_FMT) "$$f" | diff -u "$$f" -; \
+		done
+	mixtool lint mixin.libsonnet
+
+dashboards_out: mixin.libsonnet config.libsonnet $(wildcard dashboards/*)
+	@mkdir -p dashboards_out
+	mixtool generate dashboards mixin.libsonnet -d dashboards_out
+
+prometheus_alerts.yaml: mixin.libsonnet alerts/*.libsonnet
+	mixtool generate alerts mixin.libsonnet -a prometheus_alerts.yaml
+
+.PHONY: clean
+clean:
+	rm -rf dashboards_out prometheus_alerts.yaml
@@ -0,0 +1,141 @@
+# Apache Solr Mixin
+
+The Apache Solr mixin is a set of configurable Grafana dashboards and alerts.
+
+The Apache Solr mixin contains the following dashboards:
+
+- Apache Solr cluster overview
+- Apache Solr query performance
+- Apache Solr resource monitoring
+- Apache Solr logs overview
+
+and the following alerts:
+
+- ApacheSolrZookeeperChangeInEnsembleSize
+- ApacheSolrHighCPUUsageCritical
+- ApacheSolrHighCPUUsageWarning
+- ApacheSolrHighHeapMemoryUsageCritical
+- ApacheSolrHighHeapMemoryUsageWarning
+- ApacheSolrLowCacheHitRatio
+- ApacheSolrHighCoreErrors
+- ApacheSolrHighDocumentIndexing
+
+## Apache Solr Cluster Overview
+
+The Apache Solr cluster overview dashboard provides details on cluster, shard, replica and Zookeeper health as well as top core and error metrics.
+
+![Apache Solr Cluster Overview Dashboard 1](https://storage.googleapis.com/grafanalabs-integration-assets/apache-solr/screenshots/apache-solr-cluster-1.png)
+![Apache Solr Cluster Overview Dashboard 2](https://storage.googleapis.com/grafanalabs-integration-assets/apache-solr/screenshots/apache-solr-cluster-2.png)
+
+## Apache Solr Query Performance
+
+The Apache Solr query performance dashboard provides details on various query load and latency, update handlers, cache, timeout and error metrics.
+
+![Apache Solr Query Performance Dashboard 1](https://storage.googleapis.com/grafanalabs-integration-assets/apache-solr/screenshots/apache-solr-query-performance-1.png)
+![Apache Solr Query Performance Dashboard 2](https://storage.googleapis.com/grafanalabs-integration-assets/apache-solr/screenshots/apache-solr-query-performance-2.png)
+![Apache Solr Query Performance Dashboard 3](https://storage.googleapis.com/grafanalabs-integration-assets/apache-solr/screenshots/apache-solr-query-performance-3.png)
+
+## Apache Solr Resource Monitoring
+
+The Apache Solr resource monitoring dashboard provides details on connections, threads, core FS usage, as well as JVM and Jetty metrics.
+
+![Apache Solr Resource Monitoring Dashboard 1](https://storage.googleapis.com/grafanalabs-integration-assets/apache-solr/screenshots/apache-solr-resource-monitoring-1.png)
+![Apache Solr Resource Monitoring Dashboard 2](https://storage.googleapis.com/grafanalabs-integration-assets/apache-solr/screenshots/apache-solr-resource-monitoring-2.png)
+
+## Apache Solr Logs Overview
+
+The Apache Solr logs overview dashboard provides details on slow requests, garbage collection, and error logs. [Promtail and Loki needs to be installed](https://grafana.com/docs/loki/latest/installation/) and provisioned for logs with your Grafana instance. The default Apache Solr error log path is `/var/solr/logs/solr.log` for each instance on Linux.
+
+Apache Solr logs are enabled by default in the `config.libsonnet` and can be removed by setting `enableLokiLogs` to `false`. Then run `make` again to regenerate the dashboard:
+
+```
+{
+  _config+:: {
+    enableLokiLogs: false,
+  },
+}
+```
+
+In order for the selectors to properly work for system logs ingested into your logs datasource, please also include the matching `job` and `solr_cluster` labels onto the [scrape_configs](https://grafana.com/docs/loki/latest/clients/promtail/configuration/#scrape_configs) as to match the labels for ingested metrics.
+
+```yaml
+scrape_configs:
+  - job_name: integrations/apache-solr
+    static_configs:
+      - targets: [localhost]
+        labels:
+          job: integrations/apache-solr
+          instance: '<your-instance-name>'
+          solr_cluster: '<your-cluster-name>'
+          __path__: /var/log/logs/*.log
+    pipeline_stages:
+      - multiline:
+          firstline: '^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}\.\d{3}'
+      - regex:
+          expression: '^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}\.\d{3} (?P<level>\w+)'
+      - labels:
+          level:
+```
+
+![Apache Solr Logs Overview Dashboard](https://storage.googleapis.com/grafanalabs-integration-assets/apache-solr/screenshots/apache-solr-logs-overview.png)
+
+## Alerts Overview
+
+
+| Alert                                   | Summary                                                                                                             |
+|-----------------------------------------|---------------------------------------------------------------------------------------------------------------------|
+| ApacheSolrZookeeperChangeInEnsembleSize | Changes in the ZooKeeper ensemble size can affect the stability and performance of the cluster.                     |
+| ApacheSolrHighCPUUsageCritical          | High CPU load can indicate that Solr nodes are under heavy load, potentially impacting performance.                 |
+| ApacheSolrHighCPUUsageWarning           | High CPU load can indicate that Solr nodes are under heavy load, potentially impacting performance.                 |
+| ApacheSolrHighHeapMemoryUsageCritical   | High heap memory usage can lead to garbage collection issues, out-of-memory errors, and overall system instability. |
+| ApacheSolrHighHeapMemoryUsageWarning    | High heap memory usage can lead to garbage collection issues, out-of-memory errors, and overall system instability. |
+| ApacheSolrLowCacheHitRatio              | Low cache hit ratios can lead to increased disk I/O and slower query response times.                                |
+| ApacheSolrHighCoreErrors                | A spike in core errors can indicate serious issues at the core level, affecting data integrity and availability.    |
+| ApacheSolrHighDocumentIndexing          | A sudden spike in document indexing could indicate unintended or malicious bulk updates.                            |
+
+Default thresholds can be configured in `config.libsonnet`
+
+```js
+{
+  _config+:: {
+    alertsCriticalCPUUsage: 85,
+    alertsWarningCPUUsage: 75,
+    alertsWarningMemoryUsage: 85,
+    alertsCriticalMemoryUsage: 75,
+    alertsWarningCacheUsage: 75,
+    alertsWarningCoreErrors: 15,
+    alertsWarningDocumentIndexing: 30,
+  },
+}
+```
+
+## Install Tools
+
+```bash
+go install github.com/jsonnet-bundler/jsonnet-bundler/cmd/jb@latest
+go install github.com/monitoring-mixins/mixtool/cmd/mixtool@latest
+# or in brew: brew install go-jsonnet
+```
+
+For linting and formatting, you would also need `mixtool` and `jsonnetfmt` installed. If you
+have a working Go development environment, it's easiest to run the following:
+
+```bash
+go install github.com/google/go-jsonnet/cmd/jsonnetfmt@latest
+```
+
+The files in `dashboards_out` need to be imported
+into your Grafana server. The exact details will be depending on your environment.
+
+`prometheus_alerts.yaml` needs to be imported into Prometheus.
+
+## Generate Dashboards And Alerts
+
+Edit `config.libsonnet` if required and then build JSON dashboard files for Grafana:
+
+```bash
+make
+```
+
+For more advanced uses of mixins, see
+https://github.com/monitoring-mixins/docs.
@@ -0,0 +1,142 @@
+{
+  prometheusAlerts+:: {
+    groups+: [
+      {
+        name: 'apache-solr',
+        rules: [
+          {
+            alert: 'ApacheSolrZookeeperChangeInEnsembleSize',
+            expr: |||
+              'changes(solr_zookeeper_ensemble_size[5m]) > 0'
+            ||| % $._config,
+            'for': '5m',
+            labels: {
+              severity: 'warning',
+            },
+            annotations: {
+              summary: 'Changes in the ZooKeeper ensemble size can affect the stability and performance of the cluster.',
+              description:
+                (
+                  'Zookeeper host {{$labels.zk_host}} has had an ensemble change of {{ printf "%%.0f" $value }} over the last 5 minutes'
+                ) % $._config,
+            },
+          },
+          {
+            alert: 'ApacheSolrHighCPUUsageCritical',
+            expr: |||
+              '100 * sum without (base_url, item) (avg_over_time(solr_metrics_jvm_os_cpu_load{item="systemCpuLoad"}[5m])) > %(alertsCriticalCPUUsage)s'
+            ||| % $._config,
+            'for': '5m',
+            labels: {
+              severity: 'critical',
+            },
+            annotations: {
+              summary: 'High CPU load can indicate that Solr nodes are under heavy load, potentially impacting performance.',
+              description:
+                (
+                  '{{$labels.instance}} on cluster {{$labels.solr_cluster}} has had a system CPU load of {{ printf "%%.0f" $value }}%%, which is above the threshold of %(alertsCriticalCPUUsage)s.'
+                ) % $._config,
+            },
+          },
+          {
+            alert: 'ApacheSolrHighCPUUsageWarning',
+            expr: |||
+              '100 * sum without (base_url, item) (avg_over_time(solr_metrics_jvm_os_cpu_load{item="systemCpuLoad"}[5m])) > %(alertsWarningCPUUsage)s'
+            ||| % $._config,
+            'for': '5m',
+            labels: {
+              severity: 'warning',
+            },
+            annotations: {
+              summary: 'High CPU load can indicate that Solr nodes are under heavy load, potentially impacting performance.',
+              description:
+                (
+                  '{{$labels.instance}} on cluster {{$labels.solr_cluster}} has had a system CPU load of {{ printf "%%.0f" $value }}%%, which is above the threshold of %(alertsWarningCPUUsage)s.'
+                ) % $._config,
+            },
+          },
+          {
+            alert: 'ApacheSolrHighHeapMemoryUsageCritical',
+            expr: |||
+              '100 * sum without(item, base_url)(solr_metrics_jvm_memory_heap_bytes{item="used"}) / clamp_min(sum without(item, base_url)(solr_metrics_jvm_memory_heap_bytes{item="max"}), 1) > %(alertsCriticalMemoryUsage)s'
+            ||| % $._config,
+            'for': '5m',
+            labels: {
+              severity: 'critical',
+            },
+            annotations: {
+              summary: 'High heap memory usage can lead to garbage collection issues, out-of-memory errors, and overall system instability.',
+              description: |||
+                {{$labels.instance}} on cluster {{$labels.solr_cluster}} has had high memory usage of {{ printf "%%.0f" $value }}%%, which is above the thresold of %(alertsCriticalMemoryUsage)s.
+              ||| % $._config,
+            },
+          },
+          {
+            alert: 'ApacheSolrHighHeapMemoryUsageWarning',
+            expr: |||
+              '100 * sum without(item, base_url)(solr_metrics_jvm_memory_heap_bytes{item="used"}) / clamp_min(sum without(item, base_url)(solr_metrics_jvm_memory_heap_bytes{item="max"}), 1) > %(alertsWarningMemoryUsage)s'
+            ||| % $._config,
+            'for': '5m',
+            labels: {
+              severity: 'warning',
+            },
+            annotations: {
+              summary: 'High heap memory usage can lead to garbage collection issues, out-of-memory errors, and overall system instability.',
+              description: |||
+                {{$labels.instance}} on cluster {{$labels.solr_cluster}} has had high memory usage of {{ printf "%%.0f" $value }}%%, which is above the thresold of %(alertsWarningMemoryUsage)s.
+              ||| % $._config,
+            },
+          },
+          {
+            alert: 'ApacheSolrLowCacheHitRatio',
+            expr: |||
+              '100 * sum without(base_url, category, collection, item, replica, shard) (solr_metrics_core_searcher_cache_ratio{item="hitratio", type=~"documentCache|filterCache|queryResultCache"}[10m]) < %(alertsWarningCacheUsage)s'
+            ||| % $._config,
+            'for': '10m',
+            labels: {
+              severity: 'warning',
+            },
+            annotations: {
+              summary: 'Low cache hit ratios can lead to increased disk I/O and slower query response times.',
+              description: |||
+                {{$labels.instance}} on cluster {{$labels.solr_cluster}} has had a low cache hit ratio of {{ printf "%%.0f" $value }}%% on core {{$labels.core}} of type {{$labels.type}}, which is under the threshold of %(alertsWarningCacheUsage)s.
+              ||| % $._config,
+            },
+          },
+          {
+            alert: 'ApacheSolrHighCoreErrors',
+            expr: |||
+              '100 * sum without(base_url, category, collection, handler, replica, shard) (increase(solr_metrics_core_errors_total[10m]) / clamp_min(avg_over_time(solr_metrics_core_errors_total[10m]), 1)) > %(alertsWarningCoreErrors)s'
+            ||| % $._config,
+            'for': '10m',
+            labels: {
+              severity: 'warning',
+            },
+            annotations: {
+              summary: 'A spike in core errors can indicate serious issues at the core level, affecting data integrity and availability.',
+              description: |||
+                {{$labels.instance}} on cluster {{$labels.solr_cluster}} has had a high amount of core errors {{ printf "%%.0f" $value }}%% on core {{$labels.core}}, which is above the threshold of %(alertsWarningCoreErrors)s.
+              ||| % $._config,
+            },
+          },
+          {
+            alert: 'ApacheSolrHighDocumentIndexing',
+            expr: |||
+              '100 * sum without(base_url, category, collection, handler, replica, shard) (increase(solr_metrics_core_update_handler_adds_total[15m]) / clamp_min(avg_over_time(solr_metrics_core_update_handler_adds_total[15m]), 1)) > %(alertsWarningDocumentIndexing)s'
+            ||| % $._config,
+            'for': '15m',
+            labels: {
+              severity: 'warning',
+            },
+            annotations: {
+              summary: 'A sudden spike in document indexing could indicate unintended or malicious bulk updates.',
+              description: |||
+                {{$labels.instance}} on cluster {{$labels.solr_cluster}} has had a high document indexing value of {{ printf "%%.0f" $value }}%% on core {{$labels.core}}, which is above the threshold of %(alertsWarningDocumentIndexing)s.
+              ||| % $._config,
+            },
+          },
+        ],
+      },
+    ],
+  },
+}
@@ -0,0 +1,24 @@
+{
+  _config+:: {
+    enableMultiCluster: false,
+    solrSelector: if self.enableMultiCluster then 'job=~"$job", cluster=~"$cluster"' else 'job=~"$job"',
+    multiclusterSelector: 'job=~"$job"',
+    filterSelector: 'job=~"integrations/apache-solr"',
+
+    dashboardTags: ['apache-solr-mixin'],
+    dashboardPeriod: 'now-30m',
+    dashboardTimezone: 'default',
+    dashboardRefresh: '1m',
+
+    // alerts thresholds
+    alertsCriticalCPUUsage: 85,
+    alertsWarningCPUUsage: 75,
+    alertsWarningMemoryUsage: 85,
+    alertsCriticalMemoryUsage: 75,
+    alertsWarningCacheUsage: 75,
+    alertsWarningCoreErrors: 15,
+    alertsWarningDocumentIndexing: 30,
+
+    enableLokiLogs: true,
+  },
+}