Skip to content

Commit 59ab3d1

Browse files
authored
Merge branch 'master' into szuro-patch-1
2 parents 6cbafd7 + 328cd1e commit 59ab3d1

File tree

70 files changed

+35648
-8253
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

70 files changed

+35648
-8253
lines changed

apache-solr-mixin/.lint

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
exclusions:
2+
template-job-rule:
3+
reason: "Prometheus datasource variable is being named as prometheus_datasource now while linter expects 'datasource'"
4+
panel-datasource-rule:
5+
reason: "Loki datasource variable is being named as loki_datasource now while linter expects 'datasource'"
6+
template-datasource-rule:
7+
reason: "Based on new convention we are using variable names prometheus_datasource and loki_datasource where as linter expects 'datasource'"
8+
template-instance-rule:
9+
reason: "Based on new convention we are using variable names prometheus_datasource and loki_datasource where as linter expects 'datasource'"
10+
target-promql-rule:
11+
reason: "Linter does not support selector variable value as a scalar in top-k PromQL queries."
12+
template-label-promql-rule:
13+
reason: "Defining a selector for the value of top-k requires a predefined label that the linter considers invalid."
14+
panel-title-description-rule:
15+
reason: "Not required for logs volume"
16+
panel-units-rule:
17+
reason: "Custom units are used for better user experience in these panels"
18+
entries:
19+
- panel: "Logs volume"
20+
- panel: "Live nodes"
21+
- panel: "Zookeeper status"
22+
- panel: "Zookeeper ensemble size"
23+
- panel: "Shard status"
24+
- panel: "Replica status"
25+
- panel: "Top cores by update handlers / $__interval"
26+
- panel: "Top cores by core errors / $__interval"
27+
- panel: "Top nodes by node errors / $__interval"
28+
- panel: "Update handlers / $__interval"
29+
- panel: "Cache evictions / $__interval"
30+
- panel: "Core timeouts / $__interval"
31+
- panel: "Node timeouts / $__interval"
32+
- panel: "Query error rate"
33+
- panel: "Query client errors"
34+
- panel: "Connections"
35+
- panel: "Threads / $__interval"
36+
- panel: "Garbage collections / $__interval"
37+
- panel: "File descriptors"
38+
- panel: "Requests / $__interval"
39+
- panel: "Responses / $__interval"
40+
- panel: "Dispatches / $__interval"
41+
target-instance-rule:
42+
reason: "base_url is used instead of instance because of how cluster metrics are returned."
43+
entires:
44+
- dashboard: "Apache Solr cluster overview"
45+
- dashboard: "Apache Solr query performance"
46+
- dashboard: "Apache Solr resource monitoring"

apache-solr-mixin/Makefile

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
JSONNET_FMT := jsonnetfmt -n 2 --max-blank-lines 1 --string-style s --comment-style s
2+
3+
.PHONY: all
4+
all: build dashboards_out prometheus_alerts.yaml
5+
6+
vendor: jsonnetfile.json
7+
jb install
8+
9+
.PHONY: build
10+
build: vendor
11+
12+
.PHONY: fmt
13+
fmt:
14+
find . -name 'vendor' -prune -o -name '*.libsonnet' -print -o -name '*.jsonnet' -print | \
15+
xargs -n 1 -- $(JSONNET_FMT) -i
16+
17+
.PHONY: lint
18+
lint: build
19+
find . -name 'vendor' -prune -o -name '*.libsonnet' -print -o -name '*.jsonnet' -print | \
20+
while read f; do \
21+
$(JSONNET_FMT) "$$f" | diff -u "$$f" -; \
22+
done
23+
mixtool lint mixin.libsonnet
24+
25+
dashboards_out: mixin.libsonnet config.libsonnet $(wildcard dashboards/*)
26+
@mkdir -p dashboards_out
27+
mixtool generate dashboards mixin.libsonnet -d dashboards_out
28+
29+
prometheus_alerts.yaml: mixin.libsonnet alerts/*.libsonnet
30+
mixtool generate alerts mixin.libsonnet -a prometheus_alerts.yaml
31+
32+
.PHONY: clean
33+
clean:
34+
rm -rf dashboards_out prometheus_alerts.yaml

apache-solr-mixin/README.md

Lines changed: 141 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,141 @@
1+
# Apache Solr Mixin
2+
3+
The Apache Solr mixin is a set of configurable Grafana dashboards and alerts.
4+
5+
The Apache Solr mixin contains the following dashboards:
6+
7+
- Apache Solr cluster overview
8+
- Apache Solr query performance
9+
- Apache Solr resource monitoring
10+
- Apache Solr logs overview
11+
12+
and the following alerts:
13+
14+
- ApacheSolrZookeeperChangeInEnsembleSize
15+
- ApacheSolrHighCPUUsageCritical
16+
- ApacheSolrHighCPUUsageWarning
17+
- ApacheSolrHighHeapMemoryUsageCritical
18+
- ApacheSolrHighHeapMemoryUsageWarning
19+
- ApacheSolrLowCacheHitRatio
20+
- ApacheSolrHighCoreErrors
21+
- ApacheSolrHighDocumentIndexing
22+
23+
## Apache Solr Cluster Overview
24+
25+
The Apache Solr cluster overview dashboard provides details on cluster, shard, replica and Zookeeper health as well as top core and error metrics.
26+
27+
![Apache Solr Cluster Overview Dashboard 1](https://storage.googleapis.com/grafanalabs-integration-assets/apache-solr/screenshots/apache-solr-cluster-1.png)
28+
![Apache Solr Cluster Overview Dashboard 2](https://storage.googleapis.com/grafanalabs-integration-assets/apache-solr/screenshots/apache-solr-cluster-2.png)
29+
30+
## Apache Solr Query Performance
31+
32+
The Apache Solr query performance dashboard provides details on various query load and latency, update handlers, cache, timeout and error metrics.
33+
34+
![Apache Solr Query Performance Dashboard 1](https://storage.googleapis.com/grafanalabs-integration-assets/apache-solr/screenshots/apache-solr-query-performance-1.png)
35+
![Apache Solr Query Performance Dashboard 2](https://storage.googleapis.com/grafanalabs-integration-assets/apache-solr/screenshots/apache-solr-query-performance-2.png)
36+
![Apache Solr Query Performance Dashboard 3](https://storage.googleapis.com/grafanalabs-integration-assets/apache-solr/screenshots/apache-solr-query-performance-3.png)
37+
38+
## Apache Solr Resource Monitoring
39+
40+
The Apache Solr resource monitoring dashboard provides details on connections, threads, core FS usage, as well as JVM and Jetty metrics.
41+
42+
![Apache Solr Resource Monitoring Dashboard 1](https://storage.googleapis.com/grafanalabs-integration-assets/apache-solr/screenshots/apache-solr-resource-monitoring-1.png)
43+
![Apache Solr Resource Monitoring Dashboard 2](https://storage.googleapis.com/grafanalabs-integration-assets/apache-solr/screenshots/apache-solr-resource-monitoring-2.png)
44+
45+
## Apache Solr Logs Overview
46+
47+
The Apache Solr logs overview dashboard provides details on slow requests, garbage collection, and error logs. [Promtail and Loki needs to be installed](https://grafana.com/docs/loki/latest/installation/) and provisioned for logs with your Grafana instance. The default Apache Solr error log path is `/var/solr/logs/solr.log` for each instance on Linux.
48+
49+
Apache Solr logs are enabled by default in the `config.libsonnet` and can be removed by setting `enableLokiLogs` to `false`. Then run `make` again to regenerate the dashboard:
50+
51+
```
52+
{
53+
_config+:: {
54+
enableLokiLogs: false,
55+
},
56+
}
57+
```
58+
59+
In order for the selectors to properly work for system logs ingested into your logs datasource, please also include the matching `job` and `solr_cluster` labels onto the [scrape_configs](https://grafana.com/docs/loki/latest/clients/promtail/configuration/#scrape_configs) as to match the labels for ingested metrics.
60+
61+
```yaml
62+
scrape_configs:
63+
- job_name: integrations/apache-solr
64+
static_configs:
65+
- targets: [localhost]
66+
labels:
67+
job: integrations/apache-solr
68+
instance: '<your-instance-name>'
69+
solr_cluster: '<your-cluster-name>'
70+
__path__: /var/log/logs/*.log
71+
pipeline_stages:
72+
- multiline:
73+
firstline: '^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}\.\d{3}'
74+
- regex:
75+
expression: '^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}\.\d{3} (?P<level>\w+)'
76+
- labels:
77+
level:
78+
```
79+
80+
![Apache Solr Logs Overview Dashboard](https://storage.googleapis.com/grafanalabs-integration-assets/apache-solr/screenshots/apache-solr-logs-overview.png)
81+
82+
## Alerts Overview
83+
84+
85+
| Alert | Summary |
86+
|-----------------------------------------|---------------------------------------------------------------------------------------------------------------------|
87+
| ApacheSolrZookeeperChangeInEnsembleSize | Changes in the ZooKeeper ensemble size can affect the stability and performance of the cluster. |
88+
| ApacheSolrHighCPUUsageCritical | High CPU load can indicate that Solr nodes are under heavy load, potentially impacting performance. |
89+
| ApacheSolrHighCPUUsageWarning | High CPU load can indicate that Solr nodes are under heavy load, potentially impacting performance. |
90+
| ApacheSolrHighHeapMemoryUsageCritical | High heap memory usage can lead to garbage collection issues, out-of-memory errors, and overall system instability. |
91+
| ApacheSolrHighHeapMemoryUsageWarning | High heap memory usage can lead to garbage collection issues, out-of-memory errors, and overall system instability. |
92+
| ApacheSolrLowCacheHitRatio | Low cache hit ratios can lead to increased disk I/O and slower query response times. |
93+
| ApacheSolrHighCoreErrors | A spike in core errors can indicate serious issues at the core level, affecting data integrity and availability. |
94+
| ApacheSolrHighDocumentIndexing | A sudden spike in document indexing could indicate unintended or malicious bulk updates. |
95+
96+
Default thresholds can be configured in `config.libsonnet`
97+
98+
```js
99+
{
100+
_config+:: {
101+
alertsCriticalCPUUsage: 85,
102+
alertsWarningCPUUsage: 75,
103+
alertsWarningMemoryUsage: 85,
104+
alertsCriticalMemoryUsage: 75,
105+
alertsWarningCacheUsage: 75,
106+
alertsWarningCoreErrors: 15,
107+
alertsWarningDocumentIndexing: 30,
108+
},
109+
}
110+
```
111+
112+
## Install Tools
113+
114+
```bash
115+
go install github.com/jsonnet-bundler/jsonnet-bundler/cmd/jb@latest
116+
go install github.com/monitoring-mixins/mixtool/cmd/mixtool@latest
117+
# or in brew: brew install go-jsonnet
118+
```
119+
120+
For linting and formatting, you would also need `mixtool` and `jsonnetfmt` installed. If you
121+
have a working Go development environment, it's easiest to run the following:
122+
123+
```bash
124+
go install github.com/google/go-jsonnet/cmd/jsonnetfmt@latest
125+
```
126+
127+
The files in `dashboards_out` need to be imported
128+
into your Grafana server. The exact details will be depending on your environment.
129+
130+
`prometheus_alerts.yaml` needs to be imported into Prometheus.
131+
132+
## Generate Dashboards And Alerts
133+
134+
Edit `config.libsonnet` if required and then build JSON dashboard files for Grafana:
135+
136+
```bash
137+
make
138+
```
139+
140+
For more advanced uses of mixins, see
141+
https://github.com/monitoring-mixins/docs.
Lines changed: 142 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,142 @@
1+
{
2+
prometheusAlerts+:: {
3+
groups+: [
4+
{
5+
name: 'apache-solr',
6+
rules: [
7+
{
8+
alert: 'ApacheSolrZookeeperChangeInEnsembleSize',
9+
expr: |||
10+
'changes(solr_zookeeper_ensemble_size[5m]) > 0'
11+
||| % $._config,
12+
'for': '5m',
13+
labels: {
14+
severity: 'warning',
15+
},
16+
annotations: {
17+
summary: 'Changes in the ZooKeeper ensemble size can affect the stability and performance of the cluster.',
18+
description:
19+
(
20+
'Zookeeper host {{$labels.zk_host}} has had an ensemble change of {{ printf "%%.0f" $value }} over the last 5 minutes'
21+
) % $._config,
22+
},
23+
},
24+
{
25+
alert: 'ApacheSolrHighCPUUsageCritical',
26+
expr: |||
27+
'100 * sum without (base_url, item) (avg_over_time(solr_metrics_jvm_os_cpu_load{item="systemCpuLoad"}[5m])) > %(alertsCriticalCPUUsage)s'
28+
||| % $._config,
29+
'for': '5m',
30+
labels: {
31+
severity: 'critical',
32+
},
33+
annotations: {
34+
summary: 'High CPU load can indicate that Solr nodes are under heavy load, potentially impacting performance.',
35+
description:
36+
(
37+
'{{$labels.instance}} on cluster {{$labels.solr_cluster}} has had a system CPU load of {{ printf "%%.0f" $value }}%%, which is above the threshold of %(alertsCriticalCPUUsage)s.'
38+
) % $._config,
39+
},
40+
},
41+
{
42+
alert: 'ApacheSolrHighCPUUsageWarning',
43+
expr: |||
44+
'100 * sum without (base_url, item) (avg_over_time(solr_metrics_jvm_os_cpu_load{item="systemCpuLoad"}[5m])) > %(alertsWarningCPUUsage)s'
45+
||| % $._config,
46+
'for': '5m',
47+
labels: {
48+
severity: 'warning',
49+
},
50+
annotations: {
51+
summary: 'High CPU load can indicate that Solr nodes are under heavy load, potentially impacting performance.',
52+
description:
53+
(
54+
'{{$labels.instance}} on cluster {{$labels.solr_cluster}} has had a system CPU load of {{ printf "%%.0f" $value }}%%, which is above the threshold of %(alertsWarningCPUUsage)s.'
55+
) % $._config,
56+
},
57+
},
58+
{
59+
alert: 'ApacheSolrHighHeapMemoryUsageCritical',
60+
expr: |||
61+
'100 * sum without(item, base_url)(solr_metrics_jvm_memory_heap_bytes{item="used"}) / clamp_min(sum without(item, base_url)(solr_metrics_jvm_memory_heap_bytes{item="max"}), 1) > %(alertsCriticalMemoryUsage)s'
62+
||| % $._config,
63+
'for': '5m',
64+
labels: {
65+
severity: 'critical',
66+
},
67+
annotations: {
68+
summary: 'High heap memory usage can lead to garbage collection issues, out-of-memory errors, and overall system instability.',
69+
description: |||
70+
{{$labels.instance}} on cluster {{$labels.solr_cluster}} has had high memory usage of {{ printf "%%.0f" $value }}%%, which is above the thresold of %(alertsCriticalMemoryUsage)s.
71+
||| % $._config,
72+
},
73+
},
74+
{
75+
alert: 'ApacheSolrHighHeapMemoryUsageWarning',
76+
expr: |||
77+
'100 * sum without(item, base_url)(solr_metrics_jvm_memory_heap_bytes{item="used"}) / clamp_min(sum without(item, base_url)(solr_metrics_jvm_memory_heap_bytes{item="max"}), 1) > %(alertsWarningMemoryUsage)s'
78+
||| % $._config,
79+
'for': '5m',
80+
labels: {
81+
severity: 'warning',
82+
},
83+
annotations: {
84+
summary: 'High heap memory usage can lead to garbage collection issues, out-of-memory errors, and overall system instability.',
85+
description: |||
86+
{{$labels.instance}} on cluster {{$labels.solr_cluster}} has had high memory usage of {{ printf "%%.0f" $value }}%%, which is above the thresold of %(alertsWarningMemoryUsage)s.
87+
||| % $._config,
88+
},
89+
},
90+
{
91+
alert: 'ApacheSolrLowCacheHitRatio',
92+
expr: |||
93+
'100 * sum without(base_url, category, collection, item, replica, shard) (solr_metrics_core_searcher_cache_ratio{item="hitratio", type=~"documentCache|filterCache|queryResultCache"}[10m]) < %(alertsWarningCacheUsage)s'
94+
||| % $._config,
95+
'for': '10m',
96+
labels: {
97+
severity: 'warning',
98+
},
99+
annotations: {
100+
summary: 'Low cache hit ratios can lead to increased disk I/O and slower query response times.',
101+
description: |||
102+
{{$labels.instance}} on cluster {{$labels.solr_cluster}} has had a low cache hit ratio of {{ printf "%%.0f" $value }}%% on core {{$labels.core}} of type {{$labels.type}}, which is under the threshold of %(alertsWarningCacheUsage)s.
103+
||| % $._config,
104+
},
105+
},
106+
{
107+
alert: 'ApacheSolrHighCoreErrors',
108+
expr: |||
109+
'100 * sum without(base_url, category, collection, handler, replica, shard) (increase(solr_metrics_core_errors_total[10m]) / clamp_min(avg_over_time(solr_metrics_core_errors_total[10m]), 1)) > %(alertsWarningCoreErrors)s'
110+
||| % $._config,
111+
'for': '10m',
112+
labels: {
113+
severity: 'warning',
114+
},
115+
annotations: {
116+
summary: 'A spike in core errors can indicate serious issues at the core level, affecting data integrity and availability.',
117+
description: |||
118+
{{$labels.instance}} on cluster {{$labels.solr_cluster}} has had a high amount of core errors {{ printf "%%.0f" $value }}%% on core {{$labels.core}}, which is above the threshold of %(alertsWarningCoreErrors)s.
119+
||| % $._config,
120+
},
121+
},
122+
{
123+
alert: 'ApacheSolrHighDocumentIndexing',
124+
expr: |||
125+
'100 * sum without(base_url, category, collection, handler, replica, shard) (increase(solr_metrics_core_update_handler_adds_total[15m]) / clamp_min(avg_over_time(solr_metrics_core_update_handler_adds_total[15m]), 1)) > %(alertsWarningDocumentIndexing)s'
126+
||| % $._config,
127+
'for': '15m',
128+
labels: {
129+
severity: 'warning',
130+
},
131+
annotations: {
132+
summary: 'A sudden spike in document indexing could indicate unintended or malicious bulk updates.',
133+
description: |||
134+
{{$labels.instance}} on cluster {{$labels.solr_cluster}} has had a high document indexing value of {{ printf "%%.0f" $value }}%% on core {{$labels.core}}, which is above the threshold of %(alertsWarningDocumentIndexing)s.
135+
||| % $._config,
136+
},
137+
},
138+
],
139+
},
140+
],
141+
},
142+
}

apache-solr-mixin/config.libsonnet

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
{
2+
_config+:: {
3+
enableMultiCluster: false,
4+
solrSelector: if self.enableMultiCluster then 'job=~"$job", cluster=~"$cluster"' else 'job=~"$job"',
5+
multiclusterSelector: 'job=~"$job"',
6+
filterSelector: 'job=~"integrations/apache-solr"',
7+
8+
dashboardTags: ['apache-solr-mixin'],
9+
dashboardPeriod: 'now-30m',
10+
dashboardTimezone: 'default',
11+
dashboardRefresh: '1m',
12+
13+
// alerts thresholds
14+
alertsCriticalCPUUsage: 85,
15+
alertsWarningCPUUsage: 75,
16+
alertsWarningMemoryUsage: 85,
17+
alertsCriticalMemoryUsage: 75,
18+
alertsWarningCacheUsage: 75,
19+
alertsWarningCoreErrors: 15,
20+
alertsWarningDocumentIndexing: 30,
21+
22+
enableLokiLogs: true,
23+
},
24+
}

0 commit comments

Comments
 (0)