Skip to content

Commit a6317d0

Browse files
committed
More config details including alerts
1 parent 1a55e83 commit a6317d0

File tree

1 file changed

+153
-3
lines changed

1 file changed

+153
-3
lines changed

README.md

Lines changed: 153 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ Two custom components:
2121
* p4prometheus - This component.
2222
* monitor_metrics.sh - [SDP](https://swarm.workshop.perforce.com/projects/perforce-software-sdp) compatible bash script to generate simple supplementary metrics - [monitor_metrics.sh](https://swarm.workshop.perforce.com/files/guest/perforce_software/sdp/dev/Server/Unix/p4/common/site/bin/monitor_metrics.sh)
2323

24-
Check out the [Prometheus architecture](https://prometheus.io/assets/architecture.png) - the custom components are "Prometheus targets".
24+
Check out the ![Prometheus architecture](https://prometheus.io/assets/architecture.png) - the custom components are "Prometheus targets".
2525

2626
# Grafana Dashboards
2727

@@ -85,7 +85,7 @@ Ensure the above has global read access (perforce user will write files, node_ex
8585

8686
Create service file:
8787

88-
```bash
88+
```ini
8989
cat << EOF > /etc/systemd/system/node_exporter.service
9090
[Unit]
9191
Description=Node Exporter
@@ -156,7 +156,7 @@ As user `root`:
156156

157157
Create service file:
158158

159-
```bash
159+
```ini
160160
cat << EOF > /etc/systemd/system/p4prometheus.service
161161
[Unit]
162162
Description=P4prometheus
@@ -189,3 +189,153 @@ Check that metrics are being written:
189189

190190
cat /hxlogs/metrics/p4_cmds.prom
191191

192+
# Alerting
193+
194+
Done via alertmanager
195+
196+
Setup is very similar to the above.
197+
198+
Sample `/etc/systemd/system/alertmanager.service`:
199+
200+
```ini
201+
[Unit]
202+
Description=Alertmanager
203+
Wants=network-online.target
204+
After=network-online.target
205+
206+
[Service]
207+
User=alertmanager
208+
Group=alertmanager
209+
Type=simple
210+
ExecStart=/usr/local/bin/alertmanager --config.file=/etc/alertmanager/alertmanager.yml --storage.path=/var/lib/alertmanager --log.level=debug
211+
212+
[Install]
213+
WantedBy=multi-user.target
214+
```
215+
216+
* create alertmanager user
217+
* create /etc/alertmanager directory
218+
219+
220+
## Prometheus config
221+
222+
```yaml
223+
global:
224+
scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
225+
evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.
226+
# scrape_timeout is set to the global default (10s).
227+
228+
# Alertmanager configuration
229+
alerting:
230+
alertmanagers:
231+
- static_configs:
232+
- targets:
233+
- localhost:9093
234+
235+
# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
236+
rule_files:
237+
- "perforce_rules.yml"
238+
239+
# A scrape configuration containing exactly one endpoint to scrape:
240+
# Here it's Prometheus itself.
241+
scrape_configs:
242+
- job_name: 'prometheus'
243+
static_configs:
244+
- targets: ['localhost:9090']
245+
246+
- job_name: 'node_exporter'
247+
static_configs:
248+
- targets: ['p4hms:9100', 'p4main:9100', 'p4_ha:9100']
249+
250+
```
251+
252+
## Alerting rules
253+
254+
This is an example, assuming simple email and local postfix or equivalent setup.
255+
256+
```yaml
257+
groups:
258+
- name: alert.rules
259+
rules:
260+
- alert: NoLogs
261+
expr: 100 > rate(p4_prom_log_lines_read{sdpinst="1",serverid="master"}[1m])
262+
for: 1m
263+
labels:
264+
severity: "critical"
265+
annotations:
266+
summary: "Endpoint {{ $labels.instance }} too few log lines"
267+
description: "{{ $labels.instance }} of job {{ $labels.job }} has been below target for more than 1 minutes."
268+
- alert: Replication Slow HA
269+
expr: p4_replica_curr_pos{instance="p4master:9100",job="node_exporter",sdpinst="1",servername="master"} - ignoring(serverid,servername) p4_replica_curr_pos{instance="p4master:9100",job="node_exporter",sdpinst="1",servername="p4d_ha_bos"} > 5e+7
270+
for: 10m
271+
labels:
272+
severity: "warning"
273+
annotations:
274+
summary: "Endpoint {{ $labels.instance }} replication warning"
275+
description: "{{ $labels.instance }} of job {{ $labels.job }} has been above target for more than 1 minutes."
276+
- alert: Replication Slow London
277+
expr: p4_replica_curr_pos{instance="p4master:9100",job="node_exporter",sdpinst="1",servername="master"} - ignoring(serverid,servername) p4_replica_curr_pos{instance="p4master:9100",job="node_exporter",sdpinst="1",servername="p4d_fr_lon"} > 5e+7
278+
for: 10m
279+
labels:
280+
severity: "warning"
281+
annotations:
282+
summary: "Endpoint {{ $labels.instance }} replication warning"
283+
description: "{{ $labels.instance }} of job {{ $labels.job }} has been above target for more than 1 minutes."
284+
- alert: Checkpoint slow
285+
expr: p4_sdp_checkpoint_duration{sdpinst="1",serverid="master"} > 50 * 60
286+
for: 5m
287+
labels:
288+
severity: "warning"
289+
annotations:
290+
summary: "Endpoint {{ $labels.instance }} checkpoint job duration longer than expected"
291+
description: "{{ $labels.instance }} of job {{ $labels.job }} has been above target for more than 1 minutes."
292+
- alert: Checkpoint not taken
293+
expr: time() - p4_sdp_checkpoint_log_time{sdpinst="1",serverid="master"} > 25 * 60 * 60
294+
for: 5m
295+
labels:
296+
severity: "warning"
297+
annotations:
298+
summary: "Endpoint {{ $labels.instance }} checkpoint not taken in 25 hours warning"
299+
description: "{{ $labels.instance }} of job {{ $labels.job }} has been above target for more than 1 minutes."
300+
- alert: P4D service not running
301+
expr: node_systemd_unit_state{state="active",name="p4d_1.service"} != 1
302+
for: 5m
303+
labels:
304+
severity: "warning"
305+
annotations:
306+
summary: "Endpoint {{ $labels.instance }} p4d service not running"
307+
description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for 5 minutes."
308+
- alert: DiskspaceLow
309+
expr: node_filesystem_free_bytes{mountpoint=~"/hx.*"} / node_filesystem_size_bytes{mountpoint=~"/hx.*"} * 100 < 10
310+
for: 5m
311+
labels:
312+
severity: "warning"
313+
annotations:
314+
summary: "Endpoint {{ $labels.instance }} disk space below 10%"
315+
description: "{{ $labels.instance }} of job {{ $labels.job }} has been below limit for 5 minutes."
316+
```
317+
318+
## Alertmanager config
319+
320+
This is an example, assuming simple email and local postfix or equivalent setup - `/etc/alertmanager/alertmanager.yml`
321+
322+
```yaml
323+
global:
324+
smtp_from: alertmanager@perforce.com
325+
smtp_smarthost: localhost:25
326+
smtp_require_tls: false
327+
# Hello is the local machine name
328+
smtp_hello: p4hms
329+
330+
route:
331+
group_by: ['alertname']
332+
group_wait: 30s
333+
group_interval: 5m
334+
repeat_interval: 60m
335+
receiver: mail
336+
337+
receivers:
338+
- name: mail
339+
email_configs:
340+
- to: p4-group@perforce.com
341+
```

0 commit comments

Comments
 (0)