@@ -21,7 +21,7 @@ Two custom components:
2121* p4prometheus - This component.
2222* monitor_metrics.sh - [ SDP] ( https://swarm.workshop.perforce.com/projects/perforce-software-sdp ) compatible bash script to generate simple supplementary metrics - [ monitor_metrics.sh] ( https://swarm.workshop.perforce.com/files/guest/perforce_software/sdp/dev/Server/Unix/p4/common/site/bin/monitor_metrics.sh )
2323
24- Check out the [ Prometheus architecture] ( https://prometheus.io/assets/architecture.png ) - the custom components are "Prometheus targets".
24+ Check out the ! [ Prometheus architecture] ( https://prometheus.io/assets/architecture.png ) - the custom components are "Prometheus targets".
2525
2626# Grafana Dashboards
2727
@@ -85,7 +85,7 @@ Ensure the above has global read access (perforce user will write files, node_ex
8585
8686Create service file:
8787
88- ``` bash
88+ ``` ini
8989cat << EOF > /etc/systemd/system/node_exporter.service
9090[Unit]
9191Description =Node Exporter
@@ -156,7 +156,7 @@ As user `root`:
156156
157157Create service file:
158158
159- ``` bash
159+ ``` ini
160160cat << EOF > /etc/systemd/system/p4prometheus.service
161161[Unit]
162162Description =P4prometheus
@@ -189,3 +189,153 @@ Check that metrics are being written:
189189
190190 cat /hxlogs/metrics/p4_cmds.prom
191191
192+ # Alerting
193+
194+ Done via alertmanager
195+
196+ Setup is very similar to the above.
197+
198+ Sample ` /etc/systemd/system/alertmanager.service ` :
199+
200+ ``` ini
201+ [Unit]
202+ Description =Alertmanager
203+ Wants =network-online.target
204+ After =network-online.target
205+
206+ [Service]
207+ User =alertmanager
208+ Group =alertmanager
209+ Type =simple
210+ ExecStart =/usr/local/bin/alertmanager --config.file =/etc/alertmanager/alertmanager.yml --storage.path =/var/lib/alertmanager --log.level =debug
211+
212+ [Install]
213+ WantedBy =multi-user.target
214+ ```
215+
216+ * create alertmanager user
217+ * create /etc/alertmanager directory
218+
219+
220+ ## Prometheus config
221+
222+ ``` yaml
223+ global :
224+ scrape_interval : 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
225+ evaluation_interval : 15s # Evaluate rules every 15 seconds. The default is every 1 minute.
226+ # scrape_timeout is set to the global default (10s).
227+
228+ # Alertmanager configuration
229+ alerting :
230+ alertmanagers :
231+ - static_configs :
232+ - targets :
233+ - localhost:9093
234+
235+ # Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
236+ rule_files :
237+ - " perforce_rules.yml"
238+
239+ # A scrape configuration containing exactly one endpoint to scrape:
240+ # Here it's Prometheus itself.
241+ scrape_configs :
242+ - job_name : ' prometheus'
243+ static_configs :
244+ - targets : ['localhost:9090']
245+
246+ - job_name : ' node_exporter'
247+ static_configs :
248+ - targets : ['p4hms:9100', 'p4main:9100', 'p4_ha:9100']
249+
250+ ```
251+
252+ ## Alerting rules
253+
254+ This is an example, assuming simple email and local postfix or equivalent setup.
255+
256+ ``` yaml
257+ groups :
258+ - name : alert.rules
259+ rules :
260+ - alert : NoLogs
261+ expr : 100 > rate(p4_prom_log_lines_read{sdpinst="1",serverid="master"}[1m])
262+ for : 1m
263+ labels :
264+ severity : " critical"
265+ annotations :
266+ summary : " Endpoint {{ $labels.instance }} too few log lines"
267+ description : " {{ $labels.instance }} of job {{ $labels.job }} has been below target for more than 1 minutes."
268+ - alert : Replication Slow HA
269+ expr : p4_replica_curr_pos{instance="p4master:9100",job="node_exporter",sdpinst="1",servername="master"} - ignoring(serverid,servername) p4_replica_curr_pos{instance="p4master:9100",job="node_exporter",sdpinst="1",servername="p4d_ha_bos"} > 5e+7
270+ for : 10m
271+ labels :
272+ severity : " warning"
273+ annotations :
274+ summary : " Endpoint {{ $labels.instance }} replication warning"
275+ description : " {{ $labels.instance }} of job {{ $labels.job }} has been above target for more than 1 minutes."
276+ - alert : Replication Slow London
277+ expr : p4_replica_curr_pos{instance="p4master:9100",job="node_exporter",sdpinst="1",servername="master"} - ignoring(serverid,servername) p4_replica_curr_pos{instance="p4master:9100",job="node_exporter",sdpinst="1",servername="p4d_fr_lon"} > 5e+7
278+ for : 10m
279+ labels :
280+ severity : " warning"
281+ annotations :
282+ summary : " Endpoint {{ $labels.instance }} replication warning"
283+ description : " {{ $labels.instance }} of job {{ $labels.job }} has been above target for more than 1 minutes."
284+ - alert : Checkpoint slow
285+ expr : p4_sdp_checkpoint_duration{sdpinst="1",serverid="master"} > 50 * 60
286+ for : 5m
287+ labels :
288+ severity : " warning"
289+ annotations :
290+ summary : " Endpoint {{ $labels.instance }} checkpoint job duration longer than expected"
291+ description : " {{ $labels.instance }} of job {{ $labels.job }} has been above target for more than 1 minutes."
292+ - alert : Checkpoint not taken
293+ expr : time() - p4_sdp_checkpoint_log_time{sdpinst="1",serverid="master"} > 25 * 60 * 60
294+ for : 5m
295+ labels :
296+ severity : " warning"
297+ annotations :
298+ summary : " Endpoint {{ $labels.instance }} checkpoint not taken in 25 hours warning"
299+ description : " {{ $labels.instance }} of job {{ $labels.job }} has been above target for more than 1 minutes."
300+ - alert : P4D service not running
301+ expr : node_systemd_unit_state{state="active",name="p4d_1.service"} != 1
302+ for : 5m
303+ labels :
304+ severity : " warning"
305+ annotations :
306+ summary : " Endpoint {{ $labels.instance }} p4d service not running"
307+ description : " {{ $labels.instance }} of job {{ $labels.job }} has been down for 5 minutes."
308+ - alert : DiskspaceLow
309+ expr : node_filesystem_free_bytes{mountpoint=~"/hx.*"} / node_filesystem_size_bytes{mountpoint=~"/hx.*"} * 100 < 10
310+ for : 5m
311+ labels :
312+ severity : " warning"
313+ annotations :
314+ summary : " Endpoint {{ $labels.instance }} disk space below 10%"
315+ description : " {{ $labels.instance }} of job {{ $labels.job }} has been below limit for 5 minutes."
316+ ` ` `
317+
318+ ## Alertmanager config
319+
320+ This is an example, assuming simple email and local postfix or equivalent setup - ` /etc/alertmanager/alertmanager.yml`
321+
322+ ` ` ` yaml
323+ global:
324+ smtp_from: alertmanager@perforce.com
325+ smtp_smarthost: localhost:25
326+ smtp_require_tls: false
327+ # Hello is the local machine name
328+ smtp_hello: p4hms
329+
330+ route:
331+ group_by: ['alertname']
332+ group_wait: 30s
333+ group_interval: 5m
334+ repeat_interval: 60m
335+ receiver: mail
336+
337+ receivers:
338+ - name: mail
339+ email_configs:
340+ - to: p4-group@perforce.com
341+ ` ` `
0 commit comments