Skip to content

Commit c3ba755

Browse files
prometheus for alerts tests (#104)
* it's alive * lint
1 parent 6da969b commit c3ba755

File tree

7 files changed

+324
-1
lines changed

7 files changed

+324
-1
lines changed

client/prometheus.go

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -97,3 +97,12 @@ func (p *Prometheus) ResourcesSummary() (float64, float64, error) {
9797
}
9898
return cpu, mem, nil
9999
}
100+
101+
// GetAlerts returns all firing alerts
102+
func (p *Prometheus) GetAlerts() (v1.AlertsResult, error) {
103+
alerts, err := p.API.Alerts(context.Background())
104+
if err != nil {
105+
return v1.AlertsResult{}, err
106+
}
107+
return alerts, nil
108+
}

environment/environment_templates.go

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ import (
66
"github.com/pkg/errors"
77
"github.com/rs/zerolog/log"
88
"helm.sh/helm/v3/pkg/chartutil"
9+
"io/ioutil"
910
"path/filepath"
1011
"strconv"
1112
"strings"
@@ -174,6 +175,14 @@ func NewOTPEManifest() *K8sManifest {
174175
id: "otpe",
175176
DeploymentFile: filepath.Join(tools.ProjectRoot, "/environment/templates/otpe-deployment.yml"),
176177
ServiceFile: filepath.Join(tools.ProjectRoot, "/environment/templates/otpe-service.yml"),
178+
SetValuesFunc: func(manifest *K8sManifest) error {
179+
manifest.values["clusterURL"] = fmt.Sprintf(
180+
"%s:%d",
181+
manifest.Service.Spec.ClusterIP,
182+
manifest.Service.Spec.Ports[0].Port,
183+
)
184+
return nil
185+
},
177186
}
178187
}
179188

@@ -202,6 +211,25 @@ func NewMockserverHelmChart() *HelmChart {
202211
return chart
203212
}
204213

214+
// NewPrometheusManifest creates new k8s manifest for prometheus
215+
func NewPrometheusManifest() *K8sManifest {
216+
rulesFilePath := filepath.Join(tools.ProjectRoot, "/environment/templates/prometheus/rules/ocr.rules.yml")
217+
content, err := ioutil.ReadFile(rulesFilePath)
218+
if err != nil {
219+
return nil
220+
}
221+
return &K8sManifest{
222+
id: "prometheus",
223+
DeploymentFile: filepath.Join(tools.ProjectRoot, "/environment/templates/prometheus/prometheus-deployment.yml"),
224+
ServiceFile: filepath.Join(tools.ProjectRoot, "/environment/templates/prometheus/prometheus-service.yml"),
225+
ConfigMapFile: filepath.Join(tools.ProjectRoot, "/environment/templates/prometheus/prometheus-config-map.yml"),
226+
227+
values: map[string]interface{}{
228+
"ocrRulesYml": string(content),
229+
},
230+
}
231+
}
232+
205233
// NewHardhatManifest is the k8s manifest that when used will deploy hardhat to an environment
206234
func NewHardhatManifest() *K8sManifest {
207235
return &K8sManifest{
@@ -502,3 +530,16 @@ func OtpeGroup() K8sEnvSpecInit {
502530
return "envName", specs
503531
}
504532
}
533+
534+
// PrometheusGroup contains manifests for prometheus
535+
func PrometheusGroup() K8sEnvSpecInit {
536+
return func(config *config.NetworkConfig) (string, K8sEnvSpecs) {
537+
var specs K8sEnvSpecs
538+
prometheusDependencyGroup := &K8sManifestGroup{
539+
id: "PrometheusDependencyGroup",
540+
manifests: []K8sEnvResource{NewPrometheusManifest()},
541+
}
542+
specs = append(specs, prometheusDependencyGroup)
543+
return "", specs
544+
}
545+
}
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
apiVersion: v1
2+
kind: ConfigMap
3+
metadata:
4+
name: prometheus-config-map
5+
objectmeta:
6+
name: prometheus-config-map
7+
data:
8+
prometheus.yml: |
9+
global:
10+
scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
11+
evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.
12+
# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
13+
rule_files:
14+
- "ocr.rules.yml"
15+
scrape_configs:
16+
- job_name: 'prometheus'
17+
# metrics_path defaults to '/metrics'
18+
# scheme defaults to 'http'.
19+
static_configs:
20+
- targets: ['localhost:9090']
21+
- job_name: 'otpe'
22+
static_configs:
23+
- targets: ['{{ .Values.OTPEDependencyGroup.otpe.clusterURL }}']
24+
25+
ocr.rules.yml: |
26+
{{ .Values.PrometheusDependencyGroup.prometheus.ocrRulesYml }}
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
metadata:
2+
name: prometheus
3+
spec:
4+
replicas: 1
5+
selector:
6+
matchLabels:
7+
app: prometheus
8+
template:
9+
objectmeta:
10+
labels:
11+
app: prometheus
12+
spec:
13+
containers:
14+
- name: prometheus
15+
image: prom/prometheus
16+
args:
17+
- "--config.file=/etc/prometheus/prometheus.yml"
18+
- "--storage.tsdb.path=/prometheus/"
19+
ports:
20+
- containerPort: 9090
21+
volumeMounts:
22+
- name: prometheus-config-volume
23+
mountPath: /etc/prometheus/
24+
- name: prometheus-storage-volume
25+
mountPath: /prometheus/
26+
volumes:
27+
- name: prometheus-config-volume
28+
configMap:
29+
defaultMode: 420
30+
name: prometheus-config-map
31+
- name: prometheus-storage-volume
32+
emptyDir: { }
33+
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
metadata:
2+
name: prometheus
3+
spec:
4+
ports:
5+
- name: "9090"
6+
port: 9090
7+
targetPort: 9090
8+
selector:
9+
app: prometheus
10+
type: ClusterIP
Lines changed: 201 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,201 @@
1+
groups:
2+
- name: OTPE
3+
interval: 15s
4+
rules:
5+
- alert: Config Digest Duplicate
6+
expr: otpe_config_digest_duplicates_total != 0
7+
labels:
8+
severity: critical
9+
team: incident-response
10+
annotations:
11+
summary: Something really unexpected happened. Let Lorenz know.
12+
13+
- name: Telemetry Ingestion
14+
interval: 15s
15+
rules:
16+
- record: sum:ocr_telemetry_ingested_total
17+
expr: sum without (contract, oracle) (ocr_telemetry_ingested_total)
18+
- record: bool:telemetry_down
19+
expr: (rate (sum:ocr_telemetry_ingested_total[1m])) == bool 0
20+
- alert: Telemetry Down (infra)
21+
expr: bool:telemetry_down == 1
22+
for: 5m
23+
labels:
24+
severity: critical
25+
team: infra
26+
annotations:
27+
summary: OTPE is not receiving any telemetry at all.
28+
29+
- alert: Telemetry Down (o11y)
30+
expr: bool:telemetry_down == 1
31+
for: 5m
32+
labels:
33+
severity: critical
34+
team: monitoring
35+
annotations:
36+
summary: OTPE is not receiving any telemetry at all.
37+
38+
- name: Contract Configuration
39+
interval: 15s
40+
rules:
41+
- record: bool:contract_oracle_active
42+
# use max_over_time to be resistant to exporter restarts/glitches
43+
expr: max_over_time(ocr_contract_oracle_active[1m]) > bool 0
44+
- record: bool:contract_active
45+
expr: sum without(oracle) (bool:contract_oracle_active) > bool 0
46+
- record: bool:oracle_active
47+
expr: sum without(contract) (bool:contract_oracle_active) > bool 0
48+
49+
- name: Oracle & Feed
50+
interval: 15s
51+
rules:
52+
- record: bool:oracle_feed_telemetry_down
53+
expr: (rate(ocr_telemetry_ingested_total[2m]) == bool 0) * bool:contract_oracle_active
54+
- record: bool:oracle_feed_blind
55+
# TODO: It would be better to make this based on a rate of a total
56+
expr: (max_over_time(ocr_telemetry_message_report_req_observation_included[2m]) == bool 0) * bool:contract_oracle_active
57+
58+
- name: Oracle
59+
interval: 15s
60+
rules:
61+
- record: bool:oracle_blind
62+
expr: min without(contract) (bool:oracle_feed_blind) * bool:oracle_active
63+
- record: bool:oracle_blind_except_telemetry_down
64+
expr: bool:oracle_blind * ignoring (oracle) group_left() (1 - bool:telemetry_down)
65+
66+
# Oracle Blind EXCEPT Telemetry Down
67+
- alert: No observations from an OCR oracle
68+
expr: bool:oracle_blind_except_telemetry_down == 1
69+
for: 3m
70+
labels:
71+
severity: critical
72+
team: incident-response
73+
annotations:
74+
summary: Oracle has made no observations {{ $labels.oracle }}. Perhaps the oracle is down or having data source issues? Reach out to the node op.
75+
- record: bool:oracle_telemetry_down
76+
expr: min without(contract) (bool:oracle_feed_telemetry_down) * bool:oracle_active
77+
- record: bool:oracle_telemetry_down_except_telemetry_down
78+
expr: bool:oracle_telemetry_down * ignoring (oracle) group_left() (1 - bool:telemetry_down)
79+
80+
# Oracle Telemetry Down EXCEPT Telemetry Down
81+
- alert: No telemetry from an OCR oracle
82+
expr: bool:oracle_telemetry_down_except_telemetry_down == 1
83+
for: 20m
84+
labels:
85+
severity: critical
86+
team: incident-response
87+
annotations:
88+
summary: Not receiving any telemetry for {{ $labels.oracle }}. Perhaps the oracle is down or having issues with the telemetry transport? Reach out to the node op.
89+
90+
- name: Feed
91+
interval: 15s
92+
rules:
93+
- record: bool:feed_telemetry_down
94+
expr: min without(oracle) (bool:oracle_feed_telemetry_down) * bool:contract_active
95+
- record: bool:feed_telemetry_down_except_telemetry_down
96+
expr: bool:feed_telemetry_down * ignoring (contract) group_left() (1 - bool:telemetry_down)
97+
98+
# Feed Telemetry Down EXCEPT Telemetry Down
99+
- alert: No telemetry on an OCR feed
100+
expr: bool:feed_telemetry_down_except_telemetry_down == 1
101+
for: 4m
102+
labels:
103+
severity: critical
104+
team: incident-response
105+
annotations:
106+
summary: Not receiving any telemetry for {{ $labels.contract }}. Are all nodes down or not sending telemetry?
107+
- record: bool:feed_stalled
108+
expr: (rate(ocr_telemetry_feed_agreed_epoch[5m]) == bool 0) * bool:contract_active
109+
- record: bool:feed_stalled_except_telemetry_down
110+
expr: bool:feed_stalled * (1 - bool:feed_telemetry_down)
111+
112+
# Alert if no new round seen after 90 seconds, unless feed fails to report or OTPE is not receving any telememtry at all
113+
- alert: Rounds have stopped progressing on an OCR feed
114+
expr:
115+
(
116+
(sum(rate(ocr_telemetry_epoch_round[10m])) by (contract, job, cluster, instance) < 1./90 == bool 0)
117+
* bool:contract_active
118+
* (1-bool:feed_telemetry_down)
119+
) == 1
120+
labels:
121+
severity: critical
122+
team: incident-response
123+
annotations:
124+
summary: New rounds are not being created on feed {{ $labels.contract }} at the expected rate. Maybe the feed has stalled. Reach out to node operators to corroborate this. If they are not seeing any runs, escalate and consider failing over to FM if you cannot resolve this quickly.
125+
126+
# Feed Stalled EXCEPT Feed Telemetry Down
127+
- alert: Epochs have stopped progressing on an OCR feed
128+
expr: bool:feed_stalled_except_telemetry_down == 1
129+
for: 5m
130+
labels:
131+
severity: critical
132+
team: incident-response
133+
annotations:
134+
summary: New epochs are not being created on feed {{ $labels.contract }} at the expected rate. Maybe the feed has stalled. Reach out to node operators to corroborate this. If they are not seeing any runs, escalate and consider failing over to FM if you cannot resolve this quickly.
135+
# This is not particularly actionable, so commenting out for now. We can think about improved versions later.
136+
# - record: bool:feed_fast_epochs
137+
# expr: (rate(ocr_telemetry_feed_agreed_epoch[6m]) > bool 3/(ocr_contract_config_r_max * ocr_contract_config_delta_round_seconds)) * bool:contract_active
138+
# - alert: Feed Fast Epochs
139+
# expr: bool:feed_fast_epochs == 1
140+
# for: 3m
141+
# labels:
142+
# severity: critical
143+
# slack_channel: ocr-telemetry-beta-group
144+
# annotations:
145+
# summary: Feed is moving through epochs much faster than expected {{ $labels.contract }}. Perhaps a few nodes are down?
146+
- record: bool:feed_close_to_reporting_failure
147+
expr: (max_over_time(ocr_telemetry_feed_message_report_req_size[2m]) < bool 2*ocr_contract_config_f+1 + 2) * bool:contract_active
148+
- record: bool:feed_close_to_reporting_failure_except_feed_telemetry_down
149+
expr: bool:feed_close_to_reporting_failure * (1 - bool:feed_telemetry_down)
150+
151+
# Feed Close To Reporting Failure EXCEPT Feed Telemetry Down
152+
- alert: OCR feed close to reporting failure
153+
expr: bool:feed_close_to_reporting_failure_except_feed_telemetry_down == 1
154+
for: 3m
155+
labels:
156+
severity: critical
157+
team: incident-response
158+
annotations:
159+
summary: Feed is within two oracles of reporting failure {{ $labels.contract }}. Reach out to node ops that are having issues asap and consider replacing them.
160+
- record: bool:feed_reporting_failure
161+
expr: (rate(ocr_telemetry_feed_message_report_req_total[4m]) == bool 0) * bool:contract_active
162+
- record: bool:feed_reporting_failure_except_feed_telemetry_down
163+
expr: bool:feed_reporting_failure * (1 - bool:feed_telemetry_down)
164+
165+
# Feed Reporting Failure EXCEPT Feed Telemetry Down
166+
- alert: OCR feed reporting failure
167+
expr: bool:feed_reporting_failure_except_feed_telemetry_down == 1
168+
for: 4m
169+
labels:
170+
severity: critical
171+
team: incident-response
172+
annotations:
173+
summary: Feed is experiencing reporting failure {{ $labels.contract }}! Reach out to node ops to confirm and consider failing over to FluxMonitor.
174+
175+
- name: Oracle & Feed Except Oracle
176+
interval: 15s
177+
rules:
178+
# Currently not useful due to unreliable telemetry ingestion
179+
# - record: bool:oracle_feed_telemetry_down_except_oracle_telemetry_down_except_feed_telemetry_down
180+
# expr: (bool:oracle_feed_telemetry_down * ignoring (contract) group_left() (1 - bool:oracle_telemetry_down)) * ignoring (oracle) group_left() (1 - bool:feed_telemetry_down)
181+
# - alert: Oracle & Feed Telemetry Down EXCEPT Oracle Telemetry Down EXCEPT Feed Telemetry Down
182+
# expr: bool:oracle_feed_telemetry_down_except_oracle_telemetry_down_except_feed_telemetry_down == 1
183+
# for: 30m
184+
# labels:
185+
# severity: warning
186+
# slack_channel: ocr-telemetry-beta-group
187+
# annotations:
188+
# summary: Not receiving any telemetry from oracle {{ $labels.oracle }} on feed {{ $labels.contract }}. Reach out to the node op.
189+
- record: bool:oracle_feed_blind_except_oracle_blind_except_feed_reporting_failure_except_feed_telemetry_down
190+
expr: (bool:oracle_feed_blind * ignoring (contract) group_left() (1 - bool:oracle_blind)) * ignoring (oracle) group_left() (1 - bool:feed_reporting_failure) * ignoring (oracle) group_left() (1 - bool:feed_telemetry_down)
191+
192+
# Oracle & Feed Blind EXCEPT Oracle Blind EXCEPT Feed Reporting Failure EXCEPT Feed Telemetry Down
193+
- alert: Oracle not making observations on an OCR feed
194+
expr: bool:oracle_feed_blind_except_oracle_blind_except_feed_reporting_failure_except_feed_telemetry_down == 1
195+
for: 10m
196+
labels:
197+
severity: warning
198+
team: incident-response
199+
annotations:
200+
summary: Oracle {{ $labels.oracle }} is able to make observations, yet I'm not receiving any observations from it on feed {{ $labels.contract }}. Perhaps a data source issue? Reach out to the node op.
201+

suite/alerts/alerts_test.go

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ import (
99

1010
var _ = Describe("Alerts suite", func() {
1111
Describe("Alerts", func() {
12-
It("Deploys the alerts stack up to OTPE", func() {
12+
It("Deploys the alerts stack up to Prometheus", func() {
1313
i := &testcommon.OCRSetupInputs{}
1414
testcommon.DeployOCRForEnv(i, "basic-chainlink", environment.NewChainlinkClusterForAlertsTesting(5))
1515
testcommon.SetupOCRTest(i)
@@ -18,6 +18,9 @@ var _ = Describe("Alerts suite", func() {
1818

1919
err := i.SuiteSetup.Env.DeploySpecs(environment.OtpeGroup())
2020
Expect(err).ShouldNot(HaveOccurred())
21+
22+
err = i.SuiteSetup.Env.DeploySpecs(environment.PrometheusGroup())
23+
Expect(err).ShouldNot(HaveOccurred())
2124
})
2225
})
2326
})

0 commit comments

Comments
 (0)