Skip to content

Commit c4d3d3c

Browse files
committed
updated alerts that may have issue with threshold
1 parent f17ee7f commit c4d3d3c

File tree

5 files changed

+333
-26
lines changed

5 files changed

+333
-26
lines changed

samples/alerts/cas/cas-thread-count-high.yaml

Lines changed: 62 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,17 +11,77 @@ groups:
1111
CAS thread count is higher than 400. May indicate overloaded CAS
1212
server.
1313
summary: CAS is using more than 400 threads.
14-
condition: A
14+
condition: C
1515
data:
1616
- datasourceUid: prometheus
1717
model:
18-
expr: cas_thread_count > 400
18+
expr: cas_thread_count
1919
instant: true
20+
refId: A
2021
refId: A
2122
relativeTimeRange:
2223
from: 300
2324
to: 0
25+
- datasourceUid: __expr__
26+
model:
27+
conditions:
28+
- evaluator:
29+
params: []
30+
type: gt
31+
operator:
32+
type: and
33+
query:
34+
params:
35+
- B
36+
reducer:
37+
params: []
38+
type: last
39+
type: query
40+
datasource:
41+
type: __expr__
42+
uid: __expr__
43+
expression: A
44+
intervalMs: 1000
45+
maxDataPoints: 43200
46+
reducer: last
47+
refId: B
48+
type: reduce
49+
refId: B
50+
relativeTimeRange:
51+
from: 300
52+
to: 0
53+
- datasourceUid: __expr__
54+
model:
55+
conditions:
56+
- evaluator:
57+
params:
58+
- 400
59+
type: gt
60+
operator:
61+
type: and
62+
query:
63+
params:
64+
- C
65+
reducer:
66+
params: []
67+
type: last
68+
type: query
69+
datasource:
70+
type: __expr__
71+
uid: __expr__
72+
expression: B
73+
intervalMs: 1000
74+
maxDataPoints: 43200
75+
refId: C
76+
type: threshold
77+
refId: C
78+
relativeTimeRange:
79+
from: 300
80+
to: 0
81+
execErrState: Error
2482
for: 5m
83+
isPaused: false
2584
labels:
2685
severity: warning
86+
noDataState: NoData
2787
uid: cas_thread_count
Lines changed: 68 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,25 +1,87 @@
11
apiVersion: 1
22
groups:
33
- interval: 5m
4-
name: SAS Viya Alerts
54
folder: Database Alerts
5+
name: SAS Viya Alerts
66
orgId: 1
77
rules:
88
- title: PostgreSQL Connection Utilization High
99
annotations:
10-
description: PostgreSQL database connection usage is above 85% of max connections.
11-
summary: Database is nearing connection limit.
12-
condition: A
10+
description: PostgreSQL connection utilization is high (>85%). May exhaust available connections.
11+
summary: PostgreSQL connection utilization is high.
12+
condition: C
1313
data:
1414
- datasourceUid: prometheus
1515
model:
16-
expr: (pg_stat_activity_count / pg_settings_max_connections) * 100 > 85
16+
expr: pg_stat_activity_count / pg_settings_max_connections * 100
1717
instant: true
18+
intervalMs: 1000
19+
maxDataPoints: 43200
20+
refId: A
1821
refId: A
1922
relativeTimeRange:
2023
from: 300
2124
to: 0
25+
- datasourceUid: __expr__
26+
model:
27+
conditions:
28+
- evaluator:
29+
params: []
30+
type: gt
31+
operator:
32+
type: and
33+
query:
34+
params:
35+
- B
36+
reducer:
37+
params: []
38+
type: last
39+
type: query
40+
datasource:
41+
type: __expr__
42+
uid: __expr__
43+
expression: A
44+
intervalMs: 1000
45+
maxDataPoints: 43200
46+
reducer: last
47+
refId: B
48+
type: reduce
49+
refId: B
50+
relativeTimeRange:
51+
from: 300
52+
to: 0
53+
- datasourceUid: __expr__
54+
model:
55+
conditions:
56+
- evaluator:
57+
params:
58+
- 85
59+
type: gt
60+
operator:
61+
type: and
62+
query:
63+
params:
64+
- C
65+
reducer:
66+
params: []
67+
type: last
68+
type: query
69+
datasource:
70+
type: __expr__
71+
uid: __expr__
72+
expression: B
73+
intervalMs: 1000
74+
maxDataPoints: 43200
75+
refId: C
76+
type: threshold
77+
refId: C
78+
relativeTimeRange:
79+
from: 300
80+
to: 0
81+
execErrState: Error
2282
for: 5m
83+
isPaused: false
2384
labels:
2485
severity: warning
25-
uid: postgres_connection_utilization
86+
noDataState: NoData
87+
uid: postgresql_connection_utilization

samples/alerts/other/nfs-share-high-usage.yaml

Lines changed: 67 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -11,21 +11,82 @@ groups:
1111
summary:
1212
NFS share > 85% full. Typically, it is due to users filling their home
1313
directory or backups.
14-
condition: A
14+
condition: C
1515
data:
1616
- datasourceUid: prometheus
1717
model:
18-
expr:
19-
"((kubelet_volume_stats_capacity_bytes{persistentvolumeclaim=\"cas-default-data\"\
20-
}\n - kubelet_volume_stats_available_bytes{persistentvolumeclaim=\"cas-default-data\"\
21-
})\n / kubelet_volume_stats_capacity_bytes{persistentvolumeclaim=\"cas-default-data\"\
22-
}) * 100 > 85"
18+
expr: |
19+
(kubelet_volume_stats_capacity_bytes{persistentvolumeclaim="cas-default-data"}
20+
- kubelet_volume_stats_available_bytes{persistentvolumeclaim="cas-default-data"})
21+
/ kubelet_volume_stats_capacity_bytes{persistentvolumeclaim="cas-default-data"} * 100
2322
instant: true
23+
intervalMs: 1000
24+
maxDataPoints: 43200
25+
refId: A
2426
refId: A
2527
relativeTimeRange:
2628
from: 21600
2729
to: 0
30+
- datasourceUid: __expr__
31+
model:
32+
conditions:
33+
- evaluator:
34+
params: []
35+
type: gt
36+
operator:
37+
type: and
38+
query:
39+
params:
40+
- B
41+
reducer:
42+
params: []
43+
type: last
44+
type: query
45+
datasource:
46+
type: __expr__
47+
uid: __expr__
48+
expression: A
49+
intervalMs: 1000
50+
maxDataPoints: 43200
51+
reducer: last
52+
refId: B
53+
type: reduce
54+
refId: B
55+
relativeTimeRange:
56+
from: 21600
57+
to: 0
58+
- datasourceUid: __expr__
59+
model:
60+
conditions:
61+
- evaluator:
62+
params:
63+
- 85
64+
type: gt
65+
operator:
66+
type: and
67+
query:
68+
params:
69+
- C
70+
reducer:
71+
params: []
72+
type: last
73+
type: query
74+
datasource:
75+
type: __expr__
76+
uid: __expr__
77+
expression: B
78+
intervalMs: 1000
79+
maxDataPoints: 43200
80+
refId: C
81+
type: threshold
82+
refId: C
83+
relativeTimeRange:
84+
from: 21600
85+
to: 0
86+
execErrState: Error
2887
for: 5m
88+
isPaused: false
2989
labels:
3090
severity: warning
91+
noDataState: NoData
3192
uid: nfs_share_usage

samples/alerts/platform/high-viya-api-latency.yaml

Lines changed: 64 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11,19 +11,79 @@ groups:
1111
Viya service API response latency is high (95th percentile over
1212
1s).
1313
summary: Degraded performance on Viya APIs.
14-
condition: A
14+
condition: C
1515
data:
1616
- datasourceUid: prometheus
1717
model:
18-
expr:
19-
histogram_quantile(0.95, sum(rate(http_server_requests_duration_seconds_bucket{job=~"sas-.*"}[5m]))
20-
by (le)) > 1
18+
expr: histogram_quantile(0.95, sum(rate(http_server_requests_duration_seconds_bucket{job=~"sas-.*"}[5m])) by (le))
2119
instant: true
20+
intervalMs: 1000
21+
maxDataPoints: 43200
22+
refId: A
2223
refId: A
2324
relativeTimeRange:
2425
from: 300
2526
to: 0
27+
- datasourceUid: __expr__
28+
model:
29+
conditions:
30+
- evaluator:
31+
params: []
32+
type: gt
33+
operator:
34+
type: and
35+
query:
36+
params:
37+
- B
38+
reducer:
39+
params: []
40+
type: last
41+
type: query
42+
datasource:
43+
type: __expr__
44+
uid: __expr__
45+
expression: A
46+
intervalMs: 1000
47+
maxDataPoints: 43200
48+
reducer: last
49+
refId: B
50+
type: reduce
51+
refId: B
52+
relativeTimeRange:
53+
from: 300
54+
to: 0
55+
- datasourceUid: __expr__
56+
model:
57+
conditions:
58+
- evaluator:
59+
params:
60+
- 1
61+
type: gt
62+
operator:
63+
type: and
64+
query:
65+
params:
66+
- C
67+
reducer:
68+
params: []
69+
type: last
70+
type: query
71+
datasource:
72+
type: __expr__
73+
uid: __expr__
74+
expression: B
75+
intervalMs: 1000
76+
maxDataPoints: 43200
77+
refId: C
78+
type: threshold
79+
refId: C
80+
relativeTimeRange:
81+
from: 300
82+
to: 0
83+
execErrState: Error
2684
for: 5m
85+
isPaused: false
2786
labels:
2887
severity: warning
88+
noDataState: NoData
2989
uid: viya_api_latency

0 commit comments

Comments
 (0)