You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
Copy file name to clipboardExpand all lines: csp-mixin/alerts/gcp-alerts.yml
+144Lines changed: 144 additions & 0 deletions
Original file line number
Diff line number
Diff line change
@@ -14,3 +14,147 @@ groups:
14
14
summary: 'CPU utilization is too high.'
15
15
description: 'The VM {{ $labels.instance_name }} is under heavy load and may become unresponsive.'
16
16
dashboard_uid: 'f115fe73641347c43415535d77e2dc0f'
17
+
18
+
- name: gcp
19
+
rules:
20
+
- alert: GcpCEHighIOLatency
21
+
expr: |
22
+
avg by (job,project_id,instance_id)(stackdriver_gce_instance_compute_googleapis_com_instance_disk_average_io_latency{job=~".+",project_id=~".+",instance_id=~".+"}) > 5000
23
+
for: 5m
24
+
keep_firing_for: 10m
25
+
labels:
26
+
severity: critical
27
+
service: 'Compute Engine'
28
+
namespace: cloud-provider-gcp
29
+
annotations:
30
+
summary: 'IO latency is too high.'
31
+
description: 'Check for I/O bottlenecks and upgrade to SSD if necessary.'
32
+
dashboard_uid: 'f115fe73641347c43415535d77e2dc0f'
33
+
34
+
- name: gcp
35
+
rules:
36
+
- alert: GcpCloudSQLHighCpu
37
+
expr: |
38
+
100 * avg by (job,project_id,instance,database_id) (stackdriver_cloudsql_database_cloudsql_googleapis_com_database_cpu_utilization{job=~".+",project_id=~".+",instance=~".+", database_id=~".+"}) > 80
39
+
for: 5m
40
+
keep_firing_for: 10m
41
+
labels:
42
+
severity: critical
43
+
service: 'Cloud SQL'
44
+
namespace: cloud-provider-gcp
45
+
annotations:
46
+
summary: 'CPU utilization is too high.'
47
+
description: 'Check for high CPU queries and optimize them, or scale up the instance if sustained high usage.'
48
+
dashboard_uid: 'cc710d49022fdd69bed0e992891863e9'
49
+
50
+
- name: gcp
51
+
rules:
52
+
- alert: GcpCloudSQLMemoryUsage
53
+
expr: |
54
+
100 * avg by (job,project_id,instance,database_id) (stackdriver_cloudsql_database_cloudsql_googleapis_com_database_memory_utilization{job=~".+",project_id=~".+",instance=~".+", database_id=~".+"}) > 85
55
+
for: 5m
56
+
keep_firing_for: 10m
57
+
labels:
58
+
severity: critical
59
+
service: 'Cloud SQL'
60
+
namespace: cloud-provider-gcp
61
+
annotations:
62
+
summary: 'Memory utilization is too high.'
63
+
description: 'Review high-memory queries or add more memory to the instance.'
64
+
dashboard_uid: 'cc710d49022fdd69bed0e992891863e9'
65
+
66
+
- name: gcp
67
+
rules:
68
+
- alert: GcpCloudSQLDiskUsage
69
+
expr: |
70
+
100 * avg by (job,project_id,instance,database_id) (stackdriver_cloudsql_database_cloudsql_googleapis_com_database_disk_utilization{job=~".+",project_id=~".+",instance=~".+", database_id=~".+"}) > 85
71
+
for: 5m
72
+
keep_firing_for: 10m
73
+
labels:
74
+
severity: critical
75
+
service: 'Cloud SQL'
76
+
namespace: cloud-provider-gcp
77
+
annotations:
78
+
summary: 'Disk utilization is too high.'
79
+
description: 'Delete or archive unused data, or increase disk size.'
80
+
dashboard_uid: 'cc710d49022fdd69bed0e992891863e9'
81
+
82
+
- name: gcp
83
+
rules:
84
+
- alert: GcpCloudSQLActiveConnections
85
+
expr: |
86
+
avg by (job,project_id,instance, database_id) (stackdriver_cloudsql_database_cloudsql_googleapis_com_database_mysql_threads{thread_kind="THREADS_CONNECTED", job=~".+",project_id=~".+",instance=~".+", database_id=~".+"}) > 0.9 * avg by (job,project_id,instance, database_id) (stackdriver_cloudsql_database_cloudsql_googleapis_com_database_mysql_max_connections{job=~".+",project_id=~".+",instance=~".+", database_id=~".+"})
87
+
for: 5m
88
+
keep_firing_for: 10m
89
+
labels:
90
+
severity: critical
91
+
service: 'Cloud SQL'
92
+
namespace: cloud-provider-gcp
93
+
annotations:
94
+
summary: 'Too many active connections.'
95
+
description: 'Investigate connection pooling settings and connection management in your application.'
96
+
dashboard_uid: 'cc710d49022fdd69bed0e992891863e9'
97
+
98
+
- name: gcp
99
+
rules:
100
+
- alert: GcpCloudSQLAbortedConnections
101
+
expr: |
102
+
sum by(job, instance, project_id)(rate(stackdriver_cloudsql_database_cloudsql_googleapis_com_database_mysql_aborted_connects_count[5m])) > 5
103
+
for: 5m
104
+
keep_firing_for: 10m
105
+
labels:
106
+
severity: critical
107
+
service: 'Cloud SQL'
108
+
namespace: cloud-provider-gcp
109
+
annotations:
110
+
summary: 'More than 5 failed connections in 5 minutes'
111
+
description: 'Verify credentials and network settings; check for firewall rules blocking connections.'
112
+
dashboard_uid: 'cc710d49022fdd69bed0e992891863e9'
113
+
114
+
- name: gcp
115
+
rules:
116
+
- alert: GcpCloudSQLLagSecondsBehindMaster
117
+
expr: |
118
+
avg by (job,project_id,instance, database_id) (stackdriver_cloudsql_database_cloudsql_googleapis_com_database_mysql_replication_seconds_behind_master) > 5
119
+
for: 5m
120
+
keep_firing_for: 10m
121
+
labels:
122
+
severity: warning
123
+
service: 'Cloud SQL'
124
+
namespace: cloud-provider-gcp
125
+
annotations:
126
+
summary: 'More than 5 seconds lag between read replica and primary.'
127
+
description: 'Check network latency between primary and replica; adjust configurations to optimize replication.'
128
+
dashboard_uid: 'cc710d49022fdd69bed0e992891863e9'
129
+
130
+
- name: gcp
131
+
rules:
132
+
- alert: GcpPubSubNumUndeliveredMessages
133
+
expr: |
134
+
avg by (job,project_id,instance)(stackdriver_pubsub_subscription_pubsub_googleapis_com_subscription_num_undelivered_messages{job=~".+",project_id=~".+",instance=~".+"}) > 1000
135
+
for: 5m
136
+
keep_firing_for: 10m
137
+
labels:
138
+
severity: warning
139
+
service: 'Pub/Sub'
140
+
namespace: cloud-provider-gcp
141
+
annotations:
142
+
summary: 'More than 1000 messages.'
143
+
description: 'Scale up subscribers or adjust message processing capacity.'
144
+
dashboard_uid: '2abad1eb5e4873b95e9176e7ef10a30c'
145
+
146
+
- name: gcp
147
+
rules:
148
+
- alert: GcpPubSubUnackedMessageAge
149
+
expr: |
150
+
avg by (job,project_id,instance)(stackdriver_pubsub_subscription_pubsub_googleapis_com_subscription_oldest_unacked_message_age{job=~".+",project_id=~".+",instance=~".+"}) > 60
151
+
for: 5m
152
+
keep_firing_for: 10m
153
+
labels:
154
+
severity: warning
155
+
service: 'Pub/Sub'
156
+
namespace: cloud-provider-gcp
157
+
annotations:
158
+
summary: 'Unacknowledged messages for more than 60 seconds.'
159
+
description: 'Investigate and speed up message processing; ensure consumers can handle the load.'
0 commit comments