Skip to content

Commit c511b7b

Browse files
committed
csp-mixin(gcp): Add alerts for ce, cloud sql, pubsub
1 parent 51cad60 commit c511b7b

File tree

1 file changed

+144
-0
lines changed

1 file changed

+144
-0
lines changed

csp-mixin/alerts/gcp-alerts.yml

Lines changed: 144 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,3 +14,147 @@ groups:
1414
summary: 'CPU utilization is too high.'
1515
description: 'The VM {{ $labels.instance_name }} is under heavy load and may become unresponsive.'
1616
dashboard_uid: 'f115fe73641347c43415535d77e2dc0f'
17+
18+
- name: gcp
19+
rules:
20+
- alert: GcpCEHighIOLatency
21+
expr: |
22+
avg by (job,project_id,instance_id)(stackdriver_gce_instance_compute_googleapis_com_instance_disk_average_io_latency{job=~".+",project_id=~".+",instance_id=~".+"}) > 5000
23+
for: 5m
24+
keep_firing_for: 10m
25+
labels:
26+
severity: critical
27+
service: 'Compute Engine'
28+
namespace: cloud-provider-gcp
29+
annotations:
30+
summary: 'IO latency is too high.'
31+
description: 'Check for I/O bottlenecks and upgrade to SSD if necessary.'
32+
dashboard_uid: 'f115fe73641347c43415535d77e2dc0f'
33+
34+
- name: gcp
35+
rules:
36+
- alert: GcpCloudSQLHighCpu
37+
expr: |
38+
100 * avg by (job,project_id,instance,database_id) (stackdriver_cloudsql_database_cloudsql_googleapis_com_database_cpu_utilization{job=~".+",project_id=~".+",instance=~".+", database_id=~".+"}) > 80
39+
for: 5m
40+
keep_firing_for: 10m
41+
labels:
42+
severity: critical
43+
service: 'Cloud SQL'
44+
namespace: cloud-provider-gcp
45+
annotations:
46+
summary: 'CPU utilization is too high.'
47+
description: 'Check for high CPU queries and optimize them, or scale up the instance if sustained high usage.'
48+
dashboard_uid: 'cc710d49022fdd69bed0e992891863e9'
49+
50+
- name: gcp
51+
rules:
52+
- alert: GcpCloudSQLMemoryUsage
53+
expr: |
54+
100 * avg by (job,project_id,instance,database_id) (stackdriver_cloudsql_database_cloudsql_googleapis_com_database_memory_utilization{job=~".+",project_id=~".+",instance=~".+", database_id=~".+"}) > 85
55+
for: 5m
56+
keep_firing_for: 10m
57+
labels:
58+
severity: critical
59+
service: 'Cloud SQL'
60+
namespace: cloud-provider-gcp
61+
annotations:
62+
summary: 'Memory utilization is too high.'
63+
description: 'Review high-memory queries or add more memory to the instance.'
64+
dashboard_uid: 'cc710d49022fdd69bed0e992891863e9'
65+
66+
- name: gcp
67+
rules:
68+
- alert: GcpCloudSQLDiskUsage
69+
expr: |
70+
100 * avg by (job,project_id,instance,database_id) (stackdriver_cloudsql_database_cloudsql_googleapis_com_database_disk_utilization{job=~".+",project_id=~".+",instance=~".+", database_id=~".+"}) > 85
71+
for: 5m
72+
keep_firing_for: 10m
73+
labels:
74+
severity: critical
75+
service: 'Cloud SQL'
76+
namespace: cloud-provider-gcp
77+
annotations:
78+
summary: 'Disk utilization is too high.'
79+
description: 'Delete or archive unused data, or increase disk size.'
80+
dashboard_uid: 'cc710d49022fdd69bed0e992891863e9'
81+
82+
- name: gcp
83+
rules:
84+
- alert: GcpCloudSQLActiveConnections
85+
expr: |
86+
avg by (job,project_id,instance, database_id) (stackdriver_cloudsql_database_cloudsql_googleapis_com_database_mysql_threads{thread_kind="THREADS_CONNECTED", job=~".+",project_id=~".+",instance=~".+", database_id=~".+"}) > 0.9 * avg by (job,project_id,instance, database_id) (stackdriver_cloudsql_database_cloudsql_googleapis_com_database_mysql_max_connections{job=~".+",project_id=~".+",instance=~".+", database_id=~".+"})
87+
for: 5m
88+
keep_firing_for: 10m
89+
labels:
90+
severity: critical
91+
service: 'Cloud SQL'
92+
namespace: cloud-provider-gcp
93+
annotations:
94+
summary: 'Too many active connections.'
95+
description: 'Investigate connection pooling settings and connection management in your application.'
96+
dashboard_uid: 'cc710d49022fdd69bed0e992891863e9'
97+
98+
- name: gcp
99+
rules:
100+
- alert: GcpCloudSQLAbortedConnections
101+
expr: |
102+
sum by(job, instance, project_id)(rate(stackdriver_cloudsql_database_cloudsql_googleapis_com_database_mysql_aborted_connects_count[5m])) > 5
103+
for: 5m
104+
keep_firing_for: 10m
105+
labels:
106+
severity: critical
107+
service: 'Cloud SQL'
108+
namespace: cloud-provider-gcp
109+
annotations:
110+
summary: 'More than 5 failed connections in 5 minutes'
111+
description: 'Verify credentials and network settings; check for firewall rules blocking connections.'
112+
dashboard_uid: 'cc710d49022fdd69bed0e992891863e9'
113+
114+
- name: gcp
115+
rules:
116+
- alert: GcpCloudSQLLagSecondsBehindMaster
117+
expr: |
118+
avg by (job,project_id,instance, database_id) (stackdriver_cloudsql_database_cloudsql_googleapis_com_database_mysql_replication_seconds_behind_master) > 5
119+
for: 5m
120+
keep_firing_for: 10m
121+
labels:
122+
severity: warning
123+
service: 'Cloud SQL'
124+
namespace: cloud-provider-gcp
125+
annotations:
126+
summary: 'More than 5 seconds lag between read replica and primary.'
127+
description: 'Check network latency between primary and replica; adjust configurations to optimize replication.'
128+
dashboard_uid: 'cc710d49022fdd69bed0e992891863e9'
129+
130+
- name: gcp
131+
rules:
132+
- alert: GcpPubSubNumUndeliveredMessages
133+
expr: |
134+
avg by (job,project_id,instance)(stackdriver_pubsub_subscription_pubsub_googleapis_com_subscription_num_undelivered_messages{job=~".+",project_id=~".+",instance=~".+"}) > 1000
135+
for: 5m
136+
keep_firing_for: 10m
137+
labels:
138+
severity: warning
139+
service: 'Pub/Sub'
140+
namespace: cloud-provider-gcp
141+
annotations:
142+
summary: 'More than 1000 messages.'
143+
description: 'Scale up subscribers or adjust message processing capacity.'
144+
dashboard_uid: '2abad1eb5e4873b95e9176e7ef10a30c'
145+
146+
- name: gcp
147+
rules:
148+
- alert: GcpPubSubUnackedMessageAge
149+
expr: |
150+
avg by (job,project_id,instance)(stackdriver_pubsub_subscription_pubsub_googleapis_com_subscription_oldest_unacked_message_age{job=~".+",project_id=~".+",instance=~".+"}) > 60
151+
for: 5m
152+
keep_firing_for: 10m
153+
labels:
154+
severity: warning
155+
service: 'Pub/Sub'
156+
namespace: cloud-provider-gcp
157+
annotations:
158+
summary: 'Unacknowledged messages for more than 60 seconds.'
159+
description: 'Investigate and speed up message processing; ensure consumers can handle the load.'
160+
dashboard_uid: '2abad1eb5e4873b95e9176e7ef10a30c'

0 commit comments

Comments
 (0)