@@ -13,7 +13,8 @@ class Replication(Plugin):
1313 AgentPluginType = "pg"
1414 # key: (macro, value)
1515 plugin_macros = {
16- "critical_lag_seconds" : [("macro" , "{$CRITICAL_LAG_SECONDS}" ), ("value" , 60 * 5 )]
16+ "critical_lag_seconds" : [("macro" , "{$CRITICAL_LAG_SECONDS}" ), ("value" , 60 * 5 )],
17+ "critical_bytes_held_by_none_active_slot" : [("macro" , "{$CRITICAL_BYTES_HELD_BY_NON_ACTIVE_SLOT}" ), ("value" , 1024 * 1024 * 1024 )]
1718 }
1819
1920 # get time of replication lag
@@ -30,8 +31,15 @@ class Replication(Plugin):
3031 WHERE active = 'false';
3132 """
3233
34+ query_bytes_held_by_non_active_slot = """
35+ SELECT slot_name, coalesce(pg_wal_lsn_diff(pg_current_wal_lsn(), restart_lsn)::bigint, 0) AS wal_size_bytes
36+ FROM pg_replication_slots
37+ WHERE active = 'false';
38+ """
39+
3340 # for discovery rule for name of each replica
3441 key_lsn_replication_discovery = "pgsql.replication.discovery{0}"
42+ key_replication_non_active_slots_discovery = "pgsql.replication.non_active_slots_discovery{0}"
3543 key_total_lag = "pgsql.replication.total_lag{0}"
3644 # for PG 10 and higher
3745 key_flush = "pgsql.replication.flush_lag{0}"
@@ -42,6 +50,7 @@ class Replication(Plugin):
4250
4351 key_replication = "pgsql.replication_lag{0}"
4452 key_non_active_slots = "pgsql.replication.non_active_slots{0}"
53+ key_non_active_slots_held_bytes = "pgsql.replication.non_active_slots_held_bytes{0}"
4554
4655 def run (self , zbx ):
4756
@@ -79,6 +88,14 @@ def run(self, zbx):
7988 zbx .send ("pgsql.replication.replay_lag[{0}]" .format (info [0 ]), float (info [5 ]))
8089 zbx .send ("pgsql.replication.discovery[]" , zbx .json ({"data" : lags }))
8190 del lags
91+ bytes_held_by_non_active_slot = Pooler .run_sql_type ("wal_held_bytes_master" , args = [])
92+ if bytes_held_by_non_active_slot :
93+ discovery = []
94+ for info in bytes_held_by_non_active_slot :
95+ discovery .append ({"{#NON_ACTIVE_SLOT_NAME}" : info [0 ]})
96+ zbx .send ("pgsql.replication.non_active_slots_held_bytes[{0}]" .format (info [0 ]), int (info [1 ]))
97+ zbx .send ("pgsql.replication.non_active_slots_discovery[]" , zbx .json ({"data" : discovery }))
98+ del discovery
8299 elif Pooler .is_superuser () or Pooler .is_bootstraped ():
83100 result_lags = Pooler .run_sql_type ("wal_lag_lsn" , args = [" " , "xlog" , "location" ])
84101 if result_lags :
@@ -90,7 +107,15 @@ def run(self, zbx):
90107 del lags
91108 else :
92109 self .disable_and_exit_if_not_superuser ()
93-
110+ else :
111+ bytes_held_by_non_active_slot = Pooler .run_sql_type ("wal_held_bytes_replica" , args = [])
112+ if bytes_held_by_non_active_slot :
113+ discovery = []
114+ for info in bytes_held_by_non_active_slot :
115+ discovery .append ({"{#NON_ACTIVE_SLOT_NAME}" : info [0 ]})
116+ zbx .send ("pgsql.replication.non_active_slots_held_bytes[{0}]" .format (info [0 ]), int (info [1 ]))
117+ zbx .send ("pgsql.replication.non_active_slots_discovery[]" , zbx .json ({"data" : discovery }))
118+ del discovery
94119 non_active_slots = Pooler .query (self .query_non_active_slots )
95120 zbx .send (self .key_non_active_slots .format ("[]" ), int (non_active_slots [0 ][0 ]))
96121
@@ -132,7 +157,8 @@ def triggers(self, template, dashboard=False):
132157 }) + template .trigger ({
133158 "name" : "PostgreSQL Replication: number of non-active replication slots on {HOSTNAME} (value={ITEM.LASTVALUE})" ,
134159 "expression" : "{#TEMPLATE:" + self .right_type (self .key_non_active_slots ) + ".last()}>" + str (
135- NUMBER_NON_ACTIVE_SLOTS )
160+ NUMBER_NON_ACTIVE_SLOTS ),
161+ "status" : 1
136162 })
137163 return triggers
138164
@@ -198,7 +224,42 @@ def discovery_rules(self, template, dashboard=False):
198224 ]
199225 }
200226 ]
201- return template .discovery_rule (rule = rule , conditions = conditions , items = items , graphs = graphs )
227+ active_slots_discovery_rule = template .discovery_rule (rule = rule , conditions = conditions , items = items , graphs = graphs )
228+
229+ rule = {
230+ "name" : "PostgreSQL Replication: Non Active Slots Discovery" ,
231+ "key" : self .key_replication_non_active_slots_discovery .format ("[{0}]" .format (self .Macros [self .Type ]))
232+ }
233+ if Plugin .old_zabbix :
234+ conditions = []
235+ rule ["filter" ] = "{#NON_ACTIVE_SLOT_NAME}:.*"
236+ else :
237+ conditions = [{
238+ "condition" : [
239+ {"macro" : "{#NON_ACTIVE_SLOT_NAME}" ,
240+ "value" : ".*" ,
241+ "operator" : 8 ,
242+ "formulaid" : "A" }
243+ ]
244+ }]
245+ items = [
246+ {"key" : self .right_type (self .key_non_active_slots_held_bytes , var_discovery = "{#NON_ACTIVE_SLOT_NAME}," ),
247+ "name" : "PostgreSQL Replication: Bytes held by non-active slot {#NON_ACTIVE_SLOT_NAME}" ,
248+ "value_type" : Plugin .VALUE_TYPE .numeric_float ,
249+ "delay" : self .plugin_config ("interval" ),
250+ "drawtype" : 2 }
251+ ]
252+ graphs = []
253+ triggers = [
254+ {
255+ "name" : "PostgreSQL Replication: bytes held by slot {#NON_ACTIVE_SLOT_NAME} is too high (value={ITEM.LASTVALUE})" ,
256+ "expression" : "{#TEMPLATE:" + self .right_type (self .key_non_active_slots_held_bytes , var_discovery = "{#NON_ACTIVE_SLOT_NAME}," ) + ".last()}>" +
257+ self .plugin_macros ["critical_bytes_held_by_none_active_slot" ][0 ][1 ]
258+ }
259+ ]
260+ non_active_slots_discovery_rule = template .discovery_rule (rule = rule , conditions = conditions , items = items , graphs = graphs , triggers = triggers )
261+
262+ return active_slots_discovery_rule + non_active_slots_discovery_rule
202263
203264 def keys_and_queries (self , template_zabbix ):
204265 result = []
0 commit comments