Skip to content

Commit 526c407

Browse files
committed
Write create & delete sla events on config dumps
1 parent a60ebc5 commit 526c407

File tree

6 files changed

+282
-3
lines changed

6 files changed

+282
-3
lines changed

cmd/icingadb/main.go

Lines changed: 19 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -167,10 +167,15 @@ func run() int {
167167
synctx, cancelSynctx := context.WithCancel(ha.Environment().NewContext(hactx))
168168
g, synctx := errgroup.WithContext(synctx)
169169
// WaitGroups for initial synchronization.
170-
// Runtime updates must wait for initial synchronization to complete.
170+
// Runtime updates and history pipelines must wait for the initial synchronization to
171+
// complete by draining the `initConfigSyncDone` channel.
171172
configInitSync := sync.WaitGroup{}
172173
stateInitSync := &sync.WaitGroup{}
173174

175+
// A channel used to notify both the runtime updates and history pipelines workers
176+
// about the successful initial config sync completion including the SLA lifecycles.
177+
initConfigSyncDone := make(chan struct{})
178+
174179
// Clear the runtime update streams before starting anything else (rather than after the sync),
175180
// otherwise updates may be lost.
176181
runtimeConfigUpdateStreams, runtimeStateUpdateStreams, err := rt.ClearStreams(synctx)
@@ -243,9 +248,19 @@ func run() int {
243248
})
244249

245250
g.Go(func() error {
251+
// Unblock the runtime updates and history pipelines workers.
252+
defer close(initConfigSyncDone)
253+
254+
// Wait for the actual initial config sync to finish before syncing the SLA lifecycles.
246255
configInitSync.Wait()
247256
telemetry.OngoingSyncStartMilli.Store(0)
248257

258+
logger.Info("Syncing Host and Service initial SLA lifecycle")
259+
260+
if err := icingadb.SyncCheckablesSlaLifecycle(synctx, db); err != nil {
261+
return err
262+
}
263+
249264
syncEnd := time.Now()
250265
elapsed := syncEnd.Sub(syncStart)
251266
logger := logs.GetChildLogger("config-sync")
@@ -279,7 +294,8 @@ func run() int {
279294
})
280295

281296
g.Go(func() error {
282-
configInitSync.Wait()
297+
// Wait for the initial config sync including the SLA lifecycles to finish!
298+
<-initConfigSyncDone
283299

284300
if err := synctx.Err(); err != nil {
285301
return err
@@ -304,7 +320,7 @@ func run() int {
304320

305321
g.Go(func() error {
306322
// Wait for config and state sync to avoid putting additional pressure on the database.
307-
configInitSync.Wait()
323+
<-initConfigSyncDone
308324
stateInitSync.Wait()
309325

310326
if err := synctx.Err(); err != nil {

pkg/icingadb/sla_lifecycle.go

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,14 +2,20 @@ package icingadb
22

33
import (
44
"context"
5+
"fmt"
6+
"github.com/icinga/icinga-go-library/backoff"
57
"github.com/icinga/icinga-go-library/database"
8+
"github.com/icinga/icinga-go-library/retry"
69
"github.com/icinga/icinga-go-library/types"
710
v1 "github.com/icinga/icingadb/pkg/icingadb/v1"
811
"github.com/pkg/errors"
912
"golang.org/x/sync/errgroup"
1013
"time"
1114
)
1215

16+
// slaLifecycleTable defines the table name of v1.SlaLifecycle type.
17+
var slaLifecycleTable = database.TableName(v1.NewSlaLifecycle())
18+
1319
// CreateSlaLifecyclesFromCheckables transforms the given checkables to sla lifecycle struct
1420
// and streams them into a returned channel.
1521
func CreateSlaLifecyclesFromCheckables(
@@ -68,3 +74,50 @@ func CreateSlaLifecyclesFromCheckables(
6874

6975
return slaLifecycles
7076
}
77+
78+
// SyncCheckablesSlaLifecycle inserts one `create_time` sla lifecycle entry for each of the checkables from
79+
// the `host` and `service` tables and updates the `delete_time` of each of the sla lifecycle entries whose
80+
// host/service IDs cannot be found in the `host/service` tables.
81+
//
82+
// It's unlikely, but when a given Checkable doesn't already have a `create_time` entry in the database, the update
83+
// query won't update anything. Likewise, the insert statements may also become a no-op if the Checkables already
84+
// have a `create_time` entry with ´delete_time = 0`.
85+
//
86+
// This function retries any database errors for at least `5m` before giving up and failing with an error.
87+
func SyncCheckablesSlaLifecycle(ctx context.Context, db *database.DB) error {
88+
hostInsertStmtFmt := `
89+
INSERT INTO %[1]s (id, environment_id, host_id, create_time)
90+
SELECT id, environment_id, id, %[2]d AS create_time
91+
FROM host WHERE NOT EXISTS(SELECT 1 FROM %[1]s WHERE service_id IS NULL AND delete_time = 0 AND host_id = host.id)`
92+
93+
hostUpdateStmtFmt := `
94+
UPDATE %[1]s SET delete_time = %[2]d
95+
WHERE service_id IS NULL AND delete_time = 0 AND NOT EXISTS(SELECT 1 FROM host WHERE host.id = %[1]s.id)`
96+
97+
serviceInsertStmtFmt := `
98+
INSERT INTO %[1]s (id, environment_id, host_id, service_id, create_time)
99+
SELECT id, environment_id, host_id, id, %[2]d AS create_time
100+
FROM service WHERE NOT EXISTS(SELECT 1 FROM %[1]s WHERE delete_time = 0 AND service_id = service.id)`
101+
102+
serviceUpdateStmtFmt := `
103+
UPDATE %[1]s SET delete_time = %[2]d
104+
WHERE delete_time = 0 AND service_id IS NOT NULL AND NOT EXISTS(SELECT 1 FROM service WHERE service.id = %[1]s.id)`
105+
106+
return retry.WithBackoff(
107+
ctx,
108+
func(context.Context) error {
109+
eventTime := time.Now().UnixMilli()
110+
for _, queryFmt := range []string{hostInsertStmtFmt, hostUpdateStmtFmt, serviceInsertStmtFmt, serviceUpdateStmtFmt} {
111+
query := fmt.Sprintf(queryFmt, slaLifecycleTable, eventTime)
112+
if _, err := db.ExecContext(ctx, query); err != nil {
113+
return database.CantPerformQuery(err, query)
114+
}
115+
}
116+
117+
return nil
118+
},
119+
retry.Retryable,
120+
backoff.NewExponentialWithJitter(1*time.Millisecond, 1*time.Second),
121+
db.GetDefaultRetrySettings(),
122+
)
123+
}

schema/mysql/schema.sql

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1334,6 +1334,20 @@ CREATE TABLE sla_history_downtime (
13341334
INDEX idx_sla_history_downtime_env_downtime_end (environment_id, downtime_end) COMMENT 'Filter for sla history retention'
13351335
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_bin ROW_FORMAT=DYNAMIC;
13361336

1337+
CREATE TABLE sla_lifecycle (
1338+
id binary(20) NOT NULL COMMENT 'host.id if service_id is NULL otherwise service.id',
1339+
environment_id binary(20) NOT NULL COMMENT 'environment.id',
1340+
host_id binary(20) NOT NULL COMMENT 'host.id (may reference already deleted hosts)',
1341+
service_id binary(20) DEFAULT NULL COMMENT 'service.id (may reference already deleted services)',
1342+
1343+
-- These columns are nullable, but as we're using the delete_time to build the composed primary key, we have to set
1344+
-- this to `0` instead, since it's not allowed to use a nullable column as part of the primary key.
1345+
create_time bigint unsigned NOT NULL DEFAULT 0 COMMENT 'unix timestamp the event occurred',
1346+
delete_time bigint unsigned NOT NULL DEFAULT 0 COMMENT 'unix timestamp the delete event occurred',
1347+
1348+
PRIMARY KEY (id, delete_time)
1349+
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_bin ROW_FORMAT=DYNAMIC;
1350+
13371351
CREATE TABLE icingadb_schema (
13381352
id int unsigned NOT NULL AUTO_INCREMENT,
13391353
version smallint unsigned NOT NULL,

schema/mysql/upgrades/1.3.0.sql

Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,5 +18,91 @@ ALTER TABLE checkcommand_envvar MODIFY COLUMN envvar_key varchar(255) NOT NULL;
1818
ALTER TABLE eventcommand_envvar MODIFY COLUMN envvar_key varchar(255) NOT NULL;
1919
ALTER TABLE notificationcommand_envvar MODIFY COLUMN envvar_key varchar(255) NOT NULL;
2020

21+
CREATE TABLE sla_lifecycle (
22+
id binary(20) NOT NULL COMMENT 'host.id if service_id is NULL otherwise service.id',
23+
environment_id binary(20) NOT NULL COMMENT 'environment.id',
24+
host_id binary(20) NOT NULL COMMENT 'host.id (may reference already deleted hosts)',
25+
service_id binary(20) DEFAULT NULL COMMENT 'service.id (may reference already deleted services)',
26+
27+
-- These columns are nullable, but as we're using the delete_time to build the composed primary key, we have to set
28+
-- this to `0` instead, since it's not allowed to use a nullable column as part of the primary key.
29+
create_time bigint unsigned NOT NULL DEFAULT 0 COMMENT 'unix timestamp the event occurred',
30+
delete_time bigint unsigned NOT NULL DEFAULT 0 COMMENT 'unix timestamp the delete event occurred',
31+
32+
PRIMARY KEY (id, delete_time)
33+
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_bin ROW_FORMAT=DYNAMIC;
34+
35+
-- Insert a sla lifecycle create_time entry for all existing hosts with the LEAST timestamp found in either
36+
-- the sla_history_state or sla_history_downtime table, otherwise fallback to the current Unix timestamp.
37+
INSERT INTO sla_lifecycle (id, environment_id, host_id, create_time)
38+
SELECT host.id,
39+
host.environment_id,
40+
host.id,
41+
-- In MySQL/MariaDB, LEAST() returns NULL if either event_time or downtime_start is NULL, which is not
42+
-- desirable for our use cases. So we need to work around this behaviour by nesting some COALESCE() calls.
43+
COALESCE(LEAST(COALESCE(MIN(event_time), MIN(downtime_start)), COALESCE(MIN(downtime_start), MIN(event_time))), UNIX_TIMESTAMP() * 1000) AS create_time
44+
FROM host
45+
LEFT JOIN sla_history_state shs on host.id = shs.host_id AND shs.service_id IS NULL
46+
LEFT JOIN sla_history_downtime shd on host.id = shd.host_id AND shd.service_id IS NULL
47+
GROUP BY host.id, host.environment_id
48+
ON DUPLICATE KEY UPDATE sla_lifecycle.id = sla_lifecycle.id;
49+
50+
-- Insert a sla lifecycle deleted entry for all not existing hosts with the GREATEST timestamp
51+
-- found in either the sla_history_state or sla_history_downtime table.
52+
INSERT INTO sla_lifecycle (id, environment_id, host_id, delete_time)
53+
SELECT host_id AS id,
54+
environment_id,
55+
host_id,
56+
MAX(event_time) AS delete_time
57+
FROM (SELECT host_id, environment_id, MAX(event_time) AS event_time
58+
FROM sla_history_state
59+
WHERE service_id IS NULL AND NOT EXISTS(SELECT 1 FROM host WHERE id = host_id)
60+
GROUP BY host_id, environment_id
61+
UNION ALL
62+
SELECT host_id, environment_id, MAX(downtime_end) AS event_time
63+
FROM sla_history_downtime
64+
WHERE service_id IS NULL AND NOT EXISTS(SELECT 1 FROM host WHERE id = host_id)
65+
GROUP BY host_id, environment_id
66+
) AS deleted_hosts
67+
GROUP BY host_id, environment_id HAVING MAX(event_time) IS NOT NULL
68+
ON DUPLICATE KEY UPDATE sla_lifecycle.id = sla_lifecycle.id;
69+
70+
-- Insert a sla lifecycle create_time entry for all existing services with the LEAST timestamp found in either
71+
-- the sla_history_state or sla_history_downtime table, otherwise fallback to the current Unix timestamp.
72+
INSERT INTO sla_lifecycle (id, environment_id, host_id, service_id, create_time)
73+
SELECT service.id,
74+
service.environment_id,
75+
service.host_id,
76+
service.id,
77+
-- In MySQL/MariaDB, LEAST() returns NULL if either event_time or downtime_start is NULL, which is not
78+
-- desirable for our use cases. So we need to work around this behaviour by nesting some COALESCE() calls.
79+
COALESCE(LEAST(COALESCE(MIN(event_time), MIN(downtime_start)), COALESCE(MIN(downtime_start), MIN(event_time))), UNIX_TIMESTAMP() * 1000) AS create_time
80+
FROM service
81+
LEFT JOIN sla_history_state shs on service.id = shs.service_id
82+
LEFT JOIN sla_history_downtime shd on service.id = shd.service_id
83+
GROUP BY service.id, service.host_id, service.environment_id
84+
ON DUPLICATE KEY UPDATE sla_lifecycle.id = sla_lifecycle.id;
85+
86+
-- Insert a sla lifecycle deleted entry for all not existing hosts with the GREATEST timestamp
87+
-- found in either the sla_history_state or sla_history_downtime table.
88+
INSERT INTO sla_lifecycle (id, environment_id, host_id, service_id, delete_time)
89+
SELECT service_id AS id,
90+
environment_id,
91+
host_id,
92+
service_id,
93+
MAX(event_time) AS delete_time
94+
FROM (SELECT service_id, environment_id, host_id, MAX(event_time) AS event_time
95+
FROM sla_history_state
96+
WHERE service_id IS NOT NULL AND NOT EXISTS(SELECT 1 FROM service WHERE id = service_id)
97+
GROUP BY service_id, environment_id, host_id
98+
UNION ALL
99+
SELECT service_id, environment_id, host_id, MAX(downtime_end) AS event_time
100+
FROM sla_history_downtime
101+
WHERE service_id IS NOT NULL AND NOT EXISTS(SELECT 1 FROM service WHERE id = service_id)
102+
GROUP BY service_id, environment_id, host_id
103+
) AS deleted_services
104+
GROUP BY service_id, environment_id, host_id HAVING MAX(event_time) IS NOT NULL
105+
ON DUPLICATE KEY UPDATE sla_lifecycle.id = sla_lifecycle.id;
106+
21107
INSERT INTO icingadb_schema (version, timestamp)
22108
VALUES (6, UNIX_TIMESTAMP() * 1000);

schema/pgsql/schema.sql

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2171,6 +2171,27 @@ COMMENT ON COLUMN sla_history_downtime.downtime_id IS 'downtime.id (may referenc
21712171
COMMENT ON COLUMN sla_history_downtime.downtime_start IS 'start time of the downtime';
21722172
COMMENT ON COLUMN sla_history_downtime.downtime_end IS 'end time of the downtime';
21732173

2174+
CREATE TABLE sla_lifecycle (
2175+
id bytea20 NOT NULL,
2176+
environment_id bytea20 NOT NULL,
2177+
host_id bytea20 NOT NULL,
2178+
service_id bytea20 DEFAULT NULL,
2179+
2180+
-- These columns are nullable, but as we're using the delete_time to build the composed primary key, we have to set
2181+
-- this to `0` instead, since it's not allowed to use a nullable column as part of the primary key.
2182+
create_time biguint NOT NULL DEFAULT 0,
2183+
delete_time biguint NOT NULL DEFAULT 0,
2184+
2185+
CONSTRAINT pk_sla_lifecycle PRIMARY KEY (id, delete_time)
2186+
);
2187+
2188+
COMMENT ON COLUMN sla_lifecycle.id IS 'host.id if service_id is NULL otherwise service.id';
2189+
COMMENT ON COLUMN sla_lifecycle.environment_id IS 'environment.id';
2190+
COMMENT ON COLUMN sla_lifecycle.host_id IS 'host.id (may reference already deleted hosts)';
2191+
COMMENT ON COLUMN sla_lifecycle.service_id IS 'service.id (may reference already deleted services)';
2192+
COMMENT ON COLUMN sla_lifecycle.create_time IS 'unix timestamp the event occurred';
2193+
COMMENT ON COLUMN sla_lifecycle.delete_time IS 'unix timestamp the delete event occurred';
2194+
21742195
CREATE SEQUENCE icingadb_schema_id_seq;
21752196

21762197
CREATE TABLE icingadb_schema (

schema/pgsql/upgrades/1.3.0.sql

Lines changed: 89 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,5 +15,94 @@ ALTER TABLE checkcommand_envvar ALTER COLUMN envvar_key TYPE varchar(255);
1515
ALTER TABLE eventcommand_envvar ALTER COLUMN envvar_key TYPE varchar(255);
1616
ALTER TABLE notificationcommand_envvar ALTER COLUMN envvar_key TYPE varchar(255);
1717

18+
CREATE TABLE sla_lifecycle (
19+
id bytea20 NOT NULL,
20+
environment_id bytea20 NOT NULL,
21+
host_id bytea20 NOT NULL,
22+
service_id bytea20 DEFAULT NULL,
23+
24+
-- These columns are nullable, but as we're using the delete_time to build the composed primary key, we have to set
25+
-- this to `0` instead, since it's not allowed to use a nullable column as part of the primary key.
26+
create_time biguint NOT NULL DEFAULT 0,
27+
delete_time biguint NOT NULL DEFAULT 0,
28+
29+
CONSTRAINT pk_sla_lifecycle PRIMARY KEY (id, delete_time)
30+
);
31+
32+
COMMENT ON COLUMN sla_lifecycle.id IS 'host.id if service_id is NULL otherwise service.id';
33+
COMMENT ON COLUMN sla_lifecycle.environment_id IS 'environment.id';
34+
COMMENT ON COLUMN sla_lifecycle.host_id IS 'host.id (may reference already deleted hosts)';
35+
COMMENT ON COLUMN sla_lifecycle.service_id IS 'service.id (may reference already deleted services)';
36+
COMMENT ON COLUMN sla_lifecycle.create_time IS 'unix timestamp the event occurred';
37+
COMMENT ON COLUMN sla_lifecycle.delete_time IS 'unix timestamp the delete event occurred';
38+
39+
-- Insert a sla lifecycle create_time entry for all existing hosts with the LEAST timestamp found in either
40+
-- the sla_history_state or sla_history_downtime table, otherwise fallback to the current Unix timestamp.
41+
INSERT INTO sla_lifecycle (id, environment_id, host_id, create_time)
42+
SELECT host.id,
43+
host.environment_id,
44+
host.id,
45+
COALESCE(LEAST(MIN(event_time), MIN(downtime_start)), EXTRACT(EPOCH FROM now()) * 1000) AS create_time
46+
FROM host
47+
LEFT JOIN sla_history_state shs on host.id = shs.host_id AND shs.service_id IS NULL
48+
LEFT JOIN sla_history_downtime shd on host.id = shd.host_id AND shd.service_id IS NULL
49+
GROUP BY host.id, host.environment_id
50+
ON CONFLICT ON CONSTRAINT pk_sla_lifecycle DO NOTHING;
51+
52+
-- Insert a sla lifecycle deleted entry for all not existing hosts with the GREATEST timestamp
53+
-- found in either the sla_history_state or sla_history_downtime table.
54+
INSERT INTO sla_lifecycle (id, environment_id, host_id, delete_time)
55+
SELECT host_id AS id,
56+
environment_id,
57+
host_id,
58+
MAX(event_time) AS delete_time
59+
FROM (SELECT host_id, environment_id, MAX(event_time) AS event_time
60+
FROM sla_history_state
61+
WHERE service_id IS NULL AND NOT EXISTS(SELECT 1 FROM host WHERE id = host_id)
62+
GROUP BY host_id, environment_id
63+
UNION ALL
64+
SELECT host_id, environment_id, MAX(downtime_end) AS event_time
65+
FROM sla_history_downtime
66+
WHERE service_id IS NULL AND NOT EXISTS(SELECT 1 FROM host WHERE id = host_id)
67+
GROUP BY host_id, environment_id
68+
) AS deleted_hosts
69+
GROUP BY host_id, environment_id HAVING MAX(event_time) IS NOT NULL
70+
ON CONFLICT ON CONSTRAINT pk_sla_lifecycle DO NOTHING;
71+
72+
-- Insert a sla lifecycle create_time entry for all existing services with the LEAST timestamp found in either
73+
-- the sla_history_state or sla_history_downtime table, otherwise fallback to the current Unix timestamp.
74+
INSERT INTO sla_lifecycle (id, environment_id, host_id, service_id, create_time)
75+
SELECT service.id,
76+
service.environment_id,
77+
service.host_id,
78+
service.id,
79+
COALESCE(LEAST(MIN(event_time), MIN(downtime_start)), EXTRACT(EPOCH FROM now()) * 1000) AS create_time
80+
FROM service
81+
LEFT JOIN sla_history_state shs on service.id = shs.service_id
82+
LEFT JOIN sla_history_downtime shd on service.id = shd.service_id
83+
GROUP BY service.id, service.host_id, service.environment_id
84+
ON CONFLICT ON CONSTRAINT pk_sla_lifecycle DO NOTHING;
85+
86+
-- Insert a sla lifecycle deleted entry for all not existing hosts with the GREATEST timestamp
87+
-- found in either the sla_history_state or sla_history_downtime table.
88+
INSERT INTO sla_lifecycle (id, environment_id, host_id, service_id, delete_time)
89+
SELECT service_id AS id,
90+
environment_id,
91+
host_id,
92+
service_id,
93+
MAX(event_time) AS delete_time
94+
FROM (SELECT service_id, environment_id, host_id, MAX(event_time) AS event_time
95+
FROM sla_history_state
96+
WHERE service_id IS NOT NULL AND NOT EXISTS(SELECT 1 FROM service WHERE id = service_id)
97+
GROUP BY service_id, environment_id, host_id
98+
UNION ALL
99+
SELECT service_id, environment_id, host_id, MAX(downtime_end) AS event_time
100+
FROM sla_history_downtime
101+
WHERE service_id IS NOT NULL AND NOT EXISTS(SELECT 1 FROM service WHERE id = service_id)
102+
GROUP BY service_id, environment_id, host_id
103+
) AS deleted_services
104+
GROUP BY service_id, environment_id, host_id HAVING MAX(event_time) IS NOT NULL
105+
ON CONFLICT ON CONSTRAINT pk_sla_lifecycle DO NOTHING;
106+
18107
INSERT INTO icingadb_schema (version, timestamp)
19108
VALUES (4, extract(epoch from now()) * 1000);

0 commit comments

Comments
 (0)