Skip to content

Commit 0a12572

Browse files
committed
feat(checks): add state events for timeline history
1 parent 9d83214 commit 0a12572

File tree

12 files changed

+181
-72
lines changed

12 files changed

+181
-72
lines changed

examples/checks/api-checks.yaml

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,6 @@ checks:
99
jitter: 2s
1010
retries: 3
1111
failure_threshold: 3
12-
success_threshold: 1
1312
allowed_buckets: [ minute, hour ]
1413
tags: [ public, critical ]
1514
spec:
@@ -38,7 +37,6 @@ checks:
3837
jitter: 2s
3938
retries: 3
4039
failure_threshold: 3
41-
success_threshold: 1
4240
allowed_buckets: [ minute, hour ]
4341
tags: [ public, critical ]
4442
spec:
@@ -58,7 +56,6 @@ checks:
5856
jitter: 2s
5957
retries: 3
6058
failure_threshold: 3
61-
success_threshold: 1
6259
allowed_buckets: [ minute, hour ]
6360
tags: [ internal, critical ]
6461
spec:
@@ -82,7 +79,6 @@ checks:
8279
jitter: 2s
8380
retries: 3
8481
failure_threshold: 3
85-
success_threshold: 1
8682
allowed_buckets: [ minute, hour ]
8783
tags: [ public, critical ]
8884
spec:
@@ -103,7 +99,6 @@ checks:
10399
jitter: 10s
104100
retries: 2
105101
failure_threshold: 2
106-
success_threshold: 1
107102
allowed_buckets: [ hour, day ]
108103
tags: [ public, security ]
109104
spec:

internal/app/manager.go

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -388,7 +388,6 @@ func sameCheckFields(a, b config.CheckFields) bool {
388388
a.Jitter == b.Jitter &&
389389
a.Retries == b.Retries &&
390390
a.FailureThreshold == b.FailureThreshold &&
391-
a.SuccessThreshold == b.SuccessThreshold &&
392391
a.Interval == b.Interval &&
393392
slices.Equal(a.AllowedBuckets, b.AllowedBuckets) &&
394393
a.Enabled == b.Enabled

internal/config/config.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -179,16 +179,16 @@ func decodeTypedCheck[T any](raw Check) (TypedCheck[T], error) {
179179
}
180180

181181
func appendTypedCheck[T any](dst map[string]TypedCheck[T], raw Check) error {
182-
if _, found := dst[raw.Name]; found {
183-
return fmt.Errorf("duplicate check name: %s", raw.Name)
182+
if _, found := dst[raw.ID]; found {
183+
return fmt.Errorf("duplicate check id: %s", raw.ID)
184184
}
185185

186186
typedCheck, err := decodeTypedCheck[T](raw)
187187
if err != nil {
188188
return err
189189
}
190190

191-
dst[raw.Name] = typedCheck
191+
dst[raw.ID] = typedCheck
192192

193193
return nil
194194
}

internal/config/model.go

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,6 @@ type CheckFields struct {
7676
Jitter time.Duration `yaml:"jitter" json:"jitter" validate:"gte=0"`
7777
Retries int `yaml:"retries" json:"retries" validate:"required,gte=0"`
7878
FailureThreshold int `yaml:"failure_threshold" json:"failure_threshold" validate:"required,gte=1"`
79-
SuccessThreshold int `yaml:"success_threshold" json:"success_threshold" validate:"required,gte=1"`
8079
Interval time.Duration `yaml:"interval" json:"interval" validate:"required,gt=0ms"`
8180
Timeout time.Duration `yaml:"timeout" json:"timeout" validate:"required,gt=0ms"`
8281
Enabled bool `yaml:"enabled" json:"enabled"`

internal/executor.go

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,6 @@ func (c *CheckExec) Execute(
4747
ServiceID: result.ServiceID,
4848
CheckType: result.CheckType,
4949
FailureThreshold: c.cfg.FailureThreshold,
50-
SuccessThreshold: c.cfg.SuccessThreshold,
5150
}
5251

5352
if err := c.handler.Handle(ctx, policy, result); err != nil {

internal/model/result.go

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,6 @@ type CheckPolicy struct {
7070
ServiceID string
7171
CheckType config.CheckType
7272
FailureThreshold int
73-
SuccessThreshold int
7473
}
7574

7675
type CheckExecutionRecord struct {

internal/repository/check/execution.go

Lines changed: 50 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -236,61 +236,56 @@ func (e *ExecutionCheck) ListExecutionTimeline(
236236

237237
query := `
238238
WITH params AS (
239-
SELECT
240-
$1::text AS service_id,
241-
$2::text AS check_id,
242-
$3::timestamptz AS from_ts,
243-
$4::timestamptz AS to_ts,
244-
$5::bigint AS bucket_step_us,
245-
$6::bigint AS stale_after_us
246-
),
247-
buckets AS (
248-
SELECT
249-
gs AS bucket_start,
250-
gs + (p.bucket_step_us * interval '1 microsecond') AS bucket_end
251-
FROM params p,
252-
LATERAL generate_series(
253-
p.from_ts,
254-
p.to_ts,
255-
p.bucket_step_us * interval '1 microsecond'
256-
) AS gs
257-
),
258-
last_execution_per_bucket AS (
259-
SELECT
260-
b.bucket_start,
261-
b.bucket_end,
262-
e.finished_at AS last_observed_at,
263-
e.status AS last_execution_status
264-
FROM buckets b
265-
CROSS JOIN params p
266-
LEFT JOIN LATERAL (
267-
SELECT
268-
ce.finished_at,
269-
ce.status
270-
FROM pulse.check_executions ce
271-
WHERE ce.service_id = p.service_id
272-
AND ce.check_id = p.check_id
273-
AND ce.finished_at <= b.bucket_end
274-
ORDER BY ce.finished_at DESC
275-
LIMIT 1
276-
) e ON TRUE
277-
)
278-
SELECT
279-
bucket_start,
280-
bucket_end,
281-
last_observed_at,
282-
last_execution_status,
283-
CASE
284-
WHEN last_observed_at IS NULL THEN 'unknown'::pulse.check_state_status
285-
WHEN bucket_end - last_observed_at > (
286-
(SELECT stale_after_us FROM params) * interval '1 microsecond'
287-
) THEN 'unknown'::pulse.check_state_status
288-
WHEN last_execution_status = 'success' THEN 'healthy'::pulse.check_state_status
289-
WHEN last_execution_status = 'failure' THEN 'unhealthy'::pulse.check_state_status
290-
ELSE 'unknown'::pulse.check_state_status
291-
END AS timeline_state
292-
FROM last_execution_per_bucket
293-
ORDER BY bucket_start
239+
SELECT
240+
$1::text AS service_id,
241+
$2::text AS check_id,
242+
$3::timestamptz AS from_ts,
243+
$4::timestamptz AS to_ts,
244+
$5::bigint AS bucket_step_us,
245+
$6::bigint AS stale_after_us
246+
),
247+
buckets AS (
248+
SELECT
249+
gs AS bucket_start,
250+
LEAST(
251+
gs + (p.bucket_step_us * interval '1 microsecond'),
252+
p.to_ts
253+
) AS bucket_end
254+
FROM params p,
255+
LATERAL generate_series(
256+
p.from_ts,
257+
p.to_ts,
258+
p.bucket_step_us * interval '1 microsecond'
259+
) AS gs
260+
WHERE gs < p.to_ts
261+
)
262+
SELECT
263+
b.bucket_start,
264+
b.bucket_end,
265+
last_event.observed_at AS last_observed_at,
266+
last_event.last_status AS last_execution_status,
267+
CASE
268+
WHEN last_event.observed_at IS NULL THEN 'unknown'::pulse.check_state_status
269+
WHEN b.bucket_end - last_event.observed_at > (
270+
(SELECT stale_after_us FROM params) * interval '1 microsecond'
271+
) THEN 'unknown'::pulse.check_state_status
272+
ELSE last_event.status
273+
END AS timeline_state
274+
FROM buckets b
275+
LEFT JOIN LATERAL (
276+
SELECT
277+
e.observed_at,
278+
e.last_status,
279+
e.status
280+
FROM pulse.check_state_events e
281+
CROSS JOIN params p
282+
WHERE e.service_id = p.service_id
283+
AND e.check_id = p.check_id
284+
AND e.observed_at <= b.bucket_end
285+
ORDER BY e.observed_at DESC, e.id DESC
286+
LIMIT 1
287+
) last_event ON TRUE
288+
ORDER BY b.bucket_start
294289
`
295290

296291
rows, err := e.db.Query(

internal/repository/check/interfaces.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ import (
99
type CheckStateRepository interface {
1010
GetCheckState(context.Context, string, string) (*model.CheckState, error)
1111
UpsertCheckState(context.Context, *model.CheckState) error
12+
AddCheckStateEvent(context.Context, *model.CheckState) error
1213
ListCheckStatesByService(context.Context, string) ([]model.CheckState, error)
1314
}
1415

internal/repository/check/state.go

Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -184,6 +184,79 @@ ON CONFLICT (check_id, service_id) DO UPDATE SET
184184
return err
185185
}
186186

187+
func (s *StateCheck) AddCheckStateEvent(
188+
ctx context.Context,
189+
state *model.CheckState,
190+
) error {
191+
if state == nil {
192+
return errors.New("check state is nil")
193+
}
194+
195+
query := `
196+
INSERT INTO pulse.check_state_events(
197+
execution_id,
198+
check_id,
199+
service_id,
200+
check_type,
201+
status,
202+
last_status,
203+
last_error_kind,
204+
last_error_message,
205+
last_duration,
206+
last_details,
207+
last_success_at,
208+
last_failure_at,
209+
consecutive_successes,
210+
consecutive_failures,
211+
observed_at
212+
) VALUES (
213+
$1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15
214+
)
215+
`
216+
217+
var (
218+
rawDetails []byte
219+
rawErrorKind *string
220+
rawErrorMessage *string
221+
)
222+
223+
if state.LastDetails != nil {
224+
data, err := json.Marshal(state.LastDetails)
225+
if err != nil {
226+
return err
227+
}
228+
rawDetails = data
229+
}
230+
231+
if state.LastErrorKind != e.ErrNone {
232+
rawErrorKind = new(string(state.LastErrorKind))
233+
}
234+
235+
if state.LastErrorMessage != "" {
236+
rawErrorMessage = &state.LastErrorMessage
237+
}
238+
239+
_, err := s.db.Exec(ctx, query,
240+
state.LastExecutionID,
241+
state.CheckID,
242+
state.ServiceID,
243+
state.CheckType,
244+
state.Status,
245+
state.LastStatus,
246+
rawErrorKind,
247+
rawErrorMessage,
248+
state.LastDuration.Microseconds(),
249+
rawDetails,
250+
state.LastSuccessAt,
251+
state.LastFailureAt,
252+
state.ConsecutiveSuccesses,
253+
state.ConsecutiveFailures,
254+
state.UpdatedAt,
255+
)
256+
257+
return err
258+
}
259+
187260
func (s *StateCheck) ListCheckStatesByService(
188261
ctx context.Context,
189262
serviceID string,

internal/services/check/handler.go

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,13 @@ func (h *Handler) Handle(
3838
return err
3939
}
4040

41-
return repo.UpsertCheckState(ctx, new(transition(policy, currentState, result)))
41+
nextState := new(transition(policy, currentState, result))
42+
43+
if err = repo.AddCheckStateEvent(ctx, nextState); err != nil {
44+
return err
45+
}
46+
47+
return repo.UpsertCheckState(ctx, nextState)
4248
},
4349
)
4450

0 commit comments

Comments
 (0)