Skip to content

Commit 43b99ac

Browse files
fix(check-runner): improve error aggregation for multiple check failures (#463)
* fix(check-runner): improve error aggregation for multiple check failures * fix(check-runner): remove not needed channels * fix(check-runner): use timeout as referece
1 parent 2ec6234 commit 43b99ac

File tree

2 files changed

+172
-63
lines changed

2 files changed

+172
-63
lines changed

cmd/beekeeper/cmd/check.go

Lines changed: 4 additions & 63 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ import (
66
"strings"
77
"time"
88

9-
"github.com/ethersphere/beekeeper/pkg/beekeeper"
9+
"github.com/ethersphere/beekeeper/pkg/check"
1010
"github.com/ethersphere/beekeeper/pkg/config"
1111
"github.com/ethersphere/beekeeper/pkg/metrics"
1212
"github.com/ethersphere/beekeeper/pkg/tracing"
@@ -91,61 +91,9 @@ func (c *command) initCheckCmd() error {
9191
GethURL: c.globalConfig.GetString(optionNameGethURL),
9292
}
9393

94-
// run checks
95-
for _, checkName := range checks {
96-
checkName = strings.TrimSpace(checkName)
97-
// get configuration
98-
checkConfig, ok := c.config.Checks[checkName]
99-
if !ok {
100-
return fmt.Errorf("check '%s' doesn't exist", checkName)
101-
}
102-
103-
// choose check type
104-
check, ok := config.Checks[checkConfig.Type]
105-
if !ok {
106-
return fmt.Errorf("check %s not implemented", checkConfig.Type)
107-
}
108-
109-
// create check options
110-
o, err := check.NewOptions(checkGlobalConfig, checkConfig)
111-
if err != nil {
112-
return fmt.Errorf("creating check %s options: %w", checkName, err)
113-
}
114-
115-
// create check
116-
chk := check.NewAction(c.log)
117-
if r, ok := chk.(metrics.Reporter); ok && metricsEnabled {
118-
metrics.RegisterCollectors(metricsPusher, r.Report()...)
119-
}
120-
chk = beekeeper.NewActionMiddleware(tracer, chk, checkName)
121-
122-
checkCtx, cancelCheck := createChildContext(ctx, checkConfig.Timeout)
123-
defer cancelCheck()
124-
125-
c.log.Infof("running check: %s", checkName)
126-
c.log.Debugf("check options: %+v", o)
127-
128-
ch := make(chan error, 1)
129-
go func() {
130-
ch <- chk.Run(checkCtx, cluster, o)
131-
close(ch)
132-
}()
133-
134-
select {
135-
case <-checkCtx.Done():
136-
deadline, ok := checkCtx.Deadline()
137-
if ok {
138-
return fmt.Errorf("running check %s: %w: deadline %v", checkName, checkCtx.Err(), deadline)
139-
}
140-
return fmt.Errorf("check %s failed due to: %w", checkName, checkCtx.Err())
141-
case err = <-ch:
142-
if err != nil {
143-
return fmt.Errorf("check %s failed with error: %w", checkName, err)
144-
}
145-
c.log.Infof("%s check completed successfully", checkName)
146-
}
147-
}
148-
return nil
94+
checkRunner := check.NewCheckRunner(checkGlobalConfig, c.config.Checks, cluster, metricsPusher, tracer, c.log)
95+
96+
return checkRunner.Run(ctx, checks)
14997
},
15098
PreRunE: c.preRunE,
15199
}
@@ -162,10 +110,3 @@ func (c *command) initCheckCmd() error {
162110

163111
return nil
164112
}
165-
166-
func createChildContext(ctx context.Context, timeout *time.Duration) (context.Context, context.CancelFunc) {
167-
if timeout != nil {
168-
return context.WithTimeout(ctx, *timeout)
169-
}
170-
return context.WithCancel(ctx)
171-
}

pkg/check/runner.go

Lines changed: 168 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,168 @@
1+
package check
2+
3+
import (
4+
"context"
5+
"fmt"
6+
"io"
7+
"strings"
8+
"time"
9+
10+
"github.com/ethersphere/beekeeper/pkg/beekeeper"
11+
"github.com/ethersphere/beekeeper/pkg/config"
12+
"github.com/ethersphere/beekeeper/pkg/logging"
13+
"github.com/ethersphere/beekeeper/pkg/metrics"
14+
"github.com/ethersphere/beekeeper/pkg/orchestration"
15+
"github.com/opentracing/opentracing-go"
16+
"github.com/prometheus/client_golang/prometheus/push"
17+
)
18+
19+
type CheckRunner struct {
20+
globalConfig config.CheckGlobalConfig
21+
checks map[string]config.Check
22+
cluster orchestration.Cluster
23+
metricsPusher *push.Pusher
24+
tracer opentracing.Tracer
25+
logger logging.Logger
26+
}
27+
28+
func NewCheckRunner(
29+
globalConfig config.CheckGlobalConfig,
30+
checks map[string]config.Check,
31+
cluster orchestration.Cluster,
32+
metricsPusher *push.Pusher,
33+
tracer opentracing.Tracer,
34+
logger logging.Logger,
35+
) *CheckRunner {
36+
if logger == nil {
37+
logger = logging.New(io.Discard, 0)
38+
}
39+
return &CheckRunner{
40+
globalConfig: globalConfig,
41+
checks: checks,
42+
cluster: cluster,
43+
metricsPusher: metricsPusher,
44+
tracer: tracer,
45+
logger: logger,
46+
}
47+
}
48+
49+
func (c *CheckRunner) Run(ctx context.Context, checks []string) error {
50+
if len(checks) == 0 {
51+
return nil
52+
}
53+
54+
validatedChecks := make([]checkRun, 0, len(checks))
55+
56+
// validate and prepare checks
57+
for _, checkName := range checks {
58+
checkName = strings.TrimSpace(checkName)
59+
// get configuration
60+
checkConfig, ok := c.checks[checkName]
61+
if !ok {
62+
return fmt.Errorf("check '%s' doesn't exist", checkName)
63+
}
64+
65+
// choose checkType type
66+
checkType, ok := config.Checks[checkConfig.Type]
67+
if !ok {
68+
return fmt.Errorf("check %s not implemented", checkConfig.Type)
69+
}
70+
71+
// create check options
72+
o, err := checkType.NewOptions(c.globalConfig, checkConfig)
73+
if err != nil {
74+
return fmt.Errorf("creating check %s options: %w", checkName, err)
75+
}
76+
77+
// create check action
78+
chk := checkType.NewAction(c.logger)
79+
if r, ok := chk.(metrics.Reporter); ok && c.metricsPusher != nil {
80+
metrics.RegisterCollectors(c.metricsPusher, r.Report()...)
81+
}
82+
chk = beekeeper.NewActionMiddleware(c.tracer, chk, checkName)
83+
84+
// append to validated checks
85+
validatedChecks = append(validatedChecks, checkRun{
86+
name: checkName,
87+
typeName: checkConfig.Type,
88+
action: chk,
89+
options: o,
90+
timeout: checkConfig.Timeout,
91+
})
92+
}
93+
94+
type errorCheck struct {
95+
check string
96+
err error
97+
}
98+
99+
var errors []errorCheck
100+
101+
for _, check := range validatedChecks {
102+
c.logger.WithField("type", check.typeName).Infof("running check: %s", check.name)
103+
c.logger.Debugf("check options: %+v", check.options)
104+
105+
if err := check.Run(ctx, c.cluster); err != nil {
106+
c.logger.WithField("type", check.typeName).Errorf("check '%s' failed: %v", check.name, err)
107+
errors = append(errors, errorCheck{
108+
check: check.name,
109+
err: fmt.Errorf("check %s failed: %w", check.name, err),
110+
})
111+
} else {
112+
c.logger.WithField("type", check.typeName).Infof("%s check completed successfully", check.name)
113+
}
114+
}
115+
116+
if len(errors) == 1 {
117+
return errors[0].err
118+
} else if len(errors) > 1 {
119+
var errStrings []string
120+
for _, e := range errors {
121+
errStrings = append(errStrings, fmt.Sprintf("[%s]: {%v}", e.check, e.err))
122+
c.logger.Errorf("%s: %v", e.check, e.err)
123+
}
124+
return fmt.Errorf("multiple checks failed: %s", strings.Join(errStrings, "; "))
125+
}
126+
127+
return nil
128+
}
129+
130+
type checkRun struct {
131+
name string
132+
typeName string
133+
action beekeeper.Action
134+
options interface{}
135+
timeout *time.Duration
136+
}
137+
138+
func (c *checkRun) Run(ctx context.Context, cluster orchestration.Cluster) error {
139+
checkCtx, cancelCheck := createChildContext(ctx, c.timeout)
140+
defer cancelCheck()
141+
142+
ch := make(chan error, 1)
143+
go func() {
144+
ch <- c.action.Run(checkCtx, cluster, c.options)
145+
close(ch)
146+
}()
147+
148+
select {
149+
case <-checkCtx.Done():
150+
deadline, ok := checkCtx.Deadline()
151+
if ok {
152+
return fmt.Errorf("%w: deadline %v", checkCtx.Err(), deadline)
153+
}
154+
return checkCtx.Err()
155+
case err := <-ch:
156+
if err != nil {
157+
return err
158+
}
159+
return nil
160+
}
161+
}
162+
163+
func createChildContext(ctx context.Context, timeout *time.Duration) (context.Context, context.CancelFunc) {
164+
if timeout != nil {
165+
return context.WithTimeout(ctx, *timeout)
166+
}
167+
return context.WithCancel(ctx)
168+
}

0 commit comments

Comments
 (0)