Skip to content

Commit 8fa0a08

Browse files
authored
Merge pull request #7 from chainstack/starter_errortrack
starter with errortrack flag
2 parents d4d96da + 272a18b commit 8fa0a08

File tree

4 files changed

+244
-1
lines changed

4 files changed

+244
-1
lines changed

Dockerfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,7 @@ COPY --from=builder /app/build/bin/state /usr/local/bin/state
8181
COPY --from=builder /app/build/bin/txpool /usr/local/bin/txpool
8282
COPY --from=builder /app/build/bin/verkle /usr/local/bin/verkle
8383
COPY --from=builder /app/build/bin/caplin-phase1 /usr/local/bin/caplin-phase1
84+
COPY --from=builder /app/build/bin/starter /usr/local/bin/starter
8485

8586

8687

@@ -109,4 +110,3 @@ LABEL org.label-schema.build-date=$BUILD_DATE \
109110
org.label-schema.vendor="Torquem" \
110111
org.label-schema.version=$VERSION
111112

112-
ENTRYPOINT ["erigon"]

Makefile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,7 @@ geth: erigon
105105
erigon: go-version erigon.cmd
106106
@rm -f $(GOBIN)/tg # Remove old binary to prevent confusion where users still use it because of the scripts
107107

108+
COMMANDS += starter
108109
COMMANDS += devnet
109110
COMMANDS += erigon-el-mock
110111
COMMANDS += downloader

cmd/starter/README.md

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
# Starter
2+
3+
Starter is an additional utility for tracking errors in the logs of the main process. If the given substring is found in the logs more than N times in the given period of time, then the starter will restart the process.
4+
5+
If starter receives a signal to stop, it will stop the main process and exit.
6+
7+
## Build
8+
9+
```go build -o /build/bin/starter cmd/starter/main.go```
10+
11+
or
12+
13+
```make starter```
14+
15+
## Usage
16+
Each `--errortrack` flag requires 3 arguments after it, the rest of arguments is a command for child process.
17+
18+
```starter [--errortrack error_substring error_limit time_window] <command> [args...]```
19+
20+
- `error_substring` - substring to search in the logs
21+
- `error_limit` - maximum number of occurrences of the substring in the given period
22+
- `time_window` - time window for error searching (e.g. 60s, 1m, 1h, 1d)
23+
- `command` - command to run
24+
- `[args...]` - arguments for the command
25+
## Example
26+
This command will restart erigon if the substring 'No block bodies' is found in the logs more than 50 times in 5 minutes:
27+
28+
```starter --errortrack 'No block bodies' 50 5m erigon --datadir /data --private.api.addr```
29+
30+
It is possible to track multiple errors by using `--errortrack` flag multiple times.
31+
32+
```starter --errortrack 'No block bodies' 50 5m --errortrack 'No peers' 10 60s erigon --datadir /data --private.api.addr```

cmd/starter/main.go

Lines changed: 210 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,210 @@
1+
package main
2+
3+
import (
4+
"bufio"
5+
"fmt"
6+
"io"
7+
"os"
8+
"os/exec"
9+
"os/signal"
10+
"sort"
11+
"strconv"
12+
"strings"
13+
"sync"
14+
"syscall"
15+
"time"
16+
)
17+
18+
type ErrorTracker struct {
19+
Substring string
20+
Limit int
21+
Window time.Duration
22+
Errors []time.Time
23+
}
24+
25+
func (et *ErrorTracker) AddError(t time.Time) {
26+
et.Errors = append(et.Errors, t)
27+
if len(et.Errors) > et.Limit {
28+
et.Errors = et.Errors[1:]
29+
}
30+
}
31+
32+
func (et *ErrorTracker) ShouldRestartProcess() bool {
33+
if len(et.Errors) < et.Limit {
34+
return false
35+
}
36+
37+
firstError := et.Errors[0]
38+
lastError := et.Errors[len(et.Errors)-1]
39+
return lastError.Sub(firstError) <= et.Window
40+
}
41+
42+
func parseErrorTrackers() ([]ErrorTracker, []string) {
43+
trackers := make([]ErrorTracker, 0)
44+
args := os.Args[1:]
45+
maxArg := -1
46+
47+
for i, arg := range args {
48+
if arg == "--errortrack" {
49+
if i+3 >= len(args) {
50+
log("Invalid --errortrack parameters")
51+
os.Exit(1)
52+
}
53+
54+
substring := args[i+1]
55+
limit, err := strconv.Atoi(args[i+2])
56+
if err != nil {
57+
log("Error parsing limit: %v", err)
58+
os.Exit(1)
59+
}
60+
61+
window, err := time.ParseDuration(args[i+3])
62+
if err != nil {
63+
log("Error parsing time window: %v", err)
64+
os.Exit(1)
65+
}
66+
67+
trackers = append(trackers, ErrorTracker{Substring: substring, Limit: limit, Window: window})
68+
if i+3 > maxArg {
69+
maxArg = i + 3
70+
}
71+
}
72+
}
73+
74+
if maxArg >= len(args)-1 {
75+
log("No command provided")
76+
os.Exit(1)
77+
}
78+
79+
cmdArgs := args[maxArg+1:]
80+
return trackers, cmdArgs
81+
}
82+
83+
func log(format string, a ...interface{}) {
84+
fmt.Printf(fmt.Sprintf("[starter] %s ", time.Now().Format("[01-02|15:04:05.000]"))+format+"\n", a...)
85+
}
86+
87+
func main() {
88+
trackers, cmdArgs := parseErrorTrackers()
89+
if len(cmdArgs) == 0 {
90+
log("No command provided")
91+
os.Exit(1)
92+
}
93+
94+
cmdName := cmdArgs[0]
95+
cmdArgs = cmdArgs[1:]
96+
97+
// Set up signal handling
98+
sigChan := make(chan os.Signal, 1)
99+
signal.Notify(sigChan, syscall.SIGINT, syscall.SIGTERM)
100+
restartChan := make(chan struct{}, 2)
101+
exitChan := make(chan struct{}, 2)
102+
var cmd *exec.Cmd
103+
var stdoutPipe, stderrPipe io.ReadCloser
104+
var wg sync.WaitGroup
105+
restartInit := false
106+
107+
scanAndTrackErrors := func(scanner *bufio.Scanner, trackers []ErrorTracker, lastwriter io.Writer, restartChan, exitChan chan struct{}) {
108+
109+
for scanner.Scan() {
110+
111+
line := scanner.Text()
112+
113+
fmt.Fprintln(lastwriter, line)
114+
115+
if restartInit {
116+
continue
117+
}
118+
119+
for i := range trackers {
120+
if strings.Contains(line, trackers[i].Substring) {
121+
trackers[i].AddError(time.Now())
122+
123+
if len(trackers[i].Errors) == 1 {
124+
log("Find error substring %q", trackers[i].Substring)
125+
} else if len(trackers[i].Errors) < trackers[i].Limit {
126+
log("Find error substring %q, %d errors in %s", trackers[i].Substring, len(trackers[i].Errors), trackers[i].Errors[len(trackers[i].Errors)-1].Sub(trackers[i].Errors[0]).Round(time.Second))
127+
} else {
128+
// find the number of errors in the last window (last is the most recent)
129+
n := sort.Search(len(trackers[i].Errors), func(j int) bool {
130+
return trackers[i].Errors[len(trackers[i].Errors)-1].Sub(trackers[i].Errors[j]) <= trackers[i].Window
131+
})
132+
log("Find error substring %q, %d errors in %s (%d errors in last %s)", trackers[i].Substring, len(trackers[i].Errors),
133+
trackers[i].Errors[len(trackers[i].Errors)-1].Sub(trackers[i].Errors[0]).Round(time.Second),
134+
len(trackers[i].Errors)-n, trackers[i].Window.Round(time.Second),
135+
)
136+
}
137+
138+
if trackers[i].ShouldRestartProcess() && !restartInit {
139+
log("Restarting process due to error limit for substring %q", trackers[i].Substring)
140+
restartInit = true
141+
restartChan <- struct{}{}
142+
}
143+
}
144+
}
145+
}
146+
if !restartInit {
147+
exitChan <- struct{}{}
148+
}
149+
}
150+
151+
runCmd := func() {
152+
cmd = exec.Command(cmdName, cmdArgs...)
153+
stdoutPipe, _ = cmd.StdoutPipe()
154+
stderrPipe, _ = cmd.StderrPipe()
155+
restartInit = false
156+
157+
wg.Add(2)
158+
go func() {
159+
defer wg.Done()
160+
scanner := bufio.NewScanner(stdoutPipe)
161+
scanAndTrackErrors(scanner, trackers, os.Stdout, restartChan, exitChan)
162+
}()
163+
164+
go func() {
165+
defer wg.Done()
166+
scanner := bufio.NewScanner(stderrPipe)
167+
scanAndTrackErrors(scanner, trackers, os.Stderr, restartChan, exitChan)
168+
}()
169+
170+
log("Starting process %q", cmdName)
171+
cmd.Start()
172+
173+
}
174+
175+
runCmd()
176+
177+
for {
178+
select {
179+
case sig := <-sigChan:
180+
log("Received signal %v", sig)
181+
switch sig {
182+
case syscall.SIGINT, syscall.SIGTERM:
183+
log("Stopping %q process", cmdName)
184+
cmd.Process.Signal(sig)
185+
cmd.Wait()
186+
wg.Wait()
187+
time.Sleep(100 * time.Millisecond)
188+
os.Exit(0)
189+
}
190+
case <-restartChan:
191+
log("Too many errors, restarting process")
192+
if err := cmd.Process.Signal(syscall.SIGINT); err != nil {
193+
log("Error sending signal to process: %v", err)
194+
}
195+
cmd.Wait()
196+
wg.Wait()
197+
log("%q exited, restarting", cmdName)
198+
for i := range trackers {
199+
trackers[i].Errors = []time.Time{}
200+
}
201+
runCmd()
202+
case <-exitChan:
203+
log("%q exited, exiting", cmdName)
204+
cmd.Wait()
205+
wg.Wait()
206+
time.Sleep(100 * time.Millisecond)
207+
os.Exit(0)
208+
}
209+
}
210+
}

0 commit comments

Comments
 (0)