Skip to content

Commit 958814d

Browse files
authored
Allow Endpoint service to keep running if its install exit code is non-fatal (#9320)
* Allow services to keep running if install exit code is non-fatal * Improve logging * Adding a mock binary * Adding test * Adding missing headers * Adding CHANGELOG entry * Add error assertions * Use npipe.Dial if OS is Windows * Add .exe extension to mock binary file name * Log mock binary filename in test * Add nil guard
1 parent a555980 commit 958814d

File tree

7 files changed

+220
-3
lines changed

7 files changed

+220
-3
lines changed
Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
# Kind can be one of:
2+
# - breaking-change: a change to previously-documented behavior
3+
# - deprecation: functionality that is being removed in a later release
4+
# - bug-fix: fixes a problem in a previous version
5+
# - enhancement: extends functionality but does not break or fix existing behavior
6+
# - feature: new functionality
7+
# - known-issue: problems that we are aware of in a given version
8+
# - security: impacts on the security of a product or a user’s deployment.
9+
# - upgrade: important information for someone upgrading from a prior version
10+
# - other: does not fit into any of the other categories
11+
kind: bug-fix
12+
13+
# Change summary; a 80ish characters long description of the change.
14+
summary: Treat exit code 28 from Endpoint binary as non-fatal
15+
16+
# Long description; in case the summary is not enough to describe the change
17+
# this field accommodate a description without length limits.
18+
# NOTE: This field will be rendered only for breaking-change and known-issue kinds at the moment.
19+
#description:
20+
21+
# Affected component; usually one of "elastic-agent", "fleet-server", "filebeat", "metricbeat", "auditbeat", "all", etc.
22+
component: elastic-agent
23+
24+
# PR URL; optional; the PR number that added the changeset.
25+
# If not present is automatically filled by the tooling finding the PR where this changelog fragment has been added.
26+
# NOTE: the tooling supports backports, so it's able to fill the original PR number instead of the backport PR number.
27+
# Please provide it if you are adding a fragment for a different PR.
28+
pr: https://github.com/elastic/elastic-agent/pull/9320
29+
30+
# Issue URL; optional; the GitHub issue related to this changeset (either closes or is part of).
31+
# If not present is automatically filled by the tooling with the issue linked to the PR number.
32+
#issue: https://github.com/owner/repo/1234

pkg/component/runtime/service.go

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,9 @@ import (
99
"errors"
1010
"fmt"
1111
"net/url"
12+
"os/exec"
1213
"runtime"
14+
"slices"
1315
"time"
1416

1517
"github.com/kardianos/service"
@@ -239,6 +241,18 @@ func (s *serviceRuntime) Run(ctx context.Context, comm Communicator) (err error)
239241
// Start service
240242
err = s.start(ctx)
241243
if err != nil {
244+
// If the error is due to a non-fatal exit code, continue running the service
245+
var exitErr *exec.ExitError
246+
if errors.As(err, &exitErr) {
247+
exitCode := exitErr.ExitCode()
248+
s.log.Debugf("service %s start failed with exit code %d, err = %s", s.name(), exitCode, err)
249+
if s.comp.InputSpec.Spec.Service.Operations.Install != nil &&
250+
slices.Contains(s.comp.InputSpec.Spec.Service.Operations.Install.NonFatalExitCodes, exitCode) {
251+
s.log.Warnf("exit code %d is non-fatal, continuing to run...", exitCode)
252+
continue
253+
}
254+
}
255+
242256
cisStop()
243257
break
244258
}

pkg/component/runtime/service_test.go

Lines changed: 136 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,13 +5,30 @@
55
package runtime
66

77
import (
8+
"context"
9+
"fmt"
10+
"net"
811
"net/url"
12+
"os"
13+
"os/exec"
14+
"path/filepath"
15+
"runtime"
16+
"strconv"
17+
"strings"
918
"testing"
19+
"time"
20+
21+
"github.com/elastic/elastic-agent-libs/api/npipe"
22+
23+
"go.uber.org/zap/zapcore"
24+
25+
"github.com/stretchr/testify/require"
1026

1127
"github.com/elastic/elastic-agent-client/v7/pkg/client"
1228
"github.com/elastic/elastic-agent-client/v7/pkg/proto"
1329
"github.com/elastic/elastic-agent/internal/pkg/agent/application/paths"
1430
"github.com/elastic/elastic-agent/pkg/component"
31+
"github.com/elastic/elastic-agent/pkg/core/logger/loggertest"
1532

1633
"github.com/google/go-cmp/cmp"
1734
"github.com/google/go-cmp/cmp/cmpopts"
@@ -255,3 +272,122 @@ func TestGetConnInfoServerAddress(t *testing.T) {
255272
})
256273
}
257274
}
275+
276+
// TestCISKeepsRunningOnNonFatalExitCodeFromStart tests that the connection info
277+
// server keeps running when starting a service component results in a non-fatal
278+
// exit code.
279+
func TestCISKeepsRunningOnNonFatalExitCodeFromStart(t *testing.T) {
280+
log, logObs := loggertest.New("test")
281+
const nonFatalExitCode = 99
282+
const cisPort = 9999
283+
const cisSocket = ".teaci.sock"
284+
285+
// Make an Endpoint component for testing
286+
endpoint := makeEndpointComponent(t, map[string]interface{}{})
287+
endpoint.InputSpec.Spec.Service = &component.ServiceSpec{
288+
CPort: cisPort,
289+
CSocket: cisSocket,
290+
Log: nil,
291+
Operations: component.ServiceOperationsSpec{
292+
Check: &component.ServiceOperationsCommandSpec{},
293+
Install: &component.ServiceOperationsCommandSpec{
294+
NonFatalExitCodes: []int{nonFatalExitCode},
295+
},
296+
},
297+
Timeouts: component.ServiceTimeoutSpec{},
298+
}
299+
300+
// Create binary mocking Endpoint such that executing it will return
301+
// the non-fatal exit code from the spec above.
302+
endpoint.InputSpec.BinaryPath = mockEndpointBinary(t, nonFatalExitCode)
303+
endpoint.InputSpec.BinaryName = "endpoint"
304+
305+
t.Logf("mock binary path: %s\n", endpoint.InputSpec.BinaryPath)
306+
307+
// Create new service runtime with component
308+
service, err := newServiceRuntime(endpoint, log, true)
309+
require.NoError(t, err)
310+
311+
ctx, cancel := context.WithCancel(context.Background())
312+
defer cancel()
313+
comm := newMockCommunicator("")
314+
315+
// Observe component state
316+
go func() {
317+
for {
318+
select {
319+
case <-ctx.Done():
320+
return
321+
case <-service.ch:
322+
}
323+
}
324+
}()
325+
326+
// Run the service
327+
go func() {
328+
err := service.Run(ctx, comm)
329+
require.EqualError(t, err, context.Canceled.Error())
330+
}()
331+
332+
service.actionCh <- actionModeSigned{
333+
actionMode: actionStart,
334+
}
335+
336+
// Check that connection info server is still running and that we see the
337+
// warning log message about Endpoint's install operation failing with a non-fatal exit
338+
// code but the service runtime continuing to run.
339+
cisAddr, err := getConnInfoServerAddress(runtime.GOOS, true, cisPort, cisSocket)
340+
require.NoError(t, err)
341+
342+
parsedCISAddr, err := url.Parse(cisAddr)
343+
require.NoError(t, err)
344+
345+
expectedWarnLogMsg := fmt.Sprintf("exit code %d is non-fatal, continuing to run...", nonFatalExitCode)
346+
require.Eventually(t, func() bool {
347+
if runtime.GOOS != "windows" {
348+
_, err = net.Dial("unix", parsedCISAddr.Host+parsedCISAddr.Path)
349+
} else {
350+
if strings.HasPrefix(cisAddr, "npipe:///") {
351+
path := strings.TrimPrefix(cisAddr, "npipe:///")
352+
cisAddr = `\\.\pipe\` + path
353+
}
354+
_, err = npipe.Dial(cisAddr)("", "")
355+
}
356+
357+
if err != nil {
358+
t.Logf("Connection info server is not running: %v", err)
359+
return false
360+
}
361+
362+
logs := logObs.TakeAll()
363+
for _, l := range logs {
364+
t.Logf("[%s] %s", l.Level, l.Message)
365+
if l.Level == zapcore.WarnLevel && l.Message == expectedWarnLogMsg {
366+
return true
367+
}
368+
}
369+
370+
return false
371+
}, 2*time.Second, 200*time.Millisecond)
372+
}
373+
374+
func mockEndpointBinary(t *testing.T, exitCode int) string {
375+
// Build a mock Endpoint binary that can return a specific exit code.
376+
outPath := filepath.Join(t.TempDir(), "mock_endpoint")
377+
if runtime.GOOS == "windows" {
378+
outPath += ".exe"
379+
}
380+
381+
cmd := exec.Command(
382+
"go", "build",
383+
"-o", outPath,
384+
"-ldflags", "-X 'main.ExitCode="+strconv.Itoa(exitCode)+"'",
385+
"testdata/exitcode/main.go",
386+
)
387+
cmd.Stdout = os.Stdout
388+
cmd.Stderr = os.Stderr
389+
err := cmd.Run()
390+
require.NoError(t, err)
391+
392+
return outPath
393+
}
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
// Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
2+
// or more contributor license agreements. Licensed under the Elastic License 2.0;
3+
// you may not use this file except in compliance with the Elastic License 2.0.
4+
5+
package main
6+
7+
// This package contains a go program that exits with an exit code.
8+
// The desired exit code must be set at build time using
9+
// go build -ldflags='-X main.ExitCode=<code>'.
10+
// The resulting binary can be used in tests to simulate an
11+
// Agent-managed component, e.g. Endpoint, that exits with a specific
12+
// exit code.
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
// Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
2+
// or more contributor license agreements. Licensed under the Elastic License 2.0;
3+
// you may not use this file except in compliance with the Elastic License 2.0.
4+
5+
package main
6+
7+
import (
8+
"os"
9+
"strconv"
10+
)
11+
12+
var ExitCode = "0" // string so it can be set at build time
13+
14+
func main() {
15+
exitCode, err := strconv.Atoi(ExitCode)
16+
if err != nil {
17+
exitCode = -1
18+
}
19+
os.Exit(exitCode)
20+
}

pkg/component/spec.go

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -134,7 +134,8 @@ type ServiceOperationsSpec struct {
134134

135135
// ServiceOperationsCommandSpec is the specification for execution of binaries to perform the check, install, and uninstall.
136136
type ServiceOperationsCommandSpec struct {
137-
Args []string `config:"args,omitempty" yaml:"args,omitempty"`
138-
Env []CommandEnvSpec `config:"env,omitempty" yaml:"env,omitempty"`
139-
Timeout time.Duration `config:"timeout,omitempty" yaml:"timeout,omitempty"`
137+
Args []string `config:"args,omitempty" yaml:"args,omitempty"`
138+
Env []CommandEnvSpec `config:"env,omitempty" yaml:"env,omitempty"`
139+
Timeout time.Duration `config:"timeout,omitempty" yaml:"timeout,omitempty"`
140+
NonFatalExitCodes []int `config:"non_fatal_exit_codes,omitempty" yaml:"non_fatal_exit_codes,omitempty"`
140141
}

specs/endpoint-security.spec.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,8 @@ inputs:
4545
- "--resources"
4646
- "endpoint-security-resources.zip"
4747
timeout: 600s
48+
non_fatal_exit_codes:
49+
- 28 # tamper protection enabled but could not (re)install
4850
uninstall:
4951
args:
5052
- "uninstall"

0 commit comments

Comments
 (0)