Skip to content

Commit 1b75441

Browse files
authored
remove selfmon (#86)
1 parent c2f18f7 commit 1b75441

File tree

21 files changed

+56
-659
lines changed

21 files changed

+56
-659
lines changed

Makefile

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,6 @@ unit-test:
2525
go test -race -count=1 -timeout 240s -cover ./logs/... \
2626
./manager/node/... \
2727
./manager/workload/... \
28-
./selfmon/... \
2928
./types/... \
3029
./utils/...
3130

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -68,4 +68,4 @@ Make sure you can clone code by ssh protocol because libgit2 ask for it. So you
6868
<cli_execute_path> container deploy -pod <pod_name> --entry agent --network <network_name> --deploy-method fill --image <projecteru2/agent>|<your_own_image> --count 1 --file <agent_config_yaml>:/etc/eru/agent.yaml [--cpu 0.3 | --mem 1024000000] http://bit.ly/EruAgent
6969
```
7070

71-
Now you will find agent was started in each node, and monitor containers status include itself.
71+
Now you will find agent was started in each node, and monitor containers status include itself.

agent.go

Lines changed: 14 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
package main
22

33
import (
4+
"context"
45
"fmt"
56
"math/rand"
67
"os"
@@ -12,7 +13,6 @@ import (
1213
"github.com/projecteru2/agent/api"
1314
"github.com/projecteru2/agent/manager/node"
1415
"github.com/projecteru2/agent/manager/workload"
15-
"github.com/projecteru2/agent/selfmon"
1616
"github.com/projecteru2/agent/types"
1717
"github.com/projecteru2/agent/utils"
1818
"github.com/projecteru2/agent/version"
@@ -56,17 +56,12 @@ func serve(c *cli.Context) error {
5656
utils.WritePid(config.PidFile)
5757
defer os.Remove(config.PidFile)
5858

59-
if c.Bool("selfmon") {
60-
mon, err := selfmon.New(c.Context, config)
61-
if err != nil {
62-
return err
63-
}
64-
return mon.Run(c.Context)
65-
}
66-
67-
ctx, cancel := signal.NotifyContext(c.Context, syscall.SIGINT, syscall.SIGTERM, syscall.SIGQUIT)
59+
ctx, cancel := context.WithCancel(c.Context)
6860
defer cancel()
6961

62+
signalChan := make(chan os.Signal, 1)
63+
signal.Notify(signalChan, syscall.SIGINT, syscall.SIGTERM, syscall.SIGQUIT, syscall.SIGUSR1)
64+
7065
errChan := make(chan error, 2)
7166
defer close(errChan)
7267

@@ -103,10 +98,18 @@ func serve(c *cli.Context) error {
10398
go func() {
10499
select {
105100
case <-ctx.Done():
106-
log.Info("[agent] Agent caught system signal, exiting")
101+
log.Info("[agent] Agent exiting")
107102
case <-errChan:
108103
log.Info("[agent] got err, exiting")
109104
cancel()
105+
case sig := <-signalChan:
106+
log.Infof("[agent] Agent caught system signal %v", sig)
107+
if sig != syscall.SIGUSR1 {
108+
if err := nodeManager.Exit(); err != nil {
109+
log.Errorf("[agent] node manager exits with err: %v", err)
110+
}
111+
}
112+
cancel()
110113
}
111114
}()
112115

@@ -234,17 +237,6 @@ func main() {
234237
Usage: "change hostname",
235238
EnvVars: []string{"ERU_HOSTNAME"},
236239
},
237-
&cli.BoolFlag{
238-
Name: "selfmon",
239-
Value: false,
240-
Usage: "run this agent as a selfmon daemon",
241-
},
242-
&cli.StringFlag{
243-
Name: "kv",
244-
Value: "",
245-
Usage: "kv type",
246-
EnvVars: []string{"ERU_AGENT_KV"},
247-
},
248240
&cli.BoolFlag{
249241
Name: "check-only-mine",
250242
Value: false,

agent.yaml.sample

Lines changed: 2 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -12,10 +12,6 @@ store: grpc
1212
# This option is not required as the default value is "docker".
1313
runtime: docker
1414

15-
# kv defines the type of kv store.
16-
# This option is not required as the default value is "etcd".
17-
kv: etcd
18-
1915
# core defines the address of eru-core component.
2016
# This option is not required as the default value is "127.0.0.1:5001".
2117
core:
@@ -24,10 +20,8 @@ core:
2420

2521
# heartbeat_interval defines the interval for eru-agent to
2622
# report health status of the node to eru-core.
27-
# This option is not required, and is only useful when enabling
28-
# selfmon mode.
2923
# If you don't want eru-agent to report this status, set it to 0.
30-
# The default value of this option is 0.
24+
# The default value of this option is 60.
3125
heartbeat_interval: 120
3226

3327
# auth defines the authentication values for eru-core.
@@ -117,41 +111,13 @@ log:
117111
#
118112
# healthcheck.cache_ttl defines how long will eru-agent cache an unchanged status locally.
119113
# This is only used when selfmon mode is switched on. The default value is 300 (in seconds).
120-
#
121-
# healthcheck.enable_selfmon defines whether selfmon is switched on.
122-
# This should be true if there's at least one eru-agent is in mode selfmon.
123-
# When this is true, healthcheck.enable_selfmon and heartbeat_interval is meaningless.
124114
healthcheck:
125115
interval: 120
126116
timeout: 10
127117
cache_ttl: 300
128-
enable_selfmon: false
129118

130119
# global_connection_timeout defines the timeout for eru-agent other than healthcheck.
131120
# E.g. the timeout for reporting action of eru-agent, or the timeout for eru-agent to
132121
# connect to docker.
133122
# The default value is "5s", note that "s" in the end.
134-
global_connection_timeout: 15s
135-
136-
# ha_keepalive_interval defines the time interval for sending heartbeat
137-
# when selfmon maintains its own active state.
138-
# The default value is "16s", note that "s" in the end.
139-
ha_keepalive_interval: 16s
140-
141-
# etcd defines the etcd configuration.
142-
# This option is required and has no default value.
143-
# If you don't plan to run this eru-agent in selfmon mode,
144-
# you can give a mocked value e.g. 127.0.0.1:1111,
145-
# this value won't be used to connect, it's only to pass
146-
# the validation of this option (it's tricky).
147-
# Will plan to improve this in next release.
148-
#
149-
# etcd.machines defines the addresses of etcd machines.
150-
#
151-
# etcd.prefix defines the prefix for eru-agents in selfmon mode.
152-
# This prefix should be the same for all eru-agents in selfmon mode,
153-
# and also distinguished for different ERU clusters.
154-
etcd:
155-
machines:
156-
- 127.0.0.1:2379
157-
prefix: /agent-selfmon
123+
global_connection_timeout: 15s

go.mod

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ require (
1212
github.com/go-ole/go-ole v0.0.0-20180213002836-a1ec82a652eb // indirect
1313
github.com/jinzhu/configor v1.2.1
1414
github.com/patrickmn/go-cache v2.1.0+incompatible
15-
github.com/pkg/errors v0.9.1
15+
github.com/pkg/errors v0.9.1 // indirect
1616
github.com/projecteru2/core v0.0.0-20211021040158-0be8dbadbc55
1717
github.com/projecteru2/libyavirt v0.0.0-20211014062234-66e6f24ab6d1
1818
github.com/prometheus/client_golang v1.11.0

manager/node/manager.go

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -78,22 +78,24 @@ func (m *Manager) Run(ctx context.Context) error {
7878
log.Info("[NodeManager] start node status heartbeat")
7979
go m.heartbeat(ctx)
8080

81-
// wait for signal
8281
<-ctx.Done()
83-
return m.exit()
82+
log.Info("[NodeManager] exiting")
83+
return nil
8484
}
8585

86-
func (m *Manager) exit() error {
86+
// Exit .
87+
func (m *Manager) Exit() error {
8788
log.Info("[NodeManager] exiting")
88-
log.Infof("[NodeManager] mark node %s as down", m.config.HostName)
89+
log.Infof("[NodeManager] remove node status of %s", m.config.HostName)
8990

9091
// ctx is now canceled. use a new context.
9192
var err error
9293
utils.WithTimeout(context.TODO(), m.config.GlobalConnectionTimeout, func(ctx context.Context) {
93-
err = m.store.SetNode(ctx, m.config.HostName, false)
94+
// remove node status
95+
err = m.store.SetNodeStatus(ctx, -1)
9496
})
9597
if err != nil {
96-
log.Errorf("[NodeManager] failed to mark the node %s as down, err: %s", m.config.HostName, err)
98+
log.Errorf("[NodeManager] failed to remove node status of %v, err: %s", m.config.HostName, err)
9799
return err
98100
}
99101
return nil

manager/node/manager_test.go

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -23,10 +23,9 @@ func newMockNodeManager(t *testing.T) *Manager {
2323
Stdout: true,
2424
},
2525
HealthCheck: types.HealthCheckConfig{
26-
Interval: 10,
27-
Timeout: 5,
28-
CacheTTL: 300,
29-
EnableSelfmon: true,
26+
Interval: 10,
27+
Timeout: 5,
28+
CacheTTL: 300,
3029
},
3130
GlobalConnectionTimeout: 5 * time.Second,
3231
}

manager/workload/manager_test.go

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -23,10 +23,9 @@ func newMockWorkloadManager(t *testing.T) *Manager {
2323
Stdout: true,
2424
},
2525
HealthCheck: types.HealthCheckConfig{
26-
Interval: 10,
27-
Timeout: 5,
28-
CacheTTL: 300,
29-
EnableSelfmon: true,
26+
Interval: 10,
27+
Timeout: 5,
28+
CacheTTL: 300,
3029
},
3130
GlobalConnectionTimeout: 5 * time.Second,
3231
}

runtime/mocks/Runtime.go

Lines changed: 1 addition & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

selfmon/node.go

Lines changed: 0 additions & 104 deletions
This file was deleted.

0 commit comments

Comments
 (0)