Skip to content

Commit 655a0af

Browse files
gianniLeslVishnuKarthikRavindran
authored andcommitted
Updated agent worker to exit if termination channel not established
1 parent 461af16 commit 655a0af

File tree

2 files changed

+145
-4
lines changed

2 files changed

+145
-4
lines changed

agent/ipc/messagebus/respondent.go

Lines changed: 40 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,11 @@ import (
2828
_ "go.nanomsg.org/mangos/v3/transport/ipc"
2929
)
3030

31+
const (
32+
recvErrSleepTime = 30 * time.Second
33+
maxRecvErrCount = 5
34+
)
35+
3136
// IMessageBus is the interface for process the core agent broadcast request
3237
type IMessageBus interface {
3338
ProcessHealthRequest()
@@ -93,12 +98,25 @@ func (bus *MessageBus) ProcessHealthRequest() {
9398
}
9499

95100
log.Infof("Start to listen to Core Agent health channel")
101+
errRecvCount := 0
102+
96103
for {
97104
var request *message.Message
98105
if msg, err = bus.healthChannel.Recv(); err != nil {
99-
log.Errorf("Failed to receive from health channel: %s", err.Error())
106+
errRecvCount++
107+
log.Errorf("failed to receive from health channel: %s", err.Error())
108+
if errRecvCount >= maxRecvErrCount {
109+
// Agent core will still consider worker healthy as the ssm-agent-worker exists in the system process tree
110+
log.Errorf("failed to receive from agent core health channel %v times. Stopping health ipc listener", errRecvCount)
111+
return
112+
}
113+
114+
log.Debugf("Retrying receive from core agent health channel in %v seconds", recvErrSleepTime.Seconds())
115+
bus.sleepFunc(recvErrSleepTime)
100116
continue
101117
}
118+
119+
errRecvCount = 0
102120
log.Debugf("Received health request from core agent %s", string(msg))
103121

104122
if err = json.Unmarshal(msg, &request); err != nil {
@@ -158,15 +176,33 @@ func (bus *MessageBus) ProcessTerminationRequest() {
158176

159177
log.Infof("Start to listen to Core Agent termination channel")
160178
bus.terminationChannelConnected <- true
179+
errRecvCount := 0
161180

162181
for {
163182
var request *message.Message
164183
if msg, err = bus.terminationChannel.Recv(); err != nil {
165-
log.Errorf("cannot recv: %s", err.Error())
184+
log.Errorf("cannot receive message from core agent termination channel: %s", err.Error())
185+
errRecvCount++
186+
if errRecvCount >= maxRecvErrCount {
187+
// Consider communication channel to agent core to be broken
188+
// When ssm-agent-worker exits agent-ssm-agent (agent core) creates new worker with updated communication channel
189+
log.Errorf("failed to receive message from core agent termination channel %v times. Initiating worker shutdown", errRecvCount)
190+
// Unblock main() function to allow agent worker to exit
191+
bus.terminationRequestChannel <- true
192+
close(bus.terminationRequestChannel)
193+
// exit termination channel goroutine
194+
break
195+
}
196+
197+
log.Debugf("Retrying receive from core agent termination channel in %v seconds", recvErrSleepTime.Seconds())
198+
bus.sleepFunc(recvErrSleepTime)
166199
continue
167200
}
201+
168202
log.Infof("Received termination message from core agent %s", string(msg))
203+
errRecvCount = 0
169204
if err = json.Unmarshal(msg, &request); err != nil {
205+
// Incoming message is dropped and not retried
170206
log.Errorf("failed to unmarshal message: %s", err.Error())
171207
continue
172208
}
@@ -180,10 +216,12 @@ func (bus *MessageBus) ProcessTerminationRequest() {
180216
message.LongRunning,
181217
os.Getpid(),
182218
true); err != nil {
219+
// Response message is dropped and not retried
183220
log.Errorf("failed to create termination response: %s", err.Error())
184221
}
185222

186223
if err = bus.terminationChannel.Send(result); err != nil {
224+
// Response message is dropped and not retried
187225
log.Errorf("failed to send termination response: %s", err.Error())
188226
continue
189227
}

agent/ipc/messagebus/respondent_test.go

Lines changed: 105 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ import (
2424
"github.com/aws/amazon-ssm-agent/agent/log"
2525
contextmocks "github.com/aws/amazon-ssm-agent/agent/mocks/context"
2626
logmocks "github.com/aws/amazon-ssm-agent/agent/mocks/log"
27-
channel "github.com/aws/amazon-ssm-agent/common/channel"
27+
"github.com/aws/amazon-ssm-agent/common/channel"
2828
channelmocks "github.com/aws/amazon-ssm-agent/common/channel/mocks"
2929
"github.com/aws/amazon-ssm-agent/common/message"
3030
"github.com/stretchr/testify/mock"
@@ -66,6 +66,109 @@ func (suite *MessageBusTestSuite) SetupTest() {
6666
}
6767
}
6868

69+
func (suite *MessageBusTestSuite) TestProcessHealthRequest_Successful() {
70+
// Arrange
71+
suite.mockHealthChannel.On("IsChannelInitialized").Return(true).Once()
72+
suite.mockHealthChannel.On("IsDialSuccessful").Return(true).Once()
73+
suite.mockHealthChannel.On("Close").Return(nil).Once()
74+
request := message.CreateHealthRequest()
75+
requestString, _ := jsonutil.Marshal(request)
76+
suite.mockHealthChannel.On("Recv").Return([]byte(requestString), nil).Once()
77+
suite.mockHealthChannel.On("Send", mock.Anything).Return(nil)
78+
// Kills the infinite loop
79+
suite.mockHealthChannel.On("Recv").Return(nil, fmt.Errorf("failed to receive message on channel")).Times(maxRecvErrCount)
80+
81+
// Act
82+
suite.messageBus.ProcessHealthRequest()
83+
84+
// Assert
85+
suite.mockHealthChannel.AssertExpectations(suite.T())
86+
}
87+
88+
func (suite *MessageBusTestSuite) TestProcessHealthRequest_RecvError() {
89+
// Arrange
90+
suite.mockHealthChannel.On("IsChannelInitialized").Return(true).Once()
91+
suite.mockHealthChannel.On("IsDialSuccessful").Return(true).Once()
92+
suite.mockHealthChannel.On("Close").Return(nil).Once()
93+
suite.mockHealthChannel.On("Recv").Return(nil, fmt.Errorf("failed to receive message on channel")).Times(maxRecvErrCount)
94+
95+
// Act
96+
suite.messageBus.ProcessHealthRequest()
97+
98+
// Assert
99+
suite.mockHealthChannel.AssertExpectations(suite.T())
100+
}
101+
102+
func (suite *MessageBusTestSuite) TestProcessHealthRequest_RecvErrorCount_Resets() {
103+
// Arrange
104+
suite.mockHealthChannel.On("IsChannelInitialized").Return(true).Once()
105+
suite.mockHealthChannel.On("IsDialSuccessful").Return(true).Once()
106+
suite.mockHealthChannel.On("Close").Return(nil).Once()
107+
suite.mockHealthChannel.On("Recv").Return(nil, fmt.Errorf("failed to receive message on channel")).Times(maxRecvErrCount - 1)
108+
request := message.CreateHealthRequest()
109+
requestString, _ := jsonutil.Marshal(request)
110+
suite.mockHealthChannel.On("Recv").Return([]byte(requestString), nil).Once()
111+
suite.mockHealthChannel.On("Send", mock.Anything).Return(nil)
112+
// Kills the infinite loop
113+
suite.mockHealthChannel.On("Recv").Return(nil, fmt.Errorf("failed to receive message on channel")).Times(maxRecvErrCount)
114+
115+
// Act
116+
suite.messageBus.ProcessHealthRequest()
117+
118+
// Assert
119+
suite.mockHealthChannel.AssertExpectations(suite.T())
120+
}
121+
122+
func (suite *MessageBusTestSuite) TestProcessTerminationRequest_Error() {
123+
suite.mockTerminateChannel.On("IsDialSuccessful").Return(true).Once()
124+
suite.mockTerminateChannel.On("IsChannelInitialized").Return(true).Once()
125+
suite.mockTerminateChannel.On("Close").Return(nil).Once()
126+
suite.mockTerminateChannel.On("Recv").Return(nil, fmt.Errorf("failed to receive message on channel")).Times(maxRecvErrCount)
127+
128+
suite.messageBus.ProcessTerminationRequest()
129+
130+
suite.mockTerminateChannel.AssertExpectations(suite.T())
131+
132+
// Assert termination channel connected and that a termination message is sent
133+
suite.Assertions.Equal(true, <-suite.messageBus.GetTerminationChannelConnectedChan())
134+
suite.Assertions.Equal(true, <-suite.messageBus.GetTerminationRequestChan())
135+
}
136+
137+
func (suite *MessageBusTestSuite) TestProcessTerminationRequest_RecvRetried() {
138+
suite.mockTerminateChannel.On("IsDialSuccessful").Return(true).Once()
139+
suite.mockTerminateChannel.On("IsChannelInitialized").Return(true).Once()
140+
suite.mockTerminateChannel.On("Close").Return(nil).Once()
141+
suite.mockTerminateChannel.On("Recv").Return(nil, fmt.Errorf("failed to receive message on channel")).Times(maxRecvErrCount - 1)
142+
request := message.CreateTerminateWorkerRequest()
143+
requestString, _ := jsonutil.Marshal(request)
144+
suite.mockTerminateChannel.On("Recv").Return([]byte(requestString), nil)
145+
suite.mockTerminateChannel.On("Send", mock.Anything).Return(nil)
146+
suite.messageBus.ProcessTerminationRequest()
147+
suite.mockTerminateChannel.AssertExpectations(suite.T())
148+
149+
// Assert termination channel connected and that a termination message is sent
150+
suite.Assertions.Equal(true, <-suite.messageBus.GetTerminationChannelConnectedChan())
151+
suite.Assertions.Equal(true, <-suite.messageBus.GetTerminationRequestChan())
152+
}
153+
154+
func (suite *MessageBusTestSuite) TestProcessTerminationRequest_RecvRetriesReset() {
155+
suite.mockTerminateChannel.On("IsDialSuccessful").Return(true).Once()
156+
suite.mockTerminateChannel.On("IsChannelInitialized").Return(true).Once()
157+
suite.mockTerminateChannel.On("Close").Return(nil).Once()
158+
suite.mockTerminateChannel.On("Recv").Return(nil, fmt.Errorf("failed to receive message on channel")).Times(maxRecvErrCount - 1)
159+
suite.mockTerminateChannel.On("Recv").Return([]byte("not valid json message"), nil).Once()
160+
request := message.CreateTerminateWorkerRequest()
161+
requestString, _ := jsonutil.Marshal(request)
162+
suite.mockTerminateChannel.On("Recv").Return([]byte(requestString), nil).Once()
163+
suite.mockTerminateChannel.On("Send", mock.Anything).Return(nil)
164+
suite.messageBus.ProcessTerminationRequest()
165+
suite.mockTerminateChannel.AssertExpectations(suite.T())
166+
167+
// Assert termination channel connected and that a termination message is sent
168+
suite.Assertions.Equal(true, <-suite.messageBus.GetTerminationChannelConnectedChan())
169+
suite.Assertions.Equal(true, <-suite.messageBus.GetTerminationRequestChan())
170+
}
171+
69172
// Execute the test suite
70173
func TestMessageBusTestSuite(t *testing.T) {
71174
suite.Run(t, new(MessageBusTestSuite))
@@ -113,7 +216,7 @@ func (suite *MessageBusTestSuite) TestProcessTerminationRequest_SuccessfulConnec
113216
suite.mockTerminateChannel.On("Recv").Return([]byte(requestString), nil)
114217
suite.mockTerminateChannel.On("Send", mock.Anything).Return(nil)
115218

116-
// Fourth call to isConnect succeeds, fourth call is for defer where it will call close
219+
// Fourth call to IsChannelInitialized succeeds, fourth call is for defer where it will call close
117220
suite.mockTerminateChannel.On("IsChannelInitialized").Return(true).Once()
118221
suite.mockTerminateChannel.On("Close").Return(nil).Once()
119222

0 commit comments

Comments
 (0)