Skip to content

Commit 057843e

Browse files
[7.17] Handling exceptions on watcher reload (#105442) (#106209)
1 parent 2e97844 commit 057843e

File tree

5 files changed

+134
-11
lines changed

5 files changed

+134
-11
lines changed

docs/changelog/105442.yaml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
pr: 105442
2+
summary: Handling exceptions on watcher reload
3+
area: Watcher
4+
type: bug
5+
issues:
6+
- 69842

x-pack/plugin/watcher/src/main/java/org/elasticsearch/xpack/watcher/WatcherLifeCycleService.java

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -166,7 +166,9 @@ public void clusterChanged(ClusterChangedEvent event) {
166166
if (watcherService.validate(event.state())) {
167167
previousShardRoutings.set(localAffectedShardRoutings);
168168
if (state.get() == WatcherState.STARTED) {
169-
watcherService.reload(event.state(), "new local watcher shard allocation ids");
169+
watcherService.reload(event.state(), "new local watcher shard allocation ids", (exception) -> {
170+
clearAllocationIds(); // will cause reload again
171+
});
170172
} else if (isStoppedOrStopping) {
171173
this.state.set(WatcherState.STARTING);
172174
watcherService.start(event.state(), () -> this.state.set(WatcherState.STARTED), (exception) -> {

x-pack/plugin/watcher/src/main/java/org/elasticsearch/xpack/watcher/WatcherService.java

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -202,7 +202,7 @@ void stopExecutor() {
202202
* Reload the watcher service, does not switch the state from stopped to started, just keep going
203203
* @param state cluster state, which is needed to find out about local shards
204204
*/
205-
void reload(ClusterState state, String reason) {
205+
void reload(ClusterState state, String reason, Consumer<Exception> exceptionConsumer) {
206206
boolean hasValidWatcherTemplates = WatcherIndexTemplateRegistry.validate(state);
207207
if (hasValidWatcherTemplates == false) {
208208
logger.warn("missing watcher index templates");
@@ -222,7 +222,10 @@ void reload(ClusterState state, String reason) {
222222
int cancelledTaskCount = executionService.clearExecutionsAndQueue(() -> {});
223223
logger.info("reloading watcher, reason [{}], cancelled [{}] queued tasks", reason, cancelledTaskCount);
224224

225-
executor.execute(wrapWatcherService(() -> reloadInner(state, reason, false), e -> logger.error("error reloading watcher", e)));
225+
executor.execute(wrapWatcherService(() -> reloadInner(state, reason, false), e -> {
226+
logger.error("error reloading watcher", e);
227+
exceptionConsumer.accept(e);
228+
}));
226229
}
227230

228231
/**

x-pack/plugin/watcher/src/test/java/org/elasticsearch/xpack/watcher/WatcherLifeCycleServiceTests.java

Lines changed: 92 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -252,6 +252,91 @@ public void testExceptionOnStart() {
252252
assertThat(lifeCycleService.getState().get(), equalTo(WatcherState.STARTED));
253253
}
254254

255+
public void testReloadWithIdenticalRoutingTable() {
256+
/*
257+
* This tests that the identical routing table causes reload only once.
258+
*/
259+
startWatcher();
260+
261+
ClusterChangedEvent[] events = masterChangeScenario();
262+
assertThat(events[1].previousState(), equalTo(events[0].state()));
263+
assertFalse(events[1].routingTableChanged());
264+
265+
for (ClusterChangedEvent event : events) {
266+
when(watcherService.validate(event.state())).thenReturn(true);
267+
lifeCycleService.clusterChanged(event);
268+
}
269+
// reload should occur on the first event
270+
verify(watcherService).reload(eq(events[0].state()), anyString(), any());
271+
// but it shouldn't on the second event unless routing table changes
272+
verify(watcherService, never()).reload(eq(events[1].state()), anyString(), any());
273+
}
274+
275+
public void testReloadWithIdenticalRoutingTableAfterException() {
276+
/*
277+
* This tests that even the identical routing table causes reload again if some exception (for example a timeout while loading
278+
* watches) interrupted the previous one.
279+
*/
280+
startWatcher();
281+
282+
ClusterChangedEvent[] events = masterChangeScenario();
283+
assertThat(events[1].previousState(), equalTo(events[0].state()));
284+
assertFalse(events[1].routingTableChanged());
285+
286+
// simulate exception on the first event
287+
doAnswer(invocation -> {
288+
Consumer<Exception> exceptionConsumer = invocation.getArgument(2);
289+
exceptionConsumer.accept(new ElasticsearchTimeoutException(new TimeoutException("Artificial timeout")));
290+
return null;
291+
}).when(watcherService).reload(eq(events[0].state()), anyString(), any());
292+
293+
for (ClusterChangedEvent event : events) {
294+
when(watcherService.validate(event.state())).thenReturn(true);
295+
lifeCycleService.clusterChanged(event);
296+
}
297+
// reload should occur on the first event but it fails
298+
verify(watcherService).reload(eq(events[0].state()), anyString(), any());
299+
// reload should occur again on the second event because the previous one failed
300+
verify(watcherService).reload(eq(events[1].state()), anyString(), any());
301+
}
302+
303+
private ClusterChangedEvent[] masterChangeScenario() {
304+
DiscoveryNodes nodes = new DiscoveryNodes.Builder().localNodeId("node_1").add(newNode("node_1")).add(newNode("node_2")).build();
305+
306+
Index index = new Index(Watch.INDEX, "uuid");
307+
IndexRoutingTable.Builder indexRoutingTableBuilder = IndexRoutingTable.builder(index);
308+
indexRoutingTableBuilder.addShard(
309+
TestShardRouting.newShardRouting(new ShardId(index, 0), "node_1", true, ShardRoutingState.STARTED)
310+
);
311+
RoutingTable routingTable = RoutingTable.builder().add(indexRoutingTableBuilder.build()).build();
312+
313+
IndexMetadata.Builder indexMetadataBuilder = IndexMetadata.builder(Watch.INDEX)
314+
.settings(settings(Version.CURRENT).put(IndexMetadata.INDEX_FORMAT_SETTING.getKey(), 6)) // the internal index format,
315+
// required
316+
.numberOfShards(1)
317+
.numberOfReplicas(0);
318+
Metadata metadata = Metadata.builder()
319+
.put(IndexTemplateMetadata.builder(HISTORY_TEMPLATE_NAME).patterns(randomIndexPatterns()))
320+
.put(indexMetadataBuilder)
321+
.build();
322+
323+
ClusterState emptyState = ClusterState.builder(new ClusterName("my-cluster")).nodes(nodes).metadata(metadata).build();
324+
ClusterState stateWithMasterNode1 = ClusterState.builder(new ClusterName("my-cluster"))
325+
.nodes(DiscoveryNodes.builder(nodes).masterNodeId("node_1"))
326+
.metadata(metadata)
327+
.routingTable(routingTable)
328+
.build();
329+
ClusterState stateWithMasterNode2 = ClusterState.builder(new ClusterName("my-cluster"))
330+
.nodes(DiscoveryNodes.builder(nodes).masterNodeId("node_2"))
331+
.metadata(metadata)
332+
.routingTable(routingTable)
333+
.build();
334+
335+
return new ClusterChangedEvent[] {
336+
new ClusterChangedEvent("any", stateWithMasterNode1, emptyState),
337+
new ClusterChangedEvent("any", stateWithMasterNode2, stateWithMasterNode1) };
338+
}
339+
255340
public void testNoLocalShards() {
256341
Index watchIndex = new Index(Watch.INDEX, "foo");
257342
ShardId shardId = new ShardId(watchIndex, 0);
@@ -301,7 +386,7 @@ public void testNoLocalShards() {
301386
when(watcherService.validate(eq(clusterStateWithLocalShards))).thenReturn(true);
302387
when(watcherService.validate(eq(clusterStateWithoutLocalShards))).thenReturn(false);
303388
lifeCycleService.clusterChanged(new ClusterChangedEvent("any", clusterStateWithLocalShards, clusterStateWithoutLocalShards));
304-
verify(watcherService, times(1)).reload(eq(clusterStateWithLocalShards), eq("new local watcher shard allocation ids"));
389+
verify(watcherService, times(1)).reload(eq(clusterStateWithLocalShards), eq("new local watcher shard allocation ids"), any());
305390
verify(watcherService, times(1)).validate(eq(clusterStateWithLocalShards));
306391
verifyNoMoreInteractions(watcherService);
307392

@@ -386,12 +471,12 @@ public void testReplicaWasAddedOrRemoved() {
386471

387472
when(watcherService.validate(eq(firstEvent.state()))).thenReturn(true);
388473
lifeCycleService.clusterChanged(firstEvent);
389-
verify(watcherService).reload(eq(firstEvent.state()), anyString());
474+
verify(watcherService).reload(eq(firstEvent.state()), anyString(), any());
390475

391476
reset(watcherService);
392477
when(watcherService.validate(eq(secondEvent.state()))).thenReturn(true);
393478
lifeCycleService.clusterChanged(secondEvent);
394-
verify(watcherService).reload(eq(secondEvent.state()), anyString());
479+
verify(watcherService).reload(eq(secondEvent.state()), anyString(), any());
395480
}
396481

397482
// make sure that cluster state changes can be processed on nodes that do not hold data
@@ -457,7 +542,7 @@ public void testNonDataNode() {
457542

458543
lifeCycleService.clusterChanged(new ClusterChangedEvent("any", currentState, previousState));
459544
verify(watcherService, times(0)).pauseExecution(any());
460-
verify(watcherService, times(0)).reload(any(), any());
545+
verify(watcherService, times(0)).reload(any(), any(), any());
461546
}
462547

463548
public void testThatMissingWatcherIndexMetadataOnlyResetsOnce() {
@@ -490,7 +575,7 @@ public void testThatMissingWatcherIndexMetadataOnlyResetsOnce() {
490575

491576
// first add the shard allocation ids, by going from empty cs to CS with watcher index
492577
lifeCycleService.clusterChanged(new ClusterChangedEvent("any", clusterStateWithWatcherIndex, clusterStateWithoutWatcherIndex));
493-
verify(watcherService).reload(eq(clusterStateWithWatcherIndex), anyString());
578+
verify(watcherService).reload(eq(clusterStateWithWatcherIndex), anyString(), any());
494579

495580
// now remove watches index, and ensure that pausing is only called once, no matter how often called (i.e. each CS update)
496581
lifeCycleService.clusterChanged(new ClusterChangedEvent("any", clusterStateWithoutWatcherIndex, clusterStateWithWatcherIndex));
@@ -629,7 +714,7 @@ public void testWatcherReloadsOnNodeOutageWithWatcherShard() {
629714
when(watcherService.validate(any())).thenReturn(true);
630715
ClusterChangedEvent event = new ClusterChangedEvent("whatever", currentState, previousState);
631716
lifeCycleService.clusterChanged(event);
632-
verify(watcherService).reload(eq(event.state()), anyString());
717+
verify(watcherService).reload(eq(event.state()), anyString(), any());
633718
}
634719

635720
private void startWatcher() {
@@ -658,7 +743,7 @@ private void startWatcher() {
658743

659744
lifeCycleService.clusterChanged(new ClusterChangedEvent("foo", state, emptyState));
660745
assertThat(lifeCycleService.getState().get(), is(WatcherState.STARTED));
661-
verify(watcherService, times(1)).reload(eq(state), anyString());
746+
verify(watcherService, times(1)).reload(eq(state), anyString(), any());
662747
assertThat(lifeCycleService.shardRoutings(), hasSize(1));
663748

664749
// reset the mock, the user has to mock everything themselves again

x-pack/plugin/watcher/src/test/java/org/elasticsearch/xpack/watcher/WatcherServiceTests.java

Lines changed: 28 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,7 @@
7878
import static org.mockito.Mockito.doAnswer;
7979
import static org.mockito.Mockito.mock;
8080
import static org.mockito.Mockito.never;
81+
import static org.mockito.Mockito.spy;
8182
import static org.mockito.Mockito.verify;
8283
import static org.mockito.Mockito.when;
8384

@@ -348,12 +349,38 @@ void stopExecutor() {}
348349
ClusterState.Builder csBuilder = new ClusterState.Builder(new ClusterName("_name"));
349350
csBuilder.metadata(Metadata.builder());
350351

351-
service.reload(csBuilder.build(), "whatever");
352+
service.reload(csBuilder.build(), "whatever", exception -> {});
352353
verify(executionService).clearExecutionsAndQueue(any());
353354
verify(executionService, never()).pause(any());
354355
verify(triggerService).pauseExecution();
355356
}
356357

358+
// the trigger service should not start unless watches are loaded successfully
359+
public void testReloadingWatcherDoesNotStartTriggerServiceIfFailingToLoadWatches() {
360+
ExecutionService executionService = mock(ExecutionService.class);
361+
TriggerService triggerService = mock(TriggerService.class);
362+
WatcherService service = new WatcherService(
363+
Settings.EMPTY,
364+
triggerService,
365+
mock(TriggeredWatchStore.class),
366+
executionService,
367+
mock(WatchParser.class),
368+
client,
369+
EsExecutors.DIRECT_EXECUTOR_SERVICE
370+
) {
371+
@Override
372+
void stopExecutor() {}
373+
};
374+
375+
ClusterState.Builder csBuilder = new ClusterState.Builder(new ClusterName("_name"));
376+
Metadata metadata = spy(Metadata.builder().build());
377+
when(metadata.getIndicesLookup()).thenThrow(RuntimeException.class); // simulate exception in WatcherService's private loadWatches()
378+
379+
service.reload(csBuilder.metadata(metadata).build(), "whatever", exception -> {});
380+
verify(triggerService).pauseExecution();
381+
verify(triggerService, never()).start(any());
382+
}
383+
357384
private static DiscoveryNode newNode() {
358385
return new DiscoveryNode(
359386
"node",

0 commit comments

Comments
 (0)