Skip to content

Commit 0f65d79

Browse files
authored
[Transform] Retry when failing to start indexer (#132048)
Bug: when the indexer state fails to save during a cluster state update, the Transform is stuck in STOPPING and cannot be restarted unless the user force stops to delete the task. Fix: the task will continuously retry starting the indexer until the cluster state update can succeed. Notes: - users can cancel the retry by force stopping the transform - the retry is displayed in the UI as "degraded" with a message as to why the transform is restarting - the transform now displays as STARTING rather than STOPPING until it successfully starts - the retry is audited so it displays in the Messages tab of the UI - the retry timer is randomly selected between 45s and 90s, this should help during rolling restarts for clusters that have a large amount of transforms Fix #128221
1 parent 4d1b7a6 commit 0f65d79

File tree

5 files changed

+122
-34
lines changed

5 files changed

+122
-34
lines changed

docs/changelog/132048.yaml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
pr: 132048
2+
summary: Fix stuck in STOPPING by retrying the startup task indefinitely until it succeeds
3+
area: Transform
4+
type: bug
5+
issues:
6+
- 128221

x-pack/plugin/transform/src/main/java/org/elasticsearch/xpack/transform/action/TransportGetTransformStatsAction.java

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -275,6 +275,8 @@ static TransformStats deriveStats(TransformTask transformTask, @Nullable Transfo
275275
} else if (derivedState.equals(TransformStats.State.STARTED) && transformTask.getContext().isWaitingForIndexToUnblock()) {
276276
derivedState = TransformStats.State.WAITING;
277277
reason = Strings.isNullOrEmpty(reason) ? "transform is paused while destination index is blocked" : reason;
278+
} else if (derivedState.equals(TransformStats.State.STOPPING) && transformTask.isRetryingStartup()) {
279+
derivedState = TransformStats.State.STARTED;
278280
}
279281
return new TransformStats(
280282
transformTask.getTransformId(),

x-pack/plugin/transform/src/main/java/org/elasticsearch/xpack/transform/transforms/TransformPersistentTasksExecutor.java

Lines changed: 63 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -25,9 +25,12 @@
2525
import org.elasticsearch.cluster.node.DiscoveryNode;
2626
import org.elasticsearch.cluster.routing.IndexRoutingTable;
2727
import org.elasticsearch.cluster.service.ClusterService;
28+
import org.elasticsearch.common.Randomness;
2829
import org.elasticsearch.common.ValidationException;
2930
import org.elasticsearch.common.settings.Settings;
3031
import org.elasticsearch.core.Nullable;
32+
import org.elasticsearch.core.Strings;
33+
import org.elasticsearch.core.TimeValue;
3134
import org.elasticsearch.core.Tuple;
3235
import org.elasticsearch.persistent.AllocatedPersistentTask;
3336
import org.elasticsearch.persistent.PersistentTaskState;
@@ -225,22 +228,22 @@ protected void nodeOperation(AllocatedPersistentTask task, @Nullable TransformTa
225228
final SetOnce<TransformState> stateHolder = new SetOnce<>();
226229

227230
// <8> log the start result
228-
ActionListener<StartTransformAction.Response> startTaskListener = ActionListener.wrap(
229-
response -> logger.info("[{}] successfully completed and scheduled task in node operation", transformId),
230-
failure -> {
231-
// If the transform is failed then there is no need to log an error on every node restart as the error had already been
232-
// logged when the transform first failed.
233-
boolean logErrorAsInfo = failure instanceof CannotStartFailedTransformException;
234-
auditor.audit(
235-
logErrorAsInfo ? INFO : ERROR,
236-
transformId,
237-
"Failed to start transform. Please stop and attempt to start again. Failure: " + failure.getMessage()
238-
);
239-
logger.atLevel(logErrorAsInfo ? Level.INFO : Level.ERROR)
240-
.withThrowable(failure)
241-
.log("[{}] Failed to start task in node operation", transformId);
242-
}
243-
);
231+
ActionListener<StartTransformAction.Response> startTaskListener = ActionListener.wrap(response -> {
232+
logger.info("[{}] successfully completed and scheduled task in node operation", transformId);
233+
transformServices.scheduler().registerTransform(params, buildTask);
234+
}, failure -> {
235+
// If the transform is failed then there is no need to log an error on every node restart as the error had already been
236+
// logged when the transform first failed.
237+
boolean logErrorAsInfo = failure instanceof CannotStartFailedTransformException;
238+
auditor.audit(
239+
logErrorAsInfo ? INFO : ERROR,
240+
transformId,
241+
"Failed to start transform. Please stop and attempt to start again. Failure: " + failure.getMessage()
242+
);
243+
logger.atLevel(logErrorAsInfo ? Level.INFO : Level.ERROR)
244+
.withThrowable(failure)
245+
.log("[{}] Failed to start task in node operation", transformId);
246+
});
244247

245248
// <7> load next checkpoint
246249
ActionListener<TransformCheckpoint> getTransformNextCheckpointListener = ActionListener.wrap(nextCheckpoint -> {
@@ -259,7 +262,7 @@ protected void nodeOperation(AllocatedPersistentTask task, @Nullable TransformTa
259262
final long lastCheckpoint = stateHolder.get().getCheckpoint();
260263
final AuthorizationState authState = stateHolder.get().getAuthState();
261264

262-
startTask(buildTask, indexerBuilder, authState, lastCheckpoint, startTaskListener);
265+
startTask(buildTask, params, indexerBuilder, authState, lastCheckpoint, startTaskListener);
263266
}, error -> {
264267
// TODO: do not use the same error message as for loading the last checkpoint
265268
String msg = TransformMessages.getMessage(TransformMessages.FAILED_TO_LOAD_TRANSFORM_CHECKPOINT, transformId);
@@ -326,7 +329,7 @@ protected void nodeOperation(AllocatedPersistentTask task, @Nullable TransformTa
326329
markAsFailed(buildTask, error, msg);
327330
} else {
328331
logger.trace("[{}] No stats found (new transform), starting the task", transformId);
329-
startTask(buildTask, indexerBuilder, null, null, startTaskListener);
332+
startTask(buildTask, params, indexerBuilder, null, null, startTaskListener);
330333
}
331334
}
332335
);
@@ -485,17 +488,58 @@ private ActionListener<Boolean> retryListener(TransformTask task) {
485488

486489
private void startTask(
487490
TransformTask buildTask,
491+
TransformTaskParams params,
488492
ClientTransformIndexerBuilder indexerBuilder,
489493
AuthorizationState authState,
490494
Long previousCheckpoint,
491495
ActionListener<StartTransformAction.Response> listener
492496
) {
497+
// if we fail the first request, we are going to start retrying until we succeed. when start fails, it is because the cluster state
498+
// is not handling updates yet, but the cluster will eventually recover on its own.
499+
var startRetriesOnFirstFailureListener = listener.delegateResponse((l, e) -> {
500+
// copy the params but replace the frequency, this is to prevent every transform from starting and retrying every second,
501+
// potentially sending many cluster state updates at once. instead, add randomness to spread out the retry requests after the
502+
// first retry
503+
var retryTimer = TimeValue.timeValueSeconds(45 + Randomness.get().nextInt(15, 45));
504+
var paramsWithExtendedTimer = new TransformTaskParams(
505+
params.getId(),
506+
params.getVersion(),
507+
params.from(),
508+
retryTimer,
509+
params.requiresRemote()
510+
);
511+
logger.debug("Failed to start Transform, retrying in [{}] seconds.", retryTimer.seconds());
512+
// tell the user when and why the retries are happening and how to stop them
513+
// force stopping will eventually deregister this retry task from the scheduler
514+
auditor.warning(
515+
params.getId(),
516+
Strings.format(
517+
"Failed while starting Transform. Automatically retrying every [%s] seconds. "
518+
+ "To cancel retries, use [_transform/%s/_stop?force] to force stop this transform. Failure: [%s]",
519+
retryTimer.seconds(),
520+
params.getId(),
521+
e.getMessage()
522+
)
523+
);
524+
var scheduler = transformServices.scheduler();
525+
scheduler.registerTransform(
526+
paramsWithExtendedTimer,
527+
new TransformRetryableStartUpListener<>(
528+
paramsWithExtendedTimer.getId(),
529+
ll -> buildTask.start(previousCheckpoint, ll),
530+
ActionListener.runBefore(l, () -> scheduler.deregisterTransform(paramsWithExtendedTimer.getId())),
531+
ActionListener.noop(),
532+
() -> true,
533+
buildTask.getContext()
534+
)
535+
);
536+
});
493537
// switch the threadpool to generic, because the caller is on the system_read threadpool
494538
threadPool.generic().execute(() -> {
495539
buildTask.initializeIndexer(indexerBuilder);
496540
buildTask.setAuthState(authState);
497541
// TransformTask#start will fail if the task state is FAILED
498-
buildTask.setNumFailureRetries(numFailureRetries).start(previousCheckpoint, listener);
542+
buildTask.setNumFailureRetries(numFailureRetries).start(previousCheckpoint, startRetriesOnFirstFailureListener);
499543
});
500544
}
501545

x-pack/plugin/transform/src/main/java/org/elasticsearch/xpack/transform/transforms/TransformTask.java

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -302,18 +302,13 @@ void start(Long startingCheckpoint, ActionListener<StartTransformAction.Response
302302
// we could not read the previous state information from said index.
303303
persistStateToClusterState(state, ActionListener.wrap(task -> {
304304
auditor.info(transform.getId(), "Updated transform state to [" + state.getTaskState() + "].");
305-
transformScheduler.registerTransform(transform, this);
306305
listener.onResponse(new StartTransformAction.Response(true));
307306
}, exc -> {
308-
auditor.warning(
309-
transform.getId(),
310-
"Failed to persist to cluster state while marking task as started. Failure: " + exc.getMessage()
311-
);
312307
logger.error(() -> format("[%s] failed updating state to [%s].", getTransformId(), state), exc);
313308
getIndexer().stop();
314309
listener.onFailure(
315310
new ElasticsearchException(
316-
"Error while updating state for transform [" + transform.getId() + "] to [" + state.getIndexerState() + "].",
311+
"Error while updating state for transform [" + transform.getId() + "] to [" + TransformTaskState.STARTED + "].",
317312
exc
318313
)
319314
);
@@ -595,6 +590,10 @@ public void onCancelled() {
595590
}
596591
}
597592

593+
public boolean isRetryingStartup() {
594+
return getContext().getStartUpFailureCount() > 0;
595+
}
596+
598597
TransformTask setNumFailureRetries(int numFailureRetries) {
599598
context.setNumFailureRetries(numFailureRetries);
600599
return this;

x-pack/plugin/transform/src/test/java/org/elasticsearch/xpack/transform/transforms/TransformPersistentTasksExecutorTests.java

Lines changed: 46 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@
4343
import org.elasticsearch.threadpool.ThreadPool;
4444
import org.elasticsearch.xpack.core.indexing.IndexerState;
4545
import org.elasticsearch.xpack.core.transform.TransformConfigVersion;
46+
import org.elasticsearch.xpack.core.transform.action.StartTransformAction;
4647
import org.elasticsearch.xpack.core.transform.transforms.AuthorizationState;
4748
import org.elasticsearch.xpack.core.transform.transforms.TransformConfig;
4849
import org.elasticsearch.xpack.core.transform.transforms.TransformConfigTests;
@@ -75,12 +76,16 @@
7576
import java.util.concurrent.atomic.AtomicBoolean;
7677

7778
import static org.hamcrest.Matchers.equalTo;
79+
import static org.hamcrest.Matchers.startsWith;
7880
import static org.mockito.ArgumentMatchers.any;
7981
import static org.mockito.ArgumentMatchers.anyInt;
8082
import static org.mockito.ArgumentMatchers.argThat;
83+
import static org.mockito.ArgumentMatchers.assertArg;
84+
import static org.mockito.ArgumentMatchers.eq;
8185
import static org.mockito.ArgumentMatchers.isNull;
8286
import static org.mockito.Mockito.doAnswer;
8387
import static org.mockito.Mockito.mock;
88+
import static org.mockito.Mockito.times;
8489
import static org.mockito.Mockito.verify;
8590
import static org.mockito.Mockito.when;
8691

@@ -375,7 +380,7 @@ public void testNodeOperation() {
375380
var taskExecutor = buildTaskExecutor(transformServices(transformsConfigManager, transformScheduler));
376381

377382
var transformId = "testNodeOperation";
378-
var params = mockTaskParams(transformId);
383+
var params = taskParams(transformId);
379384

380385
putTransformConfiguration(transformsConfigManager, transformId);
381386
var task = mockTransformTask();
@@ -391,7 +396,7 @@ private void putTransformConfiguration(TransformConfigManager configManager, Str
391396
);
392397
}
393398

394-
public void testNodeOperationStartupRetry() throws Exception {
399+
public void testNodeOperationStartupRetryWithGetConfigFailure() throws Exception {
395400
var failFirstCall = new AtomicBoolean(true);
396401
var transformsConfigManager = new InMemoryTransformConfigManager() {
397402
@Override
@@ -407,8 +412,8 @@ public void getTransformConfiguration(String transformId, ActionListener<Transfo
407412
var transformScheduler = new TransformScheduler(Clock.systemUTC(), threadPool, fastRetry(), TimeValue.ZERO);
408413
var taskExecutor = buildTaskExecutor(transformServices(transformsConfigManager, transformScheduler));
409414

410-
var transformId = "testNodeOperationStartupRetry";
411-
var params = mockTaskParams(transformId);
415+
var transformId = "testNodeOperationStartupRetryWithGetConfigFailure";
416+
var params = taskParams(transformId);
412417
putTransformConfiguration(transformsConfigManager, transformId);
413418

414419
var task = mockTransformTask();
@@ -426,16 +431,48 @@ public void getTransformConfiguration(String transformId, ActionListener<Transfo
426431
verify(task).start(isNull(), any());
427432
}
428433

434+
public void testNodeOperationStartupRetryWithStartFailure() throws Exception {
435+
var failFirstCall = new AtomicBoolean(true);
436+
var transformsConfigManager = new InMemoryTransformConfigManager();
437+
438+
var transformScheduler = new TransformScheduler(Clock.systemUTC(), threadPool, fastRetry(), TimeValue.ZERO);
439+
var transformServices = transformServices(transformsConfigManager, transformScheduler);
440+
var taskExecutor = buildTaskExecutor(transformServices);
441+
442+
var transformId = "testNodeOperationStartupRetryWithStartFailure";
443+
var params = taskParams(transformId);
444+
putTransformConfiguration(transformsConfigManager, transformId);
445+
446+
var task = mockTransformTask();
447+
doAnswer(ans -> {
448+
ActionListener<StartTransformAction.Response> listener = ans.getArgument(1);
449+
if (failFirstCall.compareAndSet(true, false)) {
450+
listener.onFailure(new IllegalStateException("ahhhh"));
451+
} else {
452+
listener.onResponse(new StartTransformAction.Response(true));
453+
}
454+
return Void.TYPE;
455+
}).when(task).start(any(), any());
456+
taskExecutor.nodeOperation(task, params, mock());
457+
458+
// skip waiting for the scheduler to run the task a second time and just rerun it now
459+
transformScheduler.scheduleNow(transformId);
460+
461+
verify(task, times(2)).start(isNull(), any());
462+
assertThat(transformScheduler.getStats().peekTransformName(), equalTo(transformId));
463+
verify(transformServices.auditor()).warning(
464+
eq(transformId),
465+
assertArg(message -> assertThat(message, startsWith("Failed while starting Transform. Automatically retrying")))
466+
);
467+
}
468+
429469
private Settings fastRetry() {
430470
// must be >= [1s]
431471
return Settings.builder().put(Transform.SCHEDULER_FREQUENCY.getKey(), TimeValue.timeValueSeconds(1)).build();
432472
}
433473

434-
private TransformTaskParams mockTaskParams(String transformId) {
435-
var params = mock(TransformTaskParams.class);
436-
when(params.getId()).thenReturn(transformId);
437-
when(params.getFrequency()).thenReturn(TimeValue.timeValueSeconds(1));
438-
return params;
474+
private TransformTaskParams taskParams(String transformId) {
475+
return new TransformTaskParams(transformId, TransformConfigVersion.CURRENT, null, TimeValue.timeValueSeconds(1), false);
439476
}
440477

441478
private TransformTask mockTransformTask() {

0 commit comments

Comments
 (0)