-
Notifications
You must be signed in to change notification settings - Fork 25.6k
Don't run async actions when ILM is stopped #133683
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
3b77520
930155f
6aadbb3
26b3dc4
370d966
0ed022c
ebe7f23
80f5e81
02e978b
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -184,11 +184,14 @@ void onMaster(ClusterState clusterState) { | |
maybeScheduleJob(); | ||
|
||
for (var projectId : clusterState.metadata().projects().keySet()) { | ||
onMaster(clusterState.projectState(projectId)); | ||
maybeRunAsyncActions(clusterState.projectState(projectId)); | ||
} | ||
} | ||
|
||
void onMaster(ProjectState state) { | ||
/** | ||
* Kicks off any async actions that may not have been run due to either master failover or ILM being manually stopped. | ||
*/ | ||
private void maybeRunAsyncActions(ProjectState state) { | ||
final ProjectMetadata projectMetadata = state.metadata(); | ||
final IndexLifecycleMetadata currentMetadata = projectMetadata.custom(IndexLifecycleMetadata.TYPE); | ||
if (currentMetadata != null) { | ||
|
@@ -198,67 +201,51 @@ void onMaster(ProjectState state) { | |
} | ||
|
||
boolean safeToStop = true; // true until proven false by a run policy | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The changes in this method aren't stricly necessary, I just took this opportunity to clean this method up a bit, as it was becoming hard to read. If there are concerns with this refactor, I can revert these stylistic changes and stick to the bug fix. |
||
// If we just became master, we need to kick off any async actions that | ||
// may have not been run due to master rollover | ||
for (IndexMetadata idxMeta : projectMetadata.indices().values()) { | ||
if (projectMetadata.isIndexManagedByILM(idxMeta)) { | ||
String policyName = idxMeta.getLifecyclePolicyName(); | ||
final LifecycleExecutionState lifecycleState = idxMeta.getLifecycleExecutionState(); | ||
StepKey stepKey = Step.getCurrentStepKey(lifecycleState); | ||
|
||
try { | ||
if (OperationMode.STOPPING == currentMode) { | ||
if (stepKey != null && IGNORE_STEPS_MAINTENANCE_REQUESTED.contains(stepKey.name())) { | ||
logger.info( | ||
"waiting to stop ILM because index [{}] with policy [{}] is currently in step [{}]", | ||
idxMeta.getIndex().getName(), | ||
policyName, | ||
stepKey.name() | ||
); | ||
lifecycleRunner.maybeRunAsyncAction(state, idxMeta, policyName, stepKey); | ||
// ILM is trying to stop, but this index is in a Shrink step (or other dangerous step) so we can't stop | ||
safeToStop = false; | ||
} else { | ||
logger.info( | ||
"skipping policy execution of step [{}] for index [{}] with policy [{}]" + " because ILM is stopping", | ||
stepKey == null ? "n/a" : stepKey.name(), | ||
idxMeta.getIndex().getName(), | ||
policyName | ||
); | ||
} | ||
} else { | ||
lifecycleRunner.maybeRunAsyncAction(state, idxMeta, policyName, stepKey); | ||
} | ||
} catch (Exception e) { | ||
if (logger.isTraceEnabled()) { | ||
logger.warn( | ||
() -> format( | ||
"async action execution failed during master election trigger" | ||
+ " for index [%s] with policy [%s] in step [%s], lifecycle state: [%s]", | ||
idxMeta.getIndex().getName(), | ||
policyName, | ||
stepKey, | ||
lifecycleState.asMap() | ||
), | ||
e | ||
); | ||
} else { | ||
logger.warn( | ||
() -> format( | ||
"async action execution failed during master election trigger" | ||
+ " for index [%s] with policy [%s] in step [%s]", | ||
idxMeta.getIndex().getName(), | ||
policyName, | ||
stepKey | ||
), | ||
e | ||
); | ||
if (projectMetadata.isIndexManagedByILM(idxMeta) == false) { | ||
continue; | ||
} | ||
String policyName = idxMeta.getLifecyclePolicyName(); | ||
final LifecycleExecutionState lifecycleState = idxMeta.getLifecycleExecutionState(); | ||
StepKey stepKey = Step.getCurrentStepKey(lifecycleState); | ||
|
||
} | ||
// Don't rethrow the exception, we don't want a failure for one index to be | ||
// called to cause actions not to be triggered for further indices | ||
try { | ||
if (currentMode == OperationMode.RUNNING) { | ||
lifecycleRunner.maybeRunAsyncAction(state, idxMeta, policyName, stepKey); | ||
continue; | ||
} | ||
if (stepKey != null && IGNORE_STEPS_MAINTENANCE_REQUESTED.contains(stepKey.name())) { | ||
|
||
logger.info( | ||
"waiting to stop ILM because index [{}] with policy [{}] is currently in step [{}]", | ||
idxMeta.getIndex().getName(), | ||
policyName, | ||
stepKey.name() | ||
); | ||
lifecycleRunner.maybeRunAsyncAction(state, idxMeta, policyName, stepKey); | ||
// ILM is trying to stop, but this index is in a Shrink step (or other dangerous step) so we can't stop | ||
safeToStop = false; | ||
} else { | ||
logger.info( | ||
"skipping policy execution of step [{}] for index [{}] with policy [{}]" + " because ILM is stopping", | ||
stepKey == null ? "n/a" : stepKey.name(), | ||
idxMeta.getIndex().getName(), | ||
policyName | ||
); | ||
} | ||
} catch (Exception e) { | ||
String format = format( | ||
"async action execution failed during master election trigger for index [%s] with policy [%s] in step [%s]", | ||
idxMeta.getIndex().getName(), | ||
policyName, | ||
stepKey | ||
); | ||
if (logger.isTraceEnabled()) { | ||
format += format(", lifecycle state: [%s]", lifecycleState.asMap()); | ||
} | ||
logger.warn(format, e); | ||
|
||
|
||
// Don't rethrow the exception, we don't want a failure for one index to be | ||
// called to cause actions not to be triggered for further indices | ||
} | ||
} | ||
|
||
|
@@ -333,6 +320,20 @@ public void clusterChanged(ClusterChangedEvent event) { | |
cancelJob(); | ||
policyRegistry.clear(); | ||
} | ||
} else if (this.isMaster) { | ||
// If we are the master and we were before, check if any projects changed their ILM mode from non-RUNNING to RUNNING. | ||
// If so, kick off any async actions that may not have run while not in RUNNING mode. | ||
for (ProjectMetadata project : event.state().metadata().projects().values()) { | ||
final var previousProject = event.previousState().metadata().projects().get(project.id()); | ||
if (previousProject == null || project == previousProject) { | ||
continue; | ||
} | ||
final OperationMode currentMode = currentILMMode(project); | ||
final OperationMode previousMode = currentILMMode(previousProject); | ||
if (currentMode == OperationMode.RUNNING && previousMode != OperationMode.RUNNING) { | ||
maybeRunAsyncActions(event.state().projectState(project.id())); | ||
} | ||
} | ||
} | ||
|
||
// if we're the master, then process deleted indices and trigger policies | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Any suggestions for improved assertions are welcome. Adding a
Thread.sleep
here would increase our confidence, but still wouldn't make any guarantee.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Could we write a test that wraps the
Client
passed toIndexLifecycleService
with a wrapper that asserts that we never perform some particular ILM action?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I added a test to
IndexLifecycleRunnerTests
in 370d966 that verifies the action isn't executed when ILM is stopped. Let me know if that's what you had in mind. I chose for testingIndexLifecycleRunner#maybeRunAsyncAction
instead ofIndexLifecycleService#maybeRunAsyncAction
, as the latter is only used by APIs and the former is also used internally by ILM.