@@ -184,88 +184,78 @@ void onMaster(ClusterState clusterState) {
184
184
maybeScheduleJob ();
185
185
186
186
for (var projectId : clusterState .metadata ().projects ().keySet ()) {
187
- onMaster (clusterState .projectState (projectId ));
187
+ maybeRunAsyncActions (clusterState .projectState (projectId ));
188
188
}
189
189
}
190
190
191
- void onMaster (ProjectState state ) {
191
+ /**
192
+ * Kicks off any async actions that may not have been run due to either master failover or ILM being manually stopped.
193
+ */
194
+ private void maybeRunAsyncActions (ProjectState state ) {
192
195
final ProjectMetadata projectMetadata = state .metadata ();
193
196
final IndexLifecycleMetadata currentMetadata = projectMetadata .custom (IndexLifecycleMetadata .TYPE );
194
- if (currentMetadata != null ) {
195
- OperationMode currentMode = currentILMMode (projectMetadata );
196
- if (OperationMode .STOPPED .equals (currentMode )) {
197
- return ;
198
- }
199
-
200
- boolean safeToStop = true ; // true until proven false by a run policy
201
-
202
- // If we just became master, we need to kick off any async actions that
203
- // may have not been run due to master rollover
204
- for (IndexMetadata idxMeta : projectMetadata .indices ().values ()) {
205
- if (projectMetadata .isIndexManagedByILM (idxMeta )) {
206
- String policyName = idxMeta .getLifecyclePolicyName ();
207
- final LifecycleExecutionState lifecycleState = idxMeta .getLifecycleExecutionState ();
208
- StepKey stepKey = Step .getCurrentStepKey (lifecycleState );
209
-
210
- try {
211
- if (OperationMode .STOPPING == currentMode ) {
212
- if (stepKey != null && IGNORE_STEPS_MAINTENANCE_REQUESTED .contains (stepKey .name ())) {
213
- logger .info (
214
- "waiting to stop ILM because index [{}] with policy [{}] is currently in step [{}]" ,
215
- idxMeta .getIndex ().getName (),
216
- policyName ,
217
- stepKey .name ()
218
- );
219
- lifecycleRunner .maybeRunAsyncAction (state , idxMeta , policyName , stepKey );
220
- // ILM is trying to stop, but this index is in a Shrink step (or other dangerous step) so we can't stop
221
- safeToStop = false ;
222
- } else {
223
- logger .info (
224
- "skipping policy execution of step [{}] for index [{}] with policy [{}]" + " because ILM is stopping" ,
225
- stepKey == null ? "n/a" : stepKey .name (),
226
- idxMeta .getIndex ().getName (),
227
- policyName
228
- );
229
- }
230
- } else {
231
- lifecycleRunner .maybeRunAsyncAction (state , idxMeta , policyName , stepKey );
232
- }
233
- } catch (Exception e ) {
234
- if (logger .isTraceEnabled ()) {
235
- logger .warn (
236
- () -> format (
237
- "async action execution failed during master election trigger"
238
- + " for index [%s] with policy [%s] in step [%s], lifecycle state: [%s]" ,
239
- idxMeta .getIndex ().getName (),
240
- policyName ,
241
- stepKey ,
242
- lifecycleState .asMap ()
243
- ),
244
- e
245
- );
246
- } else {
247
- logger .warn (
248
- () -> format (
249
- "async action execution failed during master election trigger"
250
- + " for index [%s] with policy [%s] in step [%s]" ,
251
- idxMeta .getIndex ().getName (),
252
- policyName ,
253
- stepKey
254
- ),
255
- e
256
- );
197
+ if (currentMetadata == null ) {
198
+ return ;
199
+ }
200
+ OperationMode currentMode = currentILMMode (projectMetadata );
201
+ if (OperationMode .STOPPED .equals (currentMode )) {
202
+ return ;
203
+ }
257
204
258
- }
259
- // Don't rethrow the exception, we don't want a failure for one index to be
260
- // called to cause actions not to be triggered for further indices
261
- }
262
- }
205
+ boolean safeToStop = true ; // true until proven false by a run policy
206
+ for (IndexMetadata idxMeta : projectMetadata .indices ().values ()) {
207
+ if (projectMetadata .isIndexManagedByILM (idxMeta ) == false ) {
208
+ continue ;
263
209
}
210
+ String policyName = idxMeta .getLifecyclePolicyName ();
211
+ final LifecycleExecutionState lifecycleState = idxMeta .getLifecycleExecutionState ();
212
+ StepKey stepKey = Step .getCurrentStepKey (lifecycleState );
213
+
214
+ try {
215
+ if (currentMode == OperationMode .RUNNING ) {
216
+ lifecycleRunner .maybeRunAsyncAction (state , idxMeta , policyName , stepKey );
217
+ continue ;
218
+ }
219
+ // We only get here if ILM is in STOPPING mode. In that case, we need to check if there is any index that is in a step
220
+ // that we can't stop ILM in. If there is, we don't stop ILM yet.
221
+ if (stepKey != null && IGNORE_STEPS_MAINTENANCE_REQUESTED .contains (stepKey .name ())) {
222
+ logger .info (
223
+ "waiting to stop ILM because index [{}] with policy [{}] is currently in step [{}]" ,
224
+ idxMeta .getIndex ().getName (),
225
+ policyName ,
226
+ stepKey .name ()
227
+ );
228
+ lifecycleRunner .maybeRunAsyncAction (state , idxMeta , policyName , stepKey );
229
+ // ILM is trying to stop, but this index is in a Shrink step (or other dangerous step) so we can't stop
230
+ safeToStop = false ;
231
+ } else {
232
+ logger .info (
233
+ "skipping policy execution of step [{}] for index [{}] with policy [{}]" + " because ILM is stopping" ,
234
+ stepKey == null ? "n/a" : stepKey .name (),
235
+ idxMeta .getIndex ().getName (),
236
+ policyName
237
+ );
238
+ }
239
+ } catch (Exception e ) {
240
+ String logMessage = format (
241
+ "async action execution failed during master election trigger for index [%s] with policy [%s] in step [%s]" ,
242
+ idxMeta .getIndex ().getName (),
243
+ policyName ,
244
+ stepKey
245
+ );
246
+ if (logger .isTraceEnabled ()) {
247
+ logMessage += format (", lifecycle state: [%s]" , lifecycleState .asMap ());
248
+ }
249
+ logger .warn (logMessage , e );
264
250
265
- if ( safeToStop && OperationMode . STOPPING == currentMode ) {
266
- stopILM ( state . projectId ());
251
+ // Don't rethrow the exception, we don't want a failure for one index to be
252
+ // called to cause actions not to be triggered for further indices
267
253
}
268
254
}
255
+
256
+ if (safeToStop && OperationMode .STOPPING == currentMode ) {
257
+ stopILM (state .projectId ());
258
+ }
269
259
}
270
260
271
261
private void stopILM (ProjectId projectId ) {
@@ -333,6 +323,20 @@ public void clusterChanged(ClusterChangedEvent event) {
333
323
cancelJob ();
334
324
policyRegistry .clear ();
335
325
}
326
+ } else if (this .isMaster ) {
327
+ // If we are the master and we were before, check if any projects changed their ILM mode from non-RUNNING to RUNNING.
328
+ // If so, kick off any async actions that may not have run while not in RUNNING mode.
329
+ for (ProjectMetadata project : event .state ().metadata ().projects ().values ()) {
330
+ final var previousProject = event .previousState ().metadata ().projects ().get (project .id ());
331
+ if (previousProject == null || project == previousProject ) {
332
+ continue ;
333
+ }
334
+ final OperationMode currentMode = currentILMMode (project );
335
+ final OperationMode previousMode = currentILMMode (previousProject );
336
+ if (currentMode == OperationMode .RUNNING && previousMode != OperationMode .RUNNING ) {
337
+ maybeRunAsyncActions (event .state ().projectState (project .id ()));
338
+ }
339
+ }
336
340
}
337
341
338
342
// if we're the master, then process deleted indices and trigger policies
0 commit comments