5353import static org .elasticsearch .health .HealthStatus .YELLOW ;
5454import static org .elasticsearch .xpack .core .ilm .LifecycleOperationMetadata .currentILMMode ;
5555import static org .elasticsearch .xpack .ilm .IlmHealthIndicatorService .RuleConfig .Builder .actionRule ;
56- import static org .elasticsearch .xpack .ilm .IlmHealthIndicatorService .StepRule .stepRule ;
56+ import static org .elasticsearch .xpack .ilm .IlmHealthIndicatorService .StepRule .stepRuleFullChecks ;
57+ import static org .elasticsearch .xpack .ilm .IlmHealthIndicatorService .StepRule .stepRuleOnlyCheckRetries ;
5758
5859/**
5960 * This indicator reports health for index lifecycle management component.
@@ -103,13 +104,14 @@ public class IlmHealthIndicatorService implements HealthIndicatorService {
103104 );
104105
105106 private static final TimeValue ONE_DAY = TimeValue .timeValueDays (1 );
107+ private static final long MAX_RETRIES = 100 ;
106108
107109 static final Map <String , RuleConfig > RULES_BY_ACTION_CONFIG = Map .of (
108110 RolloverAction .NAME ,
109111 actionRule (RolloverAction .NAME ).stepRules (
110- stepRule (WaitForActiveShardsStep .NAME , ONE_DAY ),
111- stepRule (WaitForRolloverReadyStep .NAME , ONE_DAY ),
112- stepRule (RolloverStep .NAME , ONE_DAY )
112+ stepRuleFullChecks (WaitForActiveShardsStep .NAME , ONE_DAY , MAX_RETRIES ),
113+ stepRuleOnlyCheckRetries (WaitForRolloverReadyStep .NAME , MAX_RETRIES ),
114+ stepRuleFullChecks (RolloverStep .NAME , ONE_DAY , MAX_RETRIES )
113115 ),
114116 //
115117 MigrateAction .NAME ,
@@ -118,32 +120,27 @@ public class IlmHealthIndicatorService implements HealthIndicatorService {
118120 SearchableSnapshotAction .NAME ,
119121 actionRule (SearchableSnapshotAction .NAME ).maxTimeOnAction (ONE_DAY )
120122 .stepRules (
121- stepRule (WaitForDataTierStep .NAME , ONE_DAY ),
122- stepRule (WaitForIndexColorStep .NAME , ONE_DAY ),
123- // The no-follower step is added here because an `UnfollowAction` is added before the `shrinkAction` in the follower cluster
124- stepRule (WaitForNoFollowersStep .NAME , ONE_DAY )
123+ stepRuleFullChecks (WaitForDataTierStep .NAME , ONE_DAY , MAX_RETRIES ),
124+ stepRuleFullChecks (WaitForIndexColorStep .NAME , ONE_DAY , MAX_RETRIES ),
125+ stepRuleOnlyCheckRetries (WaitForNoFollowersStep .NAME , MAX_RETRIES )
125126 ),
126127 //
127128 DeleteAction .NAME ,
128- actionRule (DeleteAction .NAME ).stepRules (stepRule (DeleteStep .NAME , ONE_DAY )),
129+ actionRule (DeleteAction .NAME ).stepRules (stepRuleFullChecks (DeleteStep .NAME , ONE_DAY , MAX_RETRIES )),
129130 //
130131 ShrinkAction .NAME ,
131132 actionRule (ShrinkAction .NAME ).maxTimeOnAction (ONE_DAY )
132- .stepRules (
133- // The no-follower step is added here because an `unfollowAction` is added before the `shrinkAction` in the follower
134- // cluster.
135- stepRule (WaitForNoFollowersStep .NAME , ONE_DAY )
136- ),
133+ .stepRules (stepRuleOnlyCheckRetries (WaitForNoFollowersStep .NAME , MAX_RETRIES )),
137134 //
138135 AllocateAction .NAME ,
139136 actionRule (AllocateAction .NAME ).maxTimeOnAction (ONE_DAY ).noStepRules (),
140137 //
141138 ForceMergeAction .NAME ,
142139 actionRule (ForceMergeAction .NAME ).maxTimeOnAction (ONE_DAY )
143140 .stepRules (
144- stepRule (WaitForIndexColorStep .NAME , ONE_DAY ),
145- stepRule (ForceMergeStep .NAME , ONE_DAY ),
146- stepRule (SegmentCountStep .NAME , ONE_DAY )
141+ stepRuleFullChecks (WaitForIndexColorStep .NAME , ONE_DAY , MAX_RETRIES ),
142+ stepRuleFullChecks (ForceMergeStep .NAME , ONE_DAY , MAX_RETRIES ),
143+ stepRuleFullChecks (SegmentCountStep .NAME , ONE_DAY , MAX_RETRIES )
147144 )
148145 //
149146 // The next rule has to be commented because of this issue https://github.com/elastic/elasticsearch/issues/96705
@@ -409,17 +406,33 @@ public boolean test(Long now, IndexMetadata indexMetadata) {
409406 * @param maxTimeOn Maximum time that an index should spend on this step.
410407 * @param maxRetries Maximum number of times that a step should be retried.
411408 */
412- public record StepRule (String step , TimeValue maxTimeOn , long maxRetries ) implements RuleConfig {
413- static StepRule stepRule (String name , TimeValue maxTimeOn ) {
414- return new StepRule (name , maxTimeOn , 100 );
409+ public record StepRule (String step , TimeValue maxTimeOn , Long maxRetries ) implements RuleConfig {
410+
411+ public StepRule {
412+ if (maxTimeOn == null && maxRetries == null ) {
413+ throw new IllegalArgumentException ("At least one of [maxTimeOne or maxRetries] must be defined." );
414+ }
415+ }
416+
417+ static StepRule stepRuleFullChecks (String name , TimeValue maxTimeOn , long maxRetries ) {
418+ return new StepRule (name , maxTimeOn , maxRetries );
419+ }
420+
421+ static StepRule stepRuleOnlyCheckPassedTime (String name , TimeValue maxTimeOn ) {
422+ return new StepRule (name , maxTimeOn , null );
423+ }
424+
425+ static StepRule stepRuleOnlyCheckRetries (String name , long maxRetries ) {
426+ return new StepRule (name , null , maxRetries );
415427 }
416428
417429 @ Override
418430 public boolean test (Long now , IndexMetadata indexMetadata ) {
419431 var failedStepRetryCount = indexMetadata .getLifecycleExecutionState ().failedStepRetryCount ();
420432 return step .equals (indexMetadata .getLifecycleExecutionState ().step ())
421- && (maxTimeOn .compareTo (RuleConfig .getElapsedTime (now , indexMetadata .getLifecycleExecutionState ().stepTime ())) < 0
422- || (failedStepRetryCount != null && failedStepRetryCount > maxRetries ));
433+ && (maxTimeOn != null
434+ && maxTimeOn .compareTo (RuleConfig .getElapsedTime (now , indexMetadata .getLifecycleExecutionState ().stepTime ())) < 0
435+ || (maxRetries != null && failedStepRetryCount != null && failedStepRetryCount > maxRetries ));
423436 }
424437 }
425438}
0 commit comments