Skip to content

Commit 4e4b95d

Browse files
author
Pablo Alcantar Morales
authored
Makes StepRule more configurable (#96855)
Allows having different StepRule shapes: - only check maxRetries - only check maxTimeOn - check both adding specific factory methods
1 parent 994e927 commit 4e4b95d

File tree

2 files changed

+40
-26
lines changed

2 files changed

+40
-26
lines changed

x-pack/plugin/ilm/src/main/java/org/elasticsearch/xpack/ilm/IlmHealthIndicatorService.java

Lines changed: 35 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,8 @@
5353
import static org.elasticsearch.health.HealthStatus.YELLOW;
5454
import static org.elasticsearch.xpack.core.ilm.LifecycleOperationMetadata.currentILMMode;
5555
import static org.elasticsearch.xpack.ilm.IlmHealthIndicatorService.RuleConfig.Builder.actionRule;
56-
import static org.elasticsearch.xpack.ilm.IlmHealthIndicatorService.StepRule.stepRule;
56+
import static org.elasticsearch.xpack.ilm.IlmHealthIndicatorService.StepRule.stepRuleFullChecks;
57+
import static org.elasticsearch.xpack.ilm.IlmHealthIndicatorService.StepRule.stepRuleOnlyCheckRetries;
5758

5859
/**
5960
* This indicator reports health for index lifecycle management component.
@@ -103,13 +104,14 @@ public class IlmHealthIndicatorService implements HealthIndicatorService {
103104
);
104105

105106
private static final TimeValue ONE_DAY = TimeValue.timeValueDays(1);
107+
private static final long MAX_RETRIES = 100;
106108

107109
static final Map<String, RuleConfig> RULES_BY_ACTION_CONFIG = Map.of(
108110
RolloverAction.NAME,
109111
actionRule(RolloverAction.NAME).stepRules(
110-
stepRule(WaitForActiveShardsStep.NAME, ONE_DAY),
111-
stepRule(WaitForRolloverReadyStep.NAME, ONE_DAY),
112-
stepRule(RolloverStep.NAME, ONE_DAY)
112+
stepRuleFullChecks(WaitForActiveShardsStep.NAME, ONE_DAY, MAX_RETRIES),
113+
stepRuleOnlyCheckRetries(WaitForRolloverReadyStep.NAME, MAX_RETRIES),
114+
stepRuleFullChecks(RolloverStep.NAME, ONE_DAY, MAX_RETRIES)
113115
),
114116
//
115117
MigrateAction.NAME,
@@ -118,32 +120,27 @@ public class IlmHealthIndicatorService implements HealthIndicatorService {
118120
SearchableSnapshotAction.NAME,
119121
actionRule(SearchableSnapshotAction.NAME).maxTimeOnAction(ONE_DAY)
120122
.stepRules(
121-
stepRule(WaitForDataTierStep.NAME, ONE_DAY),
122-
stepRule(WaitForIndexColorStep.NAME, ONE_DAY),
123-
// The no-follower step is added here because an `UnfollowAction` is added before the `shrinkAction` in the follower cluster
124-
stepRule(WaitForNoFollowersStep.NAME, ONE_DAY)
123+
stepRuleFullChecks(WaitForDataTierStep.NAME, ONE_DAY, MAX_RETRIES),
124+
stepRuleFullChecks(WaitForIndexColorStep.NAME, ONE_DAY, MAX_RETRIES),
125+
stepRuleOnlyCheckRetries(WaitForNoFollowersStep.NAME, MAX_RETRIES)
125126
),
126127
//
127128
DeleteAction.NAME,
128-
actionRule(DeleteAction.NAME).stepRules(stepRule(DeleteStep.NAME, ONE_DAY)),
129+
actionRule(DeleteAction.NAME).stepRules(stepRuleFullChecks(DeleteStep.NAME, ONE_DAY, MAX_RETRIES)),
129130
//
130131
ShrinkAction.NAME,
131132
actionRule(ShrinkAction.NAME).maxTimeOnAction(ONE_DAY)
132-
.stepRules(
133-
// The no-follower step is added here because an `unfollowAction` is added before the `shrinkAction` in the follower
134-
// cluster.
135-
stepRule(WaitForNoFollowersStep.NAME, ONE_DAY)
136-
),
133+
.stepRules(stepRuleOnlyCheckRetries(WaitForNoFollowersStep.NAME, MAX_RETRIES)),
137134
//
138135
AllocateAction.NAME,
139136
actionRule(AllocateAction.NAME).maxTimeOnAction(ONE_DAY).noStepRules(),
140137
//
141138
ForceMergeAction.NAME,
142139
actionRule(ForceMergeAction.NAME).maxTimeOnAction(ONE_DAY)
143140
.stepRules(
144-
stepRule(WaitForIndexColorStep.NAME, ONE_DAY),
145-
stepRule(ForceMergeStep.NAME, ONE_DAY),
146-
stepRule(SegmentCountStep.NAME, ONE_DAY)
141+
stepRuleFullChecks(WaitForIndexColorStep.NAME, ONE_DAY, MAX_RETRIES),
142+
stepRuleFullChecks(ForceMergeStep.NAME, ONE_DAY, MAX_RETRIES),
143+
stepRuleFullChecks(SegmentCountStep.NAME, ONE_DAY, MAX_RETRIES)
147144
)
148145
//
149146
// The next rule has to be commented because of this issue https://github.com/elastic/elasticsearch/issues/96705
@@ -409,17 +406,33 @@ public boolean test(Long now, IndexMetadata indexMetadata) {
409406
* @param maxTimeOn Maximum time that an index should spend on this step.
410407
* @param maxRetries Maximum number of times that a step should be retried.
411408
*/
412-
public record StepRule(String step, TimeValue maxTimeOn, long maxRetries) implements RuleConfig {
413-
static StepRule stepRule(String name, TimeValue maxTimeOn) {
414-
return new StepRule(name, maxTimeOn, 100);
409+
public record StepRule(String step, TimeValue maxTimeOn, Long maxRetries) implements RuleConfig {
410+
411+
public StepRule {
412+
if (maxTimeOn == null && maxRetries == null) {
413+
throw new IllegalArgumentException("At least one of [maxTimeOne or maxRetries] must be defined.");
414+
}
415+
}
416+
417+
static StepRule stepRuleFullChecks(String name, TimeValue maxTimeOn, long maxRetries) {
418+
return new StepRule(name, maxTimeOn, maxRetries);
419+
}
420+
421+
static StepRule stepRuleOnlyCheckPassedTime(String name, TimeValue maxTimeOn) {
422+
return new StepRule(name, maxTimeOn, null);
423+
}
424+
425+
static StepRule stepRuleOnlyCheckRetries(String name, long maxRetries) {
426+
return new StepRule(name, null, maxRetries);
415427
}
416428

417429
@Override
418430
public boolean test(Long now, IndexMetadata indexMetadata) {
419431
var failedStepRetryCount = indexMetadata.getLifecycleExecutionState().failedStepRetryCount();
420432
return step.equals(indexMetadata.getLifecycleExecutionState().step())
421-
&& (maxTimeOn.compareTo(RuleConfig.getElapsedTime(now, indexMetadata.getLifecycleExecutionState().stepTime())) < 0
422-
|| (failedStepRetryCount != null && failedStepRetryCount > maxRetries));
433+
&& (maxTimeOn != null
434+
&& maxTimeOn.compareTo(RuleConfig.getElapsedTime(now, indexMetadata.getLifecycleExecutionState().stepTime())) < 0
435+
|| (maxRetries != null && failedStepRetryCount != null && failedStepRetryCount > maxRetries));
423436
}
424437
}
425438
}

x-pack/plugin/ilm/src/test/java/org/elasticsearch/xpack/ilm/RuleConfigTests.java

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ public void testActionRuleConfig() {
4545
public void testStepRuleConfig() {
4646
var stepName = randomAlphaOfLength(30);
4747
var maxTimeOn = TimeValue.parseTimeValue(randomTimeValue(), "");
48-
var maxRetries = randomIntBetween(11, 100);
48+
var maxRetries = randomLongBetween(11, 100);
4949
var rule = new IlmHealthIndicatorService.StepRule(stepName, maxTimeOn, maxRetries);
5050
var now = System.currentTimeMillis();
5151

@@ -98,13 +98,14 @@ public void testRuleConfigBuilder() {
9898
var now = System.currentTimeMillis();
9999
var lastExecutionTime = System.currentTimeMillis() - TimeValue.timeValueDays(2).millis();
100100
var maxTimeOnStep = TimeValue.timeValueDays(1);
101+
var maxRetries = randomLongBetween(10, 1000);
101102
var expectedAction = "some-action";
102103
var rules = IlmHealthIndicatorService.RuleConfig.Builder.actionRule(expectedAction)
103104
.maxTimeOnAction(TimeValue.timeValueDays(1))
104105
.stepRules(
105-
IlmHealthIndicatorService.StepRule.stepRule("step-1", maxTimeOnStep),
106-
IlmHealthIndicatorService.StepRule.stepRule("step-2", maxTimeOnStep),
107-
IlmHealthIndicatorService.StepRule.stepRule("step-3", maxTimeOnStep)
106+
IlmHealthIndicatorService.StepRule.stepRuleFullChecks("step-1", maxTimeOnStep, maxRetries),
107+
IlmHealthIndicatorService.StepRule.stepRuleFullChecks("step-2", maxTimeOnStep, maxRetries),
108+
IlmHealthIndicatorService.StepRule.stepRuleFullChecks("step-3", maxTimeOnStep, maxRetries)
108109
);
109110

110111
// An unknown action should not satisfy the conditions

0 commit comments

Comments
 (0)