Skip to content

Commit 5d48ded

Browse files
authored
Improve SLM Health Indicator to cover missing snapshot (#121370)
Currently the SLM health indicator in health report turns YELLOW when snapshots fail for a number of times. However, the SLM health indicator stays GREEN if snapshot is not completed (no success or failure) for a long time. This change adds a new optional setting unhealthy_if_no_snapshot_within to SLM policy, that sets a time threshold. If the SLM policy has not had a successful snapshot for longer than the threshold, the SLM health indicator will turn YELLOW.
1 parent 542c5d9 commit 5d48ded

File tree

9 files changed

+930
-153
lines changed

9 files changed

+930
-153
lines changed

docs/changelog/121370.yaml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
pr: 121370
2+
summary: Improve SLM Health Indicator to cover missing snapshot
3+
area: ILM+SLM
4+
type: enhancement
5+
issues: []

libs/core/src/main/java/org/elasticsearch/core/TimeValue.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ public class TimeValue implements Comparable<TimeValue> {
2323
public static final TimeValue MAX_VALUE = new TimeValue(Long.MAX_VALUE, TimeUnit.NANOSECONDS);
2424
public static final TimeValue THIRTY_SECONDS = new TimeValue(30, TimeUnit.SECONDS);
2525
public static final TimeValue ONE_MINUTE = new TimeValue(1, TimeUnit.MINUTES);
26+
public static final TimeValue ONE_HOUR = new TimeValue(1, TimeUnit.HOURS);
2627

2728
private static final long C0 = 1L;
2829
private static final long C1 = C0 * 1000L;

server/src/main/java/org/elasticsearch/TransportVersions.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -189,6 +189,7 @@ static TransportVersion def(int id) {
189189
public static final TransportVersion ESQL_PROFILE_ASYNC_NANOS = def(9_007_00_0);
190190
public static final TransportVersion ESQL_LOOKUP_JOIN_SOURCE_TEXT = def(9_008_0_00);
191191
public static final TransportVersion REMOVE_ALL_APPLICABLE_SELECTOR = def(9_009_0_00);
192+
public static final TransportVersion SLM_UNHEALTHY_IF_NO_SNAPSHOT_WITHIN = def(9_010_0_00);
192193

193194
/*
194195
* STOP! READ THIS FIRST! No, really,

test/framework/src/main/java/org/elasticsearch/test/ESTestCase.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1459,7 +1459,8 @@ public static void assertBusy(CheckedRunnable<Exception> codeBlock) throws Excep
14591459
}
14601460

14611461
/**
1462-
* Runs the code block for the provided interval, waiting for no assertions to trip.
1462+
* Runs the code block for the provided interval, waiting for no assertions to trip. Retries on AssertionError
1463+
* with exponential backoff until provided time runs out
14631464
*/
14641465
public static void assertBusy(CheckedRunnable<Exception> codeBlock, long maxWaitTime, TimeUnit unit) throws Exception {
14651466
long maxTimeInMillis = TimeUnit.MILLISECONDS.convert(maxWaitTime, unit);

x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/slm/SnapshotLifecyclePolicy.java

Lines changed: 64 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77

88
package org.elasticsearch.xpack.core.slm;
99

10+
import org.elasticsearch.TransportVersions;
1011
import org.elasticsearch.action.ActionRequestValidationException;
1112
import org.elasticsearch.action.admin.cluster.snapshots.create.CreateSnapshotRequest;
1213
import org.elasticsearch.cluster.SimpleDiffable;
@@ -52,12 +53,14 @@ public class SnapshotLifecyclePolicy implements SimpleDiffable<SnapshotLifecycle
5253
private final Map<String, Object> configuration;
5354
private final SnapshotRetentionConfiguration retentionPolicy;
5455
private final boolean isCronSchedule;
56+
private final TimeValue unhealthyIfNoSnapshotWithin;
5557

5658
private static final ParseField NAME = new ParseField("name");
5759
private static final ParseField SCHEDULE = new ParseField("schedule");
5860
private static final ParseField REPOSITORY = new ParseField("repository");
5961
private static final ParseField CONFIG = new ParseField("config");
6062
private static final ParseField RETENTION = new ParseField("retention");
63+
private static final ParseField UNHEALTHY_IF_NO_SNAPSHOT_WITHIN = new ParseField("unhealthy_if_no_snapshot_within");
6164
private static final String METADATA_FIELD_NAME = "metadata";
6265

6366
@SuppressWarnings("unchecked")
@@ -70,7 +73,8 @@ public class SnapshotLifecyclePolicy implements SimpleDiffable<SnapshotLifecycle
7073
String repo = (String) a[2];
7174
Map<String, Object> config = (Map<String, Object>) a[3];
7275
SnapshotRetentionConfiguration retention = (SnapshotRetentionConfiguration) a[4];
73-
return new SnapshotLifecyclePolicy(id, name, schedule, repo, config, retention);
76+
TimeValue unhealthyIfNoSnapshotWithin = (TimeValue) a[5];
77+
return new SnapshotLifecyclePolicy(id, name, schedule, repo, config, retention, unhealthyIfNoSnapshotWithin);
7478
}
7579
);
7680

@@ -80,6 +84,11 @@ public class SnapshotLifecyclePolicy implements SimpleDiffable<SnapshotLifecycle
8084
PARSER.declareString(ConstructingObjectParser.constructorArg(), REPOSITORY);
8185
PARSER.declareObject(ConstructingObjectParser.optionalConstructorArg(), (p, c) -> p.map(), CONFIG);
8286
PARSER.declareObject(ConstructingObjectParser.optionalConstructorArg(), SnapshotRetentionConfiguration::parse, RETENTION);
87+
PARSER.declareString(
88+
ConstructingObjectParser.optionalConstructorArg(),
89+
value -> TimeValue.parseTimeValue(value, UNHEALTHY_IF_NO_SNAPSHOT_WITHIN.getPreferredName()),
90+
UNHEALTHY_IF_NO_SNAPSHOT_WITHIN
91+
);
8392
}
8493

8594
public SnapshotLifecyclePolicy(
@@ -89,13 +98,26 @@ public SnapshotLifecyclePolicy(
8998
final String repository,
9099
@Nullable final Map<String, Object> configuration,
91100
@Nullable final SnapshotRetentionConfiguration retentionPolicy
101+
) {
102+
this(id, name, schedule, repository, configuration, retentionPolicy, null);
103+
}
104+
105+
public SnapshotLifecyclePolicy(
106+
final String id,
107+
final String name,
108+
final String schedule,
109+
final String repository,
110+
@Nullable final Map<String, Object> configuration,
111+
@Nullable final SnapshotRetentionConfiguration retentionPolicy,
112+
@Nullable final TimeValue unhealthyIfNoSnapshotWithin
92113
) {
93114
this.id = Objects.requireNonNull(id, "policy id is required");
94115
this.name = Objects.requireNonNull(name, "policy snapshot name is required");
95116
this.schedule = Objects.requireNonNull(schedule, "policy schedule is required");
96117
this.repository = Objects.requireNonNull(repository, "policy snapshot repository is required");
97118
this.configuration = configuration;
98119
this.retentionPolicy = retentionPolicy;
120+
this.unhealthyIfNoSnapshotWithin = unhealthyIfNoSnapshotWithin;
99121
this.isCronSchedule = isCronSchedule(schedule);
100122
}
101123

@@ -106,6 +128,9 @@ public SnapshotLifecyclePolicy(StreamInput in) throws IOException {
106128
this.repository = in.readString();
107129
this.configuration = in.readGenericMap();
108130
this.retentionPolicy = in.readOptionalWriteable(SnapshotRetentionConfiguration::new);
131+
this.unhealthyIfNoSnapshotWithin = in.getTransportVersion().onOrAfter(TransportVersions.SLM_UNHEALTHY_IF_NO_SNAPSHOT_WITHIN)
132+
? in.readOptionalTimeValue()
133+
: null;
109134
this.isCronSchedule = isCronSchedule(schedule);
110135
}
111136

@@ -135,6 +160,11 @@ public SnapshotRetentionConfiguration getRetentionPolicy() {
135160
return this.retentionPolicy;
136161
}
137162

163+
@Nullable
164+
public TimeValue getUnhealthyIfNoSnapshotWithin() {
165+
return this.unhealthyIfNoSnapshotWithin;
166+
}
167+
138168
boolean isCronSchedule() {
139169
return this.isCronSchedule;
140170
}
@@ -236,6 +266,7 @@ public ActionRequestValidationException validate() {
236266

237267
// Schedule validation
238268
// n.b. there's more validation beyond this in SnapshotLifecycleService#validateMinimumInterval
269+
boolean canValidateUnhealthyIfNoSnapshotWithin = false; // true if schedule is syntactically valid
239270
if (Strings.hasText(schedule) == false) {
240271
err.addValidationError("invalid schedule [" + schedule + "]: must not be empty");
241272
} else {
@@ -244,13 +275,35 @@ public ActionRequestValidationException validate() {
244275
if (intervalTimeValue.millis() == 0) {
245276
err.addValidationError("invalid schedule [" + schedule + "]: time unit must be at least 1 millisecond");
246277
}
278+
canValidateUnhealthyIfNoSnapshotWithin = true;
247279
} catch (IllegalArgumentException e1) {
248280
if (isCronSchedule(schedule) == false) {
249281
err.addValidationError("invalid schedule [" + schedule + "]: must be a valid cron expression or time unit");
282+
} else {
283+
canValidateUnhealthyIfNoSnapshotWithin = true;
250284
}
251285
}
252286
}
253287

288+
// validate unhealthyIfNoSnapshotWithin if schedule is syntactically valid
289+
if (canValidateUnhealthyIfNoSnapshotWithin) {
290+
TimeValue snapshotInterval = calculateNextInterval(Clock.systemUTC());
291+
if (unhealthyIfNoSnapshotWithin != null
292+
&& snapshotInterval.duration() > 0
293+
&& unhealthyIfNoSnapshotWithin.compareTo(snapshotInterval) < 0) {
294+
err.addValidationError(
295+
"invalid unhealthy_if_no_snapshot_within ["
296+
+ unhealthyIfNoSnapshotWithin.getStringRep()
297+
+ "]: "
298+
+ "time is too short, expecting at least more than the interval between snapshots ["
299+
+ snapshotInterval.toHumanReadableString(2)
300+
+ "] for schedule ["
301+
+ schedule
302+
+ "]"
303+
);
304+
}
305+
}
306+
254307
if (configuration != null && configuration.containsKey(METADATA_FIELD_NAME)) {
255308
if (configuration.get(METADATA_FIELD_NAME) instanceof Map == false) {
256309
err.addValidationError(
@@ -297,7 +350,7 @@ public ActionRequestValidationException validate() {
297350
err.addValidationError("invalid repository name [" + repository + "]: cannot be empty");
298351
}
299352

300-
return err.validationErrors().size() == 0 ? null : err;
353+
return err.validationErrors().isEmpty() ? null : err;
301354
}
302355

303356
private Map<String, Object> addPolicyNameToMetadata(final Map<String, Object> metadata) {
@@ -339,6 +392,9 @@ public void writeTo(StreamOutput out) throws IOException {
339392
out.writeString(this.repository);
340393
out.writeGenericMap(this.configuration);
341394
out.writeOptionalWriteable(this.retentionPolicy);
395+
if (out.getTransportVersion().onOrAfter(TransportVersions.SLM_UNHEALTHY_IF_NO_SNAPSHOT_WITHIN)) {
396+
out.writeOptionalTimeValue(this.unhealthyIfNoSnapshotWithin);
397+
}
342398
}
343399

344400
@Override
@@ -353,13 +409,16 @@ public XContentBuilder toXContent(XContentBuilder builder, Params params) throws
353409
if (this.retentionPolicy != null) {
354410
builder.field(RETENTION.getPreferredName(), this.retentionPolicy);
355411
}
412+
if (this.unhealthyIfNoSnapshotWithin != null) {
413+
builder.field(UNHEALTHY_IF_NO_SNAPSHOT_WITHIN.getPreferredName(), this.unhealthyIfNoSnapshotWithin);
414+
}
356415
builder.endObject();
357416
return builder;
358417
}
359418

360419
@Override
361420
public int hashCode() {
362-
return Objects.hash(id, name, schedule, repository, configuration, retentionPolicy);
421+
return Objects.hash(id, name, schedule, repository, configuration, retentionPolicy, unhealthyIfNoSnapshotWithin);
363422
}
364423

365424
@Override
@@ -377,7 +436,8 @@ public boolean equals(Object obj) {
377436
&& Objects.equals(schedule, other.schedule)
378437
&& Objects.equals(repository, other.repository)
379438
&& Objects.equals(configuration, other.configuration)
380-
&& Objects.equals(retentionPolicy, other.retentionPolicy);
439+
&& Objects.equals(retentionPolicy, other.retentionPolicy)
440+
&& Objects.equals(unhealthyIfNoSnapshotWithin, other.unhealthyIfNoSnapshotWithin);
381441
}
382442

383443
@Override

0 commit comments

Comments
 (0)