Skip to content
Merged
Show file tree
Hide file tree
Changes from 23 commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
2189670
SLM WIP
samxbr Jan 31, 2025
64b10ee
add constructor
samxbr Jan 31, 2025
dc371ff
Merge branch 'main' into feature/slm-health
samxbr Jan 31, 2025
a80fb70
remove last_successful_snapshot_timestamp
samxbr Jan 31, 2025
1f35dd0
[CI] Auto commit changes from spotless
Jan 31, 2025
7bc8e1c
add more
samxbr Feb 3, 2025
acf81e8
Merge branch 'main' into feature/slm-health
samxbr Feb 3, 2025
5cbde97
[CI] Auto commit changes from spotless
Feb 3, 2025
b6498b9
Merge branch 'main' into feature/slm-health
samxbr Feb 4, 2025
366bf48
Add integ tests
samxbr Feb 4, 2025
791b591
[CI] Auto commit changes from spotless
Feb 4, 2025
6cf8da2
rename
samxbr Feb 4, 2025
94f6356
rename
samxbr Feb 4, 2025
c18641f
[CI] Auto commit changes from spotless
Feb 4, 2025
fc8694b
Update docs/changelog/121370.yaml
samxbr Feb 4, 2025
17471af
Merge branch 'main' into feature/slm-health
samxbr Feb 4, 2025
22623c5
Update test/framework/src/main/java/org/elasticsearch/test/ESTestCase…
samxbr Feb 5, 2025
bafd842
Merge branch 'main' into feature/slm-health
samxbr Feb 5, 2025
691a4dc
Merge branch 'main' into feature/slm-health
samxbr Feb 10, 2025
96bdf38
rename to unhealthyIfNoSnapshotWithin
samxbr Feb 10, 2025
fd96eea
[CI] Auto commit changes from spotless
Feb 10, 2025
467e930
Update help url
samxbr Feb 10, 2025
331bd31
Merge branch 'main' into feature/slm-health
samxbr Feb 10, 2025
22a65fd
Merge branch 'main' into feature/slm-health
samxbr Feb 12, 2025
de772a0
minor
samxbr Feb 12, 2025
c86f2da
[CI] Auto commit changes from spotless
Feb 12, 2025
d844589
change from comments
samxbr Feb 13, 2025
14fa3ce
Merge branch 'main' into feature/slm-health
samxbr Feb 13, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions docs/changelog/121370.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
pr: 121370
summary: Improve SLM Health Indicator to cover missing snapshot
area: ILM+SLM
type: enhancement
issues: []
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ public class TimeValue implements Comparable<TimeValue> {
public static final TimeValue MAX_VALUE = new TimeValue(Long.MAX_VALUE, TimeUnit.NANOSECONDS);
public static final TimeValue THIRTY_SECONDS = new TimeValue(30, TimeUnit.SECONDS);
public static final TimeValue ONE_MINUTE = new TimeValue(1, TimeUnit.MINUTES);
public static final TimeValue ONE_HOUR = new TimeValue(1, TimeUnit.HOURS);

private static final long C0 = 1L;
private static final long C1 = C0 * 1000L;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -185,7 +185,7 @@ static TransportVersion def(int id) {
public static final TransportVersion ESQL_DRIVER_TASK_DESCRIPTION = def(9_005_0_00);
public static final TransportVersion ESQL_RETRY_ON_SHARD_LEVEL_FAILURE = def(9_006_0_00);
public static final TransportVersion ESQL_PROFILE_ASYNC_NANOS = def(9_007_00_0);

public static final TransportVersion SLM_UNHEALTHY_IF_NO_SNAPSHOT_WITHIN = def(9_008_0_00);
/*
* STOP! READ THIS FIRST! No, really,
* ____ _____ ___ ____ _ ____ _____ _ ____ _____ _ _ ___ ____ _____ ___ ____ ____ _____ _
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1459,7 +1459,8 @@ public static void assertBusy(CheckedRunnable<Exception> codeBlock) throws Excep
}

/**
* Runs the code block for the provided interval, waiting for no assertions to trip.
* Runs the code block for the provided interval, waiting for no assertions to trip. Retries on AssertionError
* with exponential backoff until provided time runs out
Comment on lines -1462 to +1463
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this change really necessary? I don't really feel like this change is super valuable (it's not wrong, just not absolutely necessary) and since it's the only change in this file, I'd prefer to revert it to reduce the scope of the PR a bit.

Copy link
Contributor Author

@samxbr samxbr Feb 11, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think adding the comment makes it clearer that it will retry until timeout, I wasn't sure about whether it will retry or not from the original comment, and had to read the code. Maybe it's just me though

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm yeah that's why I said you're not wrong. I'm just not a huge fan of PRs changing unrelated files. I prefer PRs to have a more defined scope to make them easier to read and for commit history to be a bit more understandable.

Copy link
Contributor Author

@samxbr samxbr Feb 13, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I get what you mean, and I would usually agree with you for changes that are less trivial than this. It's pretty common for people to find opportunities for tiny improvements while working on unrelated code. I think it's adding unnecessary friction if we were to split every tiny improvements to separate PRs (like a PR just to change this line of comment). To me it's more important to continuously make tiny improvements to the code base than trying to keep every PR within its scope.

That being said, I agree with you 100% on keeping PR in scope. It's just this one feels too trivial to be "out-of-scope".

*/
public static void assertBusy(CheckedRunnable<Exception> codeBlock, long maxWaitTime, TimeUnit unit) throws Exception {
long maxTimeInMillis = TimeUnit.MILLISECONDS.convert(maxWaitTime, unit);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

package org.elasticsearch.xpack.core.slm;

import org.elasticsearch.TransportVersions;
import org.elasticsearch.action.ActionRequestValidationException;
import org.elasticsearch.action.admin.cluster.snapshots.create.CreateSnapshotRequest;
import org.elasticsearch.cluster.SimpleDiffable;
Expand All @@ -20,6 +21,7 @@
import org.elasticsearch.core.TimeValue;
import org.elasticsearch.snapshots.SnapshotsService;
import org.elasticsearch.xcontent.ConstructingObjectParser;
import org.elasticsearch.xcontent.ObjectParser;
import org.elasticsearch.xcontent.ParseField;
import org.elasticsearch.xcontent.ToXContentObject;
import org.elasticsearch.xcontent.XContentBuilder;
Expand Down Expand Up @@ -52,12 +54,14 @@ public class SnapshotLifecyclePolicy implements SimpleDiffable<SnapshotLifecycle
private final Map<String, Object> configuration;
private final SnapshotRetentionConfiguration retentionPolicy;
private final boolean isCronSchedule;
private final TimeValue unhealthyIfNoSnapshotWithin;

private static final ParseField NAME = new ParseField("name");
private static final ParseField SCHEDULE = new ParseField("schedule");
private static final ParseField REPOSITORY = new ParseField("repository");
private static final ParseField CONFIG = new ParseField("config");
private static final ParseField RETENTION = new ParseField("retention");
private static final ParseField UNHEALTHY_IF_NO_SNAPSHOT_WITHIN = new ParseField("unhealthy_if_no_snapshot_within");
private static final String METADATA_FIELD_NAME = "metadata";

@SuppressWarnings("unchecked")
Expand All @@ -70,7 +74,8 @@ public class SnapshotLifecyclePolicy implements SimpleDiffable<SnapshotLifecycle
String repo = (String) a[2];
Map<String, Object> config = (Map<String, Object>) a[3];
SnapshotRetentionConfiguration retention = (SnapshotRetentionConfiguration) a[4];
return new SnapshotLifecyclePolicy(id, name, schedule, repo, config, retention);
TimeValue unhealthyIfNoSnapshotWithin = (TimeValue) a[5];
return new SnapshotLifecyclePolicy(id, name, schedule, repo, config, retention, unhealthyIfNoSnapshotWithin);
}
);

Expand All @@ -80,6 +85,12 @@ public class SnapshotLifecyclePolicy implements SimpleDiffable<SnapshotLifecycle
PARSER.declareString(ConstructingObjectParser.constructorArg(), REPOSITORY);
PARSER.declareObject(ConstructingObjectParser.optionalConstructorArg(), (p, c) -> p.map(), CONFIG);
PARSER.declareObject(ConstructingObjectParser.optionalConstructorArg(), SnapshotRetentionConfiguration::parse, RETENTION);
PARSER.declareField(
ConstructingObjectParser.optionalConstructorArg(),
(p, c) -> TimeValue.parseTimeValue(p.text(), UNHEALTHY_IF_NO_SNAPSHOT_WITHIN.getPreferredName()),
UNHEALTHY_IF_NO_SNAPSHOT_WITHIN,
ObjectParser.ValueType.STRING
);
}

public SnapshotLifecyclePolicy(
Expand All @@ -89,13 +100,26 @@ public SnapshotLifecyclePolicy(
final String repository,
@Nullable final Map<String, Object> configuration,
@Nullable final SnapshotRetentionConfiguration retentionPolicy
) {
this(id, name, schedule, repository, configuration, retentionPolicy, null);
}

public SnapshotLifecyclePolicy(
final String id,
final String name,
final String schedule,
final String repository,
@Nullable final Map<String, Object> configuration,
@Nullable final SnapshotRetentionConfiguration retentionPolicy,
@Nullable final TimeValue unhealthyIfNoSnapshotWithin
) {
this.id = Objects.requireNonNull(id, "policy id is required");
this.name = Objects.requireNonNull(name, "policy snapshot name is required");
this.schedule = Objects.requireNonNull(schedule, "policy schedule is required");
this.repository = Objects.requireNonNull(repository, "policy snapshot repository is required");
this.configuration = configuration;
this.retentionPolicy = retentionPolicy;
this.unhealthyIfNoSnapshotWithin = unhealthyIfNoSnapshotWithin;
this.isCronSchedule = isCronSchedule(schedule);
}

Expand All @@ -106,6 +130,9 @@ public SnapshotLifecyclePolicy(StreamInput in) throws IOException {
this.repository = in.readString();
this.configuration = in.readGenericMap();
this.retentionPolicy = in.readOptionalWriteable(SnapshotRetentionConfiguration::new);
this.unhealthyIfNoSnapshotWithin = in.getTransportVersion().onOrAfter(TransportVersions.SLM_UNHEALTHY_IF_NO_SNAPSHOT_WITHIN)
? in.readOptionalTimeValue()
: null;
this.isCronSchedule = isCronSchedule(schedule);
}

Expand Down Expand Up @@ -135,6 +162,11 @@ public SnapshotRetentionConfiguration getRetentionPolicy() {
return this.retentionPolicy;
}

@Nullable
public TimeValue getUnhealthyIfNoSnapshotWithin() {
return this.unhealthyIfNoSnapshotWithin;
}

boolean isCronSchedule() {
return this.isCronSchedule;
}
Expand Down Expand Up @@ -236,6 +268,7 @@ public ActionRequestValidationException validate() {

// Schedule validation
// n.b. there's more validation beyond this in SnapshotLifecycleService#validateMinimumInterval
boolean canValidateUnhealthyIfNoSnapshotWithin = false; // true if schedule is syntactically valid
if (Strings.hasText(schedule) == false) {
err.addValidationError("invalid schedule [" + schedule + "]: must not be empty");
} else {
Expand All @@ -244,13 +277,35 @@ public ActionRequestValidationException validate() {
if (intervalTimeValue.millis() == 0) {
err.addValidationError("invalid schedule [" + schedule + "]: time unit must be at least 1 millisecond");
}
canValidateUnhealthyIfNoSnapshotWithin = true;
} catch (IllegalArgumentException e1) {
if (isCronSchedule(schedule) == false) {
err.addValidationError("invalid schedule [" + schedule + "]: must be a valid cron expression or time unit");
} else {
canValidateUnhealthyIfNoSnapshotWithin = true;
}
}
}

// validate unhealthyIfNoSnapshotWithin if schedule is syntactically valid
if (canValidateUnhealthyIfNoSnapshotWithin) {
TimeValue snapshotInterval = calculateNextInterval(Clock.systemUTC());
if (unhealthyIfNoSnapshotWithin != null
&& snapshotInterval.duration() > 0
&& unhealthyIfNoSnapshotWithin.compareTo(snapshotInterval) < 0) {
err.addValidationError(
"invalid unhealthyIfNoSnapshotWithin ["
+ unhealthyIfNoSnapshotWithin.getStringRep()
+ "]: "
+ "time is too short, expecting at least more than the interval between snapshots ["
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm on the fence about this validation, on one hand, should we actually prevent someone from making the threshold too small, given that they can always manually execute the SLM policy to take a snapshot? I'm not sure it's worth it. What do you think?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree that this validation is not strictly necessary, the worse case without it is that the indicator will always be YELLOW if the threshold is set too small.

I am slightly leaning towards keeping it, my philosophy here is to providing quick feedback for user over letting them find out through health indicator. I can't think of a use case where user would set this threshold smaller than the snapshot interval. I also can't think of a drawback for keeping this validation, it's ok if user occasionally execute SLM manually, it just resets the threshold time, and doesn't defeat the purpose of this setting. We can definitely remove this validation if it can have potential negative impact I didn't think of.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm leaning towards keeping the validation as well. I also don't really see a reason why we would allow users to do this, but I do see value in avoiding support tickets where users to do this. Plus, we get unhealthyIfNoSnapshotWithin > 0 validation for free.

+ snapshotInterval.toHumanReadableString(2)
+ "] for schedule ["
+ schedule
+ "]"
);
}
}

if (configuration != null && configuration.containsKey(METADATA_FIELD_NAME)) {
if (configuration.get(METADATA_FIELD_NAME) instanceof Map == false) {
err.addValidationError(
Expand Down Expand Up @@ -297,7 +352,7 @@ public ActionRequestValidationException validate() {
err.addValidationError("invalid repository name [" + repository + "]: cannot be empty");
}

return err.validationErrors().size() == 0 ? null : err;
return err.validationErrors().isEmpty() ? null : err;
}

private Map<String, Object> addPolicyNameToMetadata(final Map<String, Object> metadata) {
Expand Down Expand Up @@ -339,6 +394,9 @@ public void writeTo(StreamOutput out) throws IOException {
out.writeString(this.repository);
out.writeGenericMap(this.configuration);
out.writeOptionalWriteable(this.retentionPolicy);
if (out.getTransportVersion().onOrAfter(TransportVersions.SLM_UNHEALTHY_IF_NO_SNAPSHOT_WITHIN)) {
out.writeOptionalTimeValue(this.unhealthyIfNoSnapshotWithin);
}
}

@Override
Expand All @@ -353,13 +411,16 @@ public XContentBuilder toXContent(XContentBuilder builder, Params params) throws
if (this.retentionPolicy != null) {
builder.field(RETENTION.getPreferredName(), this.retentionPolicy);
}
if (this.unhealthyIfNoSnapshotWithin != null) {
builder.field(UNHEALTHY_IF_NO_SNAPSHOT_WITHIN.getPreferredName(), this.unhealthyIfNoSnapshotWithin);
}
builder.endObject();
return builder;
}

@Override
public int hashCode() {
return Objects.hash(id, name, schedule, repository, configuration, retentionPolicy);
return Objects.hash(id, name, schedule, repository, configuration, retentionPolicy, unhealthyIfNoSnapshotWithin);
}

@Override
Expand All @@ -377,7 +438,8 @@ public boolean equals(Object obj) {
&& Objects.equals(schedule, other.schedule)
&& Objects.equals(repository, other.repository)
&& Objects.equals(configuration, other.configuration)
&& Objects.equals(retentionPolicy, other.retentionPolicy);
&& Objects.equals(retentionPolicy, other.retentionPolicy)
&& Objects.equals(unhealthyIfNoSnapshotWithin, other.unhealthyIfNoSnapshotWithin);
}

@Override
Expand Down
Loading