-
Notifications
You must be signed in to change notification settings - Fork 25.6k
Improve SLM Health Indicator to cover missing snapshot #121370
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 26 commits
2189670
64b10ee
dc371ff
a80fb70
1f35dd0
7bc8e1c
acf81e8
5cbde97
b6498b9
366bf48
791b591
6cf8da2
94f6356
c18641f
fc8694b
17471af
22623c5
bafd842
691a4dc
96bdf38
fd96eea
467e930
331bd31
22a65fd
de772a0
c86f2da
d844589
14fa3ce
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,5 @@ | ||
| pr: 121370 | ||
| summary: Improve SLM Health Indicator to cover missing snapshot | ||
| area: ILM+SLM | ||
| type: enhancement | ||
| issues: [] |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -7,6 +7,7 @@ | |
|
|
||
| package org.elasticsearch.xpack.core.slm; | ||
|
|
||
| import org.elasticsearch.TransportVersions; | ||
| import org.elasticsearch.action.ActionRequestValidationException; | ||
| import org.elasticsearch.action.admin.cluster.snapshots.create.CreateSnapshotRequest; | ||
| import org.elasticsearch.cluster.SimpleDiffable; | ||
|
|
@@ -52,12 +53,14 @@ public class SnapshotLifecyclePolicy implements SimpleDiffable<SnapshotLifecycle | |
| private final Map<String, Object> configuration; | ||
| private final SnapshotRetentionConfiguration retentionPolicy; | ||
| private final boolean isCronSchedule; | ||
| private final TimeValue unhealthyIfNoSnapshotWithin; | ||
|
|
||
| private static final ParseField NAME = new ParseField("name"); | ||
| private static final ParseField SCHEDULE = new ParseField("schedule"); | ||
| private static final ParseField REPOSITORY = new ParseField("repository"); | ||
| private static final ParseField CONFIG = new ParseField("config"); | ||
| private static final ParseField RETENTION = new ParseField("retention"); | ||
| private static final ParseField UNHEALTHY_IF_NO_SNAPSHOT_WITHIN = new ParseField("unhealthy_if_no_snapshot_within"); | ||
| private static final String METADATA_FIELD_NAME = "metadata"; | ||
|
|
||
| @SuppressWarnings("unchecked") | ||
|
|
@@ -70,7 +73,8 @@ public class SnapshotLifecyclePolicy implements SimpleDiffable<SnapshotLifecycle | |
| String repo = (String) a[2]; | ||
| Map<String, Object> config = (Map<String, Object>) a[3]; | ||
| SnapshotRetentionConfiguration retention = (SnapshotRetentionConfiguration) a[4]; | ||
| return new SnapshotLifecyclePolicy(id, name, schedule, repo, config, retention); | ||
| TimeValue unhealthyIfNoSnapshotWithin = (TimeValue) a[5]; | ||
| return new SnapshotLifecyclePolicy(id, name, schedule, repo, config, retention, unhealthyIfNoSnapshotWithin); | ||
| } | ||
| ); | ||
|
|
||
|
|
@@ -80,6 +84,11 @@ public class SnapshotLifecyclePolicy implements SimpleDiffable<SnapshotLifecycle | |
| PARSER.declareString(ConstructingObjectParser.constructorArg(), REPOSITORY); | ||
| PARSER.declareObject(ConstructingObjectParser.optionalConstructorArg(), (p, c) -> p.map(), CONFIG); | ||
| PARSER.declareObject(ConstructingObjectParser.optionalConstructorArg(), SnapshotRetentionConfiguration::parse, RETENTION); | ||
| PARSER.declareString( | ||
| ConstructingObjectParser.optionalConstructorArg(), | ||
| value -> TimeValue.parseTimeValue(value, UNHEALTHY_IF_NO_SNAPSHOT_WITHIN.getPreferredName()), | ||
| UNHEALTHY_IF_NO_SNAPSHOT_WITHIN | ||
| ); | ||
| } | ||
|
|
||
| public SnapshotLifecyclePolicy( | ||
|
|
@@ -89,13 +98,26 @@ public SnapshotLifecyclePolicy( | |
| final String repository, | ||
| @Nullable final Map<String, Object> configuration, | ||
| @Nullable final SnapshotRetentionConfiguration retentionPolicy | ||
| ) { | ||
| this(id, name, schedule, repository, configuration, retentionPolicy, null); | ||
| } | ||
|
|
||
| public SnapshotLifecyclePolicy( | ||
| final String id, | ||
| final String name, | ||
| final String schedule, | ||
| final String repository, | ||
| @Nullable final Map<String, Object> configuration, | ||
| @Nullable final SnapshotRetentionConfiguration retentionPolicy, | ||
| @Nullable final TimeValue unhealthyIfNoSnapshotWithin | ||
| ) { | ||
| this.id = Objects.requireNonNull(id, "policy id is required"); | ||
| this.name = Objects.requireNonNull(name, "policy snapshot name is required"); | ||
| this.schedule = Objects.requireNonNull(schedule, "policy schedule is required"); | ||
| this.repository = Objects.requireNonNull(repository, "policy snapshot repository is required"); | ||
| this.configuration = configuration; | ||
| this.retentionPolicy = retentionPolicy; | ||
| this.unhealthyIfNoSnapshotWithin = unhealthyIfNoSnapshotWithin; | ||
| this.isCronSchedule = isCronSchedule(schedule); | ||
| } | ||
|
|
||
|
|
@@ -106,6 +128,9 @@ public SnapshotLifecyclePolicy(StreamInput in) throws IOException { | |
| this.repository = in.readString(); | ||
| this.configuration = in.readGenericMap(); | ||
| this.retentionPolicy = in.readOptionalWriteable(SnapshotRetentionConfiguration::new); | ||
| this.unhealthyIfNoSnapshotWithin = in.getTransportVersion().onOrAfter(TransportVersions.SLM_UNHEALTHY_IF_NO_SNAPSHOT_WITHIN) | ||
| ? in.readOptionalTimeValue() | ||
| : null; | ||
| this.isCronSchedule = isCronSchedule(schedule); | ||
| } | ||
|
|
||
|
|
@@ -135,6 +160,11 @@ public SnapshotRetentionConfiguration getRetentionPolicy() { | |
| return this.retentionPolicy; | ||
| } | ||
|
|
||
| @Nullable | ||
| public TimeValue getUnhealthyIfNoSnapshotWithin() { | ||
| return this.unhealthyIfNoSnapshotWithin; | ||
| } | ||
|
|
||
| boolean isCronSchedule() { | ||
| return this.isCronSchedule; | ||
| } | ||
|
|
@@ -236,6 +266,7 @@ public ActionRequestValidationException validate() { | |
|
|
||
| // Schedule validation | ||
| // n.b. there's more validation beyond this in SnapshotLifecycleService#validateMinimumInterval | ||
| boolean canValidateUnhealthyIfNoSnapshotWithin = false; // true if schedule is syntactically valid | ||
| if (Strings.hasText(schedule) == false) { | ||
| err.addValidationError("invalid schedule [" + schedule + "]: must not be empty"); | ||
| } else { | ||
|
|
@@ -244,13 +275,35 @@ public ActionRequestValidationException validate() { | |
| if (intervalTimeValue.millis() == 0) { | ||
| err.addValidationError("invalid schedule [" + schedule + "]: time unit must be at least 1 millisecond"); | ||
| } | ||
| canValidateUnhealthyIfNoSnapshotWithin = true; | ||
| } catch (IllegalArgumentException e1) { | ||
| if (isCronSchedule(schedule) == false) { | ||
| err.addValidationError("invalid schedule [" + schedule + "]: must be a valid cron expression or time unit"); | ||
| } else { | ||
| canValidateUnhealthyIfNoSnapshotWithin = true; | ||
| } | ||
| } | ||
| } | ||
|
|
||
| // validate unhealthyIfNoSnapshotWithin if schedule is syntactically valid | ||
| if (canValidateUnhealthyIfNoSnapshotWithin) { | ||
| TimeValue snapshotInterval = calculateNextInterval(Clock.systemUTC()); | ||
| if (unhealthyIfNoSnapshotWithin != null | ||
| && snapshotInterval.duration() > 0 | ||
| && unhealthyIfNoSnapshotWithin.compareTo(snapshotInterval) < 0) { | ||
| err.addValidationError( | ||
| "invalid unhealthy_if_no_snapshot_within [" | ||
| + unhealthyIfNoSnapshotWithin.getStringRep() | ||
| + "]: " | ||
| + "time is too short, expecting at least more than the interval between snapshots [" | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm on the fence about this validation, on one hand, should we actually prevent someone from making the threshold too small, given that they can always manually execute the SLM policy to take a snapshot? I'm not sure it's worth it. What do you think?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I agree that this validation is not strictly necessary, the worse case without it is that the indicator will always be YELLOW if the threshold is set too small. I am slightly leaning towards keeping it, my philosophy here is to providing quick feedback for user over letting them find out through health indicator. I can't think of a use case where user would set this threshold smaller than the snapshot interval. I also can't think of a drawback for keeping this validation, it's ok if user occasionally execute SLM manually, it just resets the threshold time, and doesn't defeat the purpose of this setting. We can definitely remove this validation if it can have potential negative impact I didn't think of.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm leaning towards keeping the validation as well. I also don't really see a reason why we would allow users to do this, but I do see value in avoiding support tickets where users to do this. Plus, we get |
||
| + snapshotInterval.toHumanReadableString(2) | ||
| + "] for schedule [" | ||
| + schedule | ||
| + "]" | ||
| ); | ||
| } | ||
| } | ||
|
|
||
| if (configuration != null && configuration.containsKey(METADATA_FIELD_NAME)) { | ||
| if (configuration.get(METADATA_FIELD_NAME) instanceof Map == false) { | ||
| err.addValidationError( | ||
|
|
@@ -297,7 +350,7 @@ public ActionRequestValidationException validate() { | |
| err.addValidationError("invalid repository name [" + repository + "]: cannot be empty"); | ||
| } | ||
|
|
||
| return err.validationErrors().size() == 0 ? null : err; | ||
| return err.validationErrors().isEmpty() ? null : err; | ||
| } | ||
|
|
||
| private Map<String, Object> addPolicyNameToMetadata(final Map<String, Object> metadata) { | ||
|
|
@@ -339,6 +392,9 @@ public void writeTo(StreamOutput out) throws IOException { | |
| out.writeString(this.repository); | ||
| out.writeGenericMap(this.configuration); | ||
| out.writeOptionalWriteable(this.retentionPolicy); | ||
| if (out.getTransportVersion().onOrAfter(TransportVersions.SLM_UNHEALTHY_IF_NO_SNAPSHOT_WITHIN)) { | ||
| out.writeOptionalTimeValue(this.unhealthyIfNoSnapshotWithin); | ||
| } | ||
| } | ||
|
|
||
| @Override | ||
|
|
@@ -353,13 +409,16 @@ public XContentBuilder toXContent(XContentBuilder builder, Params params) throws | |
| if (this.retentionPolicy != null) { | ||
| builder.field(RETENTION.getPreferredName(), this.retentionPolicy); | ||
| } | ||
| if (this.unhealthyIfNoSnapshotWithin != null) { | ||
| builder.field(UNHEALTHY_IF_NO_SNAPSHOT_WITHIN.getPreferredName(), this.unhealthyIfNoSnapshotWithin); | ||
| } | ||
| builder.endObject(); | ||
| return builder; | ||
| } | ||
|
|
||
| @Override | ||
| public int hashCode() { | ||
| return Objects.hash(id, name, schedule, repository, configuration, retentionPolicy); | ||
| return Objects.hash(id, name, schedule, repository, configuration, retentionPolicy, unhealthyIfNoSnapshotWithin); | ||
| } | ||
|
|
||
| @Override | ||
|
|
@@ -377,7 +436,8 @@ public boolean equals(Object obj) { | |
| && Objects.equals(schedule, other.schedule) | ||
| && Objects.equals(repository, other.repository) | ||
| && Objects.equals(configuration, other.configuration) | ||
| && Objects.equals(retentionPolicy, other.retentionPolicy); | ||
| && Objects.equals(retentionPolicy, other.retentionPolicy) | ||
| && Objects.equals(unhealthyIfNoSnapshotWithin, other.unhealthyIfNoSnapshotWithin); | ||
| } | ||
|
|
||
| @Override | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is this change really necessary? I don't really feel like this change is super valuable (it's not wrong, just not absolutely necessary) and since it's the only change in this file, I'd prefer to revert it to reduce the scope of the PR a bit.
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think adding the comment makes it clearer that it will retry until timeout, I wasn't sure about whether it will retry or not from the original comment, and had to read the code. Maybe it's just me though
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Hmm yeah that's why I said you're not wrong. I'm just not a huge fan of PRs changing unrelated files. I prefer PRs to have a more defined scope to make them easier to read and for commit history to be a bit more understandable.
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I get what you mean, and I would usually agree with you for changes that are less trivial than this. It's pretty common for people to find opportunities for tiny improvements while working on unrelated code. I think it's adding unnecessary friction if we were to split every tiny improvements to separate PRs (like a PR just to change this line of comment). To me it's more important to continuously make tiny improvements to the code base than trying to keep every PR within its scope.
That being said, I agree with you 100% on keeping PR in scope. It's just this one feels too trivial to be "out-of-scope".