Skip to content

Commit 87192fb

Browse files
authored
Fix NPE when the start time of the last successful snapshot run is unknown (#95356)
1 parent efcdbbc commit 87192fb

File tree

3 files changed

+89
-17
lines changed

3 files changed

+89
-17
lines changed

docs/changelog/95356.yaml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
pr: 95356
2+
summary: Fix NPE when the start time of the last successful snapshot run is unknown
3+
area: Health
4+
type: bug
5+
issues: []

x-pack/plugin/ilm/src/main/java/org/elasticsearch/xpack/slm/SlmHealthIndicatorService.java

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
import java.time.ZoneOffset;
2727
import java.util.Collection;
2828
import java.util.Collections;
29+
import java.util.Comparator;
2930
import java.util.LinkedHashMap;
3031
import java.util.List;
3132
import java.util.Map;
@@ -133,6 +134,7 @@ public HealthIndicatorResult calculate(boolean verbose, int maxAffectedResources
133134
.values()
134135
.stream()
135136
.filter(metadata -> snapshotFailuresExceedWarningCount(failedSnapshotWarnThreshold, metadata))
137+
.sorted(Comparator.comparing(SnapshotLifecyclePolicyMetadata::getName))
136138
.toList();
137139

138140
if (unhealthyPolicies.size() > 0) {
@@ -153,9 +155,10 @@ public HealthIndicatorResult calculate(boolean verbose, int maxAffectedResources
153155
+ policy.getName()
154156
+ "] had ["
155157
+ policy.getInvocationsSinceLastSuccess()
156-
+ "] repeated failures without successful execution since ["
157-
+ FORMATTER.formatMillis(policy.getLastSuccess().getSnapshotStartTimestamp())
158-
+ "]"
158+
+ "] repeated failures without successful execution"
159+
+ (policy.getLastSuccess() != null && policy.getLastSuccess().getSnapshotStartTimestamp() != null
160+
? " since [" + FORMATTER.formatMillis(policy.getLastSuccess().getSnapshotStartTimestamp()) + "]"
161+
: "")
159162
)
160163
.collect(Collectors.joining("\n"));
161164
String cause = (unhealthyPolicies.size() > 1

x-pack/plugin/ilm/src/test/java/org/elasticsearch/xpack/slm/SlmHealthIndicatorServiceTests.java

Lines changed: 78 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -145,7 +145,7 @@ public void testIsGreenWhenPoliciesHaveFailedForLessThanWarningThreshold() {
145145
var clusterState = createClusterStateWith(
146146
new SnapshotLifecycleMetadata(
147147
createSlmPolicyWithInvocations(
148-
snapshotInvocation(execTime, execTime + 1000L),
148+
snapshotInvocation(randomBoolean() ? null : execTime, execTime + 1000L),
149149
snapshotInvocation(null, execTime + window + 1000L),
150150
randomLongBetween(0, 4)
151151
),
@@ -173,13 +173,50 @@ public void testIsGreenWhenPoliciesHaveFailedForLessThanWarningThreshold() {
173173
public void testIsYellowWhenPoliciesHaveFailedForMoreThanWarningThreshold() {
174174
long execTime = System.currentTimeMillis();
175175
long window = TimeUnit.HOURS.toMillis(24) + 5000L; // 24 hours and some extra room.
176-
long failedInvocations = randomLongBetween(5L, Long.MAX_VALUE);
176+
long failedInvocations1 = randomLongBetween(5L, Long.MAX_VALUE);
177+
long failedInvocations2 = randomLongBetween(5L, Long.MAX_VALUE);
178+
long failedInvocations3 = randomLongBetween(5L, Long.MAX_VALUE);
177179
var clusterState = createClusterStateWith(
178180
new SnapshotLifecycleMetadata(
179-
createSlmPolicyWithInvocations(
180-
snapshotInvocation(execTime, execTime + 1000L),
181-
snapshotInvocation(null, execTime + window + 1000L),
182-
failedInvocations
181+
Map.of(
182+
"test-policy",
183+
SnapshotLifecyclePolicyMetadata.builder()
184+
.setPolicy(new SnapshotLifecyclePolicy("policy-id-1", "test-policy", "", "test-repository", null, null))
185+
.setVersion(1L)
186+
.setModifiedDate(System.currentTimeMillis())
187+
.setLastSuccess(snapshotInvocation(execTime, execTime + 1000L))
188+
.setLastFailure(snapshotInvocation(null, execTime + window + 1000L))
189+
.setInvocationsSinceLastSuccess(failedInvocations1)
190+
.build(),
191+
"test-policy-without-any-success",
192+
SnapshotLifecyclePolicyMetadata.builder()
193+
.setPolicy(
194+
new SnapshotLifecyclePolicy("policy-id-2", "test-policy-without-any-success", "", "test-repository", null, null)
195+
)
196+
.setVersion(1L)
197+
.setModifiedDate(System.currentTimeMillis())
198+
.setLastSuccess(null)
199+
.setLastFailure(snapshotInvocation(null, execTime + window + 1000L))
200+
.setInvocationsSinceLastSuccess(failedInvocations2)
201+
.build(),
202+
"test-policy-without-success-start-time",
203+
SnapshotLifecyclePolicyMetadata.builder()
204+
.setPolicy(
205+
new SnapshotLifecyclePolicy(
206+
"policy-id-3",
207+
"test-policy-without-success-start-time",
208+
"",
209+
"test-repository",
210+
null,
211+
null
212+
)
213+
)
214+
.setVersion(1L)
215+
.setModifiedDate(System.currentTimeMillis())
216+
.setLastSuccess(snapshotInvocation(null, execTime))
217+
.setLastFailure(snapshotInvocation(null, execTime + window + 1000L))
218+
.setInvocationsSinceLastSuccess(failedInvocations3)
219+
.build()
183220
),
184221
RUNNING,
185222
null
@@ -194,15 +231,27 @@ public void testIsYellowWhenPoliciesHaveFailedForMoreThanWarningThreshold() {
194231
new HealthIndicatorResult(
195232
NAME,
196233
YELLOW,
197-
"Encountered [1] unhealthy snapshot lifecycle management policies.",
234+
"Encountered [3] unhealthy snapshot lifecycle management policies.",
198235
new SimpleHealthIndicatorDetails(
199236
Map.of(
200237
"slm_status",
201238
RUNNING,
202239
"policies",
203-
1,
240+
3,
204241
"unhealthy_policies",
205-
Map.of("count", 1, "invocations_since_last_success", Map.of("test-policy", failedInvocations))
242+
Map.of(
243+
"count",
244+
3,
245+
"invocations_since_last_success",
246+
Map.of(
247+
"test-policy",
248+
failedInvocations1,
249+
"test-policy-without-any-success",
250+
failedInvocations2,
251+
"test-policy-without-success-start-time",
252+
failedInvocations3
253+
)
254+
)
206255
)
207256
),
208257
Collections.singletonList(
@@ -218,15 +267,30 @@ public void testIsYellowWhenPoliciesHaveFailedForMoreThanWarningThreshold() {
218267
List.of(
219268
new Diagnosis(
220269
SlmHealthIndicatorService.checkRecentlyFailedSnapshots(
221-
"An automated snapshot policy is unhealthy:\n"
270+
"Several automated snapshot policies are unhealthy:\n"
222271
+ "- [test-policy] had ["
223-
+ failedInvocations
272+
+ failedInvocations1
224273
+ "] repeated failures without successful execution since ["
225274
+ FORMATTER.formatMillis(execTime)
226-
+ "]",
227-
"Check the snapshot lifecycle policy for detailed failure info:\n- GET /_slm/policy/policy-id?human"
275+
+ "]\n"
276+
+ "- [test-policy-without-any-success] had ["
277+
+ failedInvocations2
278+
+ "] repeated failures without successful execution\n"
279+
+ "- [test-policy-without-success-start-time] had ["
280+
+ failedInvocations3
281+
+ "] repeated failures without successful execution",
282+
"Check the snapshot lifecycle policies for detailed failure info:\n"
283+
+ "- GET /_slm/policy/policy-id-1?human\n"
284+
+ "- GET /_slm/policy/policy-id-2?human\n"
285+
+ "- GET /_slm/policy/policy-id-3?human"
286+
228287
),
229-
List.of(new Diagnosis.Resource(Type.SLM_POLICY, List.of("test-policy")))
288+
List.of(
289+
new Diagnosis.Resource(
290+
Type.SLM_POLICY,
291+
List.of("test-policy", "test-policy-without-any-success", "test-policy-without-success-start-time")
292+
)
293+
)
230294
)
231295
)
232296
)

0 commit comments

Comments
 (0)