15
15
import org .elasticsearch .action .admin .cluster .snapshots .restore .TransportRestoreSnapshotAction ;
16
16
import org .elasticsearch .action .admin .cluster .snapshots .status .SnapshotStatus ;
17
17
import org .elasticsearch .action .admin .cluster .snapshots .status .SnapshotsStatusResponse ;
18
+ import org .elasticsearch .action .support .master .AcknowledgedResponse ;
18
19
import org .elasticsearch .cluster .SnapshotsInProgress ;
19
20
import org .elasticsearch .cluster .health .ClusterHealthStatus ;
20
21
import org .elasticsearch .cluster .routing .UnassignedInfo ;
@@ -72,6 +73,7 @@ public class SLMSnapshotBlockingIntegTests extends AbstractSnapshotIntegTestCase
72
73
private static final String NEVER_EXECUTE_CRON_SCHEDULE = "* * * 31 FEB ? *" ;
73
74
74
75
static final String REPO = "my-repo" ;
76
+ List <String > masterNodeNames = null ;
75
77
List <String > dataNodeNames = null ;
76
78
77
79
@ Override
@@ -85,7 +87,7 @@ protected Settings nodeSettings(int nodeOrdinal, Settings otherSettings) {
85
87
@ Before
86
88
public void ensureClusterNodes () {
87
89
logger .info ("--> starting enough nodes to ensure we have enough to safely stop for tests" );
88
- internalCluster ().startMasterOnlyNodes (2 );
90
+ masterNodeNames = internalCluster ().startMasterOnlyNodes (2 );
89
91
dataNodeNames = internalCluster ().startDataOnlyNodes (2 );
90
92
ensureGreen ();
91
93
}
@@ -329,6 +331,185 @@ public void testRetentionWithMultipleRepositories() throws Exception {
329
331
testUnsuccessfulSnapshotRetention (randomBoolean ());
330
332
}
331
333
334
+ // Test that SLM stats and lastSuccess/lastFailure are correctly updated with master shutdown
335
+ public void testSLMWithMasterShutdown () throws Exception {
336
+ final String indexName = "test" ;
337
+ final String policyName = "test-policy" ;
338
+ int clusterSize = masterNodeNames .size () + dataNodeNames .size ();
339
+ indexRandomDocs (indexName , 20 );
340
+ createRepository (REPO , "mock" );
341
+
342
+ createSnapshotPolicy (
343
+ policyName ,
344
+ "snap" ,
345
+ NEVER_EXECUTE_CRON_SCHEDULE ,
346
+ REPO ,
347
+ indexName ,
348
+ true ,
349
+ false ,
350
+ new SnapshotRetentionConfiguration (TimeValue .ZERO , null , null )
351
+ );
352
+
353
+ // block snapshot from completing
354
+ blockMasterFromFinalizingSnapshotOnIndexFile (REPO );
355
+
356
+ // first SLM execution
357
+ final String snapshotName = executePolicy (policyName );
358
+ final String initialMaster = internalCluster ().getMasterName ();
359
+ waitForBlock (initialMaster , REPO );
360
+
361
+ // restart master
362
+ internalCluster ().restartNode (initialMaster );
363
+ ensureStableCluster (clusterSize );
364
+ awaitNoMoreRunningOperations ();
365
+
366
+ // ensure snapshot is completed successfully after master failover
367
+ assertBusy (() -> {
368
+ final SnapshotInfo snapshotInfo ;
369
+ try {
370
+ GetSnapshotsResponse snapshotsStatusResponse = clusterAdmin ().prepareGetSnapshots (TEST_REQUEST_TIMEOUT , REPO )
371
+ .setSnapshots (snapshotName )
372
+ .get ();
373
+ snapshotInfo = snapshotsStatusResponse .getSnapshots ().get (0 );
374
+ } catch (SnapshotMissingException sme ) {
375
+ throw new AssertionError (sme );
376
+ }
377
+ assertEquals (SnapshotState .SUCCESS , snapshotInfo .state ());
378
+ }, 30L , TimeUnit .SECONDS );
379
+ assertSnapshotSuccessful (snapshotName );
380
+
381
+ // the SLM policy metadata has not been updated due to master shutdown
382
+ assertBusy (() -> {
383
+ SnapshotLifecyclePolicyItem policy = client ().execute (
384
+ GetSnapshotLifecycleAction .INSTANCE ,
385
+ new GetSnapshotLifecycleAction .Request (TEST_REQUEST_TIMEOUT , TEST_REQUEST_TIMEOUT , policyName )
386
+ ).get ().getPolicies ().getFirst ();
387
+ assertNull (policy .getLastSuccess ());
388
+ assertNull (policy .getLastFailure ());
389
+ assertEquals (0 , policy .getPolicyStats ().getSnapshotFailedCount ());
390
+ assertEquals (0 , policy .getPolicyStats ().getSnapshotTakenCount ());
391
+ });
392
+
393
+ // 2nd SLM execution, it should pick up the last missing stats
394
+ String snapshotSecond = executePolicy (policyName );
395
+
396
+ awaitNoMoreRunningOperations ();
397
+ assertSnapshotSuccessful (snapshotSecond );
398
+
399
+ // stats should have 2 successful snapshots, 1 from the new snapshot and 1 from previous success
400
+ assertBusy (() -> {
401
+ SnapshotLifecyclePolicyItem policy = client ().execute (
402
+ GetSnapshotLifecycleAction .INSTANCE ,
403
+ new GetSnapshotLifecycleAction .Request (TEST_REQUEST_TIMEOUT , TEST_REQUEST_TIMEOUT , policyName )
404
+ ).get ().getPolicies ().getFirst ();
405
+ assertNull (policy .getLastFailure ());
406
+ assertNotNull (policy .getLastSuccess ());
407
+ assertEquals (snapshotSecond , policy .getLastSuccess ().getSnapshotName ());
408
+ assertEquals (0 , policy .getPolicyStats ().getSnapshotFailedCount ());
409
+ assertEquals (2 , policy .getPolicyStats ().getSnapshotTakenCount ());
410
+ });
411
+ }
412
+
413
+ public void testSLMWithMasterShutdownAndDeletedSnapshot () throws Exception {
414
+ final String indexName = "test" ;
415
+ final String policyName = "test-policy" ;
416
+ int clusterSize = masterNodeNames .size () + dataNodeNames .size ();
417
+ indexRandomDocs (indexName , 20 );
418
+ createRepository (REPO , "mock" );
419
+
420
+ createSnapshotPolicy (
421
+ policyName ,
422
+ "snap" ,
423
+ NEVER_EXECUTE_CRON_SCHEDULE ,
424
+ REPO ,
425
+ indexName ,
426
+ true ,
427
+ false ,
428
+ new SnapshotRetentionConfiguration (TimeValue .ZERO , null , null )
429
+ );
430
+
431
+ // block snapshot from completing
432
+ blockMasterFromFinalizingSnapshotOnIndexFile (REPO );
433
+
434
+ // first SLM execution
435
+ final String snapshotName = executePolicy (policyName );
436
+ final String initialMaster = internalCluster ().getMasterName ();
437
+ waitForBlock (initialMaster , REPO );
438
+
439
+ // restart master
440
+ internalCluster ().restartNode (initialMaster );
441
+ ensureStableCluster (clusterSize );
442
+ awaitNoMoreRunningOperations ();
443
+
444
+ // ensure snapshot is completed successfully after master failover
445
+ assertBusy (() -> {
446
+ final SnapshotInfo snapshotInfo ;
447
+ try {
448
+ GetSnapshotsResponse snapshotsStatusResponse = clusterAdmin ().prepareGetSnapshots (TEST_REQUEST_TIMEOUT , REPO )
449
+ .setSnapshots (snapshotName )
450
+ .get ();
451
+ snapshotInfo = snapshotsStatusResponse .getSnapshots ().get (0 );
452
+ } catch (SnapshotMissingException sme ) {
453
+ throw new AssertionError (sme );
454
+ }
455
+ assertEquals (SnapshotState .SUCCESS , snapshotInfo .state ());
456
+ }, 30L , TimeUnit .SECONDS );
457
+ assertSnapshotSuccessful (snapshotName );
458
+
459
+ // the SLM policy metadata has not been updated due to master shutdown
460
+ assertBusy (() -> {
461
+ SnapshotLifecyclePolicyItem policy = client ().execute (
462
+ GetSnapshotLifecycleAction .INSTANCE ,
463
+ new GetSnapshotLifecycleAction .Request (TEST_REQUEST_TIMEOUT , TEST_REQUEST_TIMEOUT , policyName )
464
+ ).get ().getPolicies ().getFirst ();
465
+ assertNull (policy .getLastSuccess ());
466
+ assertNull (policy .getLastFailure ());
467
+ assertEquals (0 , policy .getPolicyStats ().getSnapshotFailedCount ());
468
+ assertEquals (0 , policy .getPolicyStats ().getSnapshotTakenCount ());
469
+ });
470
+
471
+ // delete the snapshot, simulate missing snapshot from repo
472
+ assertBusy (() -> {
473
+ AcknowledgedResponse response = clusterAdmin ().prepareDeleteSnapshot (TEST_REQUEST_TIMEOUT , REPO , snapshotName ).get ();
474
+ assertTrue (response .isAcknowledged ());
475
+ });
476
+
477
+ // 2nd SLM execution, it should pick up the last missing stats
478
+ String snapshotSecond = executePolicy (policyName );
479
+
480
+ awaitNoMoreRunningOperations ();
481
+ assertSnapshotSuccessful (snapshotSecond );
482
+
483
+ // stats should have 1 successful and 1 failed snapshot, the deleted snapshot is inferred failure
484
+ assertBusy (() -> {
485
+ SnapshotLifecyclePolicyItem policy = client ().execute (
486
+ GetSnapshotLifecycleAction .INSTANCE ,
487
+ new GetSnapshotLifecycleAction .Request (TEST_REQUEST_TIMEOUT , TEST_REQUEST_TIMEOUT , policyName )
488
+ ).get ().getPolicies ().getFirst ();
489
+ assertNotNull (policy .getLastSuccess ());
490
+ assertEquals (snapshotSecond , policy .getLastSuccess ().getSnapshotName ());
491
+ assertNotNull (policy .getLastFailure ());
492
+ assertEquals (snapshotName , policy .getLastFailure ().getSnapshotName ());
493
+ assertEquals (1 , policy .getPolicyStats ().getSnapshotFailedCount ());
494
+ assertEquals (1 , policy .getPolicyStats ().getSnapshotTakenCount ());
495
+ });
496
+ }
497
+
498
+ private void assertSnapshotSuccessful (String snapshotName ) throws Exception {
499
+ assertBusy (() -> {
500
+ final SnapshotInfo snapshotInfo ;
501
+ try {
502
+ GetSnapshotsResponse snapshotsStatusResponse = clusterAdmin ().prepareGetSnapshots (TEST_REQUEST_TIMEOUT , REPO )
503
+ .setSnapshots (snapshotName )
504
+ .get ();
505
+ snapshotInfo = snapshotsStatusResponse .getSnapshots ().get (0 );
506
+ } catch (SnapshotMissingException sme ) {
507
+ throw new AssertionError (sme );
508
+ }
509
+ assertEquals (SnapshotState .SUCCESS , snapshotInfo .state ());
510
+ });
511
+ }
512
+
332
513
private void testUnsuccessfulSnapshotRetention (boolean partialSuccess ) throws Exception {
333
514
final String indexName = "test-idx" ;
334
515
final String policyId = "test-policy" ;
0 commit comments