From 6fa041c59232c128fc10c6e0155ff483238b5e93 Mon Sep 17 00:00:00 2001 From: John Verwolf Date: Tue, 11 Feb 2025 14:58:01 -0800 Subject: [PATCH 1/6] Retry on createIndex failure --- docs/changelog/122326.yaml | 5 + .../migration/FeatureMigrationIT.java | 1 - .../upgrades/SystemIndexMigrator.java | 97 +++++++++---------- 3 files changed, 52 insertions(+), 51 deletions(-) create mode 100644 docs/changelog/122326.yaml diff --git a/docs/changelog/122326.yaml b/docs/changelog/122326.yaml new file mode 100644 index 0000000000000..91c71041d58fc --- /dev/null +++ b/docs/changelog/122326.yaml @@ -0,0 +1,5 @@ +pr: 122326 +summary: System Index Migration Failure Results in a Non-Recoverable State +area: Infra/Core +type: bug +issues: [] diff --git a/modules/reindex/src/internalClusterTest/java/org/elasticsearch/migration/FeatureMigrationIT.java b/modules/reindex/src/internalClusterTest/java/org/elasticsearch/migration/FeatureMigrationIT.java index ee95ce5513820..efca437d14eb4 100644 --- a/modules/reindex/src/internalClusterTest/java/org/elasticsearch/migration/FeatureMigrationIT.java +++ b/modules/reindex/src/internalClusterTest/java/org/elasticsearch/migration/FeatureMigrationIT.java @@ -277,7 +277,6 @@ public void testMigrateIndexWithWriteBlock() throws Exception { }); } - @AwaitsFix(bugUrl = "ES-10666") // This test uncovered an existing issue public void testIndexBlockIsRemovedWhenAliasRequestFails() throws Exception { createSystemIndexForDescriptor(INTERNAL_UNMANAGED); ensureGreen(); diff --git a/server/src/main/java/org/elasticsearch/upgrades/SystemIndexMigrator.java b/server/src/main/java/org/elasticsearch/upgrades/SystemIndexMigrator.java index cdd466c567e8b..094c64067e394 100644 --- a/server/src/main/java/org/elasticsearch/upgrades/SystemIndexMigrator.java +++ b/server/src/main/java/org/elasticsearch/upgrades/SystemIndexMigrator.java @@ -207,49 +207,14 @@ public void run(SystemIndexMigrationTaskState taskState) { } // Kick off our callback "loop" - finishIndexAndLoop calls back into prepareNextIndex - cleanUpPreviousMigration( - taskState, - clusterState, - state -> prepareNextIndex(state, state2 -> migrateSingleIndex(state2, this::finishIndexAndLoop), stateFeatureName) - ); - } - - private void cleanUpPreviousMigration( - SystemIndexMigrationTaskState taskState, - ClusterState currentState, - Consumer listener - ) { logger.debug("cleaning up previous migration, task state: [{}]", taskState == null ? "null" : Strings.toString(taskState)); - if (taskState != null && taskState.getCurrentIndex() != null) { - SystemIndexMigrationInfo migrationInfo; - try { - migrationInfo = SystemIndexMigrationInfo.fromTaskState( - taskState, - systemIndices, - currentState.metadata(), - indexScopedSettings - ); - } catch (Exception e) { - markAsFailed(e); - return; - } - final String newIndexName = migrationInfo.getNextIndexName(); - logger.info("removing index [{}] from previous incomplete migration", newIndexName); - - migrationInfo.createClient(baseClient) - .admin() - .indices() - .prepareDelete(newIndexName) - .execute(ActionListener.wrap(ackedResponse -> { - if (ackedResponse.isAcknowledged()) { - logger.debug("successfully removed index [{}]", newIndexName); - clearResults(clusterService, ActionListener.wrap(listener::accept, this::markAsFailed)); - } - }, this::markAsFailed)); - } else { - logger.debug("no incomplete index to remove"); - clearResults(clusterService, ActionListener.wrap(listener::accept, this::markAsFailed)); - } + clearResults( + clusterService, + ActionListener.wrap( + state -> prepareNextIndex(state2 -> migrateSingleIndex(state2, this::finishIndexAndLoop), stateFeatureName), + this::markAsFailed + ) + ); } private void finishIndexAndLoop(BulkByScrollResponse bulkResponse) { @@ -289,11 +254,7 @@ private void finishIndexAndLoop(BulkByScrollResponse bulkResponse) { }, this::markAsFailed) ); } else { - prepareNextIndex( - clusterService.state(), - state2 -> migrateSingleIndex(state2, this::finishIndexAndLoop), - lastMigrationInfo.getFeatureName() - ); + prepareNextIndex(state2 -> migrateSingleIndex(state2, this::finishIndexAndLoop), lastMigrationInfo.getFeatureName()); } } @@ -303,7 +264,6 @@ private void recordIndexMigrationSuccess(SystemIndexMigrationInfo lastMigrationI SingleFeatureMigrationResult.success(), ActionListener.wrap(state -> { prepareNextIndex( - state, clusterState -> migrateSingleIndex(clusterState, this::finishIndexAndLoop), lastMigrationInfo.getFeatureName() ); @@ -312,7 +272,7 @@ private void recordIndexMigrationSuccess(SystemIndexMigrationInfo lastMigrationI updateTask.submit(clusterService); } - private void prepareNextIndex(ClusterState clusterState, Consumer listener, String lastFeatureName) { + private void prepareNextIndex(Consumer listener, String lastFeatureName) { synchronized (migrationQueue) { assert migrationQueue != null; if (migrationQueue.isEmpty()) { @@ -424,7 +384,7 @@ private void migrateSingleIndex(ClusterState clusterState, Consumer innerListener = ActionListener.wrap(listener::accept, this::markAsFailed); try { - createIndex(migrationInfo, innerListener.delegateFailureAndWrap((delegate, shardsAcknowledgedResponse) -> { + createIndexRetryOnFailure(migrationInfo, innerListener.delegateFailureAndWrap((delegate, shardsAcknowledgedResponse) -> { logger.debug( "while migrating [{}] , got create index response: [{}]", oldIndexName, @@ -508,7 +468,22 @@ private void migrateSingleIndex(ClusterState clusterState, Consumer innerListener = ActionListener.wrap(listener::accept, this::markAsFailed); + private void retryAfterFailureHandler( + ActionListener listener, + Consumer> retryableAction, + Consumer failureHandler + ) { + retryableAction.accept(ActionListener.wrap(listener::onResponse, e -> { + logger.error("error occurred while executing retryable action", e); + failureHandler.accept(e); + listener.onFailure(e); + })); + } + private void createIndex(SystemIndexMigrationInfo migrationInfo, ActionListener listener) { + logger.info("creating new system index [{}] from feature [{}]", migrationInfo.getNextIndexName(), migrationInfo.getFeatureName()); + final CreateIndexClusterStateUpdateRequest createRequest = new CreateIndexClusterStateUpdateRequest( "migrate-system-index", migrationInfo.getNextIndexName(), @@ -534,6 +509,27 @@ private void createIndex(SystemIndexMigrationInfo migrationInfo, ActionListener< ); } + private void createIndexRetryOnFailure(SystemIndexMigrationInfo migrationInfo, ActionListener listener) { + createIndex(migrationInfo, ActionListener.wrap(listener::onResponse, e -> { + logger.warn("createIndex failed, retrying after removing index [{}] from previous attempt", migrationInfo.getNextIndexName()); + deleteIndex(migrationInfo, ActionListener.wrap(cleanupResponse -> createIndex(migrationInfo, listener), e2 -> { + logger.warn("createIndex failed after retrying, aborting", e2); + listener.onFailure(e2); + })); + })); + } + + private void deleteIndex(SystemIndexMigrationInfo migrationInfo, ActionListener listener) { + logger.info("removing index [{}] from feature [{}]", migrationInfo.getNextIndexName(), migrationInfo.getFeatureName()); + String newIndexName = migrationInfo.getNextIndexName(); + baseClient.admin().indices().prepareDelete(newIndexName).execute(ActionListener.wrap(ackedResponse -> { + if (ackedResponse.isAcknowledged()) { + logger.info("successfully removed index [{}]", newIndexName); + listener.onResponse(ackedResponse); + } + }, listener::onFailure)); + } + private void setAliasAndRemoveOldIndex(SystemIndexMigrationInfo migrationInfo, ActionListener listener) { final IndicesAliasesRequestBuilder aliasesRequest = migrationInfo.createClient(baseClient).admin().indices().prepareAliases(); aliasesRequest.removeIndex(migrationInfo.getCurrentIndexName()); @@ -639,6 +635,7 @@ public void markAsFailed(Exception e) { String indexName = Optional.ofNullable(migrationInfo).map(SystemIndexMigrationInfo::getCurrentIndexName).orElse(""); MigrationResultsUpdateTask.upsert( + // this is for the Custom Metadata, not the Persistent Task featureName, SingleFeatureMigrationResult.failure(indexName, e), ActionListener.wrap(state -> super.markAsFailed(e), exception -> super.markAsFailed(e)) From abf0f97cc40b63b46cd1dc06ec32c1351e8e7ee0 Mon Sep 17 00:00:00 2001 From: John Verwolf Date: Tue, 11 Feb 2025 15:42:48 -0800 Subject: [PATCH 2/6] Remove comment --- .../java/org/elasticsearch/upgrades/SystemIndexMigrator.java | 1 - 1 file changed, 1 deletion(-) diff --git a/server/src/main/java/org/elasticsearch/upgrades/SystemIndexMigrator.java b/server/src/main/java/org/elasticsearch/upgrades/SystemIndexMigrator.java index 094c64067e394..86a2da9252ee2 100644 --- a/server/src/main/java/org/elasticsearch/upgrades/SystemIndexMigrator.java +++ b/server/src/main/java/org/elasticsearch/upgrades/SystemIndexMigrator.java @@ -635,7 +635,6 @@ public void markAsFailed(Exception e) { String indexName = Optional.ofNullable(migrationInfo).map(SystemIndexMigrationInfo::getCurrentIndexName).orElse(""); MigrationResultsUpdateTask.upsert( - // this is for the Custom Metadata, not the Persistent Task featureName, SingleFeatureMigrationResult.failure(indexName, e), ActionListener.wrap(state -> super.markAsFailed(e), exception -> super.markAsFailed(e)) From aa2b112bb4fc68d378d97367a1d6afad58e6f329 Mon Sep 17 00:00:00 2001 From: John Verwolf Date: Wed, 12 Feb 2025 07:23:14 -0800 Subject: [PATCH 3/6] remove dead code --- .../elasticsearch/upgrades/SystemIndexMigrator.java | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/upgrades/SystemIndexMigrator.java b/server/src/main/java/org/elasticsearch/upgrades/SystemIndexMigrator.java index 86a2da9252ee2..404a27bd87af9 100644 --- a/server/src/main/java/org/elasticsearch/upgrades/SystemIndexMigrator.java +++ b/server/src/main/java/org/elasticsearch/upgrades/SystemIndexMigrator.java @@ -468,19 +468,6 @@ private void migrateSingleIndex(ClusterState clusterState, Consumer innerListener = ActionListener.wrap(listener::accept, this::markAsFailed); - private void retryAfterFailureHandler( - ActionListener listener, - Consumer> retryableAction, - Consumer failureHandler - ) { - retryableAction.accept(ActionListener.wrap(listener::onResponse, e -> { - logger.error("error occurred while executing retryable action", e); - failureHandler.accept(e); - listener.onFailure(e); - })); - } - private void createIndex(SystemIndexMigrationInfo migrationInfo, ActionListener listener) { logger.info("creating new system index [{}] from feature [{}]", migrationInfo.getNextIndexName(), migrationInfo.getFeatureName()); From 98cabbd5025740b6bb5b45b6fb4583f66c583c4b Mon Sep 17 00:00:00 2001 From: John Verwolf Date: Wed, 12 Feb 2025 07:42:35 -0800 Subject: [PATCH 4/6] Add additional failure branch --- .../java/org/elasticsearch/upgrades/SystemIndexMigrator.java | 2 ++ 1 file changed, 2 insertions(+) diff --git a/server/src/main/java/org/elasticsearch/upgrades/SystemIndexMigrator.java b/server/src/main/java/org/elasticsearch/upgrades/SystemIndexMigrator.java index 404a27bd87af9..1dcf165fbcc19 100644 --- a/server/src/main/java/org/elasticsearch/upgrades/SystemIndexMigrator.java +++ b/server/src/main/java/org/elasticsearch/upgrades/SystemIndexMigrator.java @@ -513,6 +513,8 @@ private void deleteIndex(SystemIndexMigrationInfo migrationInfo, ActionListe if (ackedResponse.isAcknowledged()) { logger.info("successfully removed index [{}]", newIndexName); listener.onResponse(ackedResponse); + } else { + listener.onFailure(new ElasticsearchException("Failed to acknowledge index deletion for [" + newIndexName + "]")); } }, listener::onFailure)); } From ed44d0dd27dc0003bc1f092666bf1edbbd95446f Mon Sep 17 00:00:00 2001 From: John Verwolf Date: Wed, 12 Feb 2025 13:56:47 -0800 Subject: [PATCH 5/6] PR feedback: Fix log line --- .../upgrades/SystemIndexMigrator.java | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/upgrades/SystemIndexMigrator.java b/server/src/main/java/org/elasticsearch/upgrades/SystemIndexMigrator.java index 1dcf165fbcc19..2283de76ba07e 100644 --- a/server/src/main/java/org/elasticsearch/upgrades/SystemIndexMigrator.java +++ b/server/src/main/java/org/elasticsearch/upgrades/SystemIndexMigrator.java @@ -497,11 +497,18 @@ private void createIndex(SystemIndexMigrationInfo migrationInfo, ActionListener< } private void createIndexRetryOnFailure(SystemIndexMigrationInfo migrationInfo, ActionListener listener) { - createIndex(migrationInfo, ActionListener.wrap(listener::onResponse, e -> { + createIndex(migrationInfo, listener.delegateResponse((l, e) -> { logger.warn("createIndex failed, retrying after removing index [{}] from previous attempt", migrationInfo.getNextIndexName()); - deleteIndex(migrationInfo, ActionListener.wrap(cleanupResponse -> createIndex(migrationInfo, listener), e2 -> { - logger.warn("createIndex failed after retrying, aborting", e2); - listener.onFailure(e2); + deleteIndex(migrationInfo, ActionListener.wrap(cleanupResponse -> createIndex(migrationInfo, l.delegateResponse((l3, e3) -> { + logger.error( + "createIndex failed after retrying, aborting; index [{}] will be left in an inconsistent state", + migrationInfo.getNextIndexName(), + e3 + ); + l.onFailure(e3); + })), e2 -> { + logger.error("deleteIndex failed after retrying, aborting", e2); + l.onFailure(e2); })); })); } From b1d8538e8737749420d74b7fc84fdb37dea2f0a7 Mon Sep 17 00:00:00 2001 From: John Verwolf Date: Thu, 13 Feb 2025 10:38:28 -0800 Subject: [PATCH 6/6] Fix logger usage --- .../java/org/elasticsearch/upgrades/SystemIndexMigrator.java | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/upgrades/SystemIndexMigrator.java b/server/src/main/java/org/elasticsearch/upgrades/SystemIndexMigrator.java index 2283de76ba07e..9947606470178 100644 --- a/server/src/main/java/org/elasticsearch/upgrades/SystemIndexMigrator.java +++ b/server/src/main/java/org/elasticsearch/upgrades/SystemIndexMigrator.java @@ -501,13 +501,12 @@ private void createIndexRetryOnFailure(SystemIndexMigrationInfo migrationInfo, A logger.warn("createIndex failed, retrying after removing index [{}] from previous attempt", migrationInfo.getNextIndexName()); deleteIndex(migrationInfo, ActionListener.wrap(cleanupResponse -> createIndex(migrationInfo, l.delegateResponse((l3, e3) -> { logger.error( - "createIndex failed after retrying, aborting; index [{}] will be left in an inconsistent state", - migrationInfo.getNextIndexName(), + "createIndex failed after retrying, aborting system index migration. index: " + migrationInfo.getNextIndexName(), e3 ); l.onFailure(e3); })), e2 -> { - logger.error("deleteIndex failed after retrying, aborting", e2); + logger.error("deleteIndex failed, aborting system index migration. index: " + migrationInfo.getNextIndexName(), e2); l.onFailure(e2); })); }));