Skip to content

Commit 0dd66e5

Browse files
authored
Fix built-in roles sync losing updates (#142433) (#142891)
Built-in role sync requests arriving while another sync was in progress were silently discarded. This caused roles to go missing when cluster change events fired in quick succession during bootstrap. Replace the synchronizationInProgress AtomicBoolean with a lock-free AtomicInteger state machine (RolesSync) that tracks idle, syncing, and syncing_pending states. When a sync completes and updates are pending, it automatically retries with the latest roles.
1 parent 2da7492 commit 0dd66e5

File tree

4 files changed

+164
-23
lines changed

4 files changed

+164
-23
lines changed

docs/changelog/142433.yaml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
area: Security
2+
issues: []
3+
pr: 142433
4+
summary: Fix built-in roles sync to retry on lock contention instead of silently discarding
5+
pending updates
6+
type: bug

x-pack/plugin/security/src/main/java/org/elasticsearch/xpack/security/support/QueryableBuiltInRolesSynchronizer.java

Lines changed: 79 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,6 @@
5252
import java.util.Objects;
5353
import java.util.Set;
5454
import java.util.concurrent.Executor;
55-
import java.util.concurrent.atomic.AtomicBoolean;
5655
import java.util.concurrent.atomic.AtomicInteger;
5756

5857
import static java.util.stream.Collectors.toMap;
@@ -128,7 +127,7 @@ public void taskSucceeded(MarkRolesAsSyncedTask task, Map<String, String> value)
128127
private final QueryableBuiltInRoles.Provider rolesProvider;
129128
private final NativeRolesStore nativeRolesStore;
130129
private final Executor executor;
131-
private final AtomicBoolean synchronizationInProgress = new AtomicBoolean(false);
130+
private final RolesSync sync = new RolesSync();
132131

133132
private volatile boolean securityIndexDeleted = false;
134133

@@ -215,33 +214,91 @@ public void clusterChanged(ClusterChangedEvent event) {
215214
* @return {@code true} if the synchronization of built-in roles is in progress, {@code false} otherwise
216215
*/
217216
public boolean isSynchronizationInProgress() {
218-
return synchronizationInProgress.get();
217+
return sync.inProgress();
219218
}
220219

221220
private void syncBuiltInRoles(final QueryableBuiltInRoles roles) {
222-
if (synchronizationInProgress.compareAndSet(false, true)) {
223-
try {
224-
final Map<String, String> indexedRolesDigests = readIndexedBuiltInRolesDigests(clusterService.state());
225-
if (roles.rolesDigest().equals(indexedRolesDigests)) {
226-
logger.debug("Security index already contains the latest built-in roles indexed, skipping roles synchronization");
221+
if (sync.startSync()) {
222+
doSyncBuiltInRoles(roles);
223+
}
224+
}
225+
226+
private void doSyncBuiltInRoles(final QueryableBuiltInRoles roles) {
227+
try {
228+
final Map<String, String> indexedRolesDigests = readIndexedBuiltInRolesDigests(clusterService.state());
229+
if (roles.rolesDigest().equals(indexedRolesDigests)) {
230+
logger.debug("Security index already contains the latest built-in roles indexed, skipping roles synchronization");
231+
resetFailedSyncAttempts();
232+
endSyncAndRetryIfNeeded();
233+
} else {
234+
executor.execute(() -> applyRoleChanges(indexedRolesDigests, roles, ActionListener.wrap(v -> {
235+
logger.info("Successfully synced [{}] built-in roles to .security index", roles.roleDescriptors().size());
227236
resetFailedSyncAttempts();
228-
synchronizationInProgress.set(false);
237+
endSyncAndRetryIfNeeded();
238+
}, e -> {
239+
handleException(e);
240+
endSyncAndRetryIfNeeded();
241+
})));
242+
}
243+
} catch (Exception e) {
244+
logger.error("Failed to sync built-in roles", e);
245+
failedSyncAttempts.incrementAndGet();
246+
endSyncAndRetryIfNeeded();
247+
}
248+
}
249+
250+
private void endSyncAndRetryIfNeeded() {
251+
if (sync.endSync()) {
252+
boolean shouldRetry = false;
253+
try {
254+
shouldRetry = shouldSyncBuiltInRoles(clusterService.state());
255+
} catch (Exception e) {
256+
logger.warn("Failed to evaluate retry conditions for built-in roles synchronization", e);
257+
}
258+
if (shouldRetry) {
259+
logger.debug("Retrying synchronization of built-in roles due to pending changes");
260+
doSyncBuiltInRoles(rolesProvider.getRoles());
261+
} else {
262+
endSyncAndRetryIfNeeded();
263+
}
264+
}
265+
}
266+
267+
static class RolesSync {
268+
private static final int IDLE = 0;
269+
private static final int RUNNING = 1;
270+
private static final int RUNNING_PENDING = 2;
271+
272+
private final AtomicInteger state = new AtomicInteger(IDLE);
273+
274+
boolean startSync() {
275+
while (true) {
276+
int s = state.get();
277+
if (s == IDLE) {
278+
if (state.compareAndSet(IDLE, RUNNING)) return true;
279+
} else if (s == RUNNING) {
280+
if (state.compareAndSet(RUNNING, RUNNING_PENDING)) return false;
229281
} else {
230-
executor.execute(() -> doSyncBuiltinRoles(indexedRolesDigests, roles, ActionListener.wrap(v -> {
231-
logger.info("Successfully synced [{}] built-in roles to .security index", roles.roleDescriptors().size());
232-
resetFailedSyncAttempts();
233-
synchronizationInProgress.set(false);
234-
}, e -> {
235-
handleException(e);
236-
synchronizationInProgress.set(false);
237-
})));
282+
return false; // already RUNNING_PENDING
238283
}
239-
} catch (Exception e) {
240-
logger.error("Failed to sync built-in roles", e);
241-
failedSyncAttempts.incrementAndGet();
242-
synchronizationInProgress.set(false);
243284
}
244285
}
286+
287+
boolean endSync() {
288+
while (true) {
289+
int s = state.get();
290+
assert s != IDLE : "endSync should only be called when a sync is in progress";
291+
if (s == RUNNING_PENDING) {
292+
if (state.compareAndSet(RUNNING_PENDING, RUNNING)) return true;
293+
} else {
294+
if (state.compareAndSet(RUNNING, IDLE)) return false;
295+
}
296+
}
297+
}
298+
299+
boolean inProgress() {
300+
return state.get() != IDLE;
301+
}
245302
}
246303

247304
private void handleException(Exception e) {
@@ -356,7 +413,7 @@ private boolean shouldSyncBuiltInRoles(final ClusterState state) {
356413
return true;
357414
}
358415

359-
private void doSyncBuiltinRoles(
416+
private void applyRoleChanges(
360417
final Map<String, String> indexedRolesDigests,
361418
final QueryableBuiltInRoles roles,
362419
final ActionListener<Void> listener

x-pack/plugin/security/src/test/java/org/elasticsearch/test/SecurityIntegTestCase.java

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,8 @@
77
package org.elasticsearch.test;
88

99
import org.elasticsearch.ResourceAlreadyExistsException;
10+
import org.elasticsearch.action.admin.cluster.health.ClusterHealthRequest;
11+
import org.elasticsearch.action.admin.cluster.health.ClusterHealthResponse;
1012
import org.elasticsearch.action.admin.cluster.node.info.NodeInfo;
1113
import org.elasticsearch.action.admin.cluster.node.info.NodesInfoResponse;
1214
import org.elasticsearch.action.admin.cluster.node.info.PluginsAndModules;
@@ -53,6 +55,7 @@
5355
import static org.elasticsearch.xpack.core.security.authc.support.UsernamePasswordToken.basicAuthHeaderValue;
5456
import static org.elasticsearch.xpack.security.support.SecuritySystemIndices.SECURITY_MAIN_ALIAS;
5557
import static org.hamcrest.Matchers.hasItem;
58+
import static org.hamcrest.Matchers.is;
5659

5760
/**
5861
* Base class to run tests against a cluster with X-Pack installed and security enabled.
@@ -446,7 +449,12 @@ protected void createSecurityIndexWithWaitForActiveShards() {
446449
try {
447450
client.admin().indices().create(createIndexRequest).actionGet();
448451
} catch (ResourceAlreadyExistsException e) {
449-
logger.info("Security index already exists, ignoring.", e);
452+
logger.info("Security index already exists, waiting for it to become available.", e);
453+
ClusterHealthRequest healthRequest = new ClusterHealthRequest(TEST_REQUEST_TIMEOUT, SECURITY_MAIN_ALIAS).waitForActiveShards(
454+
ActiveShardCount.ALL
455+
);
456+
ClusterHealthResponse healthResponse = client.admin().cluster().health(healthRequest).actionGet();
457+
assertThat(healthResponse.isTimedOut(), is(false));
450458
}
451459
}
452460

x-pack/plugin/security/src/test/java/org/elasticsearch/xpack/security/support/QueryableBuiltInRolesSynchronizerTests.java

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -692,6 +692,76 @@ private void mockNativeRolesStoreWithFailure(
692692
.deleteRoles(eq(rolesToDelete), eq(WriteRequest.RefreshPolicy.IMMEDIATE), eq(false), any(ActionListener.class));
693693
}
694694

695+
public void testRolesSyncBasicAcquireRelease() {
696+
var sync = new QueryableBuiltInRolesSynchronizer.RolesSync();
697+
assertFalse(sync.inProgress());
698+
699+
assertTrue(sync.startSync());
700+
assertTrue(sync.inProgress());
701+
702+
assertFalse(sync.endSync());
703+
assertFalse(sync.inProgress());
704+
}
705+
706+
public void testRolesSyncContentionSetsPending() {
707+
var sync = new QueryableBuiltInRolesSynchronizer.RolesSync();
708+
assertTrue(sync.startSync());
709+
710+
// Second caller is rejected but marks pending
711+
assertFalse(sync.startSync());
712+
assertTrue(sync.inProgress());
713+
714+
// endSync sees pending, clears it, keeps lock held
715+
assertTrue(sync.endSync());
716+
assertTrue(sync.inProgress());
717+
718+
// No more pending — release
719+
assertFalse(sync.endSync());
720+
assertFalse(sync.inProgress());
721+
}
722+
723+
public void testRolesSyncMultiplePendingCoalesce() {
724+
var sync = new QueryableBuiltInRolesSynchronizer.RolesSync();
725+
assertTrue(sync.startSync());
726+
727+
// Multiple concurrent arrivals all set pending, but only one retry results
728+
assertFalse(sync.startSync());
729+
assertFalse(sync.startSync());
730+
assertFalse(sync.startSync());
731+
732+
assertTrue(sync.endSync()); // one retry
733+
assertFalse(sync.endSync()); // done
734+
assertFalse(sync.inProgress());
735+
}
736+
737+
public void testRolesSyncPendingDuringRetry() {
738+
var sync = new QueryableBuiltInRolesSynchronizer.RolesSync();
739+
assertTrue(sync.startSync());
740+
assertFalse(sync.startSync()); // pending
741+
742+
assertTrue(sync.endSync()); // retry (lock still held)
743+
744+
// New arrival during retry
745+
assertFalse(sync.startSync());
746+
747+
assertTrue(sync.endSync()); // second retry
748+
assertFalse(sync.endSync()); // done
749+
assertFalse(sync.inProgress());
750+
}
751+
752+
public void testRolesSyncReacquireAfterFullRelease() {
753+
var sync = new QueryableBuiltInRolesSynchronizer.RolesSync();
754+
assertTrue(sync.startSync());
755+
assertFalse(sync.endSync());
756+
assertFalse(sync.inProgress());
757+
758+
// Can acquire again after full release
759+
assertTrue(sync.startSync());
760+
assertTrue(sync.inProgress());
761+
assertFalse(sync.endSync());
762+
assertFalse(sync.inProgress());
763+
}
764+
695765
private static ClusterState.Builder createClusterStateWithOpenSecurityIndex() {
696766
return createClusterState(
697767
TestRestrictedIndices.INTERNAL_SECURITY_MAIN_INDEX_7,

0 commit comments

Comments
 (0)