Skip to content

Commit 840cd2a

Browse files
authored
Fix built-in roles sync losing updates (elastic#142433) (elastic#142892)
Built-in role sync requests arriving while another sync was in progress were silently discarded. This caused roles to go missing when cluster change events fired in quick succession during bootstrap. Replace the synchronizationInProgress AtomicBoolean with a lock-free AtomicInteger state machine (RolesSync) that tracks idle, syncing, and syncing_pending states. When a sync completes and updates are pending, it automatically retries with the latest roles.
1 parent b5ee69d commit 840cd2a

File tree

4 files changed

+164
-23
lines changed

4 files changed

+164
-23
lines changed

docs/changelog/142433.yaml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
area: Security
2+
issues: []
3+
pr: 142433
4+
summary: Fix built-in roles sync to retry on lock contention instead of silently discarding
5+
pending updates
6+
type: bug

x-pack/plugin/security/src/main/java/org/elasticsearch/xpack/security/support/QueryableBuiltInRolesSynchronizer.java

Lines changed: 79 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,6 @@
5454
import java.util.Objects;
5555
import java.util.Set;
5656
import java.util.concurrent.Executor;
57-
import java.util.concurrent.atomic.AtomicBoolean;
5857
import java.util.concurrent.atomic.AtomicInteger;
5958

6059
import static java.util.stream.Collectors.toMap;
@@ -130,7 +129,7 @@ public void taskSucceeded(MarkRolesAsSyncedTask task, Map<String, String> value)
130129
private final QueryableBuiltInRoles.Provider rolesProvider;
131130
private final NativeRolesStore nativeRolesStore;
132131
private final Executor executor;
133-
private final AtomicBoolean synchronizationInProgress = new AtomicBoolean(false);
132+
private final RolesSync sync = new RolesSync();
134133

135134
private volatile boolean securityIndexDeleted = false;
136135

@@ -217,33 +216,91 @@ public void clusterChanged(ClusterChangedEvent event) {
217216
* @return {@code true} if the synchronization of built-in roles is in progress, {@code false} otherwise
218217
*/
219218
public boolean isSynchronizationInProgress() {
220-
return synchronizationInProgress.get();
219+
return sync.inProgress();
221220
}
222221

223222
private void syncBuiltInRoles(final QueryableBuiltInRoles roles) {
224-
if (synchronizationInProgress.compareAndSet(false, true)) {
225-
try {
226-
final Map<String, String> indexedRolesDigests = readIndexedBuiltInRolesDigests(clusterService.state());
227-
if (roles.rolesDigest().equals(indexedRolesDigests)) {
228-
logger.debug("Security index already contains the latest built-in roles indexed, skipping roles synchronization");
223+
if (sync.startSync()) {
224+
doSyncBuiltInRoles(roles);
225+
}
226+
}
227+
228+
private void doSyncBuiltInRoles(final QueryableBuiltInRoles roles) {
229+
try {
230+
final Map<String, String> indexedRolesDigests = readIndexedBuiltInRolesDigests(clusterService.state());
231+
if (roles.rolesDigest().equals(indexedRolesDigests)) {
232+
logger.debug("Security index already contains the latest built-in roles indexed, skipping roles synchronization");
233+
resetFailedSyncAttempts();
234+
endSyncAndRetryIfNeeded();
235+
} else {
236+
executor.execute(() -> applyRoleChanges(indexedRolesDigests, roles, ActionListener.wrap(v -> {
237+
logger.info("Successfully synced [{}] built-in roles to .security index", roles.roleDescriptors().size());
229238
resetFailedSyncAttempts();
230-
synchronizationInProgress.set(false);
239+
endSyncAndRetryIfNeeded();
240+
}, e -> {
241+
handleException(e);
242+
endSyncAndRetryIfNeeded();
243+
})));
244+
}
245+
} catch (Exception e) {
246+
logger.error("Failed to sync built-in roles", e);
247+
failedSyncAttempts.incrementAndGet();
248+
endSyncAndRetryIfNeeded();
249+
}
250+
}
251+
252+
private void endSyncAndRetryIfNeeded() {
253+
if (sync.endSync()) {
254+
boolean shouldRetry = false;
255+
try {
256+
shouldRetry = shouldSyncBuiltInRoles(clusterService.state());
257+
} catch (Exception e) {
258+
logger.warn("Failed to evaluate retry conditions for built-in roles synchronization", e);
259+
}
260+
if (shouldRetry) {
261+
logger.debug("Retrying synchronization of built-in roles due to pending changes");
262+
doSyncBuiltInRoles(rolesProvider.getRoles());
263+
} else {
264+
endSyncAndRetryIfNeeded();
265+
}
266+
}
267+
}
268+
269+
static class RolesSync {
270+
private static final int IDLE = 0;
271+
private static final int RUNNING = 1;
272+
private static final int RUNNING_PENDING = 2;
273+
274+
private final AtomicInteger state = new AtomicInteger(IDLE);
275+
276+
boolean startSync() {
277+
while (true) {
278+
int s = state.get();
279+
if (s == IDLE) {
280+
if (state.compareAndSet(IDLE, RUNNING)) return true;
281+
} else if (s == RUNNING) {
282+
if (state.compareAndSet(RUNNING, RUNNING_PENDING)) return false;
231283
} else {
232-
executor.execute(() -> doSyncBuiltinRoles(indexedRolesDigests, roles, ActionListener.wrap(v -> {
233-
logger.info("Successfully synced [{}] built-in roles to .security index", roles.roleDescriptors().size());
234-
resetFailedSyncAttempts();
235-
synchronizationInProgress.set(false);
236-
}, e -> {
237-
handleException(e);
238-
synchronizationInProgress.set(false);
239-
})));
284+
return false; // already RUNNING_PENDING
240285
}
241-
} catch (Exception e) {
242-
logger.error("Failed to sync built-in roles", e);
243-
failedSyncAttempts.incrementAndGet();
244-
synchronizationInProgress.set(false);
245286
}
246287
}
288+
289+
boolean endSync() {
290+
while (true) {
291+
int s = state.get();
292+
assert s != IDLE : "endSync should only be called when a sync is in progress";
293+
if (s == RUNNING_PENDING) {
294+
if (state.compareAndSet(RUNNING_PENDING, RUNNING)) return true;
295+
} else {
296+
if (state.compareAndSet(RUNNING, IDLE)) return false;
297+
}
298+
}
299+
}
300+
301+
boolean inProgress() {
302+
return state.get() != IDLE;
303+
}
247304
}
248305

249306
private void handleException(Exception e) {
@@ -371,7 +428,7 @@ private boolean shouldSyncBuiltInRoles(final ClusterState state) {
371428
return true;
372429
}
373430

374-
private void doSyncBuiltinRoles(
431+
private void applyRoleChanges(
375432
final Map<String, String> indexedRolesDigests,
376433
final QueryableBuiltInRoles roles,
377434
final ActionListener<Void> listener

x-pack/plugin/security/src/test/java/org/elasticsearch/test/SecurityIntegTestCase.java

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,8 @@
77
package org.elasticsearch.test;
88

99
import org.elasticsearch.ResourceAlreadyExistsException;
10+
import org.elasticsearch.action.admin.cluster.health.ClusterHealthRequest;
11+
import org.elasticsearch.action.admin.cluster.health.ClusterHealthResponse;
1012
import org.elasticsearch.action.admin.cluster.node.info.NodeInfo;
1113
import org.elasticsearch.action.admin.cluster.node.info.NodesInfoResponse;
1214
import org.elasticsearch.action.admin.cluster.node.info.PluginsAndModules;
@@ -53,6 +55,7 @@
5355
import static org.elasticsearch.xpack.core.security.authc.support.UsernamePasswordToken.basicAuthHeaderValue;
5456
import static org.elasticsearch.xpack.security.support.SecuritySystemIndices.SECURITY_MAIN_ALIAS;
5557
import static org.hamcrest.Matchers.hasItem;
58+
import static org.hamcrest.Matchers.is;
5659

5760
/**
5861
* Base class to run tests against a cluster with X-Pack installed and security enabled.
@@ -444,7 +447,12 @@ protected void createSecurityIndexWithWaitForActiveShards() {
444447
try {
445448
client.admin().indices().create(createIndexRequest).actionGet();
446449
} catch (ResourceAlreadyExistsException e) {
447-
logger.info("Security index already exists, ignoring.", e);
450+
logger.info("Security index already exists, waiting for it to become available.", e);
451+
ClusterHealthRequest healthRequest = new ClusterHealthRequest(TEST_REQUEST_TIMEOUT, SECURITY_MAIN_ALIAS).waitForActiveShards(
452+
ActiveShardCount.ALL
453+
);
454+
ClusterHealthResponse healthResponse = client.admin().cluster().health(healthRequest).actionGet();
455+
assertThat(healthResponse.isTimedOut(), is(false));
448456
}
449457
}
450458

x-pack/plugin/security/src/test/java/org/elasticsearch/xpack/security/support/QueryableBuiltInRolesSynchronizerTests.java

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -683,6 +683,76 @@ private void mockNativeRolesStoreWithFailure(
683683
.deleteRoles(eq(rolesToDelete), eq(WriteRequest.RefreshPolicy.IMMEDIATE), eq(false), any(ActionListener.class));
684684
}
685685

686+
public void testRolesSyncBasicAcquireRelease() {
687+
var sync = new QueryableBuiltInRolesSynchronizer.RolesSync();
688+
assertFalse(sync.inProgress());
689+
690+
assertTrue(sync.startSync());
691+
assertTrue(sync.inProgress());
692+
693+
assertFalse(sync.endSync());
694+
assertFalse(sync.inProgress());
695+
}
696+
697+
public void testRolesSyncContentionSetsPending() {
698+
var sync = new QueryableBuiltInRolesSynchronizer.RolesSync();
699+
assertTrue(sync.startSync());
700+
701+
// Second caller is rejected but marks pending
702+
assertFalse(sync.startSync());
703+
assertTrue(sync.inProgress());
704+
705+
// endSync sees pending, clears it, keeps lock held
706+
assertTrue(sync.endSync());
707+
assertTrue(sync.inProgress());
708+
709+
// No more pending — release
710+
assertFalse(sync.endSync());
711+
assertFalse(sync.inProgress());
712+
}
713+
714+
public void testRolesSyncMultiplePendingCoalesce() {
715+
var sync = new QueryableBuiltInRolesSynchronizer.RolesSync();
716+
assertTrue(sync.startSync());
717+
718+
// Multiple concurrent arrivals all set pending, but only one retry results
719+
assertFalse(sync.startSync());
720+
assertFalse(sync.startSync());
721+
assertFalse(sync.startSync());
722+
723+
assertTrue(sync.endSync()); // one retry
724+
assertFalse(sync.endSync()); // done
725+
assertFalse(sync.inProgress());
726+
}
727+
728+
public void testRolesSyncPendingDuringRetry() {
729+
var sync = new QueryableBuiltInRolesSynchronizer.RolesSync();
730+
assertTrue(sync.startSync());
731+
assertFalse(sync.startSync()); // pending
732+
733+
assertTrue(sync.endSync()); // retry (lock still held)
734+
735+
// New arrival during retry
736+
assertFalse(sync.startSync());
737+
738+
assertTrue(sync.endSync()); // second retry
739+
assertFalse(sync.endSync()); // done
740+
assertFalse(sync.inProgress());
741+
}
742+
743+
public void testRolesSyncReacquireAfterFullRelease() {
744+
var sync = new QueryableBuiltInRolesSynchronizer.RolesSync();
745+
assertTrue(sync.startSync());
746+
assertFalse(sync.endSync());
747+
assertFalse(sync.inProgress());
748+
749+
// Can acquire again after full release
750+
assertTrue(sync.startSync());
751+
assertTrue(sync.inProgress());
752+
assertFalse(sync.endSync());
753+
assertFalse(sync.inProgress());
754+
}
755+
686756
private static ClusterState.Builder createClusterStateWithOpenSecurityIndex() {
687757
return SecurityIndexManagerTests.createClusterState(
688758
TestRestrictedIndices.INTERNAL_SECURITY_MAIN_INDEX_7,

0 commit comments

Comments
 (0)