Skip to content
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions docs/changelog/135271.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
pr: 135271
summary: Add role stats to `_security/stats`
area: Authorization
type: enhancement
issues: []
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
9170000
2 changes: 1 addition & 1 deletion server/src/main/resources/transport/upper_bounds/9.2.csv
Original file line number Diff line number Diff line change
@@ -1 +1 @@
inference_api_openai_embeddings_headers,9169000
roles_security_stats,9170000
Original file line number Diff line number Diff line change
Expand Up @@ -7,32 +7,74 @@

package org.elasticsearch.xpack.core.security.action.stats;

import org.elasticsearch.TransportVersion;
import org.elasticsearch.action.support.nodes.BaseNodeResponse;
import org.elasticsearch.cluster.node.DiscoveryNode;
import org.elasticsearch.common.io.stream.StreamInput;
import org.elasticsearch.common.io.stream.StreamOutput;
import org.elasticsearch.core.Nullable;
import org.elasticsearch.xcontent.ToXContentObject;
import org.elasticsearch.xcontent.XContentBuilder;

import java.io.IOException;
import java.util.Collections;
import java.util.Map;
import java.util.Objects;

public class GetSecurityStatsNodeResponse extends BaseNodeResponse implements ToXContentObject {

private static final TransportVersion ROLES_SECURITY_STATS = TransportVersion.fromName("roles_security_stats");

@Nullable
private final Map<String, Object> rolesStoreStats;

public GetSecurityStatsNodeResponse(final StreamInput in) throws IOException {
super(in);
this.rolesStoreStats = in.getTransportVersion().supports(ROLES_SECURITY_STATS) ? in.readGenericMap() : null;
}

public GetSecurityStatsNodeResponse(final DiscoveryNode node) {
public GetSecurityStatsNodeResponse(final DiscoveryNode node, @Nullable final Map<String, Object> rolesStoreStats) {
super(node);
this.rolesStoreStats = rolesStoreStats;
}

@Override
public void writeTo(final StreamOutput out) throws IOException {
super.writeTo(out);
if (out.getTransportVersion().supports(ROLES_SECURITY_STATS)) {
out.writeGenericMap(rolesStoreStats);
}
}

@Override
public XContentBuilder toXContent(final XContentBuilder builder, final Params params) throws IOException {
if (rolesStoreStats != null) {
builder.field("roles", rolesStoreStats);
}
return builder;
}

@Override
public boolean equals(Object o) {
return this == o
|| (o instanceof GetSecurityStatsNodeResponse that
&& Objects.equals(getNode(), that.getNode())
&& Objects.equals(rolesStoreStats, that.rolesStoreStats));
}

@Override
public int hashCode() {
return Objects.hash(getNode(), rolesStoreStats);
}

// for testing
@Nullable
Map<String, Object> getRolesStoreStats() {
return rolesStoreStats == null ? null : Collections.unmodifiableMap(rolesStoreStats);
}

// for testing
DiscoveryNode getDiscoveryNode() {
return getNode();
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0; you may not use this file except in compliance with the Elastic License
* 2.0.
*/

package org.elasticsearch.xpack.core.security.action.stats;

import org.elasticsearch.cluster.node.DiscoveryNodeUtils;
import org.elasticsearch.common.io.stream.Writeable;
import org.elasticsearch.test.AbstractWireSerializingTestCase;

import java.io.IOException;
import java.util.Map;

public class GetSecurityStatsNodeResponseTests extends AbstractWireSerializingTestCase<GetSecurityStatsNodeResponse> {

@Override
protected Writeable.Reader<GetSecurityStatsNodeResponse> instanceReader() {
return GetSecurityStatsNodeResponse::new;
}

@Override
protected GetSecurityStatsNodeResponse createTestInstance() {
return new GetSecurityStatsNodeResponse(
DiscoveryNodeUtils.create(randomUUID()),
randomBoolean() ? null : Map.of("key", randomUUID())
);
}

@Override
protected GetSecurityStatsNodeResponse mutateInstance(GetSecurityStatsNodeResponse instance) throws IOException {
return switch (randomIntBetween(0, 1)) {
case 0 -> new GetSecurityStatsNodeResponse(DiscoveryNodeUtils.create(randomUUID()), instance.getRolesStoreStats());
case 1 -> new GetSecurityStatsNodeResponse(instance.getDiscoveryNode(), Map.of("key", randomUUID()));
default -> throw new IllegalStateException("Unexpected value");
};
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,14 @@
*/
package org.elasticsearch.xpack.security.action.stats;

import org.elasticsearch.action.ActionListener;
import org.elasticsearch.action.FailedNodeException;
import org.elasticsearch.action.support.ActionFilters;
import org.elasticsearch.action.support.nodes.TransportNodesAction;
import org.elasticsearch.cluster.node.DiscoveryNode;
import org.elasticsearch.cluster.service.ClusterService;
import org.elasticsearch.common.io.stream.StreamInput;
import org.elasticsearch.core.Nullable;
import org.elasticsearch.injection.guice.Inject;
import org.elasticsearch.tasks.Task;
import org.elasticsearch.threadpool.ThreadPool;
Expand All @@ -21,9 +23,12 @@
import org.elasticsearch.xpack.core.security.action.stats.GetSecurityStatsNodeResponse;
import org.elasticsearch.xpack.core.security.action.stats.GetSecurityStatsNodesRequest;
import org.elasticsearch.xpack.core.security.action.stats.GetSecurityStatsNodesResponse;
import org.elasticsearch.xpack.security.authz.store.CompositeRolesStore;

import java.io.IOException;
import java.util.List;
import java.util.Map;
import java.util.concurrent.CompletableFuture;

public class TransportSecurityStatsAction extends TransportNodesAction<
GetSecurityStatsNodesRequest,
Expand All @@ -32,12 +37,16 @@ public class TransportSecurityStatsAction extends TransportNodesAction<
GetSecurityStatsNodeResponse,
Void> {

@Nullable
private final CompositeRolesStore rolesStore;

@Inject
public TransportSecurityStatsAction(
ThreadPool threadPool,
ClusterService clusterService,
TransportService transportService,
ActionFilters actionFilters
ActionFilters actionFilters,
CompositeRolesStore rolesStore
) {
super(
GetSecurityStatsAction.INSTANCE.name(),
Expand All @@ -47,6 +56,7 @@ public TransportSecurityStatsAction(
GetSecurityStatsNodeRequest::new,
threadPool.executor(ThreadPool.Names.MANAGEMENT)
);
this.rolesStore = rolesStore;
}

@Override
Expand All @@ -70,6 +80,12 @@ protected GetSecurityStatsNodeResponse newNodeResponse(final StreamInput in, fin

@Override
protected GetSecurityStatsNodeResponse nodeOperation(final GetSecurityStatsNodeRequest request, final Task task) {
return new GetSecurityStatsNodeResponse(clusterService.localNode());
final CompletableFuture<Map<String, Object>> rolesStatsFuture = new CompletableFuture<>();
if (rolesStore == null) {
rolesStatsFuture.complete(null);
} else {
rolesStore.usageStats(ActionListener.wrap(rolesStatsFuture::complete, rolesStatsFuture::completeExceptionally));
}
return new GetSecurityStatsNodeResponse(clusterService.localNode(), rolesStatsFuture.join());
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This will block a transport thread and NativeRolesStore.usageStats is potentially expensive.

Rather than using a future, we should override nodeOperationAsync instead and make use of the listener.

Copy link
Contributor Author

@szybia szybia Sep 24, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

hmm, unless i'm missing something, from my reading of the code and testing, nodeOperation doesn't run on transport threads, but the threadpool that's defined at the top (management thread pool):

    @Override
    protected GetSecurityStatsNodeResponse nodeOperation(final GetSecurityStatsNodeRequest request, final Task task) {
        if (Transports.isTransportThread(Thread.currentThread())) {
            LOG.info("szy: transport thread");
            throw new IllegalArgumentException("transport thread");
        } else {
            LOG.info("szy: not transport thread");
        }
[2025-09-24T13:48:33,922][INFO ][o.e.x.s.a.s.TransportSecurityStatsAction] [runTask-0] szy: not transport thread: elasticsearch[runTask-0][management][T#1]
[2025-09-24T13:48:33,925][INFO ][o.e.x.s.a.s.TransportSecurityStatsAction] [runTask-1] szy: not transport thread: elasticsearch[runTask-1][management][T#2]
[2025-09-24T13:48:33,924][INFO ][o.e.x.s.a.s.TransportSecurityStatsAction] [runTask-2] szy: not transport thread: elasticsearch[runTask-2][management][T#2]

i also think this is backed up by the fact that there are no overrides in ES of nodeOperationAsync, everything overrides nodeOperation, and assuming some of those do heavy things, and if they were in transport this would've been spotted

based on javadoc, my final understanding of nodeOperationAsync is that it's just to get access to the listener, maybe leading to cleaner code, but i've tried it out and don't see the hype over what i currently have:

    @Override
    protected GetSecurityStatsNodeResponse nodeOperation(final GetSecurityStatsNodeRequest request, final Task task) {
        return null; // never called because we override nodeOperationAsync
    }

    @Override
    protected void nodeOperationAsync(final GetSecurityStatsNodeRequest request,
                                      final Task task,
                                      final ActionListener<GetSecurityStatsNodeResponse> listener) {
        if (rolesStore == null) {
            listener.onResponse(new GetSecurityStatsNodeResponse(clusterService.localNode(), null));
        } else {
            rolesStore.usageStats(
                ActionListener.wrap(
                    stats -> listener.onResponse(new GetSecurityStatsNodeResponse(clusterService.localNode(), stats)),
                    listener::onFailure
                )
            );
        }
    }

thoughts?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You're right it will be a management thread not a transport thread, but we still end up blocking that thread while doing the work on another thread pool.

That risks contention on the management pool, potentially even with deadlocks if the management pool is completely full with threads that are waiting for work from other threads.

my final understanding of nodeOperationAsync is that it's just to get access to the listener, maybe leading to cleaner code

Not for cleaner code, but so you can perform asynchronous operations. You need the listener because you can't produce the required return value on the current thread, you have to return it asynchronously and for that you need the listener (because blocking on a future isn't something we want to do).

To my knowledge there are no other TransportNodesAction that need to do asynchronous work in the node operation which is why there are no implementations of nodeOperationAsync

However, that opens the question of whether we actually should be doing this work on every node. I hadn't really thought about that when I agreed with using the existing usage stats.
The file and dls stats make sense in a per-node response, but the native stats are inherently cluster-wide stats - they're returning information from the security index - and it doesn't make sense to calculate and return the same stats on every node in the cluster.

We should probably rethink that.

Our options (long term, not necessarily to be handled this PR):

  1. Accept that it's OK for every node to calculate the same stats, and stick with what we have. There are two issues here:
    1. It's potentially a performance issue (not necessarily for native role stats, but if we continue down this path it could be)
    2. There's the chance for timing issues and having nodes return different stats. That's pretty confusing - what does it mean for 1 node to have 134 native roles and another node to have 135 native rules?
  2. Calculate them once, but slot that identical value into the JSON response for each node, that overcomes the two issues above, but at the cost of:
    1. It's just weird to pretend these are node stats if they're cluster stats
    2. Now we have the problem of hiding (lying) if 1 node actually has an incorrect view of the stats (e.g. a caching bug). The response would tell you that the node has the same view of the data as all other nodes, but that's not actually the case.
  3. Move some stats to the top level of the JSON (outside of "nodes"). That's harder for client to consume, but more accurate (we could calculate those stats on the coordinating node for the request).

I think option 3 is best, but maybe Data Management has a view on how stats APIs ought to work in these sorts of cases.

Copy link
Contributor Author

@szybia szybia Sep 25, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

very fair points

after having a think, given i'm trying to ship DLS stats in 9.2 (FF 30th Sep)

i think best plan of attack is to just ship the DLS stats which don't seem to be contentious

and then have a think about structure for everything else

i've updated the code

  • it should be extensible to all your ideas above
  • follows _xpack/usage pattern
  • shouldn't be blocking now so i've left it as nodeOperation

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

re: long term options: I had a quick sanity check conversation with @dakrone, and after that conversation I feel safe saying that for the moment that list of options looks good enough that we aren't trapped in some bad state if we release this with only the dls cache stats now and then worry about how to handle (for example) the native stats later.

}
}
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
---
"Security stats returns empty response":
"Security stats returns a number of stats":
- requires:
cluster_features: [ "security_stats_endpoint" ]
reason: Introduced in 9.2
Expand All @@ -9,4 +9,4 @@

- set:
nodes._arbitrary_key_: node_id
- length: { nodes.$node_id: 0 }
- length: { nodes.$node_id: 1 }
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
---
"Security stats return roles stats":
- requires:
cluster_features: [ "security_stats_endpoint" ]
reason: Introduced in 9.2

- do:
security.get_stats: {}

- set:
nodes._arbitrary_key_: node_id
- gte: { nodes.$node_id.roles.dls.bit_set_cache.count: 0 }
- gte: { nodes.$node_id.roles.file.size: 0 }
- gte: { nodes.$node_id.roles.native.size: 0 }