3636import org .slf4j .Logger ;
3737import org .slf4j .LoggerFactory ;
3838
39+ /**
40+ * This command returns stats about the Overseer, the cluster state updater and collection API activity occurring
41+ * <b>within the current Overseer node</b> (this is important because distributed operations occurring on other nodes
42+ * are <b>not included</b> in these stats, for example distributed cluster state updates or Per Replica States updates).<p>
43+ *
44+ * The structure of the returned results is as follows:
45+ * <ul>
46+ * <li><b>{@code leader}:</b> {@code ID} of the current overseer leader node</li>
47+ * <li><b>{@code overseer_queue_size}:</b> count of entries in the {@code /overseer/queue} Zookeeper queue/directory</li>
48+ * <li><b>{@code overseer_work_queue_size}:</b> count of entries in the {@code /overseer/queue-work} Zookeeper queue/directory</li>
49+ * <li><b>{@code overseer_collection_queue_size}:</b> count of entries in the {@code /overseer/collection-queue-work}
50+ * Zookeeper queue/directory</li>
51+ * <li><b>{@code overseer_operations}:</b> map (of maps) of success and error counts for operations. The operations
52+ * (keys) tracked in this map are:
53+ * <ul>
54+ * <li>{@code am_i_leader} (Overseer checking it is still the elected Overseer as it processes cluster state update
55+ * messages)</li>
56+ * <li>{@code configset_}<i>{@code <config set operation>}</i> (from
57+ * {@link org.apache.solr.handler.admin.ConfigSetsHandler.ConfigSetOperation})</li>
58+ * <li>Cluster state change operation names from {@link org.apache.solr.common.params.CollectionParams.CollectionAction}
59+ * (not all of them!) and {@link org.apache.solr.cloud.overseer.OverseerAction} (the complete list: {@code create},
60+ * {@code delete}, {@code createshard}, {@code deleteshard}, {@code addreplica}, {@code addreplicaprop}, {@code deletereplicaprop},
61+ * {@code balanceshardunique}, {@code modifycollection}, {@code state}, {@code leader}, {@code deletecore}, {@code addroutingrule},
62+ * {@code removeroutingrule}, {@code updateshardstate}, {@code downnode} and {@code quit} with this like one unlikely
63+ * to be observed since the Overseer is existing right away)</li>
64+ * <li>{@code update_state} (when Overseer cluster state updater persists changes in Zookeeper)</li>
65+ * </ul>
66+ * For each key, the value is a map composed of:
67+ * <ul>
68+ * <li>{@code requests}: success count of the given operation </li>
69+ * <li>{@code errors}: error count of the operation </li>
70+ * <li>More metrics (see below)</li>
71+ * </ul>
72+ * </li>
73+ * <li><b>{@code collection_operations}:</b> map (of maps) of success and error counts for collection related operations.
74+ * The operations(keys) tracked in this map are <b>all operations</b> that start with {@code collection_}, but the
75+ * {@code collection_} prefix is <b>stripped</b> of the returned value. Possible keys are therefore:
76+ * <ul>
77+ * <li>{@code am_i_leader}: originating in a stat called {@code collection_am_i_leader} representing Overseer checking
78+ * it is still the elected Overseer as it processes Collection API and Config Set API messages.</li>
79+ * <li>Collection API operation names from {@link org.apache.solr.common.params.CollectionParams.CollectionAction} (the
80+ * stripped {@code collection_} prefix gets added in {@link OverseerCollectionMessageHandler#getTimerName(String)})</li>
81+ * </ul>
82+ * For each key, the value is a map composed of:
83+ * <ul>
84+ * <li>{@code requests}: success count of the given operation </li>
85+ * <li>{@code errors}: error count of the operation </li>
86+ * <li>{@code recent_failures}: an <b>optional</b> entry containing a list of maps, each map having two entries, one
87+ * with key {@code request} with a failed request properties (a {@link ZkNodeProps}) and the other with key
88+ * {@code response} with the corresponding response properties (a {@link org.apache.solr.client.solrj.SolrResponse}).</li>
89+ * <li>More metrics (see below)</li>
90+ * </ul>
91+ * </li>
92+ * <li><b>{@code overseer_queue}:</b> metrics on operations done on the Zookeeper queue {@code /overseer/queue} (see
93+ * metrics below).<br>
94+ * The operations that can be done on the queue and that can be keys whose values are a metrics map are:
95+ * <ul>
96+ * <li>{@code offer}</li>
97+ * <li>{@code peek}</li>
98+ * <li>{@code peek_wait}</li>
99+ * <li>{@code peek_wait_forever}</li>
100+ * <li>{@code peekTopN_wait}</li>
101+ * <li>{@code peekTopN_wait_forever}</li>
102+ * <li>{@code poll}</li>
103+ * <li>{@code remove}</li>
104+ * <li>{@code remove_event}</li>
105+ * <li>{@code take}</li>
106+ * </ul>
107+ * </li>
108+ * <li><b>{@code overseer_internal_queue}:</b> same as above but for queue {@code /overseer/queue-work}</li>
109+ * <li><b>{@code collection_queue}:</b> same as above but for queue {@code /overseer/collection-queue-work}</li>
110+ * </ul>
111+ *
112+ * <p>
113+ * Maps returned as values of keys in <b>{@code overseer_operations}</b>, <b>{@code collection_operations}</b>,
114+ * <b>{@code overseer_queue}</b>, <b>{@code overseer_internal_queue}</b> and <b>{@code collection_queue}</b> include
115+ * additional stats. These stats are provided by {@link MetricUtils}, and represent metrics on each type of operation
116+ * execution (be it failed or successful), see calls to {@link Stats#time(String)}. The metric keys are:
117+ * <ul>
118+ * <li>{@code avgRequestsPerSecond}</li>
119+ * <li>{@code 5minRateRequestsPerSecond}</li>
120+ * <li>{@code 15minRateRequestsPerSecond}</li>
121+ * <li>{@code avgTimePerRequest}</li>
122+ * <li>{@code medianRequestTime}</li>
123+ * <li>{@code 75thPcRequestTime}</li>
124+ * <li>{@code 95thPcRequestTime}</li>
125+ * <li>{@code 99thPcRequestTime}</li>
126+ * <li>{@code 999thPcRequestTime}</li>
127+ * </ul>
128+ */
39129public class OverseerStatusCmd implements CollApiCmds .CollectionApiCommand {
40130 private static final Logger log = LoggerFactory .getLogger (MethodHandles .lookup ().lookupClass ());
41131 private final CollectionCommandContext ccc ;
@@ -60,16 +150,6 @@ public void call(ClusterState state, ZkNodeProps message, @SuppressWarnings({"ra
60150 zkStateReader .getZkClient ().getData ("/overseer/collection-queue-work" ,null , stat , true );
61151 results .add ("overseer_collection_queue_size" , stat .getNumChildren ());
62152
63- // Overseer reported stats below are tracked in the Overseer cluster state updater when it performs certain operations.
64- // Sharing the ocmh.stats variable between the cluster state updater and the Collection API (this command) is by the way
65- // about the only thing that ties the cluster state updater to the collection api message handler and that takes
66- // advantage of the fact that both run on the same node (the Overseer node). (recently added PerReplicaStates also
67- // take advantage of this through method Overseer.submit() accessed via CollectionCommandContext.submitIntraProcessMessage()).
68- // When distributed updates are enabled, cluster state updates are not done by the Overseer (it doesn't even see them)
69- // and therefore can't report them. The corresponding data in OVERSEERSTATUS (all data built below) is no longer returned.
70- // This means top level keys "overseer_operations", "collection_operations", "overseer_queue", "overseer_internal_queue"
71- // and "collection_queue" are either empty or do not contain all expected information when cluster state updates are distributed.
72-
73153 @ SuppressWarnings ({"rawtypes" })
74154 NamedList overseerStats = new NamedList ();
75155 @ SuppressWarnings ({"rawtypes" })
@@ -80,47 +160,45 @@ public void call(ClusterState state, ZkNodeProps message, @SuppressWarnings({"ra
80160 NamedList workQueueStats = new NamedList ();
81161 @ SuppressWarnings ({"rawtypes" })
82162 NamedList collectionQueueStats = new NamedList ();
83- // stats below do not make sense when cluster state updates are distributed. Return them empty.
84- if (!ccc .getDistributedClusterStateUpdater ().isDistributedStateUpdate ()) {
85- Stats stats = ccc .getOverseerStats ();
86- for (Map .Entry <String , Stats .Stat > entry : stats .getStats ().entrySet ()) {
87- String key = entry .getKey ();
88- NamedList <Object > lst = new SimpleOrderedMap <>();
89- if (key .startsWith ("collection_" )) {
90- collectionStats .add (key .substring (11 ), lst );
91- int successes = stats .getSuccessCount (entry .getKey ());
92- int errors = stats .getErrorCount (entry .getKey ());
93- lst .add ("requests" , successes );
94- lst .add ("errors" , errors );
95- List <Stats .FailedOp > failureDetails = stats .getFailureDetails (key );
96- if (failureDetails != null ) {
97- List <SimpleOrderedMap <Object >> failures = new ArrayList <>();
98- for (Stats .FailedOp failedOp : failureDetails ) {
99- SimpleOrderedMap <Object > fail = new SimpleOrderedMap <>();
100- fail .add ("request" , failedOp .req .getProperties ());
101- fail .add ("response" , failedOp .resp .getResponse ());
102- failures .add (fail );
103- }
104- lst .add ("recent_failures" , failures );
163+ Stats stats = ccc .getOverseerStats ();
164+ for (Map .Entry <String , Stats .Stat > entry : stats .getStats ().entrySet ()) {
165+ String key = entry .getKey ();
166+ NamedList <Object > lst = new SimpleOrderedMap <>();
167+ if (key .startsWith ("collection_" )) {
168+ collectionStats .add (key .substring (11 ), lst );
169+ int successes = stats .getSuccessCount (entry .getKey ());
170+ int errors = stats .getErrorCount (entry .getKey ());
171+ lst .add ("requests" , successes );
172+ lst .add ("errors" , errors );
173+ List <Stats .FailedOp > failureDetails = stats .getFailureDetails (key );
174+ if (failureDetails != null ) {
175+ List <SimpleOrderedMap <Object >> failures = new ArrayList <>();
176+ for (Stats .FailedOp failedOp : failureDetails ) {
177+ SimpleOrderedMap <Object > fail = new SimpleOrderedMap <>();
178+ fail .add ("request" , failedOp .req .getProperties ());
179+ fail .add ("response" , failedOp .resp .getResponse ());
180+ failures .add (fail );
105181 }
106- } else if (key .startsWith ("/overseer/queue_" )) {
107- stateUpdateQueueStats .add (key .substring (16 ), lst );
108- } else if (key .startsWith ("/overseer/queue-work_" )) {
109- workQueueStats .add (key .substring (21 ), lst );
110- } else if (key .startsWith ("/overseer/collection-queue-work_" )) {
111- collectionQueueStats .add (key .substring (32 ), lst );
112- } else {
113- // overseer stats
114- overseerStats .add (key , lst );
115- int successes = stats .getSuccessCount (entry .getKey ());
116- int errors = stats .getErrorCount (entry .getKey ());
117- lst .add ("requests" , successes );
118- lst .add ("errors" , errors );
182+ lst .add ("recent_failures" , failures );
119183 }
120- Timer timer = entry .getValue ().requestTime ;
121- MetricUtils .addMetrics (lst , timer );
184+ } else if (key .startsWith ("/overseer/queue_" )) {
185+ stateUpdateQueueStats .add (key .substring (16 ), lst );
186+ } else if (key .startsWith ("/overseer/queue-work_" )) {
187+ workQueueStats .add (key .substring (21 ), lst );
188+ } else if (key .startsWith ("/overseer/collection-queue-work_" )) {
189+ collectionQueueStats .add (key .substring (32 ), lst );
190+ } else {
191+ // overseer stats
192+ overseerStats .add (key , lst );
193+ int successes = stats .getSuccessCount (entry .getKey ());
194+ int errors = stats .getErrorCount (entry .getKey ());
195+ lst .add ("requests" , successes );
196+ lst .add ("errors" , errors );
122197 }
198+ Timer timer = entry .getValue ().requestTime ;
199+ MetricUtils .addMetrics (lst , timer );
123200 }
201+
124202 results .add ("overseer_operations" , overseerStats );
125203 results .add ("collection_operations" , collectionStats );
126204 results .add ("overseer_queue" , stateUpdateQueueStats );
0 commit comments