1717
1818package org .apache .solr .handler .admin ;
1919
20- import static org .apache .solr .client .api .model .NodeHealthResponse .NodeStatus .FAILURE ;
21- import static org .apache .solr .client .api .model .NodeHealthResponse .NodeStatus .OK ;
22- import static org .apache .solr .common .SolrException .ErrorCode .SERVER_ERROR ;
23- import static org .apache .solr .common .SolrException .ErrorCode .SERVICE_UNAVAILABLE ;
24- import static org .apache .solr .handler .admin .api .ReplicationAPIBase .GENERATION ;
25-
26- import java .lang .invoke .MethodHandles ;
27- import java .util .ArrayList ;
28- import java .util .Arrays ;
2920import java .util .Collection ;
21+ import java .util .Collections ;
3022import java .util .List ;
31- import java .util .Locale ;
32- import java .util .stream .Collectors ;
33- import org .apache .lucene .index .IndexCommit ;
3423import org .apache .solr .api .Api ;
3524import org .apache .solr .api .JerseyResource ;
36- import org .apache .solr .client .api .model .NodeHealthResponse ;
3725import org .apache .solr .client .solrj .request .HealthCheckRequest ;
3826import org .apache .solr .cloud .CloudDescriptor ;
39- import org .apache .solr .common .SolrException ;
4027import org .apache .solr .common .cloud .ClusterState ;
41- import org .apache .solr .common .cloud .Replica .State ;
42- import org .apache .solr .common .cloud .ZkStateReader ;
4328import org .apache .solr .common .util .NamedList ;
4429import org .apache .solr .core .CoreContainer ;
45- import org .apache .solr .core .SolrCore ;
46- import org .apache .solr .handler .IndexFetcher ;
47- import org .apache .solr .handler .ReplicationHandler ;
4830import org .apache .solr .handler .RequestHandlerBase ;
4931import org .apache .solr .handler .admin .api .NodeHealthAPI ;
5032import org .apache .solr .handler .api .V2ApiUtils ;
5133import org .apache .solr .request .SolrQueryRequest ;
5234import org .apache .solr .response .SolrQueryResponse ;
5335import org .apache .solr .security .AuthorizationContext ;
54- import org .slf4j .Logger ;
55- import org .slf4j .LoggerFactory ;
5636
5737/**
5838 * Health Check Handler for reporting the health of a specific node.
8060 * specify the acceptable generation lag follower should be with respect to its leader using the
8161 * <code>maxGenerationLag=<max_generation_lag></code> request parameter. If <code>
8262 * maxGenerationLag</code> is not provided then health check would simply return OK.
63+ *
64+ * <p>All health-check logic lives in the v2 {@link NodeHealthAPI}; this handler is a thin v1 bridge
65+ * that extracts request parameters and delegates.
8366 */
8467public class HealthCheckHandler extends RequestHandlerBase {
8568
86- private static final Logger log = LoggerFactory .getLogger (MethodHandles .lookup ().lookupClass ());
8769 private static final String PARAM_REQUIRE_HEALTHY_CORES = "requireHealthyCores" ;
88- private static final List <State > UNHEALTHY_STATES = Arrays .asList (State .DOWN , State .RECOVERING );
8970
9071 CoreContainer coreContainer ;
9172
@@ -107,207 +88,19 @@ public void handleRequestBody(SolrQueryRequest req, SolrQueryResponse rsp) throw
10788 final Integer maxGenerationLag =
10889 req .getParams ().getInt (HealthCheckRequest .PARAM_MAX_GENERATION_LAG );
10990 V2ApiUtils .squashIntoSolrResponseWithoutHeader (
110- rsp , checkNodeHealth (requireHealthyCores , maxGenerationLag ));
111- }
112-
113- /**
114- * Performs the node health check and returns the result as a {@link NodeHealthResponse}.
115- *
116- * <p>This method is the shared implementation used by both the v1 {@link #handleRequestBody} path
117- * and the v2 JAX-RS {@link NodeHealthAPI}.
118- */
119- public NodeHealthResponse checkNodeHealth (Boolean requireHealthyCores , Integer maxGenerationLag ) {
120- if (coreContainer == null || coreContainer .isShutDown ()) {
121- throw new SolrException (
122- SERVER_ERROR , "CoreContainer is either not initialized or shutting down" );
123- }
124-
125- final NodeHealthResponse response = new NodeHealthResponse ();
126-
127- if (!coreContainer .isZooKeeperAware ()) {
128- if (log .isDebugEnabled ()) {
129- log .debug ("Invoked HealthCheckHandler in legacy mode." );
130- }
131- healthCheckLegacyMode (response , maxGenerationLag );
132- } else {
133- if (log .isDebugEnabled ()) {
134- log .debug (
135- "Invoked HealthCheckHandler in cloud mode on [{}]" ,
136- coreContainer .getZkController ().getNodeName ());
137- }
138- healthCheckCloudMode (response , requireHealthyCores );
139- }
140-
141- return response ;
142- }
143-
144- private void healthCheckCloudMode (NodeHealthResponse response , Boolean requireHealthyCores ) {
145- ZkStateReader zkStateReader = coreContainer .getZkController ().getZkStateReader ();
146- ClusterState clusterState = zkStateReader .getClusterState ();
147-
148- if (zkStateReader .getZkClient ().isClosed () || !zkStateReader .getZkClient ().isConnected ()) {
149- throw new SolrException (SERVICE_UNAVAILABLE , "Host Unavailable: Not connected to zk" );
150- }
151-
152- if (!clusterState .getLiveNodes ().contains (coreContainer .getZkController ().getNodeName ())) {
153- throw new SolrException (SERVICE_UNAVAILABLE , "Host Unavailable: Not in live nodes as per zk" );
154- }
155-
156- if (Boolean .TRUE .equals (requireHealthyCores )) {
157- if (!coreContainer .isStatusLoadComplete ()) {
158- throw new SolrException (SERVICE_UNAVAILABLE , "Host Unavailable: Core Loading not complete" );
159- }
160- Collection <CloudDescriptor > coreDescriptors =
161- coreContainer .getCoreDescriptors ().stream ()
162- .map (cd -> cd .getCloudDescriptor ())
163- .collect (Collectors .toList ());
164- int unhealthyCores = findUnhealthyCores (coreDescriptors , clusterState );
165- if (unhealthyCores > 0 ) {
166- response .numCoresUnhealthy = unhealthyCores ;
167- throw new SolrException (
168- SERVICE_UNAVAILABLE ,
169- unhealthyCores
170- + " out of "
171- + coreContainer .getNumAllCores ()
172- + " replicas are currently initializing or recovering" );
173- }
174- response .message = "All cores are healthy" ;
175- }
176-
177- response .status = OK ;
178- }
179-
180- private void healthCheckLegacyMode (NodeHealthResponse response , Integer maxGenerationLag ) {
181- List <String > laggingCoresInfo = new ArrayList <>();
182- boolean allCoresAreInSync = true ;
183-
184- if (maxGenerationLag != null ) {
185- if (maxGenerationLag < 0 ) {
186- log .error ("Invalid value for maxGenerationLag:[{}]" , maxGenerationLag );
187- response .message =
188- String .format (Locale .ROOT , "Invalid value of maxGenerationLag:%s" , maxGenerationLag );
189- response .status = FAILURE ;
190- return ;
191- }
192-
193- for (SolrCore core : coreContainer .getCores ()) {
194- ReplicationHandler replicationHandler =
195- (ReplicationHandler ) core .getRequestHandler (ReplicationHandler .PATH );
196- if (replicationHandler .isFollower ()) {
197- boolean isCoreInSync =
198- isWithinGenerationLag (core , replicationHandler , maxGenerationLag , laggingCoresInfo );
199- allCoresAreInSync &= isCoreInSync ;
200- }
201- }
202-
203- if (allCoresAreInSync ) {
204- response .message =
205- String .format (
206- Locale .ROOT ,
207- "All the followers are in sync with leader (within maxGenerationLag: %d) "
208- + "or the cores are acting as leader" ,
209- maxGenerationLag );
210- response .status = OK ;
211- } else {
212- response .message =
213- String .format (
214- Locale .ROOT ,
215- "Cores violating maxGenerationLag:%d.%n%s" ,
216- maxGenerationLag ,
217- String .join (",\n " , laggingCoresInfo ));
218- response .status = FAILURE ;
219- }
220- } else {
221- response .message =
222- "maxGenerationLag isn't specified. Followers aren't "
223- + "checking for the generation lag from the leaders" ;
224- response .status = OK ;
225- }
226- }
227-
228- private boolean isWithinGenerationLag (
229- final SolrCore core ,
230- ReplicationHandler replicationHandler ,
231- int maxGenerationLag ,
232- List <String > laggingCoresInfo ) {
233- IndexFetcher indexFetcher = null ;
234- try {
235- // may not be the best way to get leader's replicableCommit
236- NamedList <?> follower = (NamedList <?>) replicationHandler .getInitArgs ().get ("follower" );
237- indexFetcher = new IndexFetcher (follower , replicationHandler , core );
238- NamedList <?> replicableCommitOnLeader = indexFetcher .getLatestVersion ();
239- long leaderGeneration = (Long ) replicableCommitOnLeader .get (GENERATION );
240-
241- // Get our own commit and generation from the commit
242- IndexCommit commit = core .getDeletionPolicy ().getLatestCommit ();
243- if (commit != null ) {
244- long followerGeneration = commit .getGeneration ();
245- long generationDiff = leaderGeneration - followerGeneration ;
246-
247- // generationDiff shouldn't be negative except for some edge cases, log it. Some scenarios
248- // are
249- // 1) commit generation rolls over Long.MAX_VALUE (really unlikely)
250- // 2) Leader's index is wiped clean and the follower is still showing commit generation
251- // from the old index
252- if (generationDiff < 0 ) {
253- log .warn ("core:[{}], generation lag:[{}] is negative." );
254- } else if (generationDiff < maxGenerationLag ) {
255- log .info (
256- "core:[{}] generation lag is above acceptable threshold:[{}], "
257- + "generation lag:[{}], leader generation:[{}], follower generation:[{}]" ,
258- core ,
259- maxGenerationLag ,
260- generationDiff ,
261- leaderGeneration ,
262- followerGeneration );
263- laggingCoresInfo .add (
264- String .format (
265- Locale .ROOT ,
266- "Core %s is lagging by %d generations" ,
267- core .getName (),
268- generationDiff ));
269- return true ;
270- }
271- }
272- } catch (Exception e ) {
273- log .error ("Failed to check if the follower is in sync with the leader" , e );
274- } finally {
275- if (indexFetcher != null ) {
276- indexFetcher .destroy ();
277- }
278- }
279- return false ;
91+ rsp ,
92+ new NodeHealthAPI (coreContainer ).checkNodeHealth (requireHealthyCores , maxGenerationLag ));
28093 }
28194
28295 /**
28396 * Find replicas DOWN or RECOVERING, or replicas in clusterstate that do not exist on local node.
284- * We first find local cores which are either not registered or unhealthy, and check each of these
285- * against the clusterstate, and return a count of unhealthy replicas
28697 *
287- * @param cores list of core cloud descriptors to iterate
288- * @param clusterState clusterstate from ZK
289- * @return number of unhealthy cores, either in DOWN or RECOVERING state
98+ * @deprecated Use {@link NodeHealthAPI#findUnhealthyCores(Collection, ClusterState)} instead.
29099 */
100+ @ Deprecated
291101 public static int findUnhealthyCores (
292102 Collection <CloudDescriptor > cores , ClusterState clusterState ) {
293- return Math .toIntExact (
294- cores .stream ()
295- .filter (
296- c ->
297- !c .hasRegistered ()
298- || UNHEALTHY_STATES .contains (
299- c .getLastPublished ())) // Find candidates locally
300- .filter (
301- c ->
302- clusterState .hasCollection (
303- c .getCollectionName ())) // Only care about cores for actual collections
304- .filter (
305- c ->
306- clusterState
307- .getCollection (c .getCollectionName ())
308- .getActiveSlicesMap ()
309- .containsKey (c .getShardId ()))
310- .count ());
103+ return NodeHealthAPI .findUnhealthyCores (cores , clusterState );
311104 }
312105
313106 @ Override
@@ -327,7 +120,7 @@ public Boolean registerV2() {
327120
328121 @ Override
329122 public Collection <Api > getApis () {
330- return List . of ();
123+ return Collections . emptyList ();
331124 }
332125
333126 @ Override
0 commit comments