diff --git a/changelog/unreleased/SOLR-16458-migrate-node-health-api.yml b/changelog/unreleased/SOLR-16458-migrate-node-health-api.yml new file mode 100644 index 000000000000..a9979fae4929 --- /dev/null +++ b/changelog/unreleased/SOLR-16458-migrate-node-health-api.yml @@ -0,0 +1,7 @@ +title: "SolrJ now offers a SolrRequest class allowing users to perform single-node healthchecks: NodeApi.Healthcheck" +type: added +authors: + - name: Eric Pugh +links: + - name: SOLR-16458 + url: https://issues.apache.org/jira/browse/SOLR-16458 diff --git a/solr/api/src/java/org/apache/solr/client/api/endpoint/NodeHealthApi.java b/solr/api/src/java/org/apache/solr/client/api/endpoint/NodeHealthApi.java new file mode 100644 index 000000000000..e453af68201a --- /dev/null +++ b/solr/api/src/java/org/apache/solr/client/api/endpoint/NodeHealthApi.java @@ -0,0 +1,34 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.solr.client.api.endpoint; + +import io.swagger.v3.oas.annotations.Operation; +import jakarta.ws.rs.GET; +import jakarta.ws.rs.Path; +import jakarta.ws.rs.QueryParam; +import org.apache.solr.client.api.model.NodeHealthResponse; + +/** V2 API definition for checking the health of a Solr node. */ +@Path("/node/health") +public interface NodeHealthApi { + + @GET + @Operation( + summary = "Determine the health of a Solr node.", + tags = {"node"}) + NodeHealthResponse healthcheck(@QueryParam("requireHealthyCores") Boolean requireHealthyCores); +} diff --git a/solr/api/src/java/org/apache/solr/client/api/model/NodeHealthResponse.java b/solr/api/src/java/org/apache/solr/client/api/model/NodeHealthResponse.java new file mode 100644 index 000000000000..a0be8723b98a --- /dev/null +++ b/solr/api/src/java/org/apache/solr/client/api/model/NodeHealthResponse.java @@ -0,0 +1,36 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.solr.client.api.model; + +import com.fasterxml.jackson.annotation.JsonProperty; + +/** Response body for the '/api/node/health' endpoint. */ +public class NodeHealthResponse extends SolrJerseyResponse { + + /** The possible health statuses for a Solr node. */ + public enum NodeStatus { + OK, + FAILURE + } + + @JsonProperty public NodeStatus status; + + @JsonProperty public String message; + + @JsonProperty("num_cores_unhealthy") + public Integer numCoresUnhealthy; +} diff --git a/solr/core/src/java/org/apache/solr/handler/admin/HealthCheckHandler.java b/solr/core/src/java/org/apache/solr/handler/admin/HealthCheckHandler.java index 1ecf959e49ed..c26565d5f431 100644 --- a/solr/core/src/java/org/apache/solr/handler/admin/HealthCheckHandler.java +++ b/solr/core/src/java/org/apache/solr/handler/admin/HealthCheckHandler.java @@ -17,39 +17,22 @@ package org.apache.solr.handler.admin; -import static org.apache.solr.common.params.CommonParams.FAILURE; -import static org.apache.solr.common.params.CommonParams.OK; -import static org.apache.solr.common.params.CommonParams.STATUS; -import static org.apache.solr.handler.admin.api.ReplicationAPIBase.GENERATION; - -import java.lang.invoke.MethodHandles; -import java.util.ArrayList; -import java.util.Arrays; import java.util.Collection; +import java.util.Collections; import java.util.List; -import java.util.Locale; -import java.util.stream.Collectors; -import org.apache.lucene.index.IndexCommit; -import org.apache.solr.api.AnnotatedApi; import org.apache.solr.api.Api; +import org.apache.solr.api.JerseyResource; +import org.apache.solr.client.api.model.NodeHealthResponse; import org.apache.solr.client.solrj.request.HealthCheckRequest; -import org.apache.solr.cloud.CloudDescriptor; import org.apache.solr.common.SolrException; -import org.apache.solr.common.cloud.ClusterState; -import org.apache.solr.common.cloud.Replica.State; -import org.apache.solr.common.cloud.ZkStateReader; import org.apache.solr.common.util.NamedList; import org.apache.solr.core.CoreContainer; -import org.apache.solr.core.SolrCore; -import org.apache.solr.handler.IndexFetcher; -import org.apache.solr.handler.ReplicationHandler; import org.apache.solr.handler.RequestHandlerBase; -import org.apache.solr.handler.admin.api.NodeHealthAPI; +import org.apache.solr.handler.admin.api.NodeHealth; +import org.apache.solr.handler.api.V2ApiUtils; import org.apache.solr.request.SolrQueryRequest; import org.apache.solr.response.SolrQueryResponse; import org.apache.solr.security.AuthorizationContext; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; /** * Health Check Handler for reporting the health of a specific node. @@ -77,12 +60,13 @@ * specify the acceptable generation lag follower should be with respect to its leader using the * maxGenerationLag=<max_generation_lag> request parameter. If * maxGenerationLag is not provided then health check would simply return OK. + * + *

All health-check logic lives in the v2 {@link NodeHealth}; this handler is a thin v1 bridge + * that extracts request parameters and delegates. */ public class HealthCheckHandler extends RequestHandlerBase { - private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); private static final String PARAM_REQUIRE_HEALTHY_CORES = "requireHealthyCores"; - private static final List UNHEALTHY_STATES = Arrays.asList(State.DOWN, State.RECOVERING); CoreContainer coreContainer; @@ -100,224 +84,19 @@ public CoreContainer getCoreContainer() { @Override public void handleRequestBody(SolrQueryRequest req, SolrQueryResponse rsp) throws Exception { rsp.setHttpCaching(false); - - // Core container should not be null and active (redundant check) - if (coreContainer == null || coreContainer.isShutDown()) { - rsp.setException( - new SolrException( - SolrException.ErrorCode.SERVER_ERROR, - "CoreContainer is either not initialized or shutting down")); - return; - } - if (!coreContainer.isZooKeeperAware()) { - if (log.isDebugEnabled()) { - log.debug("Invoked HealthCheckHandler in legacy mode."); - } - healthCheckLegacyMode(req, rsp); - } else { - if (log.isDebugEnabled()) { - log.debug( - "Invoked HealthCheckHandler in cloud mode on [{}]", - this.coreContainer.getZkController().getNodeName()); - } - healthCheckCloudMode(req, rsp); - } - } - - private void healthCheckCloudMode(SolrQueryRequest req, SolrQueryResponse rsp) { - ZkStateReader zkStateReader = coreContainer.getZkController().getZkStateReader(); - ClusterState clusterState = zkStateReader.getClusterState(); - // Check for isConnected and isClosed - if (zkStateReader.getZkClient().isClosed() || !zkStateReader.getZkClient().isConnected()) { - rsp.add(STATUS, FAILURE); - rsp.setException( - new SolrException( - SolrException.ErrorCode.SERVICE_UNAVAILABLE, - "Host Unavailable: Not connected to zk")); - return; - } - - // Fail if not in live_nodes - if (!clusterState.getLiveNodes().contains(coreContainer.getZkController().getNodeName())) { - rsp.add(STATUS, FAILURE); - rsp.setException( - new SolrException( - SolrException.ErrorCode.SERVICE_UNAVAILABLE, - "Host Unavailable: Not in live nodes as per zk")); - return; - } - - // Optionally require that all cores on this node are active if param 'requireHealthyCores=true' - if (req.getParams().getBool(PARAM_REQUIRE_HEALTHY_CORES, false)) { - if (!coreContainer.isStatusLoadComplete()) { - rsp.add(STATUS, FAILURE); - rsp.setException( - new SolrException( - SolrException.ErrorCode.SERVICE_UNAVAILABLE, - "Host Unavailable: Core Loading not complete")); - return; - } - Collection coreDescriptors = - coreContainer.getCoreDescriptors().stream() - .map(cd -> cd.getCloudDescriptor()) - .collect(Collectors.toList()); - long unhealthyCores = findUnhealthyCores(coreDescriptors, clusterState); - if (unhealthyCores > 0) { - rsp.add(STATUS, FAILURE); - rsp.add("num_cores_unhealthy", unhealthyCores); - rsp.setException( - new SolrException( - SolrException.ErrorCode.SERVICE_UNAVAILABLE, - unhealthyCores - + " out of " - + coreContainer.getNumAllCores() - + " replicas are currently initializing or recovering")); - return; - } - rsp.add("message", "All cores are healthy"); - } - - // All lights green, report healthy - rsp.add(STATUS, OK); - } - - private void healthCheckLegacyMode(SolrQueryRequest req, SolrQueryResponse rsp) { - Integer maxGenerationLag = req.getParams().getInt(HealthCheckRequest.PARAM_MAX_GENERATION_LAG); - List laggingCoresInfo = new ArrayList<>(); - boolean allCoresAreInSync = true; - - // check only if max generation lag is specified - if (maxGenerationLag != null) { - // if is not negative - if (maxGenerationLag < 0) { - log.error("Invalid value for maxGenerationLag:[{}]", maxGenerationLag); - rsp.add( - "message", - String.format(Locale.ROOT, "Invalid value of maxGenerationLag:%s", maxGenerationLag)); - rsp.add(STATUS, FAILURE); - } else { - for (SolrCore core : coreContainer.getCores()) { - ReplicationHandler replicationHandler = - (ReplicationHandler) core.getRequestHandler(ReplicationHandler.PATH); - if (replicationHandler.isFollower()) { - boolean isCoreInSync = - isWithinGenerationLag(core, replicationHandler, maxGenerationLag, laggingCoresInfo); - - allCoresAreInSync &= isCoreInSync; - } - } - } - if (allCoresAreInSync) { - rsp.add( - "message", - String.format( - Locale.ROOT, - "All the followers are in sync with leader (within maxGenerationLag: %d) " - + "or the cores are acting as leader", - maxGenerationLag)); - rsp.add(STATUS, OK); - } else { - rsp.add( - "message", - String.format( - Locale.ROOT, - "Cores violating maxGenerationLag:%d.%n%s", - maxGenerationLag, - String.join(",\n", laggingCoresInfo))); - rsp.add(STATUS, FAILURE); - } - } else { // if maxGeneration lag is not specified (is null) we aren't checking for lag - rsp.add( - "message", - "maxGenerationLag isn't specified. Followers aren't " - + "checking for the generation lag from the leaders"); - rsp.add(STATUS, OK); - } - } - - private boolean isWithinGenerationLag( - final SolrCore core, - ReplicationHandler replicationHandler, - int maxGenerationLag, - List laggingCoresInfo) { - IndexFetcher indexFetcher = null; + final Boolean requireHealthyCores = req.getParams().getBool(PARAM_REQUIRE_HEALTHY_CORES); + final Integer maxGenerationLag = + req.getParams().getInt(HealthCheckRequest.PARAM_MAX_GENERATION_LAG); try { - // may not be the best way to get leader's replicableCommit - NamedList follower = (NamedList) replicationHandler.getInitArgs().get("follower"); - - indexFetcher = new IndexFetcher(follower, replicationHandler, core); - - NamedList replicableCommitOnLeader = indexFetcher.getLatestVersion(); - long leaderGeneration = (Long) replicableCommitOnLeader.get(GENERATION); - - // Get our own commit and generation from the commit - IndexCommit commit = core.getDeletionPolicy().getLatestCommit(); - if (commit != null) { - long followerGeneration = commit.getGeneration(); - long generationDiff = leaderGeneration - followerGeneration; - - // generationDiff shouldn't be negative except for some edge cases, log it. Some scenarios - // are - // 1) commit generation rolls over Long.MAX_VALUE (really unlikely) - // 2) Leader's index is wiped clean and the follower is still showing commit generation - // from the old index - if (generationDiff < 0) { - log.warn("core:[{}], generation lag:[{}] is negative."); - } else if (generationDiff < maxGenerationLag) { - log.info( - "core:[{}] generation lag is above acceptable threshold:[{}], " - + "generation lag:[{}], leader generation:[{}], follower generation:[{}]", - core, - maxGenerationLag, - generationDiff, - leaderGeneration, - followerGeneration); - - laggingCoresInfo.add( - String.format( - Locale.ROOT, - "Core %s is lagging by %d generations", - core.getName(), - generationDiff)); - return true; - } - } - } catch (Exception e) { - log.error("Failed to check if the follower is in sync with the leader", e); - } finally { - if (indexFetcher != null) { - indexFetcher.destroy(); - } + V2ApiUtils.squashIntoSolrResponseWithoutHeader( + rsp, + new NodeHealth(coreContainer).checkNodeHealth(requireHealthyCores, maxGenerationLag)); + } catch (SolrException e) { + final NodeHealthResponse failureResponse = new NodeHealthResponse(); + failureResponse.status = NodeHealthResponse.NodeStatus.FAILURE; + V2ApiUtils.squashIntoSolrResponseWithoutHeader(rsp, failureResponse); + rsp.setException(e); } - return false; - } - - /** - * Find replicas DOWN or RECOVERING, or replicas in clusterstate that do not exist on local node. - * We first find local cores which are either not registered or unhealthy, and check each of these - * against the clusterstate, and return a count of unhealthy replicas - * - * @param cores list of core cloud descriptors to iterate - * @param clusterState clusterstate from ZK - * @return number of unhealthy cores, either in DOWN or RECOVERING state - */ - static long findUnhealthyCores(Collection cores, ClusterState clusterState) { - return cores.stream() - .filter( - c -> - !c.hasRegistered() - || UNHEALTHY_STATES.contains(c.getLastPublished())) // Find candidates locally - .filter( - c -> - clusterState.hasCollection( - c.getCollectionName())) // Only care about cores for actual collections - .filter( - c -> - clusterState - .getCollection(c.getCollectionName()) - .getActiveSlicesMap() - .containsKey(c.getShardId())) - .count(); } @Override @@ -337,7 +116,12 @@ public Boolean registerV2() { @Override public Collection getApis() { - return AnnotatedApi.getApis(new NodeHealthAPI(this)); + return Collections.emptyList(); + } + + @Override + public Collection> getJerseyResources() { + return List.of(NodeHealth.class); } @Override diff --git a/solr/core/src/java/org/apache/solr/handler/admin/api/NodeHealth.java b/solr/core/src/java/org/apache/solr/handler/admin/api/NodeHealth.java new file mode 100644 index 000000000000..72028c6a40a4 --- /dev/null +++ b/solr/core/src/java/org/apache/solr/handler/admin/api/NodeHealth.java @@ -0,0 +1,287 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.solr.handler.admin.api; + +import static org.apache.solr.client.api.model.NodeHealthResponse.NodeStatus.FAILURE; +import static org.apache.solr.client.api.model.NodeHealthResponse.NodeStatus.OK; +import static org.apache.solr.common.SolrException.ErrorCode.SERVER_ERROR; +import static org.apache.solr.common.SolrException.ErrorCode.SERVICE_UNAVAILABLE; +import static org.apache.solr.handler.admin.api.ReplicationAPIBase.GENERATION; +import static org.apache.solr.security.PermissionNameProvider.Name.HEALTH_PERM; + +import jakarta.inject.Inject; +import java.lang.invoke.MethodHandles; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collection; +import java.util.List; +import java.util.Locale; +import java.util.stream.Collectors; +import org.apache.lucene.index.IndexCommit; +import org.apache.solr.api.JerseyResource; +import org.apache.solr.client.api.endpoint.NodeHealthApi; +import org.apache.solr.client.api.model.NodeHealthResponse; +import org.apache.solr.cloud.CloudDescriptor; +import org.apache.solr.common.SolrException; +import org.apache.solr.common.cloud.ClusterState; +import org.apache.solr.common.cloud.Replica.State; +import org.apache.solr.common.cloud.ZkStateReader; +import org.apache.solr.common.util.NamedList; +import org.apache.solr.core.CoreContainer; +import org.apache.solr.core.CoreDescriptor; +import org.apache.solr.core.SolrCore; +import org.apache.solr.handler.IndexFetcher; +import org.apache.solr.handler.ReplicationHandler; +import org.apache.solr.jersey.PermissionName; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * V2 API for checking the health of the receiving node. + * + *

This API (GET /v2/node/health) is analogous to the v1 /admin/info/health. + * + *

The v1 {@link org.apache.solr.handler.admin.HealthCheckHandler} delegates to this class. + */ +public class NodeHealth extends JerseyResource implements NodeHealthApi { + + private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); + private static final List UNHEALTHY_STATES = Arrays.asList(State.DOWN, State.RECOVERING); + + private final CoreContainer coreContainer; + + @Inject + public NodeHealth(CoreContainer coreContainer) { + this.coreContainer = coreContainer; + } + + @Override + @PermissionName(HEALTH_PERM) + public NodeHealthResponse healthcheck(Boolean requireHealthyCores) { + return checkNodeHealth(requireHealthyCores, null); + } + + /** + * Performs the node health check and returns the result as a {@link NodeHealthResponse}. + * + *

This overload is used by the v1 {@link + * org.apache.solr.handler.admin.HealthCheckHandler#handleRequestBody} path, which can supply the + * legacy {@code maxGenerationLag} parameter that is not exposed via the v2 endpoint. + */ + public NodeHealthResponse checkNodeHealth(Boolean requireHealthyCores, Integer maxGenerationLag) { + if (coreContainer == null || coreContainer.isShutDown()) { + throw new SolrException( + SERVER_ERROR, "CoreContainer is either not initialized or shutting down"); + } + + final NodeHealthResponse response = instantiateJerseyResponse(NodeHealthResponse.class); + + if (!coreContainer.isZooKeeperAware()) { + if (log.isDebugEnabled()) { + log.debug("Invoked HealthCheckHandler in legacy mode."); + } + healthCheckLegacyMode(response, maxGenerationLag); + } else { + if (log.isDebugEnabled()) { + log.debug( + "Invoked HealthCheckHandler in cloud mode on [{}]", + coreContainer.getZkController().getNodeName()); + } + healthCheckCloudMode(response, requireHealthyCores); + } + + return response; + } + + private void healthCheckCloudMode(NodeHealthResponse response, Boolean requireHealthyCores) { + ClusterState clusterState = getClusterState(); + + if (Boolean.TRUE.equals(requireHealthyCores)) { + if (!coreContainer.isStatusLoadComplete()) { + throw new SolrException(SERVICE_UNAVAILABLE, "Host Unavailable: Core Loading not complete"); + } + Collection coreDescriptors = + coreContainer.getCoreDescriptors().stream() + .map(CoreDescriptor::getCloudDescriptor) + .collect(Collectors.toList()); + int unhealthyCores = findUnhealthyCores(coreDescriptors, clusterState); + if (unhealthyCores > 0) { + response.numCoresUnhealthy = unhealthyCores; + throw new SolrException( + SERVICE_UNAVAILABLE, + unhealthyCores + + " out of " + + coreContainer.getNumAllCores() + + " replicas are currently initializing or recovering"); + } + response.message = "All cores are healthy"; + } + + response.status = OK; + } + + private ClusterState getClusterState() { + ZkStateReader zkStateReader = coreContainer.getZkController().getZkStateReader(); + ClusterState clusterState = zkStateReader.getClusterState(); + + if (zkStateReader.getZkClient().isClosed() || !zkStateReader.getZkClient().isConnected()) { + throw new SolrException(SERVICE_UNAVAILABLE, "Host Unavailable: Not connected to zk"); + } + + if (!clusterState.getLiveNodes().contains(coreContainer.getZkController().getNodeName())) { + throw new SolrException(SERVICE_UNAVAILABLE, "Host Unavailable: Not in live nodes as per zk"); + } + return clusterState; + } + + private void healthCheckLegacyMode(NodeHealthResponse response, Integer maxGenerationLag) { + List laggingCoresInfo = new ArrayList<>(); + boolean allCoresAreInSync = true; + + if (maxGenerationLag != null) { + if (maxGenerationLag < 0) { + log.error("Invalid value for maxGenerationLag:[{}]", maxGenerationLag); + response.message = + String.format(Locale.ROOT, "Invalid value of maxGenerationLag:%s", maxGenerationLag); + response.status = FAILURE; + return; + } + + for (SolrCore core : coreContainer.getCores()) { + ReplicationHandler replicationHandler = + (ReplicationHandler) core.getRequestHandler(ReplicationHandler.PATH); + if (replicationHandler.isFollower()) { + boolean isCoreInSync = + isWithinGenerationLag(core, replicationHandler, maxGenerationLag, laggingCoresInfo); + allCoresAreInSync &= isCoreInSync; + } + } + + if (allCoresAreInSync) { + response.message = + String.format( + Locale.ROOT, + "All the followers are in sync with leader (within maxGenerationLag: %d) " + + "or the cores are acting as leader", + maxGenerationLag); + response.status = OK; + } else { + response.message = + String.format( + Locale.ROOT, + "Cores violating maxGenerationLag:%d.%n%s", + maxGenerationLag, + String.join(",\n", laggingCoresInfo)); + response.status = FAILURE; + } + } else { + response.message = + "maxGenerationLag isn't specified. Followers aren't " + + "checking for the generation lag from the leaders"; + response.status = OK; + } + } + + private boolean isWithinGenerationLag( + final SolrCore core, + ReplicationHandler replicationHandler, + int maxGenerationLag, + List laggingCoresInfo) { + IndexFetcher indexFetcher = null; + try { + // may not be the best way to get leader's replicableCommit; NamedList is unavoidable here + // as it is the init-args format used by ReplicationHandler + NamedList follower = (NamedList) replicationHandler.getInitArgs().get("follower"); + indexFetcher = new IndexFetcher(follower, replicationHandler, core); + // getLatestVersion() returns a NamedList from the IndexFetcher network API + NamedList replicableCommitOnLeader = indexFetcher.getLatestVersion(); + long leaderGeneration = (Long) replicableCommitOnLeader.get(GENERATION); + + // Get our own commit and generation from the commit + IndexCommit commit = core.getDeletionPolicy().getLatestCommit(); + if (commit != null) { + long followerGeneration = commit.getGeneration(); + long generationDiff = leaderGeneration - followerGeneration; + + // generationDiff shouldn't be negative except for some edge cases, log it. Some scenarios + // are: + // 1) commit generation rolls over Long.MAX_VALUE (really unlikely) + // 2) Leader's index is wiped clean and the follower is still showing commit generation + // from the old index + if (generationDiff < 0) { + log.warn("core:[{}], generation lag:[{}] is negative.", core, generationDiff); + } else if (generationDiff > maxGenerationLag) { + log.info( + "core:[{}] generation lag is above acceptable threshold:[{}], " + + "generation lag:[{}], leader generation:[{}], follower generation:[{}]", + core, + maxGenerationLag, + generationDiff, + leaderGeneration, + followerGeneration); + laggingCoresInfo.add( + String.format( + Locale.ROOT, + "Core %s is lagging by %d generations", + core.getName(), + generationDiff)); + return false; + } + } + } catch (Exception e) { + log.error("Failed to check if the follower is in sync with the leader", e); + return false; + } finally { + if (indexFetcher != null) { + indexFetcher.destroy(); + } + } + return true; + } + + /** + * Find replicas DOWN or RECOVERING, or replicas in clusterstate that do not exist on local node. + * We first find local cores which are either not registered or unhealthy, and check each of these + * against the clusterstate, and return a count of unhealthy replicas. + * + * @param cores list of core cloud descriptors to iterate + * @param clusterState clusterstate from ZK + * @return number of unhealthy cores, either in DOWN or RECOVERING state + */ + public static int findUnhealthyCores( + Collection cores, ClusterState clusterState) { + return Math.toIntExact( + cores.stream() + .filter( + c -> + !c.hasRegistered() + || UNHEALTHY_STATES.contains( + c.getLastPublished())) // Find candidates locally + .filter( + c -> + clusterState.hasCollection( + c.getCollectionName())) // Only care about cores for actual collections + .filter( + c -> + clusterState + .getCollection(c.getCollectionName()) + .getActiveSlicesMap() + .containsKey(c.getShardId())) + .count()); + } +} diff --git a/solr/core/src/java/org/apache/solr/handler/admin/api/NodeHealthAPI.java b/solr/core/src/java/org/apache/solr/handler/admin/api/NodeHealthAPI.java deleted file mode 100644 index df5f64900f03..000000000000 --- a/solr/core/src/java/org/apache/solr/handler/admin/api/NodeHealthAPI.java +++ /dev/null @@ -1,48 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.solr.handler.admin.api; - -import static org.apache.solr.client.solrj.SolrRequest.METHOD.GET; -import static org.apache.solr.security.PermissionNameProvider.Name.HEALTH_PERM; - -import org.apache.solr.api.EndPoint; -import org.apache.solr.handler.admin.HealthCheckHandler; -import org.apache.solr.request.SolrQueryRequest; -import org.apache.solr.response.SolrQueryResponse; - -/** - * V2 API for checking the health of the receiving node. - * - *

This API (GET /v2/node/health) is analogous to the v1 /admin/info/health. - */ -public class NodeHealthAPI { - private final HealthCheckHandler handler; - - public NodeHealthAPI(HealthCheckHandler handler) { - this.handler = handler; - } - - // TODO Update permission here once SOLR-11623 lands. - @EndPoint( - path = {"/node/health"}, - method = GET, - permission = HEALTH_PERM) - public void getSystemInformation(SolrQueryRequest req, SolrQueryResponse rsp) throws Exception { - handler.handleRequestBody(req, rsp); - } -} diff --git a/solr/core/src/test/org/apache/solr/handler/admin/HealthCheckHandlerTest.java b/solr/core/src/test/org/apache/solr/handler/admin/HealthCheckHandlerTest.java index 43838707d057..fe4b7cbae171 100644 --- a/solr/core/src/test/org/apache/solr/handler/admin/HealthCheckHandlerTest.java +++ b/solr/core/src/test/org/apache/solr/handler/admin/HealthCheckHandlerTest.java @@ -20,9 +20,14 @@ import static org.apache.solr.common.params.CommonParams.HEALTH_CHECK_HANDLER_PATH; import java.io.IOException; +import java.net.URI; +import java.net.http.HttpClient; +import java.net.http.HttpRequest; +import java.net.http.HttpResponse; import java.util.Arrays; import java.util.Collection; import java.util.Properties; +import java.util.concurrent.TimeUnit; import org.apache.solr.client.solrj.RemoteSolrException; import org.apache.solr.client.solrj.SolrClient; import org.apache.solr.client.solrj.SolrRequest; @@ -30,10 +35,8 @@ import org.apache.solr.client.solrj.request.CollectionAdminRequest; import org.apache.solr.client.solrj.request.GenericSolrRequest; import org.apache.solr.client.solrj.request.HealthCheckRequest; -import org.apache.solr.client.solrj.request.V2Request; import org.apache.solr.client.solrj.response.CollectionAdminResponse; import org.apache.solr.client.solrj.response.HealthCheckResponse; -import org.apache.solr.client.solrj.response.V2Response; import org.apache.solr.cloud.CloudDescriptor; import org.apache.solr.cloud.ClusterStateMockUtil; import org.apache.solr.cloud.SolrCloudTestCase; @@ -44,6 +47,7 @@ import org.apache.solr.common.params.CommonParams; import org.apache.solr.core.CoreDescriptor; import org.apache.solr.embedded.JettySolrRunner; +import org.apache.solr.handler.admin.api.NodeHealth; import org.junit.BeforeClass; import org.junit.Test; @@ -104,11 +108,7 @@ public void testHealthCheckHandler() throws Exception { // negative check of our (new) "broken" node that we deliberately put into an unhealthy state RemoteSolrException e = - expectThrows( - RemoteSolrException.class, - () -> { - runHealthcheckWithClient(solrClient); - }); + expectThrows(RemoteSolrException.class, () -> runHealthcheckWithClient(solrClient)); assertTrue(e.getMessage(), e.getMessage().contains("Host Unavailable")); assertEquals(SolrException.ErrorCode.SERVICE_UNAVAILABLE.code, e.code()); } finally { @@ -135,37 +135,57 @@ public void testHealthCheckHandlerSolrJ() throws IOException, SolrServerExceptio } } + /** + * Verifies that the v1 health-check response body contains {@code "status":"FAILURE"} when the + * node is absent from ZooKeeper's live-nodes set. + * + *

This is a regression test for the refactoring that delegated health-check logic to {@link + * NodeHealth}: after that change, {@link SolrException} thrown by {@link NodeHealth} would escape + * {@link HealthCheckHandler#handleRequestBody} before the {@code status} field was written to the + * response, leaving callers without a machine-readable failure indicator in the body. + * + *

The node's ZK session is kept alive so that only the live-nodes check fires, not the "not + * connected to ZK" check, isolating the specific code path under test. + */ @Test - public void testHealthCheckV2Api() throws Exception { - V2Response res = new V2Request.Builder("/node/health").build().process(cluster.getSolrClient()); - assertEquals(0, res.getStatus()); - assertEquals(CommonParams.OK, res.getResponse().get(CommonParams.STATUS)); - - // add a new node for the purpose of negative testing + public void testV1FailureResponseIncludesStatusField() throws Exception { JettySolrRunner newJetty = cluster.startJettySolrRunner(); try (SolrClient solrClient = getHttpSolrClient(newJetty.getBaseUrl().toString())) { + // Sanity check: the new node is initially healthy. + assertEquals(CommonParams.OK, runHealthcheckWithClient(solrClient).getNodeStatus()); - // positive check that our (new) "healthy" node works with direct http client - assertEquals( - CommonParams.OK, - new V2Request.Builder("/node/health") - .build() - .process(solrClient) - .getResponse() - .get(CommonParams.STATUS)); - - // now "break" our (new) node - newJetty.getCoreContainer().getZkController().getZkClient().close(); - - // negative check of our (new) "broken" node that we deliberately put into an unhealthy state - RemoteSolrException e = - expectThrows( - RemoteSolrException.class, - () -> { - new V2Request.Builder("/node/health").build().process(solrClient); - }); - assertTrue(e.getMessage(), e.getMessage().contains("Host Unavailable")); - assertEquals(SolrException.ErrorCode.SERVICE_UNAVAILABLE.code, e.code()); + String nodeName = newJetty.getCoreContainer().getZkController().getNodeName(); + + // Remove the node from ZooKeeper's live_nodes without closing the ZK session. + // This ensures the "ZK not connected" check passes and only the "not in live nodes" + // check fires, exercising the specific failure branch we fixed. + newJetty.getCoreContainer().getZkController().removeEphemeralLiveNode(); + + // Wait for the node's own ZkStateReader to reflect the removal before querying. + newJetty + .getCoreContainer() + .getZkController() + .getZkStateReader() + .waitForLiveNodes(10, TimeUnit.SECONDS, missingLiveNode(nodeName)); + + // Use a raw HTTP request so we can inspect the full response body. + // SolrJ's HealthCheckRequest throws RemoteSolrException on non-200 responses and does + // not expose the response body, so we go below SolrJ here. + try (HttpClient httpClient = HttpClient.newHttpClient()) { + HttpResponse response = + httpClient.send( + HttpRequest.newBuilder() + .uri(URI.create(newJetty.getBaseUrl() + HEALTH_CHECK_HANDLER_PATH)) + .build(), + HttpResponse.BodyHandlers.ofString()); + + assertEquals("Expected 503 SERVICE_UNAVAILABLE", 503, response.statusCode()); + assertTrue( + "v1 error response body must contain status=FAILURE so body-inspecting clients get" + + " a clear signal; body was: " + + response.body(), + response.body().contains("FAILURE")); + } } finally { newJetty.stop(); } @@ -193,7 +213,7 @@ public void testFindUnhealthyCores() { mockCD("invalid", "invalid", "slice1", false, Replica.State.RECOVERING), // A core for a slice that is not an active slice will not fail the check mockCD("collection1", "invalid_replica1", "invalid", true, Replica.State.DOWN)); - long unhealthy1 = HealthCheckHandler.findUnhealthyCores(node1Cores, clusterState); + long unhealthy1 = NodeHealth.findUnhealthyCores(node1Cores, clusterState); assertEquals(2, unhealthy1); // Node 2 @@ -203,7 +223,7 @@ public void testFindUnhealthyCores() { mockCD("collection1", "slice1_replica4", "slice1", true, Replica.State.DOWN), mockCD( "collection2", "slice1_replica1", "slice1", true, Replica.State.RECOVERY_FAILED)); - long unhealthy2 = HealthCheckHandler.findUnhealthyCores(node2Cores, clusterState); + long unhealthy2 = NodeHealth.findUnhealthyCores(node2Cores, clusterState); assertEquals(1, unhealthy2); } } diff --git a/solr/core/src/test/org/apache/solr/handler/admin/api/NodeHealthTest.java b/solr/core/src/test/org/apache/solr/handler/admin/api/NodeHealthTest.java new file mode 100644 index 000000000000..7d82a8a178f2 --- /dev/null +++ b/solr/core/src/test/org/apache/solr/handler/admin/api/NodeHealthTest.java @@ -0,0 +1,164 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.solr.handler.admin.api; + +import static org.apache.solr.client.api.model.NodeHealthResponse.NodeStatus.FAILURE; +import static org.apache.solr.client.api.model.NodeHealthResponse.NodeStatus.OK; + +import java.util.concurrent.TimeUnit; +import org.apache.solr.client.solrj.SolrClient; +import org.apache.solr.client.solrj.request.CollectionAdminRequest; +import org.apache.solr.client.solrj.request.NodeApi; +import org.apache.solr.cloud.SolrCloudTestCase; +import org.apache.solr.common.SolrException; +import org.apache.solr.common.SolrException.ErrorCode; +import org.apache.solr.embedded.JettySolrRunner; +import org.apache.solr.util.SolrJettyTestRule; +import org.junit.BeforeClass; +import org.junit.ClassRule; +import org.junit.Test; + +public class NodeHealthTest extends SolrCloudTestCase { + + /** + * A standalone (non-ZooKeeper) Jetty instance used by the legacy-mode tests. The + * {@code @ClassRule} ensures it is shut down after all tests in this class complete. + */ + @ClassRule public static SolrJettyTestRule standaloneJetty = new SolrJettyTestRule(); + + @BeforeClass + public static void setupCluster() throws Exception { + configureCluster(1).addConfig("conf", configset("cloud-minimal")).configure(); + standaloneJetty.startSolr(createTempDir()); + + CollectionAdminRequest.createCollection(DEFAULT_TEST_COLLECTION_NAME, "conf", 1, 1) + .process(cluster.getSolrClient()); + } + + @Test + public void testCloudMode_HealthyNodeReturnsOkStatus() throws Exception { + final var request = new NodeApi.Healthcheck(); + final var response = request.process(cluster.getSolrClient()); + + assertNotNull(response); + assertEquals(OK, response.status); + assertNull("Expected no error on a healthy node", response.error); + } + + @Test + public void testCloudMode_RequireHealthyCoresReturnOkWhenAllCoresHealthy() throws Exception { + final var request = new NodeApi.Healthcheck(); + request.setRequireHealthyCores(true); + final var response = request.process(cluster.getSolrClient()); + + assertNotNull(response); + assertEquals(OK, response.status); + assertEquals("All cores are healthy", response.message); + } + + @Test + public void testCloudMode_UnhealthyWhenZkClientClosed() throws Exception { + // Use a fresh node so closing its ZK client does not break the primary cluster node + JettySolrRunner newJetty = cluster.startJettySolrRunner(); + try (SolrClient nodeClient = newJetty.newClient()) { + // Sanity check: the new node should start out healthy + assertEquals(OK, new NodeApi.Healthcheck().process(nodeClient).status); + + // Break the ZK connection to put the node into an unhealthy state + newJetty.getCoreContainer().getZkController().getZkClient().close(); + + SolrException e = + assertThrows(SolrException.class, () -> new NodeApi.Healthcheck().process(nodeClient)); + assertEquals(ErrorCode.SERVICE_UNAVAILABLE.code, e.code()); + assertTrue( + "Expected 'Host Unavailable' in exception message", + e.getMessage().contains("Host Unavailable")); + } finally { + newJetty.stop(); + } + } + + /** + * Verifies that when the node's name is absent from ZooKeeper's live-nodes set (while the ZK + * session itself is still connected), the v2 health-check API throws a {@code + * SERVICE_UNAVAILABLE} exception with a message identifying the live-nodes check as the cause. + * + *

This specifically exercises the code path at NodeHealth#getClusterState() that checks {@code + * clusterState.getLiveNodes().contains(nodeName)}. + */ + @Test + public void testCloudMode_NotInLiveNodes_ThrowsServiceUnavailable() throws Exception { + JettySolrRunner newJetty = cluster.startJettySolrRunner(); + try (SolrClient nodeClient = newJetty.newClient()) { + // Sanity check: the new node should start out healthy + assertEquals(OK, new NodeApi.Healthcheck().process(nodeClient).status); + + String nodeName = newJetty.getCoreContainer().getZkController().getNodeName(); + + // Remove the node from ZooKeeper's live_nodes without closing the ZK session. + // This ensures the "ZK not connected" check passes and only the "not in live nodes" + // check fires, isolating the code path under test. + newJetty.getCoreContainer().getZkController().removeEphemeralLiveNode(); + + // Wait for the node's own ZkStateReader to reflect the removal before querying it. + newJetty + .getCoreContainer() + .getZkController() + .getZkStateReader() + .waitForLiveNodes(10, TimeUnit.SECONDS, missingLiveNode(nodeName)); + + SolrException e = + assertThrows(SolrException.class, () -> new NodeApi.Healthcheck().process(nodeClient)); + assertEquals(ErrorCode.SERVICE_UNAVAILABLE.code, e.code()); + assertTrue( + "Expected 'Not in live nodes' in exception message", + e.getMessage().contains("Not in live nodes")); + } finally { + newJetty.stop(); + } + } + + @Test + public void testLegacyMode_WithoutMaxGenerationLagReturnsOk() throws Exception { + + final var request = new NodeApi.Healthcheck(); + final var response = request.process(standaloneJetty.getAdminClient()); + + assertNotNull(response); + assertEquals(OK, response.status); + assertTrue( + "Expected message about maxGenerationLag not being specified", + response.message.contains("maxGenerationLag isn't specified")); + } + + @Test + public void testLegacyMode_WithNegativeMaxGenerationLagReturnsFailure() { + // maxGenerationLag is a v1-only parameter: NodeHealth.healthcheck() (v2) hardcodes it to + // null and never forwards it from request params. NodeApi.Healthcheck therefore cannot be used + // to exercise this code path, so we call the JAX-RS implementation directly. + // FIXME: IInteresting! Do we have a gap? + final var response = + new NodeHealth(standaloneJetty.getCoreContainer()).checkNodeHealth(null, -1); + + assertNotNull(response); + assertEquals(FAILURE, response.status); + assertTrue( + "Expected message about invalid maxGenerationLag", + response.message.contains("Invalid value of maxGenerationLag")); + } +} diff --git a/solr/core/src/test/org/apache/solr/handler/admin/api/V2NodeAPIMappingTest.java b/solr/core/src/test/org/apache/solr/handler/admin/api/V2NodeAPIMappingTest.java index c0ca1bff0c00..c8000d4bbf2e 100644 --- a/solr/core/src/test/org/apache/solr/handler/admin/api/V2NodeAPIMappingTest.java +++ b/solr/core/src/test/org/apache/solr/handler/admin/api/V2NodeAPIMappingTest.java @@ -35,7 +35,6 @@ import org.apache.solr.common.util.ContentStreamBase; import org.apache.solr.handler.RequestHandlerBase; import org.apache.solr.handler.admin.CoreAdminHandler; -import org.apache.solr.handler.admin.HealthCheckHandler; import org.apache.solr.handler.admin.InfoHandler; import org.apache.solr.handler.admin.LoggingHandler; import org.apache.solr.handler.admin.PropertiesRequestHandler; @@ -58,7 +57,6 @@ public class V2NodeAPIMappingTest extends SolrTestCaseJ4 { private SystemInfoHandler mockSystemInfoHandler; private LoggingHandler mockLoggingHandler; private PropertiesRequestHandler mockPropertiesHandler; - private HealthCheckHandler mockHealthCheckHandler; private ThreadDumpHandler mockThreadDumpHandler; @BeforeClass @@ -73,14 +71,12 @@ public void setupApiBag() { mockSystemInfoHandler = mock(SystemInfoHandler.class); mockLoggingHandler = mock(LoggingHandler.class); mockPropertiesHandler = mock(PropertiesRequestHandler.class); - mockHealthCheckHandler = mock(HealthCheckHandler.class); mockThreadDumpHandler = mock(ThreadDumpHandler.class); queryRequestCaptor = ArgumentCaptor.forClass(SolrQueryRequest.class); when(infoHandler.getSystemInfoHandler()).thenReturn(mockSystemInfoHandler); when(infoHandler.getLoggingHandler()).thenReturn(mockLoggingHandler); when(infoHandler.getPropertiesHandler()).thenReturn(mockPropertiesHandler); - when(infoHandler.getHealthCheckHandler()).thenReturn(mockHealthCheckHandler); when(infoHandler.getThreadDumpHandler()).thenReturn(mockThreadDumpHandler); apiBag = new ApiBag(false); @@ -156,19 +152,6 @@ public void testSystemInfoApiAllProperties() throws Exception { assertEquals("anyParamValue", v1Params.get("anyParamName")); } - @Test - public void testHealthCheckApiAllProperties() throws Exception { - final ModifiableSolrParams solrParams = new ModifiableSolrParams(); - solrParams.add("requireHealthyCores", "true"); - solrParams.add("maxGenerationLag", "123"); - final SolrParams v1Params = - captureConvertedHealthCheckV1Params("/node/health", "GET", solrParams); - - // All parameters are passed through to v1 API as-is. - assertEquals(true, v1Params.getBool("requireHealthyCores")); - assertEquals(123, v1Params.getPrimitiveInt("maxGenerationLag")); - } - private SolrParams captureConvertedCoreV1Params(String path, String method, String v2RequestBody) throws Exception { return doCaptureParams( @@ -185,11 +168,6 @@ private SolrParams captureConvertedPropertiesV1Params( return doCaptureParams(path, method, inputParams, null, mockPropertiesHandler); } - private SolrParams captureConvertedHealthCheckV1Params( - String path, String method, SolrParams inputParams) throws Exception { - return doCaptureParams(path, method, inputParams, null, mockHealthCheckHandler); - } - private SolrParams captureConvertedThreadDumpV1Params( String path, String method, SolrParams inputParams) throws Exception { return doCaptureParams(path, method, inputParams, null, mockThreadDumpHandler); @@ -233,6 +211,5 @@ private static void registerAllNodeApis( apiBag.registerObject(new NodePropertiesAPI(infoHandler.getPropertiesHandler())); apiBag.registerObject(new NodeThreadsAPI(infoHandler.getThreadDumpHandler())); apiBag.registerObject(new NodeSystemInfoAPI(infoHandler.getSystemInfoHandler())); - apiBag.registerObject(new NodeHealthAPI(infoHandler.getHealthCheckHandler())); } } diff --git a/solr/solr-ref-guide/modules/configuration-guide/pages/implicit-requesthandlers.adoc b/solr/solr-ref-guide/modules/configuration-guide/pages/implicit-requesthandlers.adoc index 16b2f691281e..a840e903b07d 100644 --- a/solr/solr-ref-guide/modules/configuration-guide/pages/implicit-requesthandlers.adoc +++ b/solr/solr-ref-guide/modules/configuration-guide/pages/implicit-requesthandlers.adoc @@ -47,7 +47,9 @@ Health:: Report the health of the node (_available only in SolrCloud mode_) |API Endpoints |Class & Javadocs |Paramset |v1: `solr/admin/info/health` -v2: `api/node/health` |{solr-javadocs}/core/org/apache/solr/handler/admin/HealthCheckHandler.html[HealthCheckHandler] | +v2: `api/node/health` |v1: {solr-javadocs}/core/org/apache/solr/handler/admin/HealthCheckHandler.html[HealthCheckHandler] + +v2: {solr-javadocs}/core/org/apache/solr/handler/admin/api/NodeHealthAPI.html[NodeHealthAPI] | |=== + This endpoint also accepts additional request parameters. diff --git a/solr/solrj/src/java/org/apache/solr/common/util/Utils.java b/solr/solrj/src/java/org/apache/solr/common/util/Utils.java index 1e2f69da9727..435fea18d1a2 100644 --- a/solr/solrj/src/java/org/apache/solr/common/util/Utils.java +++ b/solr/solrj/src/java/org/apache/solr/common/util/Utils.java @@ -845,6 +845,12 @@ public static void reflectWrite(MapWriter.EntryWriter ew, Object o) { * @return a serializable version of the object */ public static Object getReflectWriter(Object o) { + // Enums serialized as their declared name so that javabin/NamedList consumers + // (e.g. HealthCheckHandlerTest comparing against CommonParams.OK == "OK") see + // a plain string rather than "pkg.EnumClass:NAME". + if (o instanceof Enum) { + return ((Enum) o).name(); + } List fieldWriters = null; try { fieldWriters =