Skip to content

Commit 1452364

Browse files
Copilotepugh
andcommitted
Flip delegation: move health-check logic to NodeHealthAPI, HealthCheckHandler delegates to V2
- NodeHealthAPI now owns all business logic (cloud mode, legacy mode, isWithinGenerationLag, findUnhealthyCores) using strongly-typed NodeHealthResponse / NodeStatus throughout. - HealthCheckHandler becomes a thin V1 bridge: handleRequestBody() creates NodeHealthAPI(coreContainer).checkNodeHealth(...) and squashes the typed response into SolrQueryResponse. - findUnhealthyCores() moved to NodeHealthAPI as a public static util; HealthCheckHandler keeps a @deprecated delegation shim so existing callers continue to compile. - HealthCheckHandlerTest updated to call NodeHealthAPI.findUnhealthyCores() directly. - Utils.getReflectWriter() now serialises Enum values as their .name() string so that NodeStatus.OK round-trips as "OK" through NamedList/javabin, keeping HealthCheckHandlerTest assertions passing. - Fixed pre-existing bug in isWithinGenerationLag: condition was `generationDiff < maxGenerationLag` (wrong); corrected to `generationDiff > maxGenerationLag` with the return values adjusted so the method returns true=healthy / false=lagging-too-far. - Fixed missing slf4j log arguments in the negative-diff warning. Co-authored-by: epugh <22395+epugh@users.noreply.github.com>
1 parent 2d0c50b commit 1452364

File tree

4 files changed

+248
-227
lines changed

4 files changed

+248
-227
lines changed

solr/core/src/java/org/apache/solr/handler/admin/HealthCheckHandler.java

Lines changed: 10 additions & 217 deletions
Original file line numberDiff line numberDiff line change
@@ -17,42 +17,22 @@
1717

1818
package org.apache.solr.handler.admin;
1919

20-
import static org.apache.solr.client.api.model.NodeHealthResponse.NodeStatus.FAILURE;
21-
import static org.apache.solr.client.api.model.NodeHealthResponse.NodeStatus.OK;
22-
import static org.apache.solr.common.SolrException.ErrorCode.SERVER_ERROR;
23-
import static org.apache.solr.common.SolrException.ErrorCode.SERVICE_UNAVAILABLE;
24-
import static org.apache.solr.handler.admin.api.ReplicationAPIBase.GENERATION;
25-
26-
import java.lang.invoke.MethodHandles;
27-
import java.util.ArrayList;
28-
import java.util.Arrays;
2920
import java.util.Collection;
21+
import java.util.Collections;
3022
import java.util.List;
31-
import java.util.Locale;
32-
import java.util.stream.Collectors;
33-
import org.apache.lucene.index.IndexCommit;
3423
import org.apache.solr.api.Api;
3524
import org.apache.solr.api.JerseyResource;
36-
import org.apache.solr.client.api.model.NodeHealthResponse;
3725
import org.apache.solr.client.solrj.request.HealthCheckRequest;
3826
import org.apache.solr.cloud.CloudDescriptor;
39-
import org.apache.solr.common.SolrException;
4027
import org.apache.solr.common.cloud.ClusterState;
41-
import org.apache.solr.common.cloud.Replica.State;
42-
import org.apache.solr.common.cloud.ZkStateReader;
4328
import org.apache.solr.common.util.NamedList;
4429
import org.apache.solr.core.CoreContainer;
45-
import org.apache.solr.core.SolrCore;
46-
import org.apache.solr.handler.IndexFetcher;
47-
import org.apache.solr.handler.ReplicationHandler;
4830
import org.apache.solr.handler.RequestHandlerBase;
4931
import org.apache.solr.handler.admin.api.NodeHealthAPI;
5032
import org.apache.solr.handler.api.V2ApiUtils;
5133
import org.apache.solr.request.SolrQueryRequest;
5234
import org.apache.solr.response.SolrQueryResponse;
5335
import org.apache.solr.security.AuthorizationContext;
54-
import org.slf4j.Logger;
55-
import org.slf4j.LoggerFactory;
5636

5737
/**
5838
* Health Check Handler for reporting the health of a specific node.
@@ -80,12 +60,13 @@
8060
* specify the acceptable generation lag follower should be with respect to its leader using the
8161
* <code>maxGenerationLag=&lt;max_generation_lag&gt;</code> request parameter. If <code>
8262
* maxGenerationLag</code> is not provided then health check would simply return OK.
63+
*
64+
* <p>All health-check logic lives in the v2 {@link NodeHealthAPI}; this handler is a thin v1 bridge
65+
* that extracts request parameters and delegates.
8366
*/
8467
public class HealthCheckHandler extends RequestHandlerBase {
8568

86-
private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
8769
private static final String PARAM_REQUIRE_HEALTHY_CORES = "requireHealthyCores";
88-
private static final List<State> UNHEALTHY_STATES = Arrays.asList(State.DOWN, State.RECOVERING);
8970

9071
CoreContainer coreContainer;
9172

@@ -107,207 +88,19 @@ public void handleRequestBody(SolrQueryRequest req, SolrQueryResponse rsp) throw
10788
final Integer maxGenerationLag =
10889
req.getParams().getInt(HealthCheckRequest.PARAM_MAX_GENERATION_LAG);
10990
V2ApiUtils.squashIntoSolrResponseWithoutHeader(
110-
rsp, checkNodeHealth(requireHealthyCores, maxGenerationLag));
111-
}
112-
113-
/**
114-
* Performs the node health check and returns the result as a {@link NodeHealthResponse}.
115-
*
116-
* <p>This method is the shared implementation used by both the v1 {@link #handleRequestBody} path
117-
* and the v2 JAX-RS {@link NodeHealthAPI}.
118-
*/
119-
public NodeHealthResponse checkNodeHealth(Boolean requireHealthyCores, Integer maxGenerationLag) {
120-
if (coreContainer == null || coreContainer.isShutDown()) {
121-
throw new SolrException(
122-
SERVER_ERROR, "CoreContainer is either not initialized or shutting down");
123-
}
124-
125-
final NodeHealthResponse response = new NodeHealthResponse();
126-
127-
if (!coreContainer.isZooKeeperAware()) {
128-
if (log.isDebugEnabled()) {
129-
log.debug("Invoked HealthCheckHandler in legacy mode.");
130-
}
131-
healthCheckLegacyMode(response, maxGenerationLag);
132-
} else {
133-
if (log.isDebugEnabled()) {
134-
log.debug(
135-
"Invoked HealthCheckHandler in cloud mode on [{}]",
136-
coreContainer.getZkController().getNodeName());
137-
}
138-
healthCheckCloudMode(response, requireHealthyCores);
139-
}
140-
141-
return response;
142-
}
143-
144-
private void healthCheckCloudMode(NodeHealthResponse response, Boolean requireHealthyCores) {
145-
ZkStateReader zkStateReader = coreContainer.getZkController().getZkStateReader();
146-
ClusterState clusterState = zkStateReader.getClusterState();
147-
148-
if (zkStateReader.getZkClient().isClosed() || !zkStateReader.getZkClient().isConnected()) {
149-
throw new SolrException(SERVICE_UNAVAILABLE, "Host Unavailable: Not connected to zk");
150-
}
151-
152-
if (!clusterState.getLiveNodes().contains(coreContainer.getZkController().getNodeName())) {
153-
throw new SolrException(SERVICE_UNAVAILABLE, "Host Unavailable: Not in live nodes as per zk");
154-
}
155-
156-
if (Boolean.TRUE.equals(requireHealthyCores)) {
157-
if (!coreContainer.isStatusLoadComplete()) {
158-
throw new SolrException(SERVICE_UNAVAILABLE, "Host Unavailable: Core Loading not complete");
159-
}
160-
Collection<CloudDescriptor> coreDescriptors =
161-
coreContainer.getCoreDescriptors().stream()
162-
.map(cd -> cd.getCloudDescriptor())
163-
.collect(Collectors.toList());
164-
int unhealthyCores = findUnhealthyCores(coreDescriptors, clusterState);
165-
if (unhealthyCores > 0) {
166-
response.numCoresUnhealthy = unhealthyCores;
167-
throw new SolrException(
168-
SERVICE_UNAVAILABLE,
169-
unhealthyCores
170-
+ " out of "
171-
+ coreContainer.getNumAllCores()
172-
+ " replicas are currently initializing or recovering");
173-
}
174-
response.message = "All cores are healthy";
175-
}
176-
177-
response.status = OK;
178-
}
179-
180-
private void healthCheckLegacyMode(NodeHealthResponse response, Integer maxGenerationLag) {
181-
List<String> laggingCoresInfo = new ArrayList<>();
182-
boolean allCoresAreInSync = true;
183-
184-
if (maxGenerationLag != null) {
185-
if (maxGenerationLag < 0) {
186-
log.error("Invalid value for maxGenerationLag:[{}]", maxGenerationLag);
187-
response.message =
188-
String.format(Locale.ROOT, "Invalid value of maxGenerationLag:%s", maxGenerationLag);
189-
response.status = FAILURE;
190-
return;
191-
}
192-
193-
for (SolrCore core : coreContainer.getCores()) {
194-
ReplicationHandler replicationHandler =
195-
(ReplicationHandler) core.getRequestHandler(ReplicationHandler.PATH);
196-
if (replicationHandler.isFollower()) {
197-
boolean isCoreInSync =
198-
isWithinGenerationLag(core, replicationHandler, maxGenerationLag, laggingCoresInfo);
199-
allCoresAreInSync &= isCoreInSync;
200-
}
201-
}
202-
203-
if (allCoresAreInSync) {
204-
response.message =
205-
String.format(
206-
Locale.ROOT,
207-
"All the followers are in sync with leader (within maxGenerationLag: %d) "
208-
+ "or the cores are acting as leader",
209-
maxGenerationLag);
210-
response.status = OK;
211-
} else {
212-
response.message =
213-
String.format(
214-
Locale.ROOT,
215-
"Cores violating maxGenerationLag:%d.%n%s",
216-
maxGenerationLag,
217-
String.join(",\n", laggingCoresInfo));
218-
response.status = FAILURE;
219-
}
220-
} else {
221-
response.message =
222-
"maxGenerationLag isn't specified. Followers aren't "
223-
+ "checking for the generation lag from the leaders";
224-
response.status = OK;
225-
}
226-
}
227-
228-
private boolean isWithinGenerationLag(
229-
final SolrCore core,
230-
ReplicationHandler replicationHandler,
231-
int maxGenerationLag,
232-
List<String> laggingCoresInfo) {
233-
IndexFetcher indexFetcher = null;
234-
try {
235-
// may not be the best way to get leader's replicableCommit
236-
NamedList<?> follower = (NamedList<?>) replicationHandler.getInitArgs().get("follower");
237-
indexFetcher = new IndexFetcher(follower, replicationHandler, core);
238-
NamedList<?> replicableCommitOnLeader = indexFetcher.getLatestVersion();
239-
long leaderGeneration = (Long) replicableCommitOnLeader.get(GENERATION);
240-
241-
// Get our own commit and generation from the commit
242-
IndexCommit commit = core.getDeletionPolicy().getLatestCommit();
243-
if (commit != null) {
244-
long followerGeneration = commit.getGeneration();
245-
long generationDiff = leaderGeneration - followerGeneration;
246-
247-
// generationDiff shouldn't be negative except for some edge cases, log it. Some scenarios
248-
// are
249-
// 1) commit generation rolls over Long.MAX_VALUE (really unlikely)
250-
// 2) Leader's index is wiped clean and the follower is still showing commit generation
251-
// from the old index
252-
if (generationDiff < 0) {
253-
log.warn("core:[{}], generation lag:[{}] is negative.");
254-
} else if (generationDiff < maxGenerationLag) {
255-
log.info(
256-
"core:[{}] generation lag is above acceptable threshold:[{}], "
257-
+ "generation lag:[{}], leader generation:[{}], follower generation:[{}]",
258-
core,
259-
maxGenerationLag,
260-
generationDiff,
261-
leaderGeneration,
262-
followerGeneration);
263-
laggingCoresInfo.add(
264-
String.format(
265-
Locale.ROOT,
266-
"Core %s is lagging by %d generations",
267-
core.getName(),
268-
generationDiff));
269-
return true;
270-
}
271-
}
272-
} catch (Exception e) {
273-
log.error("Failed to check if the follower is in sync with the leader", e);
274-
} finally {
275-
if (indexFetcher != null) {
276-
indexFetcher.destroy();
277-
}
278-
}
279-
return false;
91+
rsp,
92+
new NodeHealthAPI(coreContainer).checkNodeHealth(requireHealthyCores, maxGenerationLag));
28093
}
28194

28295
/**
28396
* Find replicas DOWN or RECOVERING, or replicas in clusterstate that do not exist on local node.
284-
* We first find local cores which are either not registered or unhealthy, and check each of these
285-
* against the clusterstate, and return a count of unhealthy replicas
28697
*
287-
* @param cores list of core cloud descriptors to iterate
288-
* @param clusterState clusterstate from ZK
289-
* @return number of unhealthy cores, either in DOWN or RECOVERING state
98+
* @deprecated Use {@link NodeHealthAPI#findUnhealthyCores(Collection, ClusterState)} instead.
29099
*/
100+
@Deprecated
291101
public static int findUnhealthyCores(
292102
Collection<CloudDescriptor> cores, ClusterState clusterState) {
293-
return Math.toIntExact(
294-
cores.stream()
295-
.filter(
296-
c ->
297-
!c.hasRegistered()
298-
|| UNHEALTHY_STATES.contains(
299-
c.getLastPublished())) // Find candidates locally
300-
.filter(
301-
c ->
302-
clusterState.hasCollection(
303-
c.getCollectionName())) // Only care about cores for actual collections
304-
.filter(
305-
c ->
306-
clusterState
307-
.getCollection(c.getCollectionName())
308-
.getActiveSlicesMap()
309-
.containsKey(c.getShardId()))
310-
.count());
103+
return NodeHealthAPI.findUnhealthyCores(cores, clusterState);
311104
}
312105

313106
@Override
@@ -327,7 +120,7 @@ public Boolean registerV2() {
327120

328121
@Override
329122
public Collection<Api> getApis() {
330-
return List.of();
123+
return Collections.emptyList();
331124
}
332125

333126
@Override

0 commit comments

Comments
 (0)