From 7a0c287bc5ecfc8684abb9bbfc9583170524dcea Mon Sep 17 00:00:00 2001 From: wankai123 Date: Mon, 30 Jun 2025 14:02:05 +0800 Subject: [PATCH 1/3] * OAP gRPC-Client support `Health Check`. * [Break Change] `Health Check` make response 1 represents healthy, 0 represents unhealthy. --- docs/en/changes/changes.md | 2 + docs/en/setup/backend/backend-health-check.md | 4 +- .../server/core/query/type/HealthStatus.java | 2 +- .../provider/HealthCheckerHttpService.java | 2 +- .../provider/HealthCheckerProvider.java | 24 +++++---- .../library/client/grpc/GRPCClient.java | 54 +++++++++++++++++-- .../src/main/resources/query-protocol | 2 +- .../telemetry/api/HealthCheckMetrics.java | 6 +-- .../server/telemetry/api/MetricsCreator.java | 2 +- 9 files changed, 75 insertions(+), 23 deletions(-) diff --git a/docs/en/changes/changes.md b/docs/en/changes/changes.md index 6c69b4db643d..03d0b2080f07 100644 --- a/docs/en/changes/changes.md +++ b/docs/en/changes/changes.md @@ -36,6 +36,8 @@ * chore: add `toString` implementation for `StorageID`. * chore: add a warning log when connecting to ES takes too long. * Fix the query time range in the metadata API. +* OAP gRPC-Client support `Health Check`. +* [Break Change] `Health Check` make response 1 represents healthy, 0 represents unhealthy. #### UI diff --git a/docs/en/setup/backend/backend-health-check.md b/docs/en/setup/backend/backend-health-check.md index c717851ca16d..a8201ab5a0c4 100644 --- a/docs/en/setup/backend/backend-health-check.md +++ b/docs/en/setup/backend/backend-health-check.md @@ -36,7 +36,7 @@ If the OAP server is healthy, the response should be { "data": { "checkHealth": { - "score": 0, + "score": 1, "details": "" } } @@ -49,7 +49,7 @@ If some modules are unhealthy (e.g. storage H2 is down), then the result may loo { "data": { "checkHealth": { - "score": 1, + "score": 0, "details": "storage_h2," } } diff --git a/oap-server/server-core/src/main/java/org/apache/skywalking/oap/server/core/query/type/HealthStatus.java b/oap-server/server-core/src/main/java/org/apache/skywalking/oap/server/core/query/type/HealthStatus.java index ac83e8bfb0b7..ab073d4d5863 100644 --- a/oap-server/server-core/src/main/java/org/apache/skywalking/oap/server/core/query/type/HealthStatus.java +++ b/oap-server/server-core/src/main/java/org/apache/skywalking/oap/server/core/query/type/HealthStatus.java @@ -26,7 +26,7 @@ @Setter @ToString public class HealthStatus { - // score == 0 means healthy, otherwise it's unhealthy. + // score == 1 means healthy, otherwise it's unhealthy. private int score; private String details; } diff --git a/oap-server/server-health-checker/src/main/java/org/apache/skywalking/oap/server/health/checker/provider/HealthCheckerHttpService.java b/oap-server/server-health-checker/src/main/java/org/apache/skywalking/oap/server/health/checker/provider/HealthCheckerHttpService.java index 356d76a79efe..7b520fcaa5c9 100644 --- a/oap-server/server-health-checker/src/main/java/org/apache/skywalking/oap/server/health/checker/provider/HealthCheckerHttpService.java +++ b/oap-server/server-health-checker/src/main/java/org/apache/skywalking/oap/server/health/checker/provider/HealthCheckerHttpService.java @@ -37,7 +37,7 @@ public HttpResponse healthcheck(ServiceRequestContext ctx, HttpRequest req) thro final var status = healthQueryService.checkHealth(); log.info("Health status: {}", status); - if (status.getScore() == 0) { + if (status.getScore() == 1) { return HttpResponse.of(HttpStatus.OK); } return HttpResponse.of(HttpStatus.SERVICE_UNAVAILABLE); diff --git a/oap-server/server-health-checker/src/main/java/org/apache/skywalking/oap/server/health/checker/provider/HealthCheckerProvider.java b/oap-server/server-health-checker/src/main/java/org/apache/skywalking/oap/server/health/checker/provider/HealthCheckerProvider.java index 29760f02ac27..2051b5b0df3c 100644 --- a/oap-server/server-health-checker/src/main/java/org/apache/skywalking/oap/server/health/checker/provider/HealthCheckerProvider.java +++ b/oap-server/server-health-checker/src/main/java/org/apache/skywalking/oap/server/health/checker/provider/HealthCheckerProvider.java @@ -25,8 +25,8 @@ import java.util.concurrent.Executors; import java.util.concurrent.ScheduledExecutorService; import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicBoolean; import java.util.concurrent.atomic.AtomicReference; -import java.util.stream.Collectors; import lombok.extern.slf4j.Slf4j; import org.apache.skywalking.oap.server.core.CoreModule; import org.apache.skywalking.oap.server.core.server.HTTPHandlerRegister; @@ -100,16 +100,18 @@ public void onInitialized(final HealthCheckerConfig initialized) { @Override public void notifyAfterCompleted() throws ServiceNotProvidedException, ModuleStartException { ses.scheduleAtFixedRate(() -> { StringBuilder unhealthyModules = new StringBuilder(); - score.set(Stream.ofAll(collector.collect()) - .flatMap(metricFamily -> metricFamily.samples) - .filter(sample -> metricsCreator.isHealthCheckerMetrics(sample.name)) - .peek(sample -> { - if (sample.value > 0.0) { - unhealthyModules.append(metricsCreator.extractModuleName(sample.name)).append(","); - } - }) - .map(sample -> sample.value) - .collect(Collectors.summingDouble(Double::doubleValue))); + AtomicBoolean hasUnhealthyModule = new AtomicBoolean(false); + Stream.ofAll(collector.collect()) + .flatMap(metricFamily -> metricFamily.samples) + .filter(sample -> metricsCreator.isHealthCheckerMetrics(sample.name)) + .forEach(sample -> { + if (sample.value < 1) { + unhealthyModules.append(metricsCreator.extractModuleName(sample.name)).append(","); + hasUnhealthyModule.set(true); + } + }); + + score.set(hasUnhealthyModule.get() ? 0 : 1); details.set(unhealthyModules.toString()); }, 2, config.getCheckIntervalSeconds(), TimeUnit.SECONDS); diff --git a/oap-server/server-library/library-client/src/main/java/org/apache/skywalking/oap/server/library/client/grpc/GRPCClient.java b/oap-server/server-library/library-client/src/main/java/org/apache/skywalking/oap/server/library/client/grpc/GRPCClient.java index c11f590e2f25..3799db6bd971 100644 --- a/oap-server/server-library/library-client/src/main/java/org/apache/skywalking/oap/server/library/client/grpc/GRPCClient.java +++ b/oap-server/server-library/library-client/src/main/java/org/apache/skywalking/oap/server/library/client/grpc/GRPCClient.java @@ -18,16 +18,23 @@ package org.apache.skywalking.oap.server.library.client.grpc; +import io.grpc.ConnectivityState; import io.grpc.ManagedChannel; import io.grpc.ManagedChannelBuilder; import io.grpc.netty.NettyChannelBuilder; import io.netty.handler.ssl.SslContext; +import java.util.concurrent.Executors; +import java.util.concurrent.ScheduledExecutorService; +import java.util.concurrent.TimeUnit; import lombok.Getter; import org.apache.skywalking.oap.server.library.client.Client; +import org.apache.skywalking.oap.server.library.client.healthcheck.DelegatedHealthChecker; +import org.apache.skywalking.oap.server.library.client.healthcheck.HealthCheckable; +import org.apache.skywalking.oap.server.library.util.HealthChecker; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -public class GRPCClient implements Client { +public class GRPCClient implements Client, HealthCheckable { private static final Logger LOGGER = LoggerFactory.getLogger(GRPCClient.class); @@ -41,6 +48,10 @@ public class GRPCClient implements Client { private ManagedChannel channel; + private final DelegatedHealthChecker healthChecker = new DelegatedHealthChecker(); + + private boolean enableHealthCheck = false; + public GRPCClient(String host, int port) { this.host = host; this.port = port; @@ -55,9 +66,12 @@ public GRPCClient(String host, int port, final SslContext sslContext) { public void connect() { if (sslContext == null) { channel = ManagedChannelBuilder.forAddress(host, port).usePlaintext().build(); - return; + } else { + channel = NettyChannelBuilder.forAddress(host, port).sslContext(sslContext).build(); + } + if (enableHealthCheck) { + checkHealth(); } - channel = NettyChannelBuilder.forAddress(host, port).sslContext(sslContext).build(); } @Override @@ -77,4 +91,38 @@ public ManagedChannel getChannel() { public String toString() { return host + ":" + port; } + + /** + * Must register a HealthChecker before calling connect() if you want to enable health check. + * @param healthChecker HealthChecker to be registered. + */ + @Override + public void registerChecker(final HealthChecker healthChecker) { + this.healthChecker.register(healthChecker); + this.enableHealthCheck = true; + } + + private void checkHealth() { + ScheduledExecutorService scheduler = Executors.newSingleThreadScheduledExecutor(); + scheduler.scheduleAtFixedRate(() -> { + ConnectivityState currentState = channel.getState(true); // true means try to connect + handleStateChange(currentState); + }, 5, 10, TimeUnit.SECONDS); + } + + private void handleStateChange(ConnectivityState newState) { + switch (newState) { + case READY: + case IDLE: + case CONNECTING: + this.healthChecker.health(); + break; + case TRANSIENT_FAILURE: + this.healthChecker.unHealth("gRPC connection failed, will retry. Host: " + host + ", Port: " + port); + break; + case SHUTDOWN: + this.healthChecker.unHealth("gRPC channel is shutting down. Host: " + host + ", Port: " + port); + break; + } + } } diff --git a/oap-server/server-query-plugin/query-graphql-plugin/src/main/resources/query-protocol b/oap-server/server-query-plugin/query-graphql-plugin/src/main/resources/query-protocol index e9d4f81bb2bd..021c0ad768f8 160000 --- a/oap-server/server-query-plugin/query-graphql-plugin/src/main/resources/query-protocol +++ b/oap-server/server-query-plugin/query-graphql-plugin/src/main/resources/query-protocol @@ -1 +1 @@ -Subproject commit e9d4f81bb2bde6eb92bf7595c1257cc8d60470f5 +Subproject commit 021c0ad768f8f6f64dceead9d79a3dd7e9ad8dd9 diff --git a/oap-server/server-telemetry/telemetry-api/src/main/java/org/apache/skywalking/oap/server/telemetry/api/HealthCheckMetrics.java b/oap-server/server-telemetry/telemetry-api/src/main/java/org/apache/skywalking/oap/server/telemetry/api/HealthCheckMetrics.java index cc2ea679b2f6..4ef958891b5c 100644 --- a/oap-server/server-telemetry/telemetry-api/src/main/java/org/apache/skywalking/oap/server/telemetry/api/HealthCheckMetrics.java +++ b/oap-server/server-telemetry/telemetry-api/src/main/java/org/apache/skywalking/oap/server/telemetry/api/HealthCheckMetrics.java @@ -36,18 +36,18 @@ public HealthCheckMetrics(GaugeMetrics metrics) { @Override public void health() { - metrics.setValue(0); + metrics.setValue(1); } @Override public void unHealth(Throwable t) { log.error("Health check fails", t); - metrics.setValue(1); + metrics.setValue(0); } @Override public void unHealth(String reason) { log.warn("Health check fails. reason: {}", reason); - metrics.setValue(1); + metrics.setValue(0); } } diff --git a/oap-server/server-telemetry/telemetry-api/src/main/java/org/apache/skywalking/oap/server/telemetry/api/MetricsCreator.java b/oap-server/server-telemetry/telemetry-api/src/main/java/org/apache/skywalking/oap/server/telemetry/api/MetricsCreator.java index 5bd89d26a4b3..476bf3f0de15 100644 --- a/oap-server/server-telemetry/telemetry-api/src/main/java/org/apache/skywalking/oap/server/telemetry/api/MetricsCreator.java +++ b/oap-server/server-telemetry/telemetry-api/src/main/java/org/apache/skywalking/oap/server/telemetry/api/MetricsCreator.java @@ -53,7 +53,7 @@ HistogramMetrics createHistogramMetric(String name, String tips, MetricsTag.Keys default HealthCheckMetrics createHealthCheckerGauge(String name, MetricsTag.Keys tagKeys, MetricsTag.Values tagValues) { Preconditions.checkArgument(!Strings.isNullOrEmpty(name), "Require non-null or empty metric name"); return new HealthCheckMetrics(createGauge(Strings.lenientFormat("%s%s", HEALTH_METRIC_PREFIX, name), - Strings.lenientFormat("%s health check", name), + Strings.lenientFormat("%s health check. 1 health, 0 not health, -1 unknown", name), tagKeys, tagValues)); } From 118f4982b9ba09510c9e0f1813d0b9ebf53c4cca Mon Sep 17 00:00:00 2001 From: wankai123 Date: Mon, 30 Jun 2025 16:32:20 +0800 Subject: [PATCH 2/3] add create and shutdown logic --- .../library/client/grpc/GRPCClient.java | 30 +++++++++++++++---- 1 file changed, 24 insertions(+), 6 deletions(-) diff --git a/oap-server/server-library/library-client/src/main/java/org/apache/skywalking/oap/server/library/client/grpc/GRPCClient.java b/oap-server/server-library/library-client/src/main/java/org/apache/skywalking/oap/server/library/client/grpc/GRPCClient.java index 3799db6bd971..935a9d855ecb 100644 --- a/oap-server/server-library/library-client/src/main/java/org/apache/skywalking/oap/server/library/client/grpc/GRPCClient.java +++ b/oap-server/server-library/library-client/src/main/java/org/apache/skywalking/oap/server/library/client/grpc/GRPCClient.java @@ -50,6 +50,8 @@ public class GRPCClient implements Client, HealthCheckable { private final DelegatedHealthChecker healthChecker = new DelegatedHealthChecker(); + private ScheduledExecutorService healthCheckExecutor; + private boolean enableHealthCheck = false; public GRPCClient(String host, int port) { @@ -80,6 +82,12 @@ public void shutdown() { channel.shutdownNow(); } catch (Throwable t) { LOGGER.error(t.getMessage(), t); + } finally { + if (healthCheckExecutor != null) { + healthCheckExecutor.shutdownNow(); + healthChecker.unHealth("gRPC channel is shutting down. Host: " + host + ", Port: " + port); + healthCheckExecutor = null; + } } } @@ -94,6 +102,10 @@ public String toString() { /** * Must register a HealthChecker before calling connect() if you want to enable health check. + * If the channel is shutdown by client side, the health check will not be performed. + * Note: If you register a `org.apache.skywalking.oap.server.telemetry.api.HealthCheckMetrics` here + * or the metric name start with `org.apache.skywalking.oap.server.telemetry.api.MetricsCreator.HEALTH_METRIC_PREFIX`, + * this healthy status will be included in the whole OAP health evaluate. * @param healthChecker HealthChecker to be registered. */ @Override @@ -103,20 +115,26 @@ public void registerChecker(final HealthChecker healthChecker) { } private void checkHealth() { - ScheduledExecutorService scheduler = Executors.newSingleThreadScheduledExecutor(); - scheduler.scheduleAtFixedRate(() -> { - ConnectivityState currentState = channel.getState(true); // true means try to connect - handleStateChange(currentState); - }, 5, 10, TimeUnit.SECONDS); + if (healthCheckExecutor == null) { + healthCheckExecutor = Executors.newSingleThreadScheduledExecutor(); + healthCheckExecutor.scheduleAtFixedRate( + () -> { + ConnectivityState currentState = channel.getState(true); // true means try to connect + handleStateChange(currentState); + }, 5, 10, TimeUnit.SECONDS + ); + } } private void handleStateChange(ConnectivityState newState) { switch (newState) { case READY: case IDLE: - case CONNECTING: this.healthChecker.health(); break; + case CONNECTING: + this.healthChecker.unHealth("gRPC connecting, waiting for ready. Host: " + host + ", Port: " + port); + break; case TRANSIENT_FAILURE: this.healthChecker.unHealth("gRPC connection failed, will retry. Host: " + host + ", Port: " + port); break; From 21d4269346c7c9aabc9e04e12cb7c752d18a6969 Mon Sep 17 00:00:00 2001 From: wankai123 Date: Mon, 30 Jun 2025 17:24:35 +0800 Subject: [PATCH 3/3] comment --- .../skywalking/oap/server/library/client/grpc/GRPCClient.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/oap-server/server-library/library-client/src/main/java/org/apache/skywalking/oap/server/library/client/grpc/GRPCClient.java b/oap-server/server-library/library-client/src/main/java/org/apache/skywalking/oap/server/library/client/grpc/GRPCClient.java index 935a9d855ecb..ce54de19a516 100644 --- a/oap-server/server-library/library-client/src/main/java/org/apache/skywalking/oap/server/library/client/grpc/GRPCClient.java +++ b/oap-server/server-library/library-client/src/main/java/org/apache/skywalking/oap/server/library/client/grpc/GRPCClient.java @@ -104,7 +104,7 @@ public String toString() { * Must register a HealthChecker before calling connect() if you want to enable health check. * If the channel is shutdown by client side, the health check will not be performed. * Note: If you register a `org.apache.skywalking.oap.server.telemetry.api.HealthCheckMetrics` here - * or the metric name start with `org.apache.skywalking.oap.server.telemetry.api.MetricsCreator.HEALTH_METRIC_PREFIX`, + * and the metric name start with `org.apache.skywalking.oap.server.telemetry.api.MetricsCreator.HEALTH_METRIC_PREFIX`, * this healthy status will be included in the whole OAP health evaluate. * @param healthChecker HealthChecker to be registered. */