Skip to content

Commit bf1ab2c

Browse files
committed
Improve Node health checks by batch runs
Signed-off-by: Viet Nguyen Duc <[email protected]>
1 parent b27e15c commit bf1ab2c

File tree

2 files changed

+37
-29
lines changed

2 files changed

+37
-29
lines changed

java/src/org/openqa/selenium/grid/distributor/local/LocalDistributor.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -183,6 +183,7 @@ public LocalDistributor(
183183
new LocalNodeRegistry(
184184
tracer,
185185
bus,
186+
newSessionThreadPoolSize,
186187
this.clientFactory,
187188
this.registrationSecret,
188189
this.healthcheckInterval,

java/src/org/openqa/selenium/grid/distributor/local/LocalNodeRegistry.java

Lines changed: 36 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -32,8 +32,9 @@
3232
import java.util.Map;
3333
import java.util.Optional;
3434
import java.util.Set;
35-
import java.util.concurrent.CompletableFuture;
3635
import java.util.concurrent.ConcurrentHashMap;
36+
import java.util.concurrent.ExecutorService;
37+
import java.util.concurrent.Executors;
3738
import java.util.concurrent.ScheduledExecutorService;
3839
import java.util.concurrent.TimeUnit;
3940
import java.util.concurrent.locks.Lock;
@@ -85,12 +86,15 @@ public class LocalNodeRegistry implements NodeRegistry {
8586
private final Map<NodeId, Runnable> allChecks = new ConcurrentHashMap<>();
8687
private final ReadWriteLock lock = new ReentrantReadWriteLock(/* fair */ true);
8788
private final ScheduledExecutorService nodeHealthCheckService;
89+
private final ExecutorService nodeHealthCheckExecutor;
8890
private final Duration purgeNodesInterval;
8991
private final ScheduledExecutorService purgeDeadNodesService;
92+
private final int newSessionThreadPoolSize;
9093

9194
public LocalNodeRegistry(
9295
Tracer tracer,
9396
EventBus bus,
97+
int newSessionThreadPoolSize,
9498
HttpClient.Factory clientFactory,
9599
Secret registrationSecret,
96100
Duration healthcheckInterval,
@@ -106,6 +110,7 @@ public LocalNodeRegistry(
106110
Require.nonNull("Node health check service", nodeHealthCheckService);
107111
this.purgeNodesInterval = Require.nonNull("Purge nodes interval", purgeNodesInterval);
108112
this.purgeDeadNodesService = Require.nonNull("Purge dead nodes service", purgeDeadNodesService);
113+
this.newSessionThreadPoolSize = newSessionThreadPoolSize;
109114

110115
this.model = new LocalGridModel(bus);
111116
this.nodes = new ConcurrentHashMap<>();
@@ -134,6 +139,16 @@ public LocalNodeRegistry(
134139
healthcheckInterval.toMillis(),
135140
TimeUnit.MILLISECONDS);
136141

142+
this.nodeHealthCheckExecutor =
143+
Executors.newFixedThreadPool(
144+
this.newSessionThreadPoolSize,
145+
r -> {
146+
Thread t = new Thread(r);
147+
t.setName("node-health-check-" + t.getId());
148+
t.setDaemon(true);
149+
return t;
150+
});
151+
137152
// Schedule node purging if interval is non-zero
138153
if (!this.purgeNodesInterval.isZero()) {
139154
this.purgeDeadNodesService.scheduleAtFixedRate(
@@ -309,10 +324,9 @@ public void runHealthChecks() {
309324

310325
// Large deployments: process in parallel batches with controlled concurrency
311326
int batchSize = Math.max(10, total / 10);
312-
int maxConcurrentBatches = Math.min(5, Runtime.getRuntime().availableProcessors());
313327

314328
List<List<Runnable>> batches = partition(checks, batchSize);
315-
processBatchesInParallel(batches, maxConcurrentBatches);
329+
processBatchesInParallel(batches);
316330
}
317331

318332
@Override
@@ -400,35 +414,28 @@ public boolean isReady() {
400414
}
401415
}
402416

403-
private void processBatchesInParallel(List<List<Runnable>> batches, int maxConcurrentBatches) {
417+
private void processBatchesInParallel(List<List<Runnable>> batches) {
404418
if (batches.isEmpty()) {
405419
return;
406420
}
407-
List<CompletableFuture<Void>> inFlight = new ArrayList<>();
408-
for (List<Runnable> batch : batches) {
409-
CompletableFuture<Void> fut =
410-
CompletableFuture.runAsync(
411-
() ->
412-
batch.parallelStream()
413-
.forEach(
414-
r -> {
415-
try {
416-
r.run();
417-
} catch (Throwable t) {
418-
LOG.log(
419-
getDebugLogLevel(), "Health check execution failed in batch", t);
420-
}
421-
}),
422-
nodeHealthCheckService);
423-
inFlight.add(fut);
424-
if (inFlight.size() >= maxConcurrentBatches) {
425-
CompletableFuture.allOf(inFlight.toArray(new CompletableFuture[0])).join();
426-
inFlight.clear();
427-
}
428-
}
429-
if (!inFlight.isEmpty()) {
430-
CompletableFuture.allOf(inFlight.toArray(new CompletableFuture[0])).join();
431-
}
421+
422+
// Process all batches with controlled parallelism
423+
batches.forEach(
424+
batch ->
425+
nodeHealthCheckExecutor.submit(
426+
() ->
427+
batch.parallelStream()
428+
.forEach(
429+
r -> {
430+
try {
431+
r.run();
432+
} catch (Throwable t) {
433+
LOG.log(
434+
getDebugLogLevel(),
435+
"Health check execution failed in batch",
436+
t);
437+
}
438+
})));
432439
}
433440

434441
private static List<List<Runnable>> partition(List<Runnable> list, int size) {

0 commit comments

Comments
 (0)