3232import java .util .Map ;
3333import java .util .Optional ;
3434import java .util .Set ;
35- import java .util .concurrent .CompletableFuture ;
3635import java .util .concurrent .ConcurrentHashMap ;
36+ import java .util .concurrent .ExecutorService ;
37+ import java .util .concurrent .Executors ;
3738import java .util .concurrent .ScheduledExecutorService ;
3839import java .util .concurrent .TimeUnit ;
3940import java .util .concurrent .locks .Lock ;
@@ -85,12 +86,15 @@ public class LocalNodeRegistry implements NodeRegistry {
8586 private final Map <NodeId , Runnable > allChecks = new ConcurrentHashMap <>();
8687 private final ReadWriteLock lock = new ReentrantReadWriteLock (/* fair */ true );
8788 private final ScheduledExecutorService nodeHealthCheckService ;
89+ private final ExecutorService nodeHealthCheckExecutor ;
8890 private final Duration purgeNodesInterval ;
8991 private final ScheduledExecutorService purgeDeadNodesService ;
92+ private final int newSessionThreadPoolSize ;
9093
9194 public LocalNodeRegistry (
9295 Tracer tracer ,
9396 EventBus bus ,
97+ int newSessionThreadPoolSize ,
9498 HttpClient .Factory clientFactory ,
9599 Secret registrationSecret ,
96100 Duration healthcheckInterval ,
@@ -106,6 +110,7 @@ public LocalNodeRegistry(
106110 Require .nonNull ("Node health check service" , nodeHealthCheckService );
107111 this .purgeNodesInterval = Require .nonNull ("Purge nodes interval" , purgeNodesInterval );
108112 this .purgeDeadNodesService = Require .nonNull ("Purge dead nodes service" , purgeDeadNodesService );
113+ this .newSessionThreadPoolSize = newSessionThreadPoolSize ;
109114
110115 this .model = new LocalGridModel (bus );
111116 this .nodes = new ConcurrentHashMap <>();
@@ -134,6 +139,16 @@ public LocalNodeRegistry(
134139 healthcheckInterval .toMillis (),
135140 TimeUnit .MILLISECONDS );
136141
142+ this .nodeHealthCheckExecutor =
143+ Executors .newFixedThreadPool (
144+ this .newSessionThreadPoolSize ,
145+ r -> {
146+ Thread t = new Thread (r );
147+ t .setName ("node-health-check-" + t .getId ());
148+ t .setDaemon (true );
149+ return t ;
150+ });
151+
137152 // Schedule node purging if interval is non-zero
138153 if (!this .purgeNodesInterval .isZero ()) {
139154 this .purgeDeadNodesService .scheduleAtFixedRate (
@@ -309,10 +324,9 @@ public void runHealthChecks() {
309324
310325 // Large deployments: process in parallel batches with controlled concurrency
311326 int batchSize = Math .max (10 , total / 10 );
312- int maxConcurrentBatches = Math .min (5 , Runtime .getRuntime ().availableProcessors ());
313327
314328 List <List <Runnable >> batches = partition (checks , batchSize );
315- processBatchesInParallel (batches , maxConcurrentBatches );
329+ processBatchesInParallel (batches );
316330 }
317331
318332 @ Override
@@ -400,35 +414,28 @@ public boolean isReady() {
400414 }
401415 }
402416
403- private void processBatchesInParallel (List <List <Runnable >> batches , int maxConcurrentBatches ) {
417+ private void processBatchesInParallel (List <List <Runnable >> batches ) {
404418 if (batches .isEmpty ()) {
405419 return ;
406420 }
407- List <CompletableFuture <Void >> inFlight = new ArrayList <>();
408- for (List <Runnable > batch : batches ) {
409- CompletableFuture <Void > fut =
410- CompletableFuture .runAsync (
411- () ->
412- batch .parallelStream ()
413- .forEach (
414- r -> {
415- try {
416- r .run ();
417- } catch (Throwable t ) {
418- LOG .log (
419- getDebugLogLevel (), "Health check execution failed in batch" , t );
420- }
421- }),
422- nodeHealthCheckService );
423- inFlight .add (fut );
424- if (inFlight .size () >= maxConcurrentBatches ) {
425- CompletableFuture .allOf (inFlight .toArray (new CompletableFuture [0 ])).join ();
426- inFlight .clear ();
427- }
428- }
429- if (!inFlight .isEmpty ()) {
430- CompletableFuture .allOf (inFlight .toArray (new CompletableFuture [0 ])).join ();
431- }
421+
422+ // Process all batches with controlled parallelism
423+ batches .forEach (
424+ batch ->
425+ nodeHealthCheckExecutor .submit (
426+ () ->
427+ batch .parallelStream ()
428+ .forEach (
429+ r -> {
430+ try {
431+ r .run ();
432+ } catch (Throwable t ) {
433+ LOG .log (
434+ getDebugLogLevel (),
435+ "Health check execution failed in batch" ,
436+ t );
437+ }
438+ })));
432439 }
433440
434441 private static List <List <Runnable >> partition (List <Runnable > list , int size ) {
0 commit comments