1111import java .util .concurrent .Executors ;
1212import java .util .concurrent .Future ;
1313import java .util .concurrent .ScheduledExecutorService ;
14+ import java .util .concurrent .ThreadFactory ;
1415import java .util .concurrent .atomic .AtomicReference ;
1516import java .util .function .Supplier ;
1617import java .util .stream .Collectors ;
1718import java .util .stream .Stream ;
1819
1920import com .google .common .base .Preconditions ;
21+ import com .google .common .util .concurrent .ThreadFactoryBuilder ;
2022import org .slf4j .Logger ;
2123import org .slf4j .LoggerFactory ;
2224
3537import tech .ydb .core .Status ;
3638import tech .ydb .core .StatusCode ;
3739
38- // TODO: документцаия / логгирование / рекомендации по коду
40+ /**
41+ * A distributed leader election implementation using coordination services.
42+ * This class provides a mechanism for multiple instances to compete for leadership
43+ * of a named resource, with exactly one instance becoming the leader at any time.
44+ *
45+ * <p>The election process uses a semaphore-based approach where:
46+ * <ul>
47+ * <li>The leader holds the semaphore lock</li>
48+ * <li>Other participants wait in a queue</li>
49+ * <li>Leadership can be voluntarily released or lost due to session issues</li>
50+ * </ul>
51+ *
52+ * <p>Thread safety: This class is thread-safe. All public methods can be called
53+ * from multiple threads concurrently.
54+ */
3955public class LeaderElection implements Closeable , SessionListenableProvider {
4056 private static final Logger logger = LoggerFactory .getLogger (LeaderElection .class );
57+ private static final ThreadFactory threadFactory = new ThreadFactoryBuilder ()
58+ .setNameFormat ("ydb-leader-election-%d" )
59+ .setDaemon (true )
60+ .build ();
4161 private static final long MAX_LEASE = 1L ;
4262
4363 private final LeaderElectionListener leaderElectionListener ;
@@ -68,6 +88,15 @@ private enum State {
6888 CLOSED
6989 }
7090
91+ /**
92+ * Creates a new LeaderElection instance with default settings.
93+ *
94+ * @param client the coordination client to use
95+ * @param coordinationNodePath path to the coordination node
96+ * @param electionName name of the election (must be unique per coordination node)
97+ * @param data optional data to associate with the leader (visible to all participants)
98+ * @param leaderElectionListener callback for leadership events
99+ */
71100 public LeaderElection (
72101 CoordinationClient client ,
73102 String coordinationNodePath ,
@@ -86,6 +115,17 @@ public LeaderElection(
86115 );
87116 }
88117
118+ /**
119+ * Creates a new LeaderElection instance with custom settings.
120+ *
121+ * @param client the coordination client to use
122+ * @param coordinationNodePath path to the coordination node
123+ * @param electionName name of the election (must be unique per coordination node)
124+ * @param data optional data to associate with the leader (visible to all participants)
125+ * @param leaderElectionListener callback for leadership events
126+ * @param settings configuration settings for the election process
127+ * @throws NullPointerException if any required parameter is null
128+ */
89129 public LeaderElection (
90130 CoordinationClient client ,
91131 String coordinationNodePath ,
@@ -94,21 +134,28 @@ public LeaderElection(
94134 LeaderElectionListener leaderElectionListener ,
95135 LeaderElectionSettings settings
96136 ) {
137+ Preconditions .checkNotNull (client , "CoordinationClient cannot be null" );
138+ Preconditions .checkNotNull (coordinationNodePath , "Coordination node path cannot be null" );
139+ Preconditions .checkNotNull (electionName , "Election name cannot be null" );
140+ Preconditions .checkNotNull (leaderElectionListener , "LeaderElectionListener cannot be null" );
141+ Preconditions .checkNotNull (settings , "LeaderElectionSettings cannot be null" );
142+
97143 this .coordinationNodePath = coordinationNodePath ;
98144 this .electionName = electionName ;
99145 this .data = data ;
100146 this .leaderElectionListener = leaderElectionListener ;
101147 this .scheduledExecutor = settings .getScheduledExecutor ();
102- this .blockingExecutor = Executors .newSingleThreadExecutor (); // TODO: thread factory
148+ this .blockingExecutor = Executors .newSingleThreadExecutor (threadFactory );
103149 this .retryPolicy = settings .getRetryPolicy ();
104150
105151 this .coordinationSession = client .createSession (coordinationNodePath );
106152 this .sessionListenable = new ListenableContainer <>();
107153 coordinationSession .addStateListener (sessionState -> {
108- if (sessionState == CoordinationSession .State .LOST || sessionState == CoordinationSession .State .CLOSED ) {
154+ if (!state .get ().equals (State .CLOSED ) && (sessionState == CoordinationSession .State .LOST ||
155+ sessionState == CoordinationSession .State .CLOSED )) {
109156 logger .error ("Coordination session unexpectedly changed to {} state, marking election as FAILED" ,
110157 sessionState );
111- state . set (State .FAILED );
158+ stopInternal (State .FAILED );
112159 }
113160 sessionListenable .notifyListeners (sessionState );
114161 });
@@ -127,6 +174,11 @@ public LeaderElection(
127174 );
128175 }
129176
177+ /**
178+ * Starts the leader election process.
179+ *
180+ * @throws IllegalStateException if the election is already started or closed
181+ */
130182 public void start () {
131183 Preconditions .checkState (
132184 state .compareAndSet (State .INITIAL , State .STARTING ),
@@ -159,9 +211,7 @@ public void start() {
159211 return semaphoreStatus ;
160212 }).exceptionally (ex -> {
161213 logger .error ("Leader election initializing task failed" , ex );
162- state .set (State .FAILED );
163- semaphoreObserver .close ();
164- startingLatch .countDown ();
214+ stopInternal (State .FAILED );
165215 return Status .of (StatusCode .CLIENT_INTERNAL_ERROR );
166216 });
167217
@@ -176,20 +226,30 @@ private CompletableFuture<Status> executeWithRetry(Supplier<CompletableFuture<St
176226 return new RetryableTask ("leaderElectionInitialize" , taskSupplier , scheduledExecutor , retryPolicy ).execute ();
177227 }
178228
229+ /**
230+ * Enables automatic requeueing when leadership is lost.
231+ * If called before start election will be started immediately.
232+ */
179233 public void autoRequeue () {
180234 autoRequeue = true ;
181235 }
182236
237+ /**
238+ * Checks if this instance is currently the leader.
239+ *
240+ * @return true if this instance is the leader, false otherwise
241+ */
183242 public boolean isLeader () {
184243 return isLeader ;
185244 }
186245
187246 /**
188247 * Re-queue an attempt for leadership. If this instance is already queued, nothing
189248 * happens and false is returned. If the instance was not queued, it is re-queued and true
190- * is returned
249+ * is returned.
191250 *
192- * @return true if re-enqueue was successful
251+ * @return true if reenqueue was successful
252+ * @throws IllegalStateException if the election is not in STARTED or STARTING state
193253 */
194254 public boolean requeue () {
195255 State localState = state .get ();
@@ -201,6 +261,11 @@ public boolean requeue() {
201261 return enqueueElection ();
202262 }
203263
264+ /**
265+ * Interrupts the current leadership attempt if one is in progress.
266+ *
267+ * @return true if leadership was interrupted, false if no attempt was in progress
268+ */
204269 public synchronized boolean interruptLeadership () {
205270 Future <?> localTask = electionTask ;
206271 if (localTask != null ) {
@@ -231,11 +296,16 @@ public Void call() throws Exception {
231296 return false ;
232297 }
233298
299+ /**
300+ * Main work loop for leadership acquisition and maintenance.
301+ *
302+ * @throws Exception if the leadership attempt fails
303+ */
234304 private void doWork () throws Exception {
235305 isLeader = false ;
236306
237307 try {
238- waitStartedState ();
308+ waitStartedStateOrFail ();
239309 lock .tryAcquire (
240310 null ,
241311 true ,
@@ -248,7 +318,7 @@ private void doWork() throws Exception {
248318 Thread .currentThread ().interrupt ();
249319 throw e ;
250320 } catch (Throwable e ) {
251- logger .debug ( "takeLeadership exception " , e );
321+ logger .error ( "Unexpected error in takeLeadership " , e );
252322 }
253323 } catch (InterruptedException e ) {
254324 Thread .currentThread ().interrupt ();
@@ -270,7 +340,7 @@ private void doWork() throws Exception {
270340 }
271341 }
272342
273- private void waitStartedState () throws InterruptedException {
343+ private void waitStartedStateOrFail () throws InterruptedException {
274344 State localState = state .get ();
275345 if (localState == State .STARTING ) {
276346 startingLatch .await ();
@@ -295,9 +365,10 @@ private boolean isQueued() {
295365 }
296366
297367 /**
298- * Не гарантированы все, кроме лидера
368+ * Gets all participants in the election.
369+ * Note: Due to observer limitations, waiters may be visible only eventually (after lease changes).
299370 *
300- * @return
371+ * @return list of election participants (owners and visible waiters)
301372 */
302373 public List <ElectionParticipant > getParticipants () {
303374 SemaphoreDescription semaphoreDescription = semaphoreObserver .getCachedData ();
@@ -313,6 +384,11 @@ public List<ElectionParticipant> getParticipants() {
313384 ).collect (Collectors .toList ());
314385 }
315386
387+ /**
388+ * Gets the current leader if one exists.
389+ *
390+ * @return Optional containing the current leader, or empty if no leader exists
391+ */
316392 public Optional <ElectionParticipant > getCurrentLeader () {
317393 SemaphoreDescription semaphoreDescription = semaphoreObserver .getCachedData ();
318394 if (semaphoreDescription == null ) {
@@ -336,18 +412,59 @@ public Listenable<CoordinationSession.State> getSessionListenable() {
336412 return sessionListenable ;
337413 }
338414
415+ /**
416+ * Closes the leader election and releases all resources.
417+ * After closing, the instance cannot be reused.
418+ */
339419 @ Override
340420 public synchronized void close () {
341- // TODO: Учесть все стейты
342- Preconditions .checkState (state .compareAndSet (State .STARTED , State .CLOSED ), "Already closed" );
421+ stopInternal (State .CLOSED );
422+ }
423+
424+ /**
425+ * Internal method to stop the election with the specified termination state.
426+ *
427+ * @param terminationState the state to transition to (FAILED or CLOSED)
428+ * @return true if the state was changed, false if already terminated
429+ */
430+ private synchronized boolean stopInternal (State terminationState ) {
431+ State localState = state .get ();
432+ if (localState == State .FAILED || localState == State .CLOSED ) {
433+ logger .warn ("Already stopped leader election {} with status: {}" , electionName , localState );
434+ return false ;
435+ }
436+ logger .debug ("Transitioning leader election {} from {} to {}" , electionName , localState , terminationState );
437+
438+ // change state
439+ state .set (terminationState );
343440
441+ // unblock starting latch if not yet
442+ startingLatch .countDown ();
443+
444+ // stop tasks
445+ Future <Status > localInitializingTask = initializingTask .get ();
446+ if (localInitializingTask != null ) {
447+ localInitializingTask .cancel (true );
448+ initializingTask .set (null );
449+ }
344450 Future <Void > localTask = electionTask ;
345451 if (localTask != null ) {
346452 localTask .cancel (true );
347453 electionTask = null ;
348454 }
349455
350- blockingExecutor .shutdown ();
351- semaphoreObserver .close ();
456+ // Clean up resources
457+ try {
458+ semaphoreObserver .close ();
459+ } catch (Exception e ) {
460+ logger .warn ("Error closing semaphore observer for {}: {}" , electionName , e .getMessage ());
461+ }
462+
463+ try {
464+ blockingExecutor .shutdown ();
465+ } catch (Exception e ) {
466+ logger .warn ("Error shutting down executor for {}: {}" , electionName , e .getMessage ());
467+ }
468+ return true ;
352469 }
353470}
0 commit comments