1010import java .io .IOException ;
1111import java .net .InetSocketAddress ;
1212import java .time .Instant ;
13- import java .util .Objects ;
1413import java .util .concurrent .TimeUnit ;
1514import java .util .stream .Collectors ;
1615import javax .annotation .Nullable ;
@@ -42,12 +41,6 @@ public class Pauser {
4241 private final TargetSelector targetSelector ;
4342 private Instant startTime ;
4443 private Instant endTime ;
45- private PauseFailedException pauseFailedException ;
46- private UnpauseFailedException unpauseFailedException ;
47- private StatusCheckFailedException statusCheckFailedException ;
48- private boolean pauseSuccessful = false ;
49- private boolean unpauseSuccessful = false ;
50- private boolean compareTargetSuccessful = false ;
5144
5245 /**
5346 * @param namespace The namespace where the pods are deployed.
@@ -105,16 +98,17 @@ public PausedDuration pause(int pauseDuration, @Nullable Long maxPauseWaitTime)
10598 }
10699
107100 // Run pause operation.
101+ PauseFailedException pauseFailedException = null ;
108102 try {
109- pauseSuccessful = pauseInternal (requestCoordinator , pauseDuration , maxPauseWaitTime );
103+ pauseInternal (requestCoordinator , pauseDuration , maxPauseWaitTime );
110104 } catch (Exception e ) {
111105 pauseFailedException = new PauseFailedException ("Pause operation failed." , e );
112106 }
113107
114108 // Run unpause operation.
109+ UnpauseFailedException unpauseFailedException = null ;
115110 try {
116- unpauseSuccessful =
117- unpauseWithRetry (requestCoordinator , MAX_UNPAUSE_RETRY_COUNT , targetBeforePause );
111+ unpauseWithRetry (requestCoordinator , MAX_UNPAUSE_RETRY_COUNT );
118112 } catch (Exception e ) {
119113 unpauseFailedException = new UnpauseFailedException ("Unpause operation failed." , e );
120114 }
@@ -130,82 +124,82 @@ public PausedDuration pause(int pauseDuration, @Nullable Long maxPauseWaitTime)
130124 e );
131125 }
132126
127+ // Prepare error messages for each process.
128+ String unpauseErrorMessage =
129+ String .format (
130+ "Unpause operation failed. Scalar products might still be in a paused state. You"
131+ + " must restart related pods by using the `kubectl rollout restart deployment"
132+ + " %s` command to unpause all pods. " ,
133+ targetBeforePause .getDeployment ().getMetadata ().getName ());
134+ String pauseErrorMessage =
135+ "Pause operation failed. You cannot use the backup that was taken during this pause"
136+ + " duration. You need to retry the pause operation from the beginning to"
137+ + " take a backup. " ;
138+ String statusCheckErrorMessage =
139+ "Status check failed. You cannot use the backup that was taken during this pause"
140+ + " duration. You need to retry the pause operation from the beginning to"
141+ + " take a backup. " ;
142+ String statusDifferentErrorMessage =
143+ "The target pods were updated during the pause duration. You cannot use the backup that"
144+ + " was taken during this pause duration. " ;
145+ String unpauseFailedButBackupOkErrorMessage =
146+ String .format (
147+ "Note that the pause operations for taking backup succeeded. You can use a backup that"
148+ + " was taken during this pause duration: Start Time = %s, End Time = %s. " ,
149+ startTime , endTime );
150+
133151 // Check if pods and deployment information are the same between before pause and after pause.
152+ boolean compareTargetSuccessful ;
134153 try {
135- compareTargetSuccessful = compareTargetStates (targetBeforePause , targetAfterPause );
154+ compareTargetSuccessful = compareTargetStatus (targetBeforePause , targetAfterPause );
136155 } catch (Exception e ) {
137- statusCheckFailedException = new StatusCheckFailedException ("Status check failed." , e );
156+ if (unpauseFailedException == null ) {
157+ throw new StatusCheckFailedException (statusCheckErrorMessage , e );
158+ } else {
159+ throw new UnpauseFailedException (unpauseErrorMessage , e );
160+ }
138161 }
139162
140163 // If both the pause operation and status check succeeded, you can use the backup that was taken
141164 // during the pause duration.
142- boolean backupOk = pauseSuccessful && compareTargetSuccessful ;
165+ boolean backupOk = (pauseFailedException == null ) && compareTargetSuccessful ;
166+
167+ // Create error message if any of the operations failed.
168+ StringBuilder errorMessageBuilder = new StringBuilder ();
169+ if (unpauseFailedException != null ) {
170+ errorMessageBuilder .append (unpauseErrorMessage );
171+ if (backupOk ) {
172+ errorMessageBuilder .append (unpauseFailedButBackupOkErrorMessage );
173+ }
174+ }
175+ if (pauseFailedException != null ) {
176+ errorMessageBuilder .append (pauseErrorMessage );
177+ }
178+ if (!compareTargetSuccessful ) {
179+ errorMessageBuilder .append (statusDifferentErrorMessage );
180+ }
181+ String errorMessage = errorMessageBuilder .toString ();
143182
144183 // Return the final result based on each process.
145- if (backupOk ) { // Backup OK
146- if (unpauseSuccessful ) { // Backup OK and Unpause OK
147- return new PausedDuration (startTime , endTime );
148- } else { // Backup OK but Unpause NG
149- String errorMessage =
150- String .format (
151- "Unpause operation failed. Scalar products might still be in a paused state. You"
152- + " must restart related pods by using the `kubectl rollout restart deployment"
153- + " %s` command to unpause all pods. However, the pause operations for taking"
154- + " backup succeeded. You can use a backup that was taken during this pause"
155- + " duration: Start Time = %s, End Time = %s." ,
156- Objects .requireNonNull (targetBeforePause .getDeployment ().getMetadata ()).getName (),
157- startTime ,
158- endTime );
159- // Users who directly utilize this library, bypassing our CLI, are responsible for proper
160- // exception handling. However, this scenario represents a critical issue. Consequently,
161- // we output the error message here regardless of whether the calling code handles the
162- // exception.
163- logger .error (errorMessage );
164- throw new UnpauseFailedException (errorMessage , unpauseFailedException );
165- }
166- } else { // Backup NG
167- if (unpauseSuccessful ) { // Backup NG but Unpause OK
168- if (!pauseSuccessful ) { // Backup NG (Pause operation failed) but Unpause OK
169- String errorMessage =
170- String .format (
171- "Pause operation failed. You cannot use the backup that was taken during this"
172- + " pause duration. You need to retry the pause operation from the beginning"
173- + " to take a backup." );
174- throw new PauseFailedException (errorMessage , pauseFailedException );
175- } else { // Backup NG (Status check failed) but Unpause OK
176- String errorMessage =
177- String .format (
178- "Status check failed. You cannot use the backup that was taken during this pause"
179- + " duration. You need to retry the pause operation from the beginning to"
180- + " take a backup." );
181- throw new StatusCheckFailedException (errorMessage , statusCheckFailedException );
182- }
183- } else { // Backup NG and Unpause NG
184- String errorMessage =
185- String .format (
186- "Pause and unpause operation failed. Scalar products might still be in a paused"
187- + " state. You must restart related pods by using the `kubectl rollout restart"
188- + " deployment %s` command to unpause all pods." ,
189- Objects .requireNonNull (targetBeforePause .getDeployment ().getMetadata ()).getName ());
190- // Users who directly utilize this library, bypassing our CLI, are responsible for proper
191- // exception handling. However, this scenario represents a critical issue. Consequently,
192- // we output the error message here regardless of whether the calling code handles the
193- // exception.
194- logger .error (errorMessage );
195- throw new UnpauseFailedException (errorMessage , unpauseFailedException );
196- }
184+ if (unpauseFailedException != null ) { // Unpause issue is the most critical.
185+ throw new UnpauseFailedException (errorMessage , unpauseFailedException );
186+ } else if (pauseFailedException
187+ != null ) { // Pause issue might be caused by configuration error.
188+ throw new PauseFailedException (errorMessage , pauseFailedException );
189+ } else if (!compareTargetSuccessful ) { // Status check issue might be caused by temporary issue.
190+ throw new PauseFailedException (errorMessage );
191+ } else { // All operations succeeded.
192+ return new PausedDuration (startTime , endTime );
197193 }
198194 }
199195
200196 @ VisibleForTesting
201- boolean unpauseWithRetry (RequestCoordinator coordinator , int maxRetryCount , TargetSnapshot target )
202- throws PauserException {
197+ void unpauseWithRetry (RequestCoordinator coordinator , int maxRetryCount ) {
203198 int retryCounter = 0 ;
204-
205199 while (true ) {
206200 try {
207201 coordinator .unpause ();
208- return true ;
202+ return ;
209203 } catch (Exception e ) {
210204 if (++retryCounter >= maxRetryCount ) {
211205 throw e ;
@@ -227,18 +221,16 @@ RequestCoordinator getRequestCoordinator(TargetSnapshot target) {
227221 .collect (Collectors .toList ()));
228222 }
229223
230- private boolean pauseInternal (
224+ void pauseInternal (
231225 RequestCoordinator requestCoordinator , int pauseDuration , @ Nullable Long maxPauseWaitTime ) {
232226
233227 requestCoordinator .pause (true , maxPauseWaitTime );
234228 startTime = Instant .now ();
235229 Uninterruptibles .sleepUninterruptibly (pauseDuration , TimeUnit .MILLISECONDS );
236230 endTime = Instant .now ();
237-
238- return true ;
239231 }
240232
241- private boolean compareTargetStates (TargetSnapshot before , TargetSnapshot after ) {
233+ boolean compareTargetStatus (TargetSnapshot before , TargetSnapshot after ) {
242234 return before .getStatus ().equals (after .getStatus ());
243235 }
244236}
0 commit comments